### Initial Modelling

### Imports

In [1]:
# General Imports
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# General modeling imports
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn import metrics
from sklearn.preprocessing import StandardScaler

# Models
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor


In [2]:
df = pd.read_csv('../Data/Analysis/model_data.csv')

In [4]:
df.head()

Unnamed: 0,time,oil_price,hour_of_day,date,t_price_0,t_price_1,t_price_2,t_price_3,t_price_4,t_price_5,...,y_price_18,y_price_17,y_price_16,y_price_15,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday
0,2015-01-02 14:00:00+00:00,43.8585,14,2015-01-02,66.82,63.35,58.79,57.44,55.29,56.22,...,74.26,64.74,61.18,59.76,0,0,0,0,0,0
1,2015-01-03 14:00:00+00:00,43.8585,14,2015-01-03,55.22,50.54,48.68,48.02,47.06,46.79,...,82.55,72.85,70.64,71.24,0,1,0,0,0,0
2,2015-01-04 14:00:00+00:00,43.7237,14,2015-01-04,70.77,64.89,60.91,59.68,58.04,59.57,...,71.5,66.69,62.03,62.76,0,0,1,0,0,0
3,2015-01-05 14:00:00+00:00,41.9574,14,2015-01-05,71.48,64.76,60.22,57.14,53.94,53.43,...,71.98,61.12,52.15,52.07,1,0,0,0,0,0
4,2015-01-06 14:00:00+00:00,40.3817,14,2015-01-06,67.24,64.17,62.12,62.11,60.05,62.48,...,89.08,80.53,71.85,71.5,0,0,0,0,1,0


In [5]:
df.drop(columns=['time'], inplace=True)

In [8]:
df.set_index(pd.DatetimeIndex(df['date']), inplace=True)

In [9]:
df.sort_index(inplace=True)

In [12]:
df.drop(columns=['hour_of_day','date'], inplace=True)

In [13]:
df.head()

Unnamed: 0_level_0,oil_price,t_price_0,t_price_1,t_price_2,t_price_3,t_price_4,t_price_5,t_price_6,t_price_7,t_price_8,...,y_price_18,y_price_17,y_price_16,y_price_15,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-01-02,43.8585,66.82,63.35,58.79,57.44,55.29,56.22,58.13,62.06,67.36,...,74.26,64.74,61.18,59.76,0,0,0,0,0,0
2015-01-03,43.8585,55.22,50.54,48.68,48.02,47.06,46.79,47.63,47.44,50.84,...,82.55,72.85,70.64,71.24,0,1,0,0,0,0
2015-01-04,43.7237,70.77,64.89,60.91,59.68,58.04,59.57,69.73,72.97,77.92,...,71.5,66.69,62.03,62.76,0,0,1,0,0,0
2015-01-05,41.9574,71.48,64.76,60.22,57.14,53.94,53.43,56.68,63.49,66.75,...,71.98,61.12,52.15,52.07,1,0,0,0,0,0
2015-01-06,40.3817,67.24,64.17,62.12,62.11,60.05,62.48,70.01,75.2,83.3,...,89.08,80.53,71.85,71.5,0,0,0,0,1,0


### Functions Used

In [36]:
def lin_reg_metrics(lr, X_train, y_train, y_pred, X_test, y_test):
    print(f'Test RMSE Score (original units): {np.sqrt(((y_test-y_pred)**(2)).mean())} \n')
    print(f'Train Score (R^2): {lr.score(X_train,y_train)} \n')
    print(f'Test Score (R^2): {lr.score(X_test,y_test)} \n')    

### Prepare Data

In [17]:
# Get columns for y
y_cols = [col for col in df.columns if col.startswith('t_price')]

In [19]:
# Set X and y
X = df.drop(columns=y_cols)
y = df[y_cols]

In [31]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                    shuffle=False)

In [32]:
lr = LinearRegression()
lr.fit(X_train,y_train);

In [33]:
lr.score(X_train,y_train)



0.8478348240813925

In [34]:
lr.score(X_test,y_test)



0.7107352824933267

In [37]:
pred = lr.predict(X_test)

In [38]:
pred

array([[31.77059248, 30.55427902, 27.57187385, ..., 51.32963265,
        47.53332965, 40.40510643],
       [30.15185231, 25.41985807, 20.94103778, ..., 36.87993257,
        32.60202683, 29.84014527],
       [50.16630009, 44.52623886, 40.49694906, ..., 52.43709673,
        48.55004197, 45.03318894],
       ...,
       [65.77789209, 60.29483015, 57.05887473, ..., 68.54299784,
        65.67694664, 64.09738788],
       [61.80365175, 57.07844341, 52.00062984, ..., 73.48192982,
        70.69542062, 66.51388586],
       [69.19704776, 61.73020239, 57.16651035, ..., 78.52279481,
        73.58317252, 69.80582645]])