# Regression Modeling

Notebook for testing various non-neural net regression models. Going through GridSearches to find best hyperparameters and the best models.

## Problem Statement

Predict electricity prices in Spain for each hour of the upcoming day more accurately than estimates provided by the Spanish transmission agent and operator. 

Use information available during the 2pm-3pm window the previous day during which generators in Spain submit their bids. 

## Contents

### Imports

In [193]:
# General Imports
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# General modeling imports
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, TimeSeriesSplit
from sklearn import metrics
from sklearn.preprocessing import StandardScaler

# Models
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor

In [32]:
df = pd.read_csv('../Data/Analysis/model_data.csv')
visuals = pd.read_csv('../Data/intermediary/energy.csv')

In [33]:
df.head(3)

Unnamed: 0,time,oil_price,hour_of_day,date,t_price_0,t_price_1,t_price_2,t_price_3,t_price_4,t_price_5,...,y_price_18,y_price_17,y_price_16,y_price_15,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday
0,2015-01-02 14:00:00+00:00,43.8585,14,2015-01-02,66.82,63.35,58.79,57.44,55.29,56.22,...,74.26,64.74,61.18,59.76,0,0,0,0,0,0
1,2015-01-03 14:00:00+00:00,43.8585,14,2015-01-03,55.22,50.54,48.68,48.02,47.06,46.79,...,82.55,72.85,70.64,71.24,0,1,0,0,0,0
2,2015-01-04 14:00:00+00:00,43.7237,14,2015-01-04,70.77,64.89,60.91,59.68,58.04,59.57,...,71.5,66.69,62.03,62.76,0,0,1,0,0,0


In [34]:
# visuals contains the prices attached to their original hour
# and will be used to visualize and compare our predictions
visuals.head(3)

Unnamed: 0,time,generation_biomass,generation_fossil_brown_coal/lignite,generation_fossil_gas,generation_fossil_hard_coal,generation_fossil_oil,generation_hydro_pumped_storage_consumption,generation_hydro_run-of-river_and_poundage,generation_hydro_water_reservoir,generation_nuclear,...,generation_wind_onshore,forecast_solar_day_ahead,forecast_wind_onshore_day_ahead,total_load_forecast,total_load_actual,price_day_ahead,price_actual,diff,day_of_week,hour_of_day
0,2015-01-01 00:00:00+00:00,447.0,329.0,4844.0,4821.0,162.0,863.0,1051.0,1899.0,7096.0,...,6378.0,17.0,6436.0,26118.0,25385.0,50.1,65.41,2866.0,3,0
1,2015-01-01 01:00:00+00:00,449.0,328.0,5196.0,4755.0,158.0,920.0,1009.0,1658.0,7096.0,...,5890.0,16.0,5856.0,24934.0,24382.0,48.1,64.92,3436.0,3,1
2,2015-01-01 02:00:00+00:00,448.0,323.0,4857.0,4581.0,157.0,1164.0,973.0,1371.0,7099.0,...,5461.0,8.0,5454.0,23515.0,22734.0,47.33,64.48,4062.0,3,2


In [35]:
visuals.set_index(pd.DatetimeIndex(visuals['time']), inplace=True)
visuals = visuals[['price_actual','price_day_ahead']]
visuals.head(2)

Unnamed: 0_level_0,price_actual,price_day_ahead
time,Unnamed: 1_level_1,Unnamed: 2_level_1
2015-01-01 00:00:00+00:00,65.41,50.1
2015-01-01 01:00:00+00:00,64.92,48.1


### Functions Used

In [89]:
# Function for evaluating the regressions
# and outputting a dataframe of different metrics
# for each hour predicted
def reg_metrics(y_train, y_train_p, y_test, y_test_p, mod):
    test_rmse = np.sqrt(((y_test-y_test_p)**(2)).mean())
    train_r2 = metrics.r2_score(y_train, y_train_p, multioutput='raw_values')
    test_r2 =  metrics.r2_score(y_test, y_test_p, multioutput='raw_values')
    metrics_df = pd.DataFrame(data = zip(test_rmse, train_r2, test_r2),
                              columns=[mod+'test_rmse',mod+'train_r2',mod+'test_r2'])
    return metrics_df

In [122]:
# function to convert predictions into dataframe for plotting
def append_preds(preds, previous_preds, name):
    new_preds = pd.DataFrame(np.ravel(preds),columns=[name], index=previous_preds.index)
    return previous_preds.join(new_preds)

### Baseline

For a basline comparison we will look at the rmse and r2 scores for the actual prices and day ahead prices. This will give us a sense of how our models compare to previous estimates

In [80]:
np.sqrt(((visuals['price_actual']-visuals['price_day_ahead'])**(2)).mean())

13.249856853754808

Our baseline rmse is 13.25 euros.

In [82]:
metrics.r2_score(visuals['price_actual'],visuals['price_day_ahead'])

0.12982152162245753

In [87]:
visuals.corr()

Unnamed: 0,price_actual,price_day_ahead
price_actual,1.0,0.732155
price_day_ahead,0.732155,1.0


### Prepare Data

In [37]:
# Set up data frame for modeling
# Drop time column
df.drop(columns=['time'], inplace=True)
# set index as date
df.set_index(pd.DatetimeIndex(df['date']), inplace=True)
# sort index
df.sort_index(inplace=True)
# drop hour of day and date column
df.drop(columns=['hour_of_day','date'], inplace=True)
# check result
df.head(3)

Unnamed: 0_level_0,oil_price,t_price_0,t_price_1,t_price_2,t_price_3,t_price_4,t_price_5,t_price_6,t_price_7,t_price_8,...,y_price_18,y_price_17,y_price_16,y_price_15,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-01-02,43.8585,66.82,63.35,58.79,57.44,55.29,56.22,58.13,62.06,67.36,...,74.26,64.74,61.18,59.76,0,0,0,0,0,0
2015-01-03,43.8585,55.22,50.54,48.68,48.02,47.06,46.79,47.63,47.44,50.84,...,82.55,72.85,70.64,71.24,0,1,0,0,0,0
2015-01-04,43.7237,70.77,64.89,60.91,59.68,58.04,59.57,69.73,72.97,77.92,...,71.5,66.69,62.03,62.76,0,0,1,0,0,0


In [38]:
# Get columns for y
y_cols = [col for col in df.columns if col.startswith('t_price')]

In [39]:
# Set X and y
X = df.drop(columns=y_cols)
y = df[y_cols]

In [40]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                    shuffle=False)

# Scaled data for KNeighbors
ss = StandardScaler()
Z_train = ss.fit_transform(X_train)
Z_test = ss.transform(X_test)

In [96]:
# Get data frame of predictions for plotting
y_test.head(1)

Unnamed: 0_level_0,t_price_0,t_price_1,t_price_2,t_price_3,t_price_4,t_price_5,t_price_6,t_price_7,t_price_8,t_price_9,...,t_price_14,t_price_15,t_price_16,t_price_17,t_price_18,t_price_19,t_price_20,t_price_21,t_price_22,t_price_23
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-12-31,20.76,19.02,17.9,20.88,18.83,15.8,18.7,19.97,21.91,17.82,...,17.69,17.62,16.99,20.66,28.2,32.37,30.39,32.2,32.9,26.95


Our test data starts on the last day of 2017 meaning that predictions from it pertain to all 2018

In [135]:
pred_df = visuals[visuals.index.year == 2018]
pred_df.head(2)

Unnamed: 0_level_0,price_actual,price_day_ahead
time,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-01-01 00:00:00+00:00,20.76,6.74
2018-01-01 01:00:00+00:00,19.02,4.74


### Linear Regression

A baseline model, does not have any hyperparameters that we want to search over.

In [74]:
# Linear regression
# Fit model
lr = LinearRegression()
lr.fit(X_train, y_train)
# Get predictions
lr_train_preds = lr.predict(X_train)
lr_test_preds = lr.predict(X_test)
# Get stats
lr_metrics = reg_metrics(y_train, lr_train_preds, y_test, lr_test_preds,'lin_reg_')
# Display summary
lr_metrics.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
lin_reg_test_rmse,24.0,5.978276,0.405687,5.402746,5.659744,5.906032,6.190153,6.82617
lin_reg_train_r2,24.0,0.844108,0.024345,0.79403,0.828518,0.853359,0.860473,0.878429
lin_reg_test_r2,24.0,0.698824,0.079239,0.517575,0.694516,0.721589,0.744032,0.798843


From our linear regression we can see that we have significantly improved the rmse, getting a mean of 5.98 euros vs over 13 for our baseline. However, the model is clearly overfit with a 0.15 difference between train and test rmse. Let's see if we can improve this by running and elastic net to test different regularization types and strengths.

In [136]:
# Append predictions to pred df
pred_df = append_preds(lr_test_preds, pred_df, 'lin_reg')
pred_df.head(2)

Unnamed: 0_level_0,price_actual,price_day_ahead,lin_reg
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-01-01 00:00:00+00:00,20.76,6.74,31.770592
2018-01-01 01:00:00+00:00,19.02,4.74,30.554279


### Elastic Net

Our elastic net will allow us to try different combinations of LASSO (L1) and Ridge (L2) regularization. The 2 hyperparameters we will search over are:
- alpha: regularization strength, higher alpha means more regularization
- l1_ratio: how much LASSO vs Ridge regularization is used, an l1_ratio of 1 mean pure LASSO regularization

In [132]:
# Set up a list of alphas to check.
enet_alphas = np.linspace(0.01, 5, 20)

# Set up our l1 ratio
enet_ratio = np.linspace(0.01, 1, 20)

# Pipe parameters
en_params = {
    'alpha': enet_alphas,
    'l1_ratio': enet_ratio
}

# Instantiate GridSearchCV
en_gs = GridSearchCV(ElasticNet(), 
                    en_params, 
                    cv=TimeSeriesSplit(n_splits=4),
                    n_jobs = 2) 

# Fit
en_gs.fit(X_train,y_train);

# Get predictions
en_train_preds = en_gs.predict(X_train)
en_test_preds = en_gs.predict(X_test)

# Show metrics and best parameters
print(f'Best hyperparameters: {en_gs.best_params_}\n')

  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


Best hyperparameters: {'alpha': 0.01, 'l1_ratio': 1.0}



  positive)
  positive)
  positive)
  positive)


In [133]:
# Get stats
en_metrics = reg_metrics(y_train, en_train_preds, y_test, en_test_preds, 'e_net_')
# Display summary
en_metrics.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
e_net_test_rmse,24.0,5.942493,0.400009,5.368821,5.615814,5.918572,6.140093,6.767992
e_net_train_r2,24.0,0.843372,0.024433,0.792995,0.827807,0.852643,0.859852,0.877983
e_net_test_r2,24.0,0.7027,0.076778,0.526648,0.70007,0.723849,0.74714,0.801361


Our regularization has not done much to improve the issue of overfitting, though the test r2 and rmse are slightly improved from the simple linear regression.

Additionally we need to be wary of the fact that we had numerous convergence warnings and can't completely trust the results of this gridsearch.

Of note the alpha and l1_ratio indicate that our model performed best with a weak LASSO penalty. This makes sense intuitively as LASSO will 'zero out' coefficients and our X data includes terms from many different hours of the day so for each of our y predictions it likely makes sense to not include past terms from hours not similar to the hour we are attempting to predict. 

In [137]:
# append predictions to predictions df
pred_df = append_preds(en_test_preds, pred_df, 'e_net')
pred_df.head(2)

Unnamed: 0_level_0,price_actual,price_day_ahead,lin_reg,e_net
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-01-01 00:00:00+00:00,20.76,6.74,31.770592,30.864549
2018-01-01 01:00:00+00:00,19.02,4.74,30.554279,28.474588


### *K*NN

In [77]:
# Parameters
knn_params = {
    'n_neighbors': [3,5,11,15]
}

# Instantiate GridSearchCV
knn_gs = GridSearchCV(KNeighborsRegressor(), 
                    knn_params, 
                    cv=TimeSeriesSplit(n_splits=4),
                    n_jobs = 2) 

# Fit
knn_gs.fit(Z_train,y_train);

# Get predictions
knn_train_preds = knn_gs.predict(Z_train)
knn_test_preds = knn_gs.predict(Z_test)

# Show metrics and best parameters
print(f'Best hyperparameters: {knn_gs.best_params_}\n')

Best hyperparameters: {'n_neighbors': 11}



In [76]:
# Get stats
knn_metrics = reg_metrics(y_train, knn_train_preds, y_test, knn_test_preds,'knn_')
# Display summary
knn_metrics.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
knn_test_rmse,24.0,7.472363,0.698402,5.993699,6.928633,7.612452,8.018745,8.487968
knn_train_r2,24.0,0.753281,0.017677,0.724929,0.740688,0.754058,0.761393,0.783366
knn_test_r2,24.0,0.542787,0.046053,0.445879,0.511994,0.54151,0.574639,0.616905


While an improvement on the baseline our knn model is clearly worse than the previous linear models and thus not worth considering as our final model.

In [138]:
# append predictions to predictions df
pred_df = append_preds(en_test_preds, pred_df, 'knn')
pred_df.head(2)

Unnamed: 0_level_0,price_actual,price_day_ahead,lin_reg,e_net,knn
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-01-01 00:00:00+00:00,20.76,6.74,31.770592,30.864549,30.864549
2018-01-01 01:00:00+00:00,19.02,4.74,30.554279,28.474588,28.474588


### Random Forest

In [70]:
# Parameters
rf_params = {
    'max_depth': [5,15,None],
    'warm_start': [True, False],
    'min_samples_leaf': [1,5,15]
}

# Instantiate GridSearchCV
rf_gs = GridSearchCV(RandomForestRegressor(), 
                    rf_params, 
                    cv=TimeSeriesSplit(n_splits=4),
                    n_jobs = 2) 

# Fit
rf_gs.fit(X_train,y_train);

# Get predictions
rf_train_preds = rf_gs.predict(X_train)
rf_test_preds = rf_gs.predict(X_test)

# Show metrics and best parameters
print(f'Best hyperparameters: {rf_gs.best_params_}\n')

Best hyperparameters: {'max_depth': None, 'min_samples_leaf': 1, 'warm_start': True}



Unnamed: 0,count,mean,std,min,25%,50%,75%,max
test_rmse,24.0,6.873277,0.836594,5.540561,6.110746,6.927468,7.352547,8.375877
train_r2,24.0,0.966834,0.003503,0.959394,0.965325,0.967594,0.969373,0.971603
test_r2,24.0,0.611773,0.066241,0.490914,0.564321,0.619236,0.662106,0.709628


In [206]:
# Get stats
rf_metrics = reg_metrics(y_train, rf_train_preds, y_test, rf_test_preds, 'knn_')
# Display summary
rf_metrics.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
knn_test_rmse,24.0,6.89289,0.865281,5.520268,6.11885,6.911735,7.399846,8.491909
knn_train_r2,24.0,0.966443,0.003234,0.959216,0.964651,0.966897,0.968481,0.971398
knn_test_r2,24.0,0.60987,0.066824,0.496458,0.564644,0.621383,0.662448,0.707163


This was not a particularly good result, ending in a wildly overfit model. Let's try again, testing more hyperparameters to see if we can improve the model.

In [72]:
# Parameters
rf_params = {
    'max_depth': [7, 9, 11],
    'min_samples_split': [5,7,9],
    'n_estimators':[125]
}

# Instantiate GridSearchCV
rf_gs = GridSearchCV(RandomForestRegressor(), 
                    rf_params, 
                    cv=TimeSeriesSplit(n_splits=4),
                    n_jobs = 2) 

# Fit
rf_gs.fit(X_train,y_train);

# Get predictions
rf_train_preds = rf_gs.predict(X_train)
rf_test_preds = rf_gs.predict(X_test)

# Show metrics and best parameters
print(f'Best hyperparameters: {rf_gs.best_params_}\n')

Best hyperparameters: {'max_depth': 11, 'min_samples_split': 5, 'n_estimators': 125}



Unnamed: 0,count,mean,std,min,25%,50%,75%,max
test_rmse,24.0,6.908248,0.868783,5.515654,6.169793,6.946663,7.410626,8.486542
train_r2,24.0,0.927524,0.014136,0.891138,0.91894,0.929317,0.936305,0.947962
test_r2,24.0,0.608473,0.065143,0.501732,0.564371,0.619814,0.65219,0.707049


In [None]:
# Get stats
rf_metrics = reg_metrics(y_train, rf_train_preds, y_test, rf_test_preds)
# Display summary
rf_metrics.describe().T

Attempting to lower the variance of the model succeded in lowering the difference between the train and test R2 values. However, this was not because our test R2 improved but rather because our train R2 was lowered. 

Based on these results it seems as though random forest is not a good model for what we are attempting. Let's create a new random forest using the best parameters (from the first gridsearch) so that we can keep the results for our analysis.

In [177]:
# Our best hyperparameters ended up being the base parameters 
# for an sklearn random forest
rf = RandomForestRegressor()
rf.fit(X_train,y_train)

# Get predictions
rf_train_preds = rf.predict(X_train)
rf_test_preds = rf.predict(X_test)

# Get stats
rf_metrics = reg_metrics(y_train, rf_train_preds, y_test, rf_test_preds,'rf_')
# Display summary
rf_metrics.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
rf_test_rmse,24.0,6.89289,0.865281,5.520268,6.11885,6.911735,7.399846,8.491909
rf_train_r2,24.0,0.966443,0.003234,0.959216,0.964651,0.966897,0.968481,0.971398
rf_test_r2,24.0,0.60987,0.066824,0.496458,0.564644,0.621383,0.662448,0.707163


In [141]:
# append predictions to predictions df
pred_df = append_preds(rf_test_preds, pred_df, 'rf')
pred_df.head(2)

Unnamed: 0_level_0,price_actual,price_day_ahead,lin_reg,e_net,knn,rf
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-01-01 00:00:00+00:00,20.76,6.74,31.770592,30.864549,30.864549,35.2893
2018-01-01 01:00:00+00:00,19.02,4.74,30.554279,28.474588,28.474588,31.3505


### AdaBoost

In [187]:
# Parameters
ada_params = {
    'estimator__n_estimators': [50,100],
    'estimator__loss': ['linear', 'exponential', 'square'],
    'estimator__random_state': [33]
}
# thanks Marco Antonio Yamada for helping w/MultiOutputRegressor
# https://stackoverflow.com/questions/43532811/gridsearch-over-multioutputregressor
ada = AdaBoostRegressor()

# Instantiate GridSearchCV
ada_gs = GridSearchCV(MultiOutputRegressor(ada), 
                    ada_params, 
                    cv=TimeSeriesSplit(n_splits=4),
                    n_jobs = 2) 

# Fit
ada_gs.fit(X_train,y_train);

# Get predictions
ada_train_preds = ada_gs.predict(X_train)
ada_test_preds = ada_gs.predict(X_test)

# Show metrics and best parameters
print(f'Best hyperparameters: {ada_gs.best_params_}\n')

Best hyperparameters: {'estimator__loss': 'linear', 'estimator__n_estimators': 100, 'estimator__random_state': 33}



In [194]:
# Get stats
ada_metrics = reg_metrics(y_train, ada_train_preds, y_test, ada_test_preds, 'ada_')
# Display summary
ada_metrics.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
ada_test_rmse,24.0,7.119702,0.729551,5.683866,6.432974,7.259775,7.732652,8.080586
ada_train_r2,24.0,0.810865,0.020563,0.775386,0.79324,0.811091,0.827315,0.847691
ada_test_r2,24.0,0.579495,0.095926,0.224895,0.565427,0.586845,0.638845,0.671181


In [196]:
# append predictions to predictions df
pred_df = append_preds(ada_test_preds, pred_df, 'ada')
pred_df.head(2)

Unnamed: 0_level_0,price_actual,price_day_ahead,lin_reg,e_net,knn,rf,ada
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2018-01-01 00:00:00+00:00,20.76,6.74,31.770592,30.864549,30.864549,35.2893,38.492384
2018-01-01 01:00:00+00:00,19.02,4.74,30.554279,28.474588,28.474588,31.3505,30.774154


### Support Vector Regressor

In [195]:
# SVR uses standardized x variables
# Parameters
svr_params = {
    'estimator__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'estimator__gamma': ['scale','auto'],
    'estimator__C':[0.1, 1, 2]
}


svr = SVR()

# Instantiate GridSearchCV
svr_gs = GridSearchCV(MultiOutputRegressor(svr), 
                    svr_params, 
                    cv=TimeSeriesSplit(n_splits=4),
                    n_jobs = 2) 

# Fit
svr_gs.fit(Z_train,y_train);

# Get predictions
svr_train_preds = svr_gs.predict(Z_train)
svr_test_preds = svr_gs.predict(Z_test)

# Show metrics and best parameters
print(f'Best hyperparameters: {svr_gs.best_params_}\n')

Best hyperparameters: {'estimator__C': 1, 'estimator__gamma': 'scale', 'estimator__kernel': 'linear'}



In [198]:
# Get stats
svr_metrics = reg_metrics(y_train, svr_train_preds, y_test, svr_test_preds, 'svr_')
# Display summary
svr_metrics.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
svr_test_rmse,24.0,5.861017,0.347601,5.38048,5.565653,5.916765,6.092138,6.528117
svr_train_r2,24.0,0.829972,0.02695,0.777344,0.813562,0.840064,0.848223,0.867341
svr_test_r2,24.0,0.712508,0.064067,0.564932,0.703917,0.732584,0.74876,0.797224


In [200]:
# append predictions to predictions df
pred_df = append_preds(svr_test_preds, pred_df, 'svr')
pred_df.head(2)

Unnamed: 0_level_0,price_actual,price_day_ahead,lin_reg,e_net,knn,rf,ada,svr
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2018-01-01 00:00:00+00:00,20.76,6.74,31.770592,30.864549,30.864549,35.2893,38.492384,29.132312
2018-01-01 01:00:00+00:00,19.02,4.74,30.554279,28.474588,28.474588,31.3505,30.774154,28.383815


### Results for Analysis

In [203]:
pred_df.to_csv('../Data/Analysis/reg_predictions.csv')

In [201]:
metrics_df = pd.concat([lr_metrics, en_metrics,
                        knn_metrics, rf_metrics,
                        ada_metrics, svr_metrics],
                     axis=1, sort=False)
metrics_df.head()

Unnamed: 0,lin_reg_test_rmse,lin_reg_train_r2,lin_reg_test_r2,e_net_test_rmse,e_net_train_r2,e_net_test_r2,knn_test_rmse,knn_train_r2,knn_test_r2,rf_test_rmse,rf_train_r2,rf_test_r2,ada_test_rmse,ada_train_r2,ada_test_r2,svr_test_rmse,svr_train_r2,svr_test_r2
0,5.748025,0.83747,0.714437,5.772251,0.836942,0.712025,7.165323,0.761384,0.556252,6.482274,0.969178,0.636822,6.264047,0.827333,0.660863,5.832793,0.82198,0.705952
1,6.103116,0.825918,0.717623,6.119451,0.825228,0.716109,7.893097,0.748584,0.527696,7.369454,0.96832,0.588284,6.897932,0.833366,0.639285,6.168888,0.806493,0.711503
2,5.95236,0.817095,0.743166,5.970468,0.816358,0.741601,8.104286,0.740708,0.523894,7.679308,0.967051,0.572517,7.127891,0.82659,0.631704,5.997368,0.797643,0.739267
3,5.859704,0.807637,0.751702,5.866675,0.807067,0.751111,8.214508,0.735229,0.512039,7.998256,0.965623,0.537393,7.612675,0.812419,0.58092,5.926283,0.790757,0.746028
4,5.693829,0.801997,0.762886,5.669691,0.801217,0.764892,8.167019,0.727499,0.512163,8.184219,0.962909,0.510106,7.552819,0.806807,0.582779,5.623372,0.781412,0.768718


In [202]:
metrics_df.to_csv('../Data/Analysis/reg_metrics.csv')

In [204]:
metrics_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
lin_reg_test_rmse,24.0,5.978276,0.405687,5.402746,5.659744,5.906032,6.190153,6.82617
lin_reg_train_r2,24.0,0.844108,0.024345,0.79403,0.828518,0.853359,0.860473,0.878429
lin_reg_test_r2,24.0,0.698824,0.079239,0.517575,0.694516,0.721589,0.744032,0.798843
e_net_test_rmse,24.0,5.942493,0.400009,5.368821,5.615814,5.918572,6.140093,6.767992
e_net_train_r2,24.0,0.843372,0.024433,0.792995,0.827807,0.852643,0.859852,0.877983
e_net_test_r2,24.0,0.7027,0.076778,0.526648,0.70007,0.723849,0.74714,0.801361
knn_test_rmse,24.0,7.472363,0.698402,5.993699,6.928633,7.612452,8.018745,8.487968
knn_train_r2,24.0,0.753281,0.017677,0.724929,0.740688,0.754058,0.761393,0.783366
knn_test_r2,24.0,0.542787,0.046053,0.445879,0.511994,0.54151,0.574639,0.616905
rf_test_rmse,24.0,6.89289,0.865281,5.520268,6.11885,6.911735,7.399846,8.491909
