# Parameter Tuning
    
    Find best predictive window & best number of splits

# Importing libraries:

In [6]:
!pip install pandas
!pip install sklearn

import pandas as pd;
import numpy as np;
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler;
from sklearn import metrics;
from sklearn.model_selection import TimeSeriesSplit;
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
from sklearn.linear_model import LinearRegression

# empty list to append metric values
mae_gen = []
mse_gen  =[]
rmse_gen = []
mae_nor = []
mae_spi = []
mse_nor = []
mse_spi = []
rmse_nor = []
rmse_spi = []



# Prepare parameters:

In [7]:
from sklearn.model_selection import ParameterGrid

# Dictionary to include the parameters
parameters = {'dates':  [2017010000,
                         2017030000, 
                         2017050000, 
                         2017070000,
                         2017090000, 
                         2017110000,
                         2018010000, 
                         2018030000, 
                         2018050000,
                         2018070000,
                         2018090000,
                         2018110000],
              'splits': [4, 
                         5, 
                         6, 
                         7,
                         8, 
                         9, 
                         10, 
                         11]
               }

all_param = ParameterGrid(parameters)

# Data set processing and tuning:

In [8]:
for i in range(len(all_param)):
    
    # parameters to change
    splits = all_param[i]['splits']
    dates = all_param[i]['dates']
    
    # data
    data = pd.read_csv('Data_set_1_smaller.csv', index_col = 0)
    data = data.loc[data.index > dates, :]
    
    # reset index
    data.reset_index(inplace = True)
    data.drop('index', axis = 1, inplace = True)
    
    # Divide features and labels
    X = data.iloc[:, 0:15]
    y = data.loc[:, 'Offers']
    
    X.fillna(X.mean(), inplace = True)
    y.fillna(y.mean(), inplace = True)
    
    X = X.astype('float64')
    X = X.round(20)
    
    # divide data into train and test with 15% test data
    X_train, X_test, y_train, y_test = train_test_split(
             X, y, test_size = 0.15, shuffle=False)
    
    # feature scaling
    sc_X = MinMaxScaler()
    X_train = sc_X.fit_transform(X_train)
    X_test = sc_X.transform(X_test)
    
    # create time series split for CV
    tscv = TimeSeriesSplit(n_splits = splits)
    
    # create linear regressor 
    regressor = LinearRegression()
    
    for train_index, test_index in tscv.split(X_train):
          X_train_split, X_test_split = X_train[train_index], X_train[test_index]
          y_train_split, y_test_split = y_train[train_index], y_train[test_index]
          regressor.fit(X_train_split, y_train_split)
    
    # predict for X_test  
    y_pred = regressor.predict(X_test)

    from sklearn.metrics import mean_squared_error as mse
    from sklearn.metrics import mean_absolute_error as mae

    rmse_error = mse(y_test, y_pred, squared = False)
    mse_error = mse(y_test, y_pred) # 1479.61335
    mae_error = mae(y_test, y_pred) # 23.1525
    
    rmse_gen.append(rmse_error)
    mse_gen.append(mse_error)
    mae_gen.append(mae_error)
    
    # =============================================================================
    # Metrics evaluation on spike regions
    # =============================================================================
    
    y_spike_occ = pd.read_csv('Spike_binary_1std.csv', usecols = [6])
    
    # create array same size as y_test
    y_spike_occ = y_spike_occ.iloc[- len(y_test):]
    y_spike_occ = pd.Series(y_spike_occ.iloc[:,0]).values
    
    # smal adjustment
    y_test.replace(0, 0.0001,inplace = True)

    # select y_pred and y_test only for regions with spikes
    y_test_spike = (y_test.T * y_spike_occ).T
    y_pred_spike = (y_pred.T * y_spike_occ).T
    y_test_spike = y_test_spike[y_test_spike != 0]
    y_pred_spike = y_pred_spike[y_pred_spike != 0]
    
    # calculate metric
    rmse_spike = mse(y_test_spike, y_pred_spike, squared = False)
    mse_spike = mse(y_test_spike, y_pred_spike)
    mae_spike = mae(y_test_spike, y_pred_spike)
    
    rmse_spi.append(rmse_spike)
    mse_spi.append(mse_spike)
    mae_spi.append(mae_spike)
    
    # =============================================================================
    # Metric evaluation on normal regions
    # =============================================================================
    
    # inverse y_spike_occ so the only normal occurences are chosen
    y_normal_occ = (y_spike_occ - 1) * (-1)
    
    # sanity check
    y_normal_occ.sum() + y_spike_occ.sum() # gives the correct total 
    
    # select y_pred and y_test only for normal regions
    y_test_normal = (y_test.T * y_normal_occ).T
    y_pred_normal = (y_pred.T * y_normal_occ).T
    y_test_normal = y_test_normal[y_test_normal != 0.00]
    y_pred_normal = y_pred_normal[y_pred_normal != 0.00]
    
    # calculate metric
    rmse_normal = mse(y_test_normal, y_pred_normal, squared = False)
    mse_normal = mse(y_test_normal, y_pred_normal)
    mae_normal = mae(y_test_normal, y_pred_normal)
    
    rmse_nor.append(rmse_normal)
    mse_nor.append(mse_normal)
    mae_nor.append(mae_normal)  


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a s

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a s

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a s

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a s

In [19]:
results = pd.DataFrame({'all_param':all_param,
                        
                        'rmse_general': rmse_gen, 
                 
                        'mae_general': mae_gen,
                        
                        'rmse_spike': rmse_spi,
                 
                        'mae_spike': mae_spi,
                        
                        'rmse_normal': rmse_nor,
                    
                        'mae_normal': mae_nor})

for i in range(len(all_param)):
    print(all_param[i])

{'splits': 4, 'dates': 2017010000}
{'splits': 5, 'dates': 2017010000}
{'splits': 6, 'dates': 2017010000}
{'splits': 7, 'dates': 2017010000}
{'splits': 8, 'dates': 2017010000}
{'splits': 9, 'dates': 2017010000}
{'splits': 10, 'dates': 2017010000}
{'splits': 11, 'dates': 2017010000}
{'splits': 4, 'dates': 2017030000}
{'splits': 5, 'dates': 2017030000}
{'splits': 6, 'dates': 2017030000}
{'splits': 7, 'dates': 2017030000}
{'splits': 8, 'dates': 2017030000}
{'splits': 9, 'dates': 2017030000}
{'splits': 10, 'dates': 2017030000}
{'splits': 11, 'dates': 2017030000}
{'splits': 4, 'dates': 2017050000}
{'splits': 5, 'dates': 2017050000}
{'splits': 6, 'dates': 2017050000}
{'splits': 7, 'dates': 2017050000}
{'splits': 8, 'dates': 2017050000}
{'splits': 9, 'dates': 2017050000}
{'splits': 10, 'dates': 2017050000}
{'splits': 11, 'dates': 2017050000}
{'splits': 4, 'dates': 2017070000}
{'splits': 5, 'dates': 2017070000}
{'splits': 6, 'dates': 2017070000}
{'splits': 7, 'dates': 2017070000}
{'splits': 8, 

In [20]:
results_1 = pd.DataFrame({                        
                        'rmse_general': rmse_gen, 
                 
                        'mae_general': mae_gen,
                        
                        'rmse_spike': rmse_spi,
                 
                        'mae_spike': mae_spi,
                        
                        'rmse_normal': rmse_nor,
                    
                        'mae_normal': mae_nor})

def highlight_min(s):
    '''
    highlight the maximum in a Series yellow.
    '''
    is_max = s == s.min()
    return ['background-color: yellow' if v else '' for v in is_max]

results_1.style.apply(highlight_min)

Unnamed: 0,rmse_general,mae_general,rmse_spike,mae_spike,rmse_normal,mae_normal
0,47.007429,33.585565,86.587406,57.351248,33.14152,28.590614
1,46.502913,32.570383,87.246191,57.891265,31.893548,27.248568
2,46.229773,31.996806,87.629968,58.199055,31.183974,26.48975
3,46.239043,32.097747,87.490483,58.073913,31.282767,26.638207
4,46.252955,32.186396,87.396737,58.007267,31.362629,26.759496
5,46.142993,31.949053,87.538296,58.118815,31.08238,26.448825
6,46.05559,31.777541,87.630975,58.192784,30.869891,26.225718
7,46.007374,31.664514,87.708863,58.253288,30.736103,26.076219
8,46.344051,31.41502,88.936203,57.802076,30.867118,25.942443
9,46.05521,30.755547,89.349777,58.090586,30.087589,25.086362


In [21]:
print(results.iloc[80, :])
print(results.iloc[88, :])

all_param       {'dates': 2018090000, 'splits': 4}
rmse_general                               34.1209
mae_general                                28.5281
rmse_spike                                 61.6195
mae_spike                                  50.6259
rmse_normal                                27.7119
mae_normal                                 25.2019
Name: 80, dtype: object
all_param       {'dates': 2018110000, 'splits': 4}
rmse_general                                30.829
mae_general                                22.1787
rmse_spike                                 63.5435
mae_spike                                  51.0044
rmse_normal                                20.8449
mae_normal                                 17.3617
Name: 88, dtype: object


In [None]:
!pip install matplotlib

%matplotlib notebook

import matplotlib.pyplot as plt

In [None]:
for i in range(6):
    one.append(rmse_spi[8 * i])
    two.append(rmse_spi[1 + (8 * i)])
    three.append(rmse_spi[2 + (8 * i)])
    four.append(rmse_spi[3 + (8 * i)])
    five.append(rmse_spi[4 + (8 * i)])
    six.append(rmse_spi[5 + (8 * i)])
    seven.append(rmse_spi[6 + (8 * i)])
    eight.append(rmse_spi[7 + (8 * i)])

In [None]:
plt.figure(figsize=(9,4))
plt.plot(one, label = '10 neurons')
plt.plot(two, label = '15 neurons')
plt.plot(three, label = '20 neurons')
plt.plot(four, label = '25 neurons')
plt.plot(five, label = '30 neurons')
plt.plot(six, label = '35 neurons')
plt.plot(seven, label = '40 neurons')
plt.plot(eight, label = '45 neurons')
plt.minorticks_on()
plt.grid(which='major', linestyle='-', linewidth='0.5')
plt.grid(which='minor', linestyle=':', linewidth='0.5')
plt.xticks([0,1,2,3,4,5], [1,2,3,4,5,6])
plt.xlabel('Number of hidden layers')
plt.ylabel('RMSE (£/MWh)')
plt.title('RMSE on spike regions for different combinations of: \nNumber hidden layers & Number of neurons per layer ')
plt.legend(bbox_to_anchor=(1.0, 1.02))
plt.tight_layout()
plt.savefig('RMSE_spike_n_hidden_n_neurons.png')

In [None]:
one = []
two = []
three = []
four = []
five = []
six = []
seven = []
eight = []


for i in range(6):
    one.append(rmse_nor[8 * i])
    two.append(rmse_nor[1 + (8 * i)])
    three.append(rmse_nor[2 + (8 * i)])
    four.append(rmse_nor[3 + (8 * i)])
    five.append(rmse_nor[4 + (8 * i)])
    six.append(rmse_nor[5 + (8 * i)])
    seven.append(rmse_nor[6 + (8 * i)])
    eight.append(rmse_nor[7 + (8 * i)])
    
plt.figure(figsize=(9,4))
plt.plot(one, label = '10 neurons')
plt.plot(two, label = '15 neurons')
plt.plot(three, label = '20 neurons')
plt.plot(four, label = '25 neurons')
plt.plot(five, label = '30 neurons')
plt.plot(six, label = '35 neurons')
plt.plot(seven, label = '40 neurons')
plt.plot(eight, label = '45 neurons')
plt.minorticks_on()
plt.grid(which='major', linestyle='-', linewidth='0.5')
plt.grid(which='minor', linestyle=':', linewidth='0.5')
plt.xticks([0,1,2,3,4,5], [1,2,3,4,5,6])
plt.xlabel('Number of hidden layers')
plt.ylabel('RMSE (£/MWh)')
plt.title('RMSE on normal regions for different combinations of: \nNumber hidden layers & Number of neurons per layer ')
plt.legend(bbox_to_anchor=(1.0, 1.02))
plt.tight_layout()
plt.savefig('RMSE_normal_n_hidden_n_neurons.png')

In [None]:
one = []
two = []
three = []
four = []
five = []
six = []
seven = []
eight = []


for i in range(6):
    one.append(rmse_gen[8 * i])
    two.append(rmse_gen[1 + (8 * i)])
    three.append(rmse_gen[2 + (8 * i)])
    four.append(rmse_gen[3 + (8 * i)])
    five.append(rmse_gen[4 + (8 * i)])
    six.append(rmse_gen[5 + (8 * i)])
    seven.append(rmse_gen[6 + (8 * i)])
    eight.append(rmse_gen[7 + (8 * i)])
    
plt.figure(figsize=(9,4))
plt.plot(one, label = '10 neurons')
plt.plot(two, label = '15 neurons')
plt.plot(three, label = '20 neurons')
plt.plot(four, label = '25 neurons')
plt.plot(five, label = '30 neurons')
plt.plot(six, label = '35 neurons')
plt.plot(seven, label = '40 neurons')
plt.plot(eight, label = '45 neurons')
plt.minorticks_on()
plt.grid(which='major', linestyle='-', linewidth='0.5')
plt.grid(which='minor', linestyle=':', linewidth='0.5')
plt.xticks([0,1,2,3,4,5], [1,2,3,4,5,6])
plt.xlabel('Number of hidden layers')
plt.ylabel('RMSE (£/MWh)')
plt.title('RMSE for all test set for different combinations of: \nNumber hidden layers & Number of neurons per layer ')
plt.legend(bbox_to_anchor=(1.0, 1.02))
plt.tight_layout()
plt.savefig('RMSE_general_n_hidden_n_neurons.png')

In [None]:
one = []
two = []
three = []
four = []
five = []
six = []
seven = []
eight = []


for i in range(6):
    one.append(mae_spi[8 * i])
    two.append(mae_spi[1 + (8 * i)])
    three.append(mae_spi[2 + (8 * i)])
    four.append(mae_spi[3 + (8 * i)])
    five.append(mae_spi[4 + (8 * i)])
    six.append(mae_spi[5 + (8 * i)])
    seven.append(mae_spi[6 + (8 * i)])
    eight.append(mae_spi[7 + (8 * i)])
    
plt.figure(figsize=(9,4))
plt.plot(one, label = '10 neurons')
plt.plot(two, label = '15 neurons')
plt.plot(three, label = '20 neurons')
plt.plot(four, label = '25 neurons')
plt.plot(five, label = '30 neurons')
plt.plot(six, label = '35 neurons')
plt.plot(seven, label = '40 neurons')
plt.plot(eight, label = '45 neurons')
plt.minorticks_on()
plt.grid(which='major', linestyle='-', linewidth='0.5')
plt.grid(which='minor', linestyle=':', linewidth='0.5')
plt.xticks([0,1,2,3,4,5], [1,2,3,4,5,6])
plt.xlabel('Number of hidden layers')
plt.ylabel('MAE (£/MWh)')
plt.title('MAE on spike regions for different combinations of: \nNumber hidden layers & Number of neurons per layer ')
plt.legend(bbox_to_anchor=(1.0, 1.02))
plt.tight_layout()
plt.savefig('MAE_spike_n_hidden_n_neurons.png')

In [None]:
one = []
two = []
three = []
four = []
five = []
six = []
seven = []
eight = []


for i in range(6):
    one.append(mae_nor[8 * i])
    two.append(mae_nor[1 + (8 * i)])
    three.append(mae_nor[2 + (8 * i)])
    four.append(mae_nor[3 + (8 * i)])
    five.append(mae_nor[4 + (8 * i)])
    six.append(mae_nor[5 + (8 * i)])
    seven.append(mae_nor[6 + (8 * i)])
    eight.append(mae_nor[7 + (8 * i)])
    
plt.figure(figsize=(9,4))
plt.plot(one, label = '10 neurons')
plt.plot(two, label = '15 neurons')
plt.plot(three, label = '20 neurons')
plt.plot(four, label = '25 neurons')
plt.plot(five, label = '30 neurons')
plt.plot(six, label = '35 neurons')
plt.plot(seven, label = '40 neurons')
plt.plot(eight, label = '45 neurons')
plt.minorticks_on()
plt.grid(which='major', linestyle='-', linewidth='0.5')
plt.grid(which='minor', linestyle=':', linewidth='0.5')
plt.xticks([0,1,2,3,4,5], [1,2,3,4,5,6])
plt.xlabel('Number of hidden layers')
plt.ylabel('MAE (£/MWh)')
plt.title('MAE on normal regions for different combinations of: \nNumber hidden layers & Number of neurons per layer ')
plt.legend(bbox_to_anchor=(1.0, 1.02))
plt.tight_layout()
plt.savefig('MAE_normal_n_hidden_n_neurons.png')

In [None]:
one = []
two = []
three = []
four = []
five = []
six = []
seven = []
eight = []


for i in range(6):
    one.append(mae_gen[8 * i])
    two.append(mae_gen[1 + (8 * i)])
    three.append(mae_gen[2 + (8 * i)])
    four.append(mae_gen[3 + (8 * i)])
    five.append(mae_gen[4 + (8 * i)])
    six.append(mae_gen[5 + (8 * i)])
    seven.append(mae_gen[6 + (8 * i)])
    eight.append(mae_gen[7 + (8 * i)])
    
plt.figure(figsize=(9,4))
plt.plot(one, label = '10 neurons')
plt.plot(two, label = '15 neurons')
plt.plot(three, label = '20 neurons')
plt.plot(four, label = '25 neurons')
plt.plot(five, label = '30 neurons')
plt.plot(six, label = '35 neurons')
plt.plot(seven, label = '40 neurons')
plt.plot(eight, label = '45 neurons')
plt.minorticks_on()
plt.grid(which='major', linestyle='-', linewidth='0.5')
plt.grid(which='minor', linestyle=':', linewidth='0.5')
plt.xticks([0,1,2,3,4,5], [1,2,3,4,5,6])
plt.xlabel('Number of hidden layers')
plt.ylabel('MAE (£/MWh)')
plt.title('MAE for all test set for different combinations of: \nNumber hidden layers & Number of neurons per layer ')
plt.legend(bbox_to_anchor=(1.0, 1.02))
plt.tight_layout()
plt.savefig('MAE_general_n_hidden_n_neurons.png')