# Parameter Tuning
    Find best predictive window from 2 years to 2 months

In [16]:
!pip install pandas
!pip install sklearn

import pandas as pd;
import numpy as np;
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler;
from sklearn import metrics;
from sklearn.model_selection import TimeSeriesSplit;
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
from sklearn.linear_model import LinearRegression

dates = [2017010000,
         2017030000, 
         2017050000, 
         2017070000,
         2017090000, 
         2017110000,
         2018010000, 
         2018030000, 
         2018050000,
         2018070000,
         2018090000,
         2018110000]

dates_labels = ['24 ', 
                '22 ',
                '20 ',  
                '18 ', 
                '16 ', 
                '14 ', 
                '12 ',
                '10 ',
                '8 ',
                '6 ',
                '4 ',
                '2 ']

# empty list to append metric values
mae_gen = []
mse_gen  =[]
rmse_gen = []
mae_nor = []
mae_spi = []
mse_nor = []
mse_spi = []
rmse_nor = []
rmse_spi = []



# Data set processing and tuning:

In [17]:
for i in dates:
    # data
    data = pd.read_csv('Data_set_1_smaller.csv', index_col = 0)
    data = data.loc[data.index > i, :]
    
    # reset index
    data.reset_index(inplace = True)
    data.drop('index', axis = 1, inplace = True)
    
    # Divide features and labels
    X = data.iloc[:, 0:15]
    y = data.loc[:, 'Offers']
    
    X.fillna(X.mean(), inplace = True)
    y.fillna(y.mean(), inplace = True)
    
    X = X.astype('float64')
    X = X.round(20)
    
    # divide data into train and test with 15% test data
    X_train, X_test, y_train, y_test = train_test_split(
             X, y, test_size = 0.15, shuffle=False)
    
    # feature scaling
    sc_X = MinMaxScaler()
    X_train = sc_X.fit_transform(X_train)
    X_test = sc_X.transform(X_test)
    
    # create time series split for CV
    splits = 7
    tscv = TimeSeriesSplit(n_splits = splits)
    
    # create linear regressor 
    regressor = LinearRegression()
    
    for train_index, test_index in tscv.split(X_train):
          X_train_split, X_test_split = X_train[train_index], X_train[test_index]
          y_train_split, y_test_split = y_train[train_index], y_train[test_index]
          regressor.fit(X_train_split, y_train_split)
    
    # predict for X_test  
    y_pred = regressor.predict(X_test)

    from sklearn.metrics import mean_squared_error as mse
    from sklearn.metrics import mean_absolute_error as mae

    rmse_error = mse(y_test, y_pred, squared = False)
    mse_error = mse(y_test, y_pred) # 1479.61335
    mae_error = mae(y_test, y_pred) # 23.1525
    
    rmse_gen.append(rmse_error)
    mse_gen.append(mse_error)
    mae_gen.append(mae_error)
    
    # =============================================================================
    # Metrics evaluation on spike regions
    # =============================================================================
    
    y_spike_occ = pd.read_csv('Spike_binary_1std.csv', usecols = [6])
    
    # create array same size as y_test
    y_spike_occ = y_spike_occ.iloc[- len(y_test):]
    y_spike_occ = pd.Series(y_spike_occ.iloc[:,0]).values
    
    
    # smal adjustment
    y_test.replace(0, 0.0001,inplace = True)

    
    # select y_pred and y_test only for regions with spikes
    y_test_spike = (y_test.T * y_spike_occ).T
    y_pred_spike = (y_pred.T * y_spike_occ).T
    y_test_spike = y_test_spike[y_test_spike != 0]
    y_pred_spike = y_pred_spike[y_pred_spike != 0]
    
    # calculate metric
    rmse_spike = mse(y_test_spike, y_pred_spike, squared = False)
    mse_spike = mse(y_test_spike, y_pred_spike)
    mae_spike = mae(y_test_spike, y_pred_spike)
    
    rmse_spi.append(rmse_spike)
    mse_spi.append(mse_spike)
    mae_spi.append(mae_spike)
    
    # =============================================================================
    # Metric evaluation on normal regions
    # =============================================================================
    
    # inverse y_spike_occ so the only normal occurences are chosen
    y_normal_occ = (y_spike_occ - 1) * (-1)
    
    # sanity check
    y_normal_occ.sum() + y_spike_occ.sum() # gives the correct total 
    
    # select y_pred and y_test only for normal regions
    y_test_normal = (y_test.T * y_normal_occ).T
    y_pred_normal = (y_pred.T * y_normal_occ).T
    y_test_normal = y_test_normal[y_test_normal != 0.00]
    y_pred_normal = y_pred_normal[y_pred_normal != 0.00]
    
    # calculate metric
    rmse_normal = mse(y_test_normal, y_pred_normal, squared = False)
    mse_normal = mse(y_test_normal, y_pred_normal)
    mae_normal = mae(y_test_normal, y_pred_normal)
    
    rmse_nor.append(rmse_normal)
    mse_nor.append(mse_normal)
    mae_nor.append(mae_normal)  


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a s

# Results:

In [18]:
results = pd.DataFrame({                        
                        'rmse_general': rmse_gen, 
                 
                        'mae_general': mae_gen,
                        
                        'rmse_spike': rmse_spi,
                 
                        'mae_spike': mae_spi,
                        
                        'rmse_normal': rmse_nor,
                    
                        'mae_normal': mae_nor}, index = dates_labels)

results.to_csv('Results_Linear_Regression_predictive_window.csv')

# Highlight best results:

In [19]:
def highlight_min(s):
    '''
    highlight the maximum in a Series yellow.
    '''
    is_max = s == s.min()
    return ['background-color: yellow' if v else '' for v in is_max]

results.style.apply(highlight_min)

Unnamed: 0,rmse_general,mae_general,rmse_spike,mae_spike,rmse_normal,mae_normal
24,46.239043,32.097747,87.490483,58.073913,31.282767,26.638207
22,46.004745,30.810083,89.226219,58.025463,30.070412,25.165714
20,46.13321,30.329918,91.434238,59.145912,29.215028,24.435558
18,45.786912,29.570111,91.401886,57.76518,27.812719,23.609756
16,47.490328,31.626817,93.992376,57.263661,30.49619,26.46256
14,48.410827,30.844238,97.805482,58.22007,30.004619,25.373354
12,53.133037,32.010633,105.971584,59.076499,35.697182,27.024174
10,48.776812,30.073776,86.262064,57.196603,37.730283,24.953501
8,32.763167,25.043521,65.640231,53.108762,22.627303,20.173157
6,34.523382,27.112522,70.964036,58.64354,24.723575,22.350025


In [20]:
results.min()

rmse_general    31.145243
mae_general     23.008685
rmse_spike      62.712436
mae_spike       50.735031
rmse_normal     21.792485
mae_normal      18.375370
dtype: float64

# Plot results:

In [21]:
!pip install matplotlib

%matplotlib notebook

import matplotlib.pyplot as plt




In [26]:
fontsize = 13

plt.figure(figsize=(10,3.5))
plt.minorticks_on()
plt.grid(which='major', linestyle='-', linewidth='0.5')
plt.grid(which='minor', linestyle=':', linewidth='0.5')
plt.title('Linear Regression: RMSE for different training time window sizes', fontsize = fontsize + 2)
plt.plot(rmse_gen, label = 'All test set')
plt.plot(rmse_spi, label = 'Spike regions')
plt.plot(rmse_nor, label = 'Non - spike regions')
plt.xlim(0, 11)
plt.legend(loc = 'upper right', fontsize = fontsize - 1)
plt.ylabel('RMSE (£/MWh)', fontsize = fontsize)
plt.xlabel('Time window (in months)', fontsize = fontsize)
plt.xticks([0,1,2,3,4,5,6,7,8,9,10,11], dates_labels, fontsize = fontsize)
plt.yticks(fontsize = fontsize)
plt.tight_layout()
plt.savefig('RMSE_predictive_window.png')


<IPython.core.display.Javascript object>