# Kernel Tuning

In [1]:
!pip install pandas
!pip install sklearn

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVR

# empty list to append metric values
mae_gen = []
mae_nor = []
mae_spi = []
rmse_gen = []
rmse_nor = []
rmse_spi = []



# Data set processing and tuning:

In [2]:
data = pd.read_csv('Data_set_1_smaller.csv', index_col = 0)

# set predictive window 
data = data.loc[data.index > 2018070000, :]

# reset index
data.reset_index(inplace = True)
data.drop('index', axis = 1, inplace = True)

# divide features and labels
X = data.iloc[:, 0:15]
y = data.loc[:, 'Offers']

X.fillna(X.median(), inplace = True)
y.fillna(y.median(), inplace = True)

# small fix
X = X.astype('float64')
X = X.round(20)

# divide into train and test
X_train, X_test, y_train, y_test = train_test_split(
         X, y, test_size = 0.075, shuffle = False)

# feature scaling
sc_X = MinMaxScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

# the different regressors to train
# gamma = kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’
svr_rbf_auto = SVR(kernel='rbf', gamma= 'auto')
svr_rbf_scale = SVR(kernel='rbf', gamma= 'scale')
svr_poly_auto = SVR(kernel='poly', gamma='auto')
svr_poly_scale = SVR(kernel='poly', gamma='scale')
svr_sigmoid_auto = SVR(kernel='sigmoid', gamma='auto')
svr_sigmoid_scale = SVR(kernel='sigmoid', gamma='scale')
svr_lin = SVR(kernel='linear')

kernel_range = [svr_rbf_auto, svr_rbf_scale, 
                svr_poly_auto, svr_poly_scale,
                svr_sigmoid_auto, svr_sigmoid_scale,
                svr_lin ]

kernel_label = ['RBF auto', 'RBF scale',
                'Polynomial auto', 'Polynomial scale',
                'Sigmoid auto', 'Sigmoid scale',
                'Linear']

# loop
for i in kernel_range:
    # create regressor 
    regressor = i
    regressor.fit(X_train, y_train)

    # predict for X_test  
    y_pred = regressor.predict(X_test)

    from sklearn.metrics import mean_squared_error as mse
    from sklearn.metrics import mean_absolute_error as mae

    rmse_error = mse(y_test, y_pred, squared = False)
    mae_error = mae(y_test, y_pred) 
    
    rmse_gen.append(rmse_error)
    mae_gen.append(mae_error)
    
    # =============================================================================
    # Metrics evaluation on spike regions
    # =============================================================================
    
    y_spike_occ = pd.read_csv('Spike_binary_1std.csv', usecols = [6])
    
    # create array same size as y_test
    y_spike_occ = y_spike_occ.iloc[- len(y_test):]
    y_spike_occ = pd.Series(y_spike_occ.iloc[:,0]).values
    
    # smal adjustment
    y_test.replace(0, 0.0001,inplace = True)

    
    # select y_pred and y_test only for regions with spikes
    y_test_spike = (y_test.T * y_spike_occ).T
    y_pred_spike = (y_pred.T * y_spike_occ).T
    y_test_spike = y_test_spike[y_test_spike != 0]
    y_pred_spike = y_pred_spike[y_pred_spike != 0]
    
    # calculate metric
    rmse_spike = mse(y_test_spike, y_pred_spike, squared = False)
    mae_spike = mae(y_test_spike, y_pred_spike)
    
    rmse_spi.append(rmse_spike)
    mae_spi.append(mae_spike)
    
    # =============================================================================
    # Metric evaluation on normal regions
    # =============================================================================
    
    # inverse y_spike_occ so the only normal occurences are chosen
    y_normal_occ = (y_spike_occ - 1) * (-1)
    
    # sanity check
    y_normal_occ.sum() + y_spike_occ.sum() # gives the correct total 
    
    # select y_pred and y_test only for normal regions
    y_test_normal = (y_test.T * y_normal_occ).T
    y_pred_normal = (y_pred.T * y_normal_occ).T
    y_test_normal = y_test_normal[y_test_normal != 0.00]
    y_pred_normal = y_pred_normal[y_pred_normal != 0.00]
    
    # calculate metric
    rmse_normal = mse(y_test_normal, y_pred_normal, squared = False)
    mae_normal = mae(y_test_normal, y_pred_normal)
    
    rmse_nor.append(rmse_normal)
    mae_nor.append(mae_normal)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)


# Results:

In [3]:
results = pd.DataFrame({                        
                        'rmse_general': rmse_gen, 
                 
                        'mae_general': mae_gen,
                        
                        'rmse_spike': rmse_spi,
                 
                        'mae_spike': mae_spi,
                        
                        'rmse_normal': rmse_nor,
                    
                        'mae_normal': mae_nor}, index = kernel_range)

results

Unnamed: 0,rmse_general,mae_general,rmse_spike,mae_spike,rmse_normal,mae_normal
"SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',\n kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)",28.568874,13.923926,71.49058,57.350793,10.745355,6.838489
"SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',\n kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)",28.303229,14.557012,70.289201,55.828511,11.210746,7.823242
"SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',\n kernel='poly', max_iter=-1, shrinking=True, tol=0.001, verbose=False)",28.677633,13.853012,71.618368,57.858753,10.941699,6.673128
"SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',\n kernel='poly', max_iter=-1, shrinking=True, tol=0.001, verbose=False)",28.254527,14.999839,69.692045,55.238537,11.666851,8.434578
"SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',\n kernel='sigmoid', max_iter=-1, shrinking=True, tol=0.001, verbose=False)",28.644321,13.863798,71.685345,57.592341,10.767254,6.729141
"SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',\n kernel='sigmoid', max_iter=-1, shrinking=True, tol=0.001, verbose=False)",28.502708,14.763839,70.660888,57.099795,11.415497,7.856393
"SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',\n kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False)",28.328231,14.237349,70.721431,56.462858,10.834365,7.347923


# Highlight best results:

In [4]:
def highlight_min(s):
    '''
    highlight the maximum in a Series yellow.
    '''
    is_max = s == s.min()
    return ['background-color: yellow' if v else '' for v in is_max]

results.style.apply(highlight_min)

Unnamed: 0,rmse_general,mae_general,rmse_spike,mae_spike,rmse_normal,mae_normal
"SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)",28.568874,13.923926,71.49058,57.350793,10.745355,6.838489
"SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)",28.303229,14.557012,70.289201,55.828511,11.210746,7.823242
"SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',  kernel='poly', max_iter=-1, shrinking=True, tol=0.001, verbose=False)",28.677633,13.853012,71.618368,57.858753,10.941699,6.673128
"SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',  kernel='poly', max_iter=-1, shrinking=True, tol=0.001, verbose=False)",28.254527,14.999839,69.692045,55.238537,11.666851,8.434578
"SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',  kernel='sigmoid', max_iter=-1, shrinking=True, tol=0.001, verbose=False)",28.644321,13.863798,71.685345,57.592341,10.767254,6.729141
"SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',  kernel='sigmoid', max_iter=-1, shrinking=True, tol=0.001, verbose=False)",28.502708,14.763839,70.660888,57.099795,11.415497,7.856393
"SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',  kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False)",28.328231,14.237349,70.721431,56.462858,10.834365,7.347923


In [5]:
!pip install matplotlib



# Plot results:

In [20]:
%matplotlib notebook

import matplotlib.pyplot as plt

plt.figure(figsize=(11, 9))

plt.subplot(2, 1, 1)
plt.minorticks_on()
plt.grid(which='major', linestyle='-', linewidth='0.5')
plt.grid(which='minor', linestyle=':', linewidth='0.5')
plt.title('SVM: Averaged RMSE on all test set \n for different kernels', fontsize = 15)
plt.scatter(x = list(range(len(rmse_gen))), y = rmse_gen, label = 'Overall error')
plt.legend()
plt.ylabel('RMSE (£/MWh)', fontsize = 13)
plt.yticks(fontsize = 13)
plt.xticks(list(range(len(rmse_gen))), kernel_label, rotation = 90, fontsize = 13)


plt.subplot(2, 2, 3)
plt.minorticks_on()
plt.grid(which='major', linestyle='-', linewidth='0.5')
plt.grid(which='minor', linestyle=':', linewidth='0.5')
plt.title('SVM: Averaged RMSE on spike regions \n for different kernels', fontsize = 15)
plt.scatter(x = list(range(len(rmse_gen))), y = rmse_spi, label = 'Spike regions', color = 'orange')
plt.legend(loc = 'lower right')
plt.ylabel('RMSE (£/MWh)', fontsize = 13)
plt.yticks(fontsize = 13)
plt.xticks(list(range(len(rmse_gen))), kernel_label, rotation = 90, fontsize = 13)

plt.subplot(2, 2, 4)
plt.minorticks_on()
plt.grid(which='major', linestyle='-', linewidth='0.5')
plt.grid(which='minor', linestyle=':', linewidth='0.5')
plt.title('SVM: Averaged RMSE on normal regions \n for different kernels', fontsize = 15)
plt.scatter(x = list(range(len(rmse_gen))), y = rmse_nor, label = 'Normal regions', color = 'green')
plt.legend()
plt.ylabel('RMSE (£/MWh)', fontsize = 13)
plt.yticks(fontsize = 13)
plt.xticks(list(range(len(rmse_gen))), kernel_label, rotation = 90, fontsize = 13)
plt.tight_layout()
plt.savefig('_TRIAL_RMSE_best_kernel_normal.png')

<IPython.core.display.Javascript object>

In [7]:
%matplotlib notebook

import matplotlib.pyplot as plt

plt.figure(figsize=(7,4.5))
plt.minorticks_on()
plt.grid(which='major', linestyle='-', linewidth='0.5')
plt.grid(which='minor', linestyle=':', linewidth='0.5')
plt.title('SVM: Averaged MAE on all test set \n for different kernels')
plt.scatter(x = list(range(len(mae_gen))), y = mae_gen, label = 'Overall error')
plt.legend()
plt.ylabel('MAE (£/MWh)')
plt.xticks(list(range(len(rmse_gen))), kernel_label, rotation = 90)
plt.tight_layout()
plt.savefig('MAE_best_kernel_all.png')

plt.figure(figsize=(7,4.5))
plt.minorticks_on()
plt.grid(which='major', linestyle='-', linewidth='0.5')
plt.grid(which='minor', linestyle=':', linewidth='0.5')
plt.title('SVM: Averaged MAE on spike regions \n for different kernels')
plt.scatter(x = list(range(len(mae_gen))), y = mae_spi, label = 'Spike regions', color = 'orange')
plt.legend()
plt.ylabel('MAE (£/MWh)')
plt.xticks(list(range(len(rmse_gen))), kernel_label, rotation = 90)
plt.tight_layout()
plt.savefig('MAE_best_kernel_spike.png')


plt.figure(figsize=(7,4.5))
plt.minorticks_on()
plt.grid(which='major', linestyle='-', linewidth='0.5')
plt.grid(which='minor', linestyle=':', linewidth='0.5')
plt.title('SVM: Averaged MAE on normal regions \n for different kernels')
plt.scatter(x = list(range(len(mae_gen))), y = mae_nor, label = 'Normal regions', color = 'green')
plt.legend()
plt.ylabel('MAE (£/MWh)')
plt.xticks(list(range(len(rmse_gen))), kernel_label, rotation = 90)
plt.tight_layout()
plt.savefig('MAE_best_kernel_normal.png')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>