In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('./data/ul-pfe-ns.csv')

X = df.drop('throughput',axis=1)

y = df['throughput']

# Scaler Selection
---
In the last Notebook we examined the Features and their distributions. We concluded non of the Features are normally distributed and the Features have a variaty of scales and units.

In this section we settle on the Scaler that yields the best performance in a Linear Regression Model, i.e., the Scaler yields the smallest error metric (MAE, MSE, and RMSE). This will be the baseline model. 

Note: when fitting data to the Scaler, it's important to fit ONLY on the train data set to avoid data leakage. 

In [2]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

from sklearn import preprocessing
mm_scaler = preprocessing.MinMaxScaler()
mm_scaler.fit(X_train)

rb_scaler = preprocessing.RobustScaler()
rb_scaler.fit(X_train)

st_scaler = preprocessing.StandardScaler()
st_scaler.fit(X_train)

scalers = [mm_scaler, rb_scaler, st_scaler]

In [3]:
scalers

[MinMaxScaler(), RobustScaler(), StandardScaler()]

# Linear Regression

In [4]:
model_name = 'LinReg'

In [5]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [6]:
from sklearn.model_selection import cross_val_score

scalers_str = []
mae_list = []
mse_list = []
rmse_list = []

for scaler in scalers: 
    # Scale data
    X_train_s = scaler.transform(X_train)
    X_test_s = scaler.transform(X_test)
    
    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_mean_absolute_error',cv=10)
    mae_list.append(abs(scores.mean()))
    print(f"{scaler} MAE-scores: {scores}")
    
    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_mean_squared_error',cv=10)
    mse_list.append(abs(scores.mean()))
    print(f"{scaler} MSE-scores: {scores}")

    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_root_mean_squared_error',cv=10)
    rmse_list.append(abs(scores.mean()))
    print(f"{scaler} RMSE-scores: {scores}")
    
    print(f"{scaler} done!")
    print('\n')
    
    scalers_str.append(str(scaler))

MinMaxScaler() MAE-scores: [-4.52877974e+00 -2.16464313e+11 -4.48331102e+00 -4.89128889e+00
 -4.44136320e+00 -4.66205931e+11 -4.09752531e+00 -4.30348247e+00
 -4.35774819e+00 -4.34765560e+00]
MinMaxScaler() MSE-scores: [-3.17605482e+01 -6.18509746e+24 -3.09040552e+01 -3.54911780e+01
 -2.97422047e+01 -2.86899320e+25 -2.55209405e+01 -2.93520912e+01
 -2.97492253e+01 -3.21351488e+01]
MinMaxScaler() RMSE-scores: [-5.63564976e+00 -2.48698562e+12 -5.55914159e+00 -5.95744727e+00
 -5.45364141e+00 -5.35629835e+12 -5.05182546e+00 -5.41775703e+00
 -5.45428504e+00 -5.66878725e+00]
MinMaxScaler() done!


RobustScaler() MAE-scores: [-4.52877974e+00 -4.69810032e+00 -4.47828203e+00 -4.89128889e+00
 -4.44136320e+00 -2.63942447e+10 -4.09752531e+00 -4.30348247e+00
 -4.35759909e+00 -4.34765560e+00]
RobustScaler() MSE-scores: [-3.17605482e+01 -3.28869732e+01 -3.09277548e+01 -3.54911780e+01
 -2.97422047e+01 -9.19586124e+22 -2.55209405e+01 -2.93520912e+01
 -2.97808821e+01 -3.21351488e+01]
RobustScaler() RMSE-s

In [7]:
def min_index(l):
     return scalers_str[min(range(len(l)), key=l.__getitem__)]

In [8]:
min_index(mae_list)

'RobustScaler()'

In [9]:
min_index(mse_list)

'RobustScaler()'

In [10]:
min_index(rmse_list)

'RobustScaler()'

In [11]:
err_list = [mae_list, mse_list, rmse_list]

In [12]:
err_df = pd.DataFrame(columns=scalers_str, data=err_list, index=['MAE','MSE','RMSE'])

In [13]:
err_df

Unnamed: 0,MinMaxScaler(),RobustScaler(),StandardScaler()
MAE,68267020000.0,2639424000.0,1344139000000.0
MSE,3.487503e+24,9.195861e+21,1.637616e+27
RMSE,784328400000.0,30324680000.0,15442980000000.0


In [14]:
err_df.to_csv(f'./scaler-err/{model_name}-scaler-err.csv')

# Poisson Regressor

In [15]:
model_name = 'PoiReg'

In [16]:
from sklearn.linear_model import PoissonRegressor
model = PoissonRegressor()

In [17]:
from sklearn.model_selection import cross_val_score

scalers_str = []
mae_list = []
mse_list = []
rmse_list = []

for scaler in scalers: 
    # Scale data
    X_train_s = scaler.transform(X_train)
    X_test_s = scaler.transform(X_test)
    
    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_mean_absolute_error',cv=10)
    mae_list.append(abs(scores.mean()))
    print(f"{scaler} MAE-scores: {scores}")
    
    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_mean_squared_error',cv=10)
    mse_list.append(abs(scores.mean()))
    print(f"{scaler} MSE-scores: {scores}")

    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_root_mean_squared_error',cv=10)
    rmse_list.append(abs(scores.mean()))
    print(f"{scaler} RMSE-scores: {scores}")
    
    print(f"{scaler} done!")
    print('\n')
    
    scalers_str.append(str(scaler))

MinMaxScaler() MAE-scores: [-5.43448946 -5.59018201 -5.06268748 -5.87306525 -4.90775553 -5.14390266
 -4.62335324 -4.78131881 -5.2678741  -5.23419079]
MinMaxScaler() MSE-scores: [-40.91413435 -42.15908271 -40.27596866 -49.03080802 -34.98355889
 -38.73017792 -31.63067713 -34.11318265 -39.02401271 -41.63023172]
MinMaxScaler() RMSE-scores: [-6.39641574 -6.4930026  -6.34633506 -7.00220023 -5.91469009 -6.22335745
 -5.62411567 -5.84064916 -6.24692026 -6.45214939]
MinMaxScaler() done!


RobustScaler() MAE-scores: [-4.81025358 -5.08721286 -4.88169352 -5.45177358 -4.40059456 -4.90122461
 -4.07257787 -4.25958542 -4.73402873 -4.86620108]
RobustScaler() MSE-scores: [-33.71554869 -37.57168535 -40.18692657 -41.15454604 -29.07845639
 -36.2043109  -25.48698018 -29.95674583 -35.23289167 -37.0608788 ]
RobustScaler() RMSE-scores: [-5.80650917 -6.12957465 -6.33931594 -6.4151809  -5.39244438 -6.01700182
 -5.04846315 -5.4732756  -5.93573009 -6.08776468]
RobustScaler() done!


StandardScaler() MAE-scores: [-4

In [18]:
min_index(mae_list)

'StandardScaler()'

In [19]:
min_index(mse_list)

'StandardScaler()'

In [20]:
min_index(rmse_list)

'StandardScaler()'

In [21]:
err_list = [mae_list, mse_list, rmse_list]

In [22]:
err_df = pd.DataFrame(columns=scalers_str, data=err_list, index=['MAE','MSE','RMSE'])

In [23]:
err_df
# -> StandardScaler -> use StandardScaler for Elastic Net also

Unnamed: 0,MinMaxScaler(),RobustScaler(),StandardScaler()
MAE,5.191882,4.746515,4.599563
MSE,39.249183,34.564897,32.396149
RMSE,6.253984,5.864526,5.685198


In [24]:
err_df.to_csv(f'./scaler-err/{model_name}-scaler-err.csv')

# Support Vector Regressor

In [25]:
model_name = 'SVR'

In [26]:
from sklearn.svm import SVR
model = SVR()

In [27]:
from sklearn.model_selection import cross_val_score

scalers_str = []
mae_list = []
mse_list = []
rmse_list = []

for scaler in scalers: 
    # Scale data
    X_train_s = scaler.transform(X_train)
    X_test_s = scaler.transform(X_test)
    
    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_mean_absolute_error',cv=10)
    mae_list.append(abs(scores.mean()))
    print(f"{scaler} MAE-scores: {scores}")
    
    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_mean_squared_error',cv=10)
    mse_list.append(abs(scores.mean()))
    print(f"{scaler} MSE-scores: {scores}")

    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_root_mean_squared_error',cv=10)
    rmse_list.append(abs(scores.mean()))
    print(f"{scaler} RMSE-scores: {scores}")
    
    print(f"{scaler} done!")
    print('\n')
    
    scalers_str.append(str(scaler))

MinMaxScaler() MAE-scores: [-5.13858805 -5.18677899 -4.69815458 -5.39285225 -4.56200214 -4.93164024
 -4.26607081 -4.42422889 -4.86908717 -5.10857194]
MinMaxScaler() MSE-scores: [-39.09115152 -38.23189941 -36.07790473 -42.77065716 -32.47371281
 -36.83808013 -28.28494169 -31.40535623 -35.19648403 -42.61191762]
MinMaxScaler() RMSE-scores: [-6.2522917  -6.18319492 -6.00648855 -6.53992792 -5.69857112 -6.06943821
 -5.31835893 -5.6040482  -5.93266247 -6.52778045]
MinMaxScaler() done!


RobustScaler() MAE-scores: [-5.17163839 -5.25342532 -4.77615137 -5.2215659  -4.54045038 -5.10708349
 -4.16827078 -4.50757533 -4.94876155 -5.214592  ]
RobustScaler() MSE-scores: [-43.75531786 -42.7708013  -36.53546858 -41.4292383  -34.09816885
 -38.86457851 -28.72990214 -34.19070576 -37.2679849  -45.24982551]
RobustScaler() RMSE-scores: [-6.61478026 -6.53993894 -6.04445767 -6.43655485 -5.83936374 -6.23414617
 -5.36002818 -5.84728191 -6.10475101 -6.72679905]
RobustScaler() done!


StandardScaler() MAE-scores: [-5

In [28]:
min_index(mae_list)

'MinMaxScaler()'

In [29]:
min_index(mse_list)

'MinMaxScaler()'

In [30]:
min_index(rmse_list)

'MinMaxScaler()'

In [31]:
err_list = [mae_list, mse_list, rmse_list]

In [32]:
err_df = pd.DataFrame(columns=scalers_str, data=err_list, index=['MAE','MSE','RMSE'])

In [33]:
err_df
# -> RobustScaler

Unnamed: 0,MinMaxScaler(),RobustScaler(),StandardScaler()
MAE,4.857798,4.890951,5.088836
MSE,36.298211,38.289199,41.790082
RMSE,6.013276,6.17481,6.451861


In [34]:
err_df.to_csv(f'./scaler-err/{model_name}-scaler-err.csv')

In [35]:
# def highlight_min(s):
#     is_min = s == s.min()
#     return ['background-color: grey' if v else '' for v in is_min]

In [36]:
# scalers_df.transpose().style.apply(highlight_min)

# Random Forest Regressor

In [37]:
model_name = 'RF'

In [38]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()

In [39]:
from sklearn.model_selection import cross_val_score

scalers_str = []
mae_list = []
mse_list = []
rmse_list = []

for scaler in scalers: 
    # Scale data
    X_train_s = scaler.transform(X_train)
    X_test_s = scaler.transform(X_test)
    
    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_mean_absolute_error',cv=10)
    mae_list.append(abs(scores.mean()))
    print(f"{scaler} MAE-scores: {scores}")
    
    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_mean_squared_error',cv=10)
    mse_list.append(abs(scores.mean()))
    print(f"{scaler} MSE-scores: {scores}")

    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_root_mean_squared_error',cv=10)
    rmse_list.append(abs(scores.mean()))
    print(f"{scaler} RMSE-scores: {scores}")
    
    print(f"{scaler} done!")
    print('\n')
    
    scalers_str.append(str(scaler))

MinMaxScaler() MAE-scores: [-2.67606526 -2.9716964  -3.13682407 -2.83328125 -2.48252304 -3.30619703
 -2.6949694  -2.64747899 -2.86785642 -2.7682976 ]
MinMaxScaler() MSE-scores: [-13.88307292 -15.28164409 -22.87440137 -14.66938936 -11.7819451
 -18.12657902 -15.46179559 -12.58659285 -16.34182706 -18.67151728]
MinMaxScaler() RMSE-scores: [-3.6805268  -3.9801155  -4.73044434 -3.87029182 -3.43238137 -4.26583313
 -3.86508779 -3.58086383 -3.97030847 -4.26351005]
MinMaxScaler() done!


RobustScaler() MAE-scores: [-2.72528422 -3.0113267  -3.13345598 -2.85416243 -2.55639204 -3.27211227
 -2.79782754 -2.52809361 -2.83850806 -2.84350115]
RobustScaler() MSE-scores: [-14.42387019 -15.42258292 -22.67427269 -14.7499135  -11.91698858
 -17.841119   -15.4905078  -12.64447462 -16.27897347 -18.0611915 ]
RobustScaler() RMSE-scores: [-3.6719889  -3.88641599 -4.76073739 -3.81071498 -3.32197752 -4.31540053
 -3.89093682 -3.46556914 -3.97989466 -4.37782951]
RobustScaler() done!


StandardScaler() MAE-scores: [-2.

In [40]:
def min_index(l):
     return scalers_str[min(range(len(l)), key=l.__getitem__)]

In [41]:
min_index(mae_list)

'MinMaxScaler()'

In [42]:
min_index(mse_list)

'RobustScaler()'

In [43]:
min_index(rmse_list)

'RobustScaler()'

In [44]:
err_list = [mae_list, mse_list, rmse_list]

In [45]:
err_df = pd.DataFrame(columns=scalers_str, data=err_list, index=['MAE','MSE','RMSE'])

In [46]:
err_df
# -> RobustScaler (marginal difference)

Unnamed: 0,MinMaxScaler(),RobustScaler(),StandardScaler()
MAE,2.838519,2.856066,2.847194
MSE,15.967876,15.950389,16.059677
RMSE,3.963936,3.948147,3.975017


In [47]:
err_df.to_csv(f'./scaler-err/{model_name}-scaler-err.csv')

DONE!