In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('./data/dl-nfe-ns.csv')

X = df.drop('throughput',axis=1)

y = df['throughput']

# Scaler Selection
---
In the last Notebook we examined the Features and their distributions. We concluded non of the Features are normally distributed and the Features have a variaty of scales and units.

In this section we settle on the Scaler that yields the best performance in a Linear Regression Model, i.e., the Scaler yields the smallest error metric (MAE, MSE, and RMSE). This will be the baseline model. 

Note: when fitting data to the Scaler, it's important to fit ONLY on the train data set to avoid data leakage. 

In [2]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

from sklearn import preprocessing
mm_scaler = preprocessing.MinMaxScaler()
mm_scaler.fit(X_train)

rb_scaler = preprocessing.RobustScaler()
rb_scaler.fit(X_train)

st_scaler = preprocessing.StandardScaler()
st_scaler.fit(X_train)

scalers = [mm_scaler, rb_scaler, st_scaler]

In [3]:
scalers

[MinMaxScaler(), RobustScaler(), StandardScaler()]

# Linear Regression

In [4]:
model_name = 'LinReg'

In [5]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [6]:
from sklearn.model_selection import cross_val_score

scalers_str = []
mae_list = []
mse_list = []
rmse_list = []

for scaler in scalers: 
    # Scale data
    X_train_s = scaler.transform(X_train)
    X_test_s = scaler.transform(X_test)
    
    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_mean_absolute_error',cv=10)
    mae_list.append(abs(scores.mean()))
    print(f"{scaler} MAE-scores: {scores}")
    
    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_mean_squared_error',cv=10)
    mse_list.append(abs(scores.mean()))
    print(f"{scaler} MSE-scores: {scores}")

    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_root_mean_squared_error',cv=10)
    rmse_list.append(abs(scores.mean()))
    print(f"{scaler} RMSE-scores: {scores}")
    
    print(f"{scaler} done!")
    print('\n')
    
    scalers_str.append(str(scaler))

MinMaxScaler() MAE-scores: [-3.29991922e+00 -3.51321470e+00 -4.30763886e+10 -2.36967839e+11
 -3.57772400e+00 -3.13283707e+00 -1.46745968e+11 -3.91089926e+00
 -6.00351583e+11 -1.88994546e+11]
MinMaxScaler() MSE-scores: [-1.70693944e+01 -2.73285161e+01 -2.67202837e+23 -4.28568095e+24
 -2.11977518e+01 -1.52918804e+01 -3.07941622e+24 -2.46995925e+01
 -2.58944837e+25 -5.10780817e+24]
MinMaxScaler() RMSE-scores: [-4.13151236e+00 -5.22766832e+00 -5.16916664e+11 -2.07018863e+12
 -4.60410162e+00 -3.91048340e+00 -1.75482655e+12 -4.96986846e+00
 -5.08866227e+12 -2.26004605e+12]
MinMaxScaler() done!


RobustScaler() MAE-scores: [-3.30476716 -3.51425973 -3.42361624 -3.78707881 -3.577724   -3.1380967
 -3.25163631 -3.90866413 -3.14763595 -3.64465367]
RobustScaler() MSE-scores: [-17.10839283 -27.33992015 -21.40827128 -25.26192732 -21.19775177
 -15.34843546 -19.22501916 -24.64375134 -17.87274432 -26.30077257]
RobustScaler() RMSE-scores: [-4.1362293  -5.22875895 -4.62690731 -5.02612448 -4.60410162 -3.91

In [7]:
def min_index(l):
     return scalers_str[min(range(len(l)), key=l.__getitem__)]

In [8]:
min_index(mae_list)

'RobustScaler()'

In [9]:
min_index(mse_list)

'RobustScaler()'

In [10]:
min_index(rmse_list)

'RobustScaler()'

In [11]:
err_list = [mae_list, mse_list, rmse_list]

In [12]:
err_df = pd.DataFrame(columns=scalers_str, data=err_list, index=['MAE','MSE','RMSE'])

In [13]:
err_df
# -> model produces bad predictions! -> skip linear model with no regularization and go straigth to elastic net model

Unnamed: 0,MinMaxScaler(),RobustScaler(),StandardScaler()
MAE,121613600000.0,3.469813,1892438000000.0
MSE,3.863459e+24,21.570699,1.182243e+27
RMSE,1169064000000.0,4.624476,18415410000000.0


In [14]:
err_df.to_csv(f'./scaler-err/{model_name}-scaler-err-nfe.csv')

# Poisson Regressor

In [15]:
model_name = 'PoiReg'

In [16]:
from sklearn.linear_model import PoissonRegressor
model = PoissonRegressor()

In [17]:
from sklearn.model_selection import cross_val_score

scalers_str = []
mae_list = []
mse_list = []
rmse_list = []

for scaler in scalers: 
    # Scale data
    X_train_s = scaler.transform(X_train)
    X_test_s = scaler.transform(X_test)
    
    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_mean_absolute_error',cv=10)
    mae_list.append(abs(scores.mean()))
    print(f"{scaler} MAE-scores: {scores}")
    
    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_mean_squared_error',cv=10)
    mse_list.append(abs(scores.mean()))
    print(f"{scaler} MSE-scores: {scores}")

    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_root_mean_squared_error',cv=10)
    rmse_list.append(abs(scores.mean()))
    print(f"{scaler} RMSE-scores: {scores}")
    
    print(f"{scaler} done!")
    print('\n')
    
    scalers_str.append(str(scaler))

MinMaxScaler() MAE-scores: [-3.99830061 -4.18194142 -4.05822045 -4.67453496 -3.82469971 -3.72964035
 -4.33458683 -4.41732445 -3.48330877 -4.18515077]
MinMaxScaler() MSE-scores: [-23.28865416 -32.23971368 -28.50544383 -32.70699738 -20.41917561
 -20.9603554  -30.99720284 -29.97417449 -22.12562924 -27.0828639 ]
MinMaxScaler() RMSE-scores: [-4.82583197 -5.67800261 -5.33904896 -5.71900318 -4.51875819 -4.57824807
 -5.56751316 -5.47486753 -4.70378882 -5.2041199 ]
MinMaxScaler() done!


RobustScaler() MAE-scores: [-3.44752765 -3.55052689 -3.41520412 -3.93537467 -3.44983903 -3.09403443
 -3.61726699 -3.88068067 -3.17533258 -3.59168522]
RobustScaler() MSE-scores: [-17.6478597  -25.900412   -22.06178961 -25.62428154 -17.20902408
 -15.28869649 -22.69030163 -23.8458462  -18.52437713 -21.61183776]
RobustScaler() RMSE-scores: [-4.20093557 -5.08924474 -4.69699794 -5.06204322 -4.14837608 -3.91007628
 -4.76343381 -4.88322088 -4.30399548 -4.64885338]
RobustScaler() done!


StandardScaler() MAE-scores: [-3

In [18]:
min_index(mae_list)

'StandardScaler()'

In [19]:
min_index(mse_list)

'StandardScaler()'

In [20]:
min_index(rmse_list)

'StandardScaler()'

In [21]:
err_list = [mae_list, mse_list, rmse_list]

In [22]:
err_df = pd.DataFrame(columns=scalers_str, data=err_list, index=['MAE','MSE','RMSE'])

In [23]:
err_df
# -> StandardScaler -> use StandardScaler for Elastic Net also

Unnamed: 0,MinMaxScaler(),RobustScaler(),StandardScaler()
MAE,4.088771,3.515747,3.338589
MSE,26.830021,21.040443,19.781246
RMSE,5.160918,4.570718,4.434923


In [24]:
err_df.to_csv(f'./scaler-err/{model_name}-scaler-err-nfe.csv')

# Support Vector Regressor

In [25]:
model_name = 'SVR'

In [26]:
from sklearn.svm import SVR
model = SVR()

In [27]:
from sklearn.model_selection import cross_val_score

scalers_str = []
mae_list = []
mse_list = []
rmse_list = []

for scaler in scalers: 
    # Scale data
    X_train_s = scaler.transform(X_train)
    X_test_s = scaler.transform(X_test)
    
    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_mean_absolute_error',cv=10)
    mae_list.append(abs(scores.mean()))
    print(f"{scaler} MAE-scores: {scores}")
    
    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_mean_squared_error',cv=10)
    mse_list.append(abs(scores.mean()))
    print(f"{scaler} MSE-scores: {scores}")

    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_root_mean_squared_error',cv=10)
    rmse_list.append(abs(scores.mean()))
    print(f"{scaler} RMSE-scores: {scores}")
    
    print(f"{scaler} done!")
    print('\n')
    
    scalers_str.append(str(scaler))

MinMaxScaler() MAE-scores: [-3.57457245 -3.72172938 -3.4836075  -4.01087608 -3.40743798 -3.16638812
 -3.59260824 -3.97329324 -3.25020396 -3.79563859]
MinMaxScaler() MSE-scores: [-19.88989697 -30.50215737 -23.92292236 -27.13880442 -17.28199234
 -16.16736476 -24.61222184 -25.33499273 -20.75557209 -25.2985247 ]
MinMaxScaler() RMSE-scores: [-4.45980907 -5.52287582 -4.89110646 -5.20949176 -4.15716157 -4.02086617
 -4.96107063 -5.0333878  -4.55582836 -5.02976388]
MinMaxScaler() done!


RobustScaler() MAE-scores: [-3.49561075 -3.80201581 -3.6509275  -3.86778742 -3.27026967 -3.01067693
 -3.48210329 -3.72989678 -3.12295352 -3.45857809]
RobustScaler() MSE-scores: [-20.41443991 -29.92032926 -24.7504133  -27.42583604 -15.99649423
 -15.39911643 -23.77412685 -23.32165492 -19.63585238 -22.18568215]
RobustScaler() RMSE-scores: [-4.51823416 -5.46994783 -4.97497872 -5.23696821 -3.99956175 -3.9241708
 -4.87587191 -4.82924993 -4.43123599 -4.71016795]
RobustScaler() done!


StandardScaler() MAE-scores: [-3.

In [28]:
min_index(mae_list)

'RobustScaler()'

In [29]:
min_index(mse_list)

'RobustScaler()'

In [30]:
min_index(rmse_list)

'RobustScaler()'

In [31]:
err_list = [mae_list, mse_list, rmse_list]

In [32]:
err_df = pd.DataFrame(columns=scalers_str, data=err_list, index=['MAE','MSE','RMSE'])

In [33]:
err_df
# -> RobustScaler

Unnamed: 0,MinMaxScaler(),RobustScaler(),StandardScaler()
MAE,3.597636,3.489082,3.83852
MSE,23.090445,22.282395,25.53851
RMSE,4.784136,4.697039,5.033716


In [34]:
err_df.to_csv(f'./scaler-err/{model_name}-scaler-err-nfe.csv')

In [35]:
# def highlight_min(s):
#     is_min = s == s.min()
#     return ['background-color: grey' if v else '' for v in is_min]

In [36]:
# scalers_df.transpose().style.apply(highlight_min)

# Random Forest Regressor

In [37]:
model_name = 'RF'

In [38]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()

In [39]:
from sklearn.model_selection import cross_val_score

scalers_str = []
mae_list = []
mse_list = []
rmse_list = []

for scaler in scalers: 
    # Scale data
    X_train_s = scaler.transform(X_train)
    X_test_s = scaler.transform(X_test)
    
    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_mean_absolute_error',cv=10)
    mae_list.append(abs(scores.mean()))
    print(f"{scaler} MAE-scores: {scores}")
    
    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_mean_squared_error',cv=10)
    mse_list.append(abs(scores.mean()))
    print(f"{scaler} MSE-scores: {scores}")

    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_root_mean_squared_error',cv=10)
    rmse_list.append(abs(scores.mean()))
    print(f"{scaler} RMSE-scores: {scores}")
    
    print(f"{scaler} done!")
    print('\n')
    
    scalers_str.append(str(scaler))

MinMaxScaler() MAE-scores: [-2.90432467 -3.5549293  -3.26936111 -3.36674075 -2.71529663 -2.54958342
 -3.19306292 -3.43545544 -2.83518068 -3.38720904]
MinMaxScaler() MSE-scores: [-13.94790814 -27.67362594 -23.40448082 -21.46925361 -12.31367392
 -11.83427762 -18.15362996 -19.54766295 -14.80591541 -21.71737653]
MinMaxScaler() RMSE-scores: [-3.75292715 -5.26075841 -4.75678798 -4.68039479 -3.56743792 -3.34335313
 -4.34306865 -4.35852763 -3.90383836 -4.67307816]
MinMaxScaler() done!


RobustScaler() MAE-scores: [-2.89725723 -3.49196431 -3.30036561 -3.39418524 -2.66441725 -2.48942422
 -3.17510722 -3.37686957 -2.88379252 -3.38061236]
RobustScaler() MSE-scores: [-13.54318679 -27.24715034 -22.26203759 -21.41542048 -12.30123059
 -10.92786002 -18.13579879 -19.10244515 -14.69107854 -21.43482417]
RobustScaler() RMSE-scores: [-3.77505756 -5.31722037 -4.81307278 -4.61320123 -3.51998339 -3.36113496
 -4.31124544 -4.29204578 -3.91592632 -4.59363829]
RobustScaler() done!


StandardScaler() MAE-scores: [-2

In [40]:
def min_index(l):
     return scalers_str[min(range(len(l)), key=l.__getitem__)]

In [41]:
min_index(mae_list)

'RobustScaler()'

In [42]:
min_index(mse_list)

'RobustScaler()'

In [43]:
min_index(rmse_list)

'RobustScaler()'

In [44]:
err_list = [mae_list, mse_list, rmse_list]

In [45]:
err_df = pd.DataFrame(columns=scalers_str, data=err_list, index=['MAE','MSE','RMSE'])

In [46]:
err_df
# -> RobustScaler (marginal difference)

Unnamed: 0,MinMaxScaler(),RobustScaler(),StandardScaler()
MAE,3.121114,3.1054,3.11347
MSE,18.48678,18.106103,18.468607
RMSE,4.264017,4.251253,4.273295


In [47]:
err_df.to_csv(f'./scaler-err/{model_name}-scaler-err-nfe.csv')

DONE!