In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('./data/d-wo-ns.csv')

X = df.drop('throughput',axis=1)

y = df['throughput']

# Scaler Selection
---
In the last Notebook we examined the Features and their distributions. We concluded non of the Features are normally distributed and the Features have a variaty of scales and units.

In this section we settle on the Scaler that yields the best performance in a Linear Regression Model, i.e., the Scaler yields the smallest error metric (MAE, MSE, and RMSE). This will be the baseline model. 

Note: when fitting data to the Scaler, it's important to fit ONLY on the train data set to avoid data leakage. 

In [2]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

from sklearn import preprocessing
mm_scaler = preprocessing.MinMaxScaler()
mm_scaler.fit(X_train)

rb_scaler = preprocessing.RobustScaler()
rb_scaler.fit(X_train)

st_scaler = preprocessing.StandardScaler()
st_scaler.fit(X_train)

scalers = [mm_scaler, rb_scaler, st_scaler]

In [3]:
scalers

[MinMaxScaler(), RobustScaler(), StandardScaler()]

# Linear Regression

In [4]:
model_name = 'lin'

In [5]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [6]:
from sklearn.model_selection import cross_val_score

scalers_str = []
mae_list = []
mse_list = []
rmse_list = []

for scaler in scalers: 
    # Scale data
    X_train_s = scaler.transform(X_train)
    X_test_s = scaler.transform(X_test)
    
    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_mean_absolute_error',cv=10)
    mae_list.append(abs(scores.mean()))
    print(f"{scaler} MAE-scores: {scores}")
    
    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_mean_squared_error',cv=10)
    mse_list.append(abs(scores.mean()))
    print(f"{scaler} MSE-scores: {scores}")

    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_root_mean_squared_error',cv=10)
    rmse_list.append(abs(scores.mean()))
    print(f"{scaler} RMSE-scores: {scores}")
    
    print(f"{scaler} done!")
    print('\n')
    
    scalers_str.append(str(scaler))

MinMaxScaler() MAE-scores: [-3.31684955e+00 -1.26335829e+10 -3.29969556e+00 -3.26534808e+00
 -1.43152556e+11 -6.41354675e+10 -9.64065835e+10 -3.32176380e+00
 -4.50374071e+10 -3.70325737e+10]
MinMaxScaler() MSE-scores: [-1.69671704e+01 -3.08042315e+22 -1.71260151e+01 -2.32301756e+01
 -3.95508230e+24 -4.00213154e+23 -1.78449203e+24 -1.97053341e+01
 -3.89446663e+23 -1.90216394e+23]
MinMaxScaler() RMSE-scores: [-4.11912252e+00 -1.75511343e+11 -4.13835898e+00 -4.81976925e+00
 -1.98873887e+12 -6.32624022e+11 -1.33584881e+12 -4.43906906e+00
 -6.24056618e+11 -4.36138045e+11]
MinMaxScaler() done!


RobustScaler() MAE-scores: [-3.28710178 -3.23900263 -3.29993451 -3.26534808 -3.30040569 -3.48738587
 -3.40672355 -3.3217638  -3.37326807 -3.5235812 ]
RobustScaler() MSE-scores: [-16.70071589 -19.22896508 -17.13131745 -23.23017559 -20.02288822
 -22.2047757  -18.05039757 -19.70533412 -20.11370607 -23.02961075]
RobustScaler() RMSE-scores: [-4.08665094 -4.38508439 -4.13899957 -4.81976925 -4.4746942  -4.7

In [7]:
def min_index(l):
     return scalers_str[min(range(len(l)), key=l.__getitem__)]

In [8]:
min_index(mae_list)

'RobustScaler()'

In [9]:
min_index(mse_list)

'RobustScaler()'

In [10]:
min_index(rmse_list)

'RobustScaler()'

In [11]:
err_list = [mae_list, mse_list, rmse_list]

In [12]:
err_df = pd.DataFrame(columns=scalers_str, data=err_list, index=['MAE','MSE','RMSE'])

In [13]:
err_df
# -> model produces bad predictions! -> skip linear model with no regularization and go straigth to elastic net model

Unnamed: 0,MinMaxScaler(),RobustScaler(),StandardScaler()
MAE,39839820000.0,3.350452,1920546000000.0
MSE,6.750255e+23,19.941789,1.58675e+27
RMSE,519291800000.0,4.458879,24727020000000.0


In [14]:
err_df.to_csv(f'./scaler-err/{model_name}-wo.csv')

# Poisson Regressor

In [15]:
model_name = 'poi'

In [16]:
from sklearn.linear_model import PoissonRegressor
model = PoissonRegressor()

In [17]:
from sklearn.model_selection import cross_val_score

scalers_str = []
mae_list = []
mse_list = []
rmse_list = []

for scaler in scalers: 
    # Scale data
    X_train_s = scaler.transform(X_train)
    X_test_s = scaler.transform(X_test)
    
    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_mean_absolute_error',cv=10)
    mae_list.append(abs(scores.mean()))
    print(f"{scaler} MAE-scores: {scores}")
    
    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_mean_squared_error',cv=10)
    mse_list.append(abs(scores.mean()))
    print(f"{scaler} MSE-scores: {scores}")

    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_root_mean_squared_error',cv=10)
    rmse_list.append(abs(scores.mean()))
    print(f"{scaler} RMSE-scores: {scores}")
    
    print(f"{scaler} done!")
    print('\n')
    
    scalers_str.append(str(scaler))

MinMaxScaler() MAE-scores: [-3.84662944 -4.05523473 -4.0903293  -3.87960648 -4.09892334 -4.33663429
 -3.77518904 -4.3298293  -3.84841555 -4.07860893]
MinMaxScaler() MSE-scores: [-23.5907888  -25.65925403 -25.7243442  -26.85222706 -27.87941259
 -28.13269911 -21.06589786 -30.94937633 -24.81452891 -26.24246773]
MinMaxScaler() RMSE-scores: [-4.85703498 -5.06549642 -5.07191721 -5.18191346 -5.28009589 -5.30402669
 -4.58976011 -5.56321637 -4.98141836 -5.12274026]
MinMaxScaler() done!


RobustScaler() MAE-scores: [-3.34428936 -3.46249729 -3.45625522 -3.31559788 -3.45244336 -3.68912853
 -3.27214373 -3.59711767 -3.42953052 -3.54530066]
RobustScaler() MSE-scores: [-17.54307315 -20.41385088 -18.16503995 -21.62899003 -21.68336317
 -21.89783368 -16.58300868 -22.41284392 -20.56158926 -21.09858703]
RobustScaler() RMSE-scores: [-4.1884452  -4.51816897 -4.26204645 -4.6506978  -4.65653983 -4.67951212
 -4.07222405 -4.73422052 -4.53448886 -4.59331983]
RobustScaler() done!


StandardScaler() MAE-scores: [-3

In [18]:
min_index(mae_list)

'StandardScaler()'

In [19]:
min_index(mse_list)

'StandardScaler()'

In [20]:
min_index(rmse_list)

'StandardScaler()'

In [21]:
err_list = [mae_list, mse_list, rmse_list]

In [22]:
err_df = pd.DataFrame(columns=scalers_str, data=err_list, index=['MAE','MSE','RMSE'])

In [23]:
err_df
# -> StandardScaler -> use StandardScaler for Elastic Net also

Unnamed: 0,MinMaxScaler(),RobustScaler(),StandardScaler()
MAE,4.03394,3.45643,3.294599
MSE,26.0911,20.198818,18.886119
RMSE,5.101762,4.488966,4.341527


In [24]:
err_df.to_csv(f'./scaler-err/{model_name}-wo.csv')

# Support Vector Regressor

In [25]:
model_name = 'svr'

In [26]:
from sklearn.svm import SVR
model = SVR()

In [27]:
from sklearn.model_selection import cross_val_score

scalers_str = []
mae_list = []
mse_list = []
rmse_list = []

for scaler in scalers: 
    # Scale data
    X_train_s = scaler.transform(X_train)
    X_test_s = scaler.transform(X_test)
    
    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_mean_absolute_error',cv=10)
    mae_list.append(abs(scores.mean()))
    print(f"{scaler} MAE-scores: {scores}")
    
    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_mean_squared_error',cv=10)
    mse_list.append(abs(scores.mean()))
    print(f"{scaler} MSE-scores: {scores}")

    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_root_mean_squared_error',cv=10)
    rmse_list.append(abs(scores.mean()))
    print(f"{scaler} RMSE-scores: {scores}")
    
    print(f"{scaler} done!")
    print('\n')
    
    scalers_str.append(str(scaler))

MinMaxScaler() MAE-scores: [-3.24279859 -3.29719305 -3.59268095 -3.33981114 -3.43013458 -3.58566365
 -3.27615874 -3.57064433 -3.41416529 -3.63435548]
MinMaxScaler() MSE-scores: [-18.196192   -19.35204491 -20.71573432 -24.03742865 -21.96327667
 -22.18669032 -16.81336476 -23.40690218 -20.4955409  -23.29200457]
MinMaxScaler() RMSE-scores: [-4.26569947 -4.39909592 -4.55145409 -4.90279804 -4.6864994  -4.71027497
 -4.10041032 -4.83806802 -4.52720012 -4.82617909]
MinMaxScaler() done!


RobustScaler() MAE-scores: [-3.20886173 -3.36567222 -3.44520808 -3.31002794 -3.43370981 -3.51094043
 -2.96771837 -3.41417029 -3.18839017 -3.36835001]
RobustScaler() MSE-scores: [-17.87441392 -19.9347764  -19.91914755 -23.30195309 -21.79195653
 -22.46992199 -14.55640074 -23.05126643 -18.3725854  -21.90658609]
RobustScaler() RMSE-scores: [-4.22781432 -4.46483778 -4.46308722 -4.82720966 -4.66818557 -4.74024493
 -3.81528515 -4.80117344 -4.2863254  -4.68044721]
RobustScaler() done!


StandardScaler() MAE-scores: [-3

In [28]:
min_index(mae_list)

'RobustScaler()'

In [29]:
min_index(mse_list)

'RobustScaler()'

In [30]:
min_index(rmse_list)

'RobustScaler()'

In [31]:
err_list = [mae_list, mse_list, rmse_list]

In [32]:
err_df = pd.DataFrame(columns=scalers_str, data=err_list, index=['MAE','MSE','RMSE'])

In [33]:
err_df
# -> RobustScaler

Unnamed: 0,MinMaxScaler(),RobustScaler(),StandardScaler()
MAE,3.438361,3.321305,3.623027
MSE,21.045918,20.317901,23.048861
RMSE,4.580768,4.497461,4.792348


In [34]:
err_df.to_csv(f'./scaler-err/{model_name}-wo.csv')

In [35]:
# def highlight_min(s):
#     is_min = s == s.min()
#     return ['background-color: grey' if v else '' for v in is_min]

In [36]:
# scalers_df.transpose().style.apply(highlight_min)

# Random Forest Regressor

In [37]:
model_name = 'rf'

In [38]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()

In [39]:
from sklearn.model_selection import cross_val_score

scalers_str = []
mae_list = []
mse_list = []
rmse_list = []

for scaler in scalers: 
    # Scale data
    X_train_s = scaler.transform(X_train)
    X_test_s = scaler.transform(X_test)
    
    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_mean_absolute_error',cv=10)
    mae_list.append(abs(scores.mean()))
    print(f"{scaler} MAE-scores: {scores}")
    
    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_mean_squared_error',cv=10)
    mse_list.append(abs(scores.mean()))
    print(f"{scaler} MSE-scores: {scores}")

    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_root_mean_squared_error',cv=10)
    rmse_list.append(abs(scores.mean()))
    print(f"{scaler} RMSE-scores: {scores}")
    
    print(f"{scaler} done!")
    print('\n')
    
    scalers_str.append(str(scaler))

MinMaxScaler() MAE-scores: [-2.9972226  -2.9875783  -2.95545789 -3.36789564 -3.2302077  -3.00011203
 -2.73231039 -2.99880202 -3.08519259 -3.2551668 ]
MinMaxScaler() MSE-scores: [-15.63804221 -17.25400367 -15.15017385 -23.33901466 -23.09447721
 -18.1870698  -12.52985298 -17.39247288 -17.79647224 -19.05305125]
MinMaxScaler() RMSE-scores: [-3.90692076 -4.16493694 -4.03904811 -4.76106457 -4.7799185  -4.25555535
 -3.62938881 -4.1475583  -4.14737459 -4.35019011]
MinMaxScaler() done!


RobustScaler() MAE-scores: [-3.02329248 -2.98882803 -2.98244818 -3.24864647 -3.16495413 -2.98169185
 -2.69927873 -3.03827643 -3.11737348 -3.20799682]
RobustScaler() MSE-scores: [-15.3644501  -17.76783958 -15.75698487 -23.16352044 -23.29789049
 -18.30194333 -12.56600528 -17.24125263 -17.18944635 -18.99273683]
RobustScaler() RMSE-scores: [-3.94801595 -4.19198187 -4.00185797 -4.82953268 -4.76461773 -4.22350057
 -3.57953216 -4.05042319 -4.15176703 -4.38837242]
RobustScaler() done!


StandardScaler() MAE-scores: [-3

In [40]:
def min_index(l):
     return scalers_str[min(range(len(l)), key=l.__getitem__)]

In [41]:
min_index(mae_list)

'StandardScaler()'

In [42]:
min_index(mse_list)

'StandardScaler()'

In [43]:
min_index(rmse_list)

'RobustScaler()'

In [44]:
err_list = [mae_list, mse_list, rmse_list]

In [45]:
err_df = pd.DataFrame(columns=scalers_str, data=err_list, index=['MAE','MSE','RMSE'])

In [46]:
err_df
# -> RobustScaler (marginal difference)

Unnamed: 0,MinMaxScaler(),RobustScaler(),StandardScaler()
MAE,3.060995,3.045279,3.045243
MSE,17.943463,17.964207,17.800979
RMSE,4.218196,4.21296,4.233853


In [47]:
err_df.to_csv(f'./scaler-err/{model_name}-wo.csv')

DONE!