In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('./data/d-no-ns.csv')

X = df.drop('throughput',axis=1)

y = df['throughput']

# Scaler Selection
---
In the last Notebook we examined the Features and their distributions. We concluded non of the Features are normally distributed and the Features have a variaty of scales and units.

In this section we settle on the Scaler that yields the best performance in a Linear Regression Model, i.e., the Scaler yields the smallest error metric (MAE, MSE, and RMSE). This will be the baseline model. 

Note: when fitting data to the Scaler, it's important to fit ONLY on the train data set to avoid data leakage. 

In [2]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

from sklearn import preprocessing
mm_scaler = preprocessing.MinMaxScaler()
mm_scaler.fit(X_train)

rb_scaler = preprocessing.RobustScaler()
rb_scaler.fit(X_train)

st_scaler = preprocessing.StandardScaler()
st_scaler.fit(X_train)

scalers = [mm_scaler, rb_scaler, st_scaler]

In [3]:
scalers

[MinMaxScaler(), RobustScaler(), StandardScaler()]

# Linear Regression

In [4]:
model_name = 'lin'

In [5]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [6]:
from sklearn.model_selection import cross_val_score

scalers_str = []
mae_list = []
mse_list = []
rmse_list = []

for scaler in scalers: 
    # Scale data
    X_train_s = scaler.transform(X_train)
    X_test_s = scaler.transform(X_test)
    
    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_mean_absolute_error',cv=10)
    mae_list.append(abs(scores.mean()))
    print(f"{scaler} MAE-scores: {scores}")
    
    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_mean_squared_error',cv=10)
    mse_list.append(abs(scores.mean()))
    print(f"{scaler} MSE-scores: {scores}")

    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_root_mean_squared_error',cv=10)
    rmse_list.append(abs(scores.mean()))
    print(f"{scaler} RMSE-scores: {scores}")
    
    print(f"{scaler} done!")
    print('\n')
    
    scalers_str.append(str(scaler))

MinMaxScaler() MAE-scores: [-4.11018119e+00 -4.45825109e+00 -4.48462686e+00 -4.66081263e+00
 -4.92740761e+00 -4.51204866e+00 -4.51965011e+00 -1.88724616e+12
 -4.13922565e+00 -4.42166971e+00]
MinMaxScaler() MSE-scores: [-2.56957776e+01 -3.06565809e+01 -3.18616829e+01 -3.19490494e+01
 -3.70021154e+01 -3.06179941e+01 -3.13090844e+01 -6.30420560e+26
 -2.66434276e+01 -3.13516393e+01]
MinMaxScaler() RMSE-scores: [-5.06910028e+00 -5.53683853e+00 -5.64461539e+00 -5.65234902e+00
 -6.08293641e+00 -5.53335288e+00 -5.59545212e+00 -2.51081771e+13
 -5.16172719e+00 -5.59925346e+00]
MinMaxScaler() done!


RobustScaler() MAE-scores: [-4.11018119e+00 -4.45841528e+00 -4.49212240e+00 -2.46930082e+11
 -4.92740761e+00 -4.51202094e+00 -4.51965011e+00 -6.41381544e+10
 -4.13922565e+00 -4.41758966e+00]
RobustScaler() MSE-scores: [-2.56957776e+01 -3.06784657e+01 -3.18938443e+01 -1.08534548e+25
 -3.70021154e+01 -3.05600648e+01 -3.13090844e+01 -7.28125405e+23
 -2.66434276e+01 -3.13347482e+01]
RobustScaler() RMSE-s

In [7]:
def min_index(l):
     return scalers_str[min(range(len(l)), key=l.__getitem__)]

In [8]:
min_index(mae_list)

'RobustScaler()'

In [9]:
min_index(mse_list)

'RobustScaler()'

In [10]:
min_index(rmse_list)

'RobustScaler()'

In [11]:
err_list = [mae_list, mse_list, rmse_list]

In [12]:
err_df = pd.DataFrame(columns=scalers_str, data=err_list, index=['MAE','MSE','RMSE'])

In [13]:
err_df
# -> model produces bad predictions! -> skip linear model with no regularization and go straigth to elastic net model

Unnamed: 0,MinMaxScaler(),RobustScaler(),StandardScaler()
MAE,188724600000.0,31106820000.0,6602992000000.0
MSE,6.304206e+25,1.158158e+24,4.981415e+28
RMSE,2510818000000.0,414776100000.0,86895180000000.0


In [14]:
err_df.to_csv(f'./scaler-err/{model_name}-no.csv')

# Poisson Regressor

In [15]:
model_name = 'poi'

In [16]:
from sklearn.linear_model import PoissonRegressor
model = PoissonRegressor()

In [17]:
from sklearn.model_selection import cross_val_score

scalers_str = []
mae_list = []
mse_list = []
rmse_list = []

for scaler in scalers: 
    # Scale data
    X_train_s = scaler.transform(X_train)
    X_test_s = scaler.transform(X_test)
    
    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_mean_absolute_error',cv=10)
    mae_list.append(abs(scores.mean()))
    print(f"{scaler} MAE-scores: {scores}")
    
    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_mean_squared_error',cv=10)
    mse_list.append(abs(scores.mean()))
    print(f"{scaler} MSE-scores: {scores}")

    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_root_mean_squared_error',cv=10)
    rmse_list.append(abs(scores.mean()))
    print(f"{scaler} RMSE-scores: {scores}")
    
    print(f"{scaler} done!")
    print('\n')
    
    scalers_str.append(str(scaler))

MinMaxScaler() MAE-scores: [-5.2228701  -5.83113052 -4.99865206 -5.67563686 -5.43576027 -5.13887445
 -5.12442732 -4.69808525 -4.7913836  -5.33461106]
MinMaxScaler() MSE-scores: [-38.40413755 -46.24784416 -36.90626799 -42.33387134 -44.41992013
 -38.6542465  -37.94517151 -33.48797655 -34.31478141 -41.31800399]
MinMaxScaler() RMSE-scores: [-6.19710719 -6.80057675 -6.07505292 -6.50644844 -6.66482709 -6.21725394
 -6.15996522 -5.78687969 -5.85788199 -6.42790821]
MinMaxScaler() done!


RobustScaler() MAE-scores: [-4.66383329 -5.18481497 -4.50363616 -5.12848034 -5.24926547 -4.74377278
 -4.72620474 -4.1773513  -4.29677916 -4.96620732]
RobustScaler() MSE-scores: [-31.89112101 -39.95812992 -32.07179278 -36.59840074 -43.03455072
 -32.96034094 -33.83550177 -28.29552231 -30.83910011 -36.56517141]
RobustScaler() RMSE-scores: [-5.64722242 -6.32124433 -5.66319634 -6.04966121 -6.56007246 -5.74110973
 -5.81682918 -5.31935356 -5.55329633 -6.04691421]
RobustScaler() done!


StandardScaler() MAE-scores: [-4

In [18]:
min_index(mae_list)

'StandardScaler()'

In [19]:
min_index(mse_list)

'StandardScaler()'

In [20]:
min_index(rmse_list)

'StandardScaler()'

In [21]:
err_list = [mae_list, mse_list, rmse_list]

In [22]:
err_df = pd.DataFrame(columns=scalers_str, data=err_list, index=['MAE','MSE','RMSE'])

In [23]:
err_df
# -> StandardScaler -> use StandardScaler for Elastic Net also

Unnamed: 0,MinMaxScaler(),RobustScaler(),StandardScaler()
MAE,5.225143,4.764035,4.580342
MSE,39.403222,34.604963,32.189225
RMSE,6.26939,5.87189,5.665851


In [24]:
err_df.to_csv(f'./scaler-err/{model_name}-no.csv')

# Support Vector Regressor

In [25]:
model_name = 'svr'

In [26]:
from sklearn.svm import SVR
model = SVR()

In [27]:
from sklearn.model_selection import cross_val_score

scalers_str = []
mae_list = []
mse_list = []
rmse_list = []

for scaler in scalers: 
    # Scale data
    X_train_s = scaler.transform(X_train)
    X_test_s = scaler.transform(X_test)
    
    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_mean_absolute_error',cv=10)
    mae_list.append(abs(scores.mean()))
    print(f"{scaler} MAE-scores: {scores}")
    
    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_mean_squared_error',cv=10)
    mse_list.append(abs(scores.mean()))
    print(f"{scaler} MSE-scores: {scores}")

    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_root_mean_squared_error',cv=10)
    rmse_list.append(abs(scores.mean()))
    print(f"{scaler} RMSE-scores: {scores}")
    
    print(f"{scaler} done!")
    print('\n')
    
    scalers_str.append(str(scaler))

MinMaxScaler() MAE-scores: [-4.6389655  -5.14731491 -4.6594998  -5.00077934 -4.99819021 -4.44771182
 -4.73192981 -4.22498301 -4.16028188 -4.97576063]
MinMaxScaler() MSE-scores: [-31.43794757 -38.9247923  -33.86203649 -35.90552955 -39.17583999
 -31.08125766 -33.90239695 -29.0981727  -28.21224939 -38.59694109]
MinMaxScaler() RMSE-scores: [-5.60695528 -6.23897366 -5.8191096  -5.99212229 -6.25906063 -5.57505674
 -5.82257649 -5.39427221 -5.31152044 -6.21264365]
MinMaxScaler() done!


RobustScaler() MAE-scores: [-4.52403435 -5.24743224 -4.74563216 -4.98759065 -4.89743309 -4.45362385
 -4.75046397 -4.24389158 -4.22871094 -4.98094547]
RobustScaler() MSE-scores: [-31.86824535 -42.18592538 -36.87752124 -38.62436142 -37.66409368
 -31.48818834 -34.59370168 -29.3165338  -30.09614751 -39.6684311 ]
RobustScaler() RMSE-scores: [-5.64519666 -6.49506931 -6.07268649 -6.21485007 -6.13710793 -5.61143372
 -5.88164107 -5.41447447 -5.48599558 -6.29828795]
RobustScaler() done!


StandardScaler() MAE-scores: [-4

In [28]:
min_index(mae_list)

'MinMaxScaler()'

In [29]:
min_index(mse_list)

'MinMaxScaler()'

In [30]:
min_index(rmse_list)

'MinMaxScaler()'

In [31]:
err_list = [mae_list, mse_list, rmse_list]

In [32]:
err_df = pd.DataFrame(columns=scalers_str, data=err_list, index=['MAE','MSE','RMSE'])

In [33]:
err_df
# -> RobustScaler

Unnamed: 0,MinMaxScaler(),RobustScaler(),StandardScaler()
MAE,4.698542,4.705976,4.885692
MSE,34.019716,35.238315,38.769867
RMSE,5.823229,5.925674,6.215588


In [34]:
err_df.to_csv(f'./scaler-err/{model_name}-no.csv')

In [35]:
# def highlight_min(s):
#     is_min = s == s.min()
#     return ['background-color: grey' if v else '' for v in is_min]

In [36]:
# scalers_df.transpose().style.apply(highlight_min)

# Random Forest Regressor

In [37]:
model_name = 'rf'

In [38]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()

In [39]:
from sklearn.model_selection import cross_val_score

scalers_str = []
mae_list = []
mse_list = []
rmse_list = []

for scaler in scalers: 
    # Scale data
    X_train_s = scaler.transform(X_train)
    X_test_s = scaler.transform(X_test)
    
    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_mean_absolute_error',cv=10)
    mae_list.append(abs(scores.mean()))
    print(f"{scaler} MAE-scores: {scores}")
    
    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_mean_squared_error',cv=10)
    mse_list.append(abs(scores.mean()))
    print(f"{scaler} MSE-scores: {scores}")

    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_root_mean_squared_error',cv=10)
    rmse_list.append(abs(scores.mean()))
    print(f"{scaler} RMSE-scores: {scores}")
    
    print(f"{scaler} done!")
    print('\n')
    
    scalers_str.append(str(scaler))

MinMaxScaler() MAE-scores: [-2.79927273 -2.63835534 -2.61472362 -2.89193913 -3.03303902 -2.45142552
 -2.93371328 -2.7061655  -2.53480267 -2.90079746]
MinMaxScaler() MSE-scores: [-14.81213198 -14.1592188  -13.88364934 -15.43683052 -20.98477783
 -11.17908308 -16.13447287 -13.9104826  -12.19419235 -17.63566575]
MinMaxScaler() RMSE-scores: [-3.88747259 -3.83753649 -3.74266299 -3.91561299 -4.54729476 -3.35335102
 -3.95770667 -3.65099744 -3.54900578 -4.17406876]
MinMaxScaler() done!


RobustScaler() MAE-scores: [-2.7835137  -2.62600345 -2.56635016 -2.92517384 -3.11182181 -2.46704494
 -2.8783769  -2.72196925 -2.52809173 -2.81256048]
RobustScaler() MSE-scores: [-15.3184463  -13.91134199 -14.29443072 -15.06354697 -21.59342479
 -11.42413248 -15.65098805 -14.02899658 -12.6731     -17.77746787]
RobustScaler() RMSE-scores: [-3.86303066 -3.76423095 -3.74819785 -3.93562443 -4.58095146 -3.3649351
 -3.98389591 -3.67103581 -3.53014934 -4.21104962]
RobustScaler() done!


StandardScaler() MAE-scores: [-2.

In [40]:
def min_index(l):
     return scalers_str[min(range(len(l)), key=l.__getitem__)]

In [41]:
min_index(mae_list)

'RobustScaler()'

In [42]:
min_index(mse_list)

'StandardScaler()'

In [43]:
min_index(rmse_list)

'StandardScaler()'

In [44]:
err_list = [mae_list, mse_list, rmse_list]

In [45]:
err_df = pd.DataFrame(columns=scalers_str, data=err_list, index=['MAE','MSE','RMSE'])

In [46]:
err_df
# -> RobustScaler (marginal difference)

Unnamed: 0,MinMaxScaler(),RobustScaler(),StandardScaler()
MAE,2.750423,2.742091,2.742594
MSE,15.033051,15.173588,14.966572
RMSE,3.861571,3.86531,3.851702


In [47]:
err_df.to_csv(f'./scaler-err/{model_name}-no.csv')

DONE!