In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('./data/dl-pfe-ns.csv')

X = df.drop('throughput',axis=1)

y = df['throughput']

# Scaler Selection
---
In the last Notebook we examined the Features and their distributions. We concluded non of the Features are normally distributed and the Features have a variaty of scales and units.

In this section we settle on the Scaler that yields the best performance in a Linear Regression Model, i.e., the Scaler yields the smallest error metric (MAE, MSE, and RMSE). This will be the baseline model. 

Note: when fitting data to the Scaler, it's important to fit ONLY on the train data set to avoid data leakage. 

In [2]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

from sklearn import preprocessing
mm_scaler = preprocessing.MinMaxScaler()
mm_scaler.fit(X_train)

rb_scaler = preprocessing.RobustScaler()
rb_scaler.fit(X_train)

st_scaler = preprocessing.StandardScaler()
st_scaler.fit(X_train)

scalers = [mm_scaler, rb_scaler, st_scaler]

In [3]:
scalers

[MinMaxScaler(), RobustScaler(), StandardScaler()]

# Linear Regression

In [15]:
model_name = 'LinReg'

In [16]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [17]:
from sklearn.model_selection import cross_val_score

scalers_str = []
mae_list = []
mse_list = []
rmse_list = []

for scaler in scalers: 
    # Scale data
    X_train_s = scaler.transform(X_train)
    X_test_s = scaler.transform(X_test)
    
    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_mean_absolute_error',cv=10)
    mae_list.append(abs(scores.mean()))
    print(f"{scaler} MAE-scores: {scores}")
    
    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_mean_squared_error',cv=10)
    mse_list.append(abs(scores.mean()))
    print(f"{scaler} MSE-scores: {scores}")

    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_root_mean_squared_error',cv=10)
    rmse_list.append(abs(scores.mean()))
    print(f"{scaler} RMSE-scores: {scores}")
    
    print(f"{scaler} done!")
    print('\n')
    
    scalers_str.append(str(scaler))

MinMaxScaler() MAE-scores: [-3.65497450e+08 -2.60943965e+11 -3.05180186e+00 -3.44210885e+00
 -1.58601477e+10 -3.61073947e+00 -3.27890522e+00 -3.09733345e+00
 -2.75373390e+00 -2.93065408e+00]
MinMaxScaler() MSE-scores: [-1.38861577e+19 -9.05620316e+24 -1.61670028e+01 -1.92514481e+01
 -3.32038455e+22 -2.31452021e+01 -1.79974263e+01 -1.58342133e+01
 -1.29571075e+01 -1.61919161e+01]
MinMaxScaler() RMSE-scores: [-3.72641352e+09 -3.00935261e+12 -4.02082116e+00 -4.38764721e+00
 -1.82219224e+11 -4.81094607e+00 -4.24233736e+00 -3.97922271e+00
 -3.59959824e+00 -4.02391801e+00]
MinMaxScaler() done!


RobustScaler() MAE-scores: [-1.20532725e+12 -2.42252522e+11 -3.05576685e+00 -3.43971806e+00
 -1.10330867e+12 -3.60802825e+00 -3.27890522e+00 -3.09733345e+00
 -2.75373390e+00 -2.93107324e+00]
RobustScaler() MSE-scores: [-1.18871374e+26 -7.80527582e+24 -1.61799580e+01 -1.92409135e+01
 -1.60682283e+26 -2.31187839e+01 -1.79974263e+01 -1.58342133e+01
 -1.29571075e+01 -1.62008021e+01]
RobustScaler() RMSE-s

In [18]:
def min_index(l):
     return scalers_str[min(range(len(l)), key=l.__getitem__)]

In [19]:
min_index(mae_list)

'MinMaxScaler()'

In [20]:
min_index(mse_list)

'MinMaxScaler()'

In [21]:
min_index(rmse_list)

'MinMaxScaler()'

In [22]:
err_list = [mae_list, mse_list, rmse_list]

In [23]:
err_df = pd.DataFrame(columns=scalers_str, data=err_list, index=['MAE','MSE','RMSE'])

In [24]:
err_df
# -> model produces bad predictions! -> skip linear model with no regularization and go straigth to elastic net model

Unnamed: 0,MinMaxScaler(),RobustScaler(),StandardScaler()
MAE,27716960000.0,255088800000.0,3428406000000.0
MSE,9.089421e+23,2.873589e+25,7.769378e+27
RMSE,319529800000.0,2637266000000.0,38840150000000.0


In [25]:
err_df.to_csv(f'./scaler-err/{model_name}-scaler-err.csv')

# Poisson Regressor

In [26]:
model_name = 'PoiReg'

In [27]:
from sklearn.linear_model import PoissonRegressor
model = PoissonRegressor()

In [28]:
from sklearn.model_selection import cross_val_score

scalers_str = []
mae_list = []
mse_list = []
rmse_list = []

for scaler in scalers: 
    # Scale data
    X_train_s = scaler.transform(X_train)
    X_test_s = scaler.transform(X_test)
    
    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_mean_absolute_error',cv=10)
    mae_list.append(abs(scores.mean()))
    print(f"{scaler} MAE-scores: {scores}")
    
    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_mean_squared_error',cv=10)
    mse_list.append(abs(scores.mean()))
    print(f"{scaler} MSE-scores: {scores}")

    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_root_mean_squared_error',cv=10)
    rmse_list.append(abs(scores.mean()))
    print(f"{scaler} RMSE-scores: {scores}")
    
    print(f"{scaler} done!")
    print('\n')
    
    scalers_str.append(str(scaler))

MinMaxScaler() MAE-scores: [-3.84560388 -4.01276974 -4.01379454 -4.18745159 -4.00997316 -4.61611536
 -3.61973899 -3.33099149 -3.41478902 -3.76879175]
MinMaxScaler() MSE-scores: [-21.93368454 -25.61094209 -26.00600841 -24.38733702 -24.64566261
 -33.92884122 -19.11828423 -17.14871036 -16.56318104 -24.10805285]
MinMaxScaler() RMSE-scores: [-4.68334117 -5.06072545 -5.09960865 -4.93835368 -4.96443981 -5.82484688
 -4.37244602 -4.14110014 -4.06978882 -4.9099952 ]
MinMaxScaler() done!


RobustScaler() MAE-scores: [-3.38665692 -3.68304343 -3.37067662 -3.61832448 -3.44083954 -3.8349792
 -3.19412472 -3.11618621 -2.99565037 -3.17397426]
RobustScaler() MSE-scores: [-19.19358842 -22.07879334 -18.93440133 -19.70100947 -19.09366172
 -25.57308754 -15.79230346 -15.45941076 -14.07128431 -17.62408532]
RobustScaler() RMSE-scores: [-4.38104878 -4.69880765 -4.35136775 -4.43858192 -4.36962947 -5.05698404
 -3.97395313 -3.93184572 -3.75117106 -4.19810497]
RobustScaler() done!


StandardScaler() MAE-scores: [-3.

In [29]:
min_index(mae_list)

'StandardScaler()'

In [30]:
min_index(mse_list)

'StandardScaler()'

In [31]:
min_index(rmse_list)

'StandardScaler()'

In [32]:
err_list = [mae_list, mse_list, rmse_list]

In [33]:
err_df = pd.DataFrame(columns=scalers_str, data=err_list, index=['MAE','MSE','RMSE'])

In [34]:
err_df
# -> StandardScaler -> use StandardScaler for Elastic Net also

Unnamed: 0,MinMaxScaler(),RobustScaler(),StandardScaler()
MAE,3.882002,3.381446,3.251639
MSE,23.34507,18.752163,17.757713
RMSE,4.806465,4.315149,4.203541


In [35]:
err_df.to_csv(f'./scaler-err/{model_name}-scaler-err.csv')

# Support Vector Regressor

In [36]:
model_name = 'SVR'

In [37]:
from sklearn.svm import SVR
model = SVR()

In [38]:
from sklearn.model_selection import cross_val_score

scalers_str = []
mae_list = []
mse_list = []
rmse_list = []

for scaler in scalers: 
    # Scale data
    X_train_s = scaler.transform(X_train)
    X_test_s = scaler.transform(X_test)
    
    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_mean_absolute_error',cv=10)
    mae_list.append(abs(scores.mean()))
    print(f"{scaler} MAE-scores: {scores}")
    
    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_mean_squared_error',cv=10)
    mse_list.append(abs(scores.mean()))
    print(f"{scaler} MSE-scores: {scores}")

    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_root_mean_squared_error',cv=10)
    rmse_list.append(abs(scores.mean()))
    print(f"{scaler} RMSE-scores: {scores}")
    
    print(f"{scaler} done!")
    print('\n')
    
    scalers_str.append(str(scaler))

MinMaxScaler() MAE-scores: [-3.32188843 -3.54272184 -3.39099767 -3.63946868 -3.38944601 -3.97613093
 -3.16522318 -2.94463405 -2.82163205 -3.1457765 ]
MinMaxScaler() MSE-scores: [-17.95923517 -21.29882823 -21.05783998 -20.82521198 -20.26188129
 -27.64123347 -16.46066654 -14.00568036 -12.70980489 -19.29097449]
MinMaxScaler() RMSE-scores: [-4.23783378 -4.61506535 -4.58888221 -4.56346491 -4.50131995 -5.25749308
 -4.0571747  -3.74241638 -3.56508133 -4.39214919]
MinMaxScaler() done!


RobustScaler() MAE-scores: [-3.24212753 -3.58705906 -3.30808436 -3.4373774  -3.33540011 -3.71618628
 -3.10649782 -2.88611925 -2.86233909 -3.10408514]
RobustScaler() MSE-scores: [-17.67022413 -22.76551843 -19.70829204 -19.50730761 -20.49546936
 -25.16456757 -16.67733208 -13.52108646 -13.50684539 -18.98103523]
RobustScaler() RMSE-scores: [-4.20359657 -4.7713225  -4.43940222 -4.41670778 -4.52719222 -5.01642976
 -4.08378894 -3.677103   -3.67516604 -4.35672299]
RobustScaler() done!


StandardScaler() MAE-scores: [-3

In [39]:
min_index(mae_list)

'RobustScaler()'

In [40]:
min_index(mse_list)

'RobustScaler()'

In [41]:
min_index(rmse_list)

'RobustScaler()'

In [42]:
err_list = [mae_list, mse_list, rmse_list]

In [43]:
err_df = pd.DataFrame(columns=scalers_str, data=err_list, index=['MAE','MSE','RMSE'])

In [44]:
err_df
# -> RobustScaler

Unnamed: 0,MinMaxScaler(),RobustScaler(),StandardScaler()
MAE,3.333792,3.258528,3.717978
MSE,19.151136,18.799768,23.170326
RMSE,4.352088,4.316743,4.789031


In [45]:
err_df.to_csv(f'./scaler-err/{model_name}-scaler-err.csv')

In [46]:
# def highlight_min(s):
#     is_min = s == s.min()
#     return ['background-color: grey' if v else '' for v in is_min]

In [47]:
# scalers_df.transpose().style.apply(highlight_min)

# Random Forest Regressor

In [4]:
model_name = 'RF'

In [5]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()

In [6]:
from sklearn.model_selection import cross_val_score

scalers_str = []
mae_list = []
mse_list = []
rmse_list = []

for scaler in scalers: 
    # Scale data
    X_train_s = scaler.transform(X_train)
    X_test_s = scaler.transform(X_test)
    
    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_mean_absolute_error',cv=10)
    mae_list.append(abs(scores.mean()))
    print(f"{scaler} MAE-scores: {scores}")
    
    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_mean_squared_error',cv=10)
    mse_list.append(abs(scores.mean()))
    print(f"{scaler} MSE-scores: {scores}")

    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_root_mean_squared_error',cv=10)
    rmse_list.append(abs(scores.mean()))
    print(f"{scaler} RMSE-scores: {scores}")
    
    print(f"{scaler} done!")
    print('\n')
    
    scalers_str.append(str(scaler))

MinMaxScaler() MAE-scores: [-3.26094993 -3.35499843 -2.71996252 -3.54012992 -2.99669916 -3.30461545
 -3.0813636  -2.71217336 -2.84891772 -2.85215081]
MinMaxScaler() MSE-scores: [-21.821406   -20.41840459 -14.55634021 -20.36740242 -17.71855485
 -20.18962942 -15.48415526 -13.94490666 -13.07212101 -14.98059201]
MinMaxScaler() RMSE-scores: [-4.69404766 -4.50137529 -3.89755416 -4.40829365 -4.22833988 -4.55994863
 -3.85191174 -3.77593704 -3.64805727 -3.88776048]
MinMaxScaler() done!


RobustScaler() MAE-scores: [-3.28001747 -3.33307445 -2.70720681 -3.44921539 -3.01773748 -3.37332662
 -3.05122132 -2.69913581 -2.81209659 -2.91146218]
RobustScaler() MSE-scores: [-21.54865976 -20.25735784 -15.10697264 -19.78060356 -16.84728059
 -20.07714838 -15.09801252 -13.72622129 -12.39717914 -14.93349809]
RobustScaler() RMSE-scores: [-4.49879892 -4.48671113 -3.81040824 -4.45666922 -4.14478568 -4.53154382
 -3.87684881 -3.68202395 -3.65329525 -3.90262659]
RobustScaler() done!


StandardScaler() MAE-scores: [-3

In [7]:
def min_index(l):
     return scalers_str[min(range(len(l)), key=l.__getitem__)]

In [8]:
min_index(mae_list)

'StandardScaler()'

In [9]:
min_index(mse_list)

'RobustScaler()'

In [10]:
min_index(rmse_list)

'RobustScaler()'

In [11]:
err_list = [mae_list, mse_list, rmse_list]

In [12]:
err_df = pd.DataFrame(columns=scalers_str, data=err_list, index=['MAE','MSE','RMSE'])

In [13]:
err_df
# -> RobustScaler (marginal difference)

Unnamed: 0,MinMaxScaler(),RobustScaler(),StandardScaler()
MAE,3.067196,3.063449,3.057996
MSE,17.255351,16.977293,17.17558
RMSE,4.145323,4.104371,4.121753


In [14]:
err_df.to_csv(f'./scaler-err/{model_name}-scaler-err.csv')

DONE!