In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('./data/d-no-ns.csv')

X = df.drop('throughput',axis=1)

y = df['throughput']

# Scaler Selection
---
In the last Notebook we examined the Features and their distributions. We concluded non of the Features are normally distributed and the Features have a variaty of scales and units.

In this section we settle on the Scaler that yields the best performance in a Linear Regression Model, i.e., the Scaler yields the smallest error metric (MAE, MSE, and RMSE). This will be the baseline model. 

Note: when fitting data to the Scaler, it's important to fit ONLY on the train data set to avoid data leakage. 

In [2]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

from sklearn import preprocessing
mm_scaler = preprocessing.MinMaxScaler()
mm_scaler.fit(X_train)

rb_scaler = preprocessing.RobustScaler()
rb_scaler.fit(X_train)

st_scaler = preprocessing.StandardScaler()
st_scaler.fit(X_train)

scalers = [mm_scaler, rb_scaler, st_scaler]

In [3]:
scalers

[MinMaxScaler(), RobustScaler(), StandardScaler()]

# Linear Regression

In [4]:
model_name = 'lin'

In [5]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [6]:
from sklearn.model_selection import cross_val_score

scalers_str = []
mae_list = []
mse_list = []
rmse_list = []

for scaler in scalers: 
    # Scale data
    X_train_s = scaler.transform(X_train)
    X_test_s = scaler.transform(X_test)
    
    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_mean_absolute_error',cv=10)
    mae_list.append(abs(scores.mean()))
    print(f"{scaler} MAE-scores: {scores}")
    
    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_mean_squared_error',cv=10)
    mse_list.append(abs(scores.mean()))
    print(f"{scaler} MSE-scores: {scores}")

    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_root_mean_squared_error',cv=10)
    rmse_list.append(abs(scores.mean()))
    print(f"{scaler} RMSE-scores: {scores}")
    
    print(f"{scaler} done!")
    print('\n')
    
    scalers_str.append(str(scaler))

MinMaxScaler() MAE-scores: [-3.19789131e+00 -3.05134687e+00 -3.20904192e+00 -3.31414168e+00
 -1.73748476e+10 -3.26949831e+00 -6.45413173e+09 -3.11183876e+00
 -2.97194458e+00 -2.85807538e+00]
MinMaxScaler() MSE-scores: [-1.70398274e+01 -1.65409409e+01 -1.66425805e+01 -1.82386817e+01
 -5.37355884e+22 -1.69142602e+01 -7.41473530e+21 -1.65256762e+01
 -1.47436740e+01 -1.46374182e+01]
MinMaxScaler() RMSE-scores: [-4.12793259e+00 -4.06705556e+00 -4.07953190e+00 -4.27067696e+00
 -2.31809380e+11 -4.11269500e+00 -8.61088573e+10 -4.06517850e+00
 -3.83974921e+00 -3.82588790e+00]
MinMaxScaler() done!


RobustScaler() MAE-scores: [-3.19907889 -3.04965708 -3.20904192 -3.31350615 -3.16858144 -3.26850698
 -3.39898964 -3.10707478 -2.97310285 -2.8544241 ]
RobustScaler() MSE-scores: [-17.04662562 -16.52680696 -16.64258054 -18.2537581  -17.3033322
 -16.9085637  -21.0040796  -16.43288096 -14.74468834 -14.64165446]
RobustScaler() RMSE-scores: [-4.12875594 -4.06531757 -4.0795319  -4.2724417  -4.15972742 -4.11

In [7]:
def min_index(l):
     return scalers_str[min(range(len(l)), key=l.__getitem__)]

In [8]:
min_index(mae_list)

'RobustScaler()'

In [9]:
min_index(mse_list)

'RobustScaler()'

In [10]:
min_index(rmse_list)

'RobustScaler()'

In [11]:
err_list = [mae_list, mse_list, rmse_list]

In [12]:
err_df = pd.DataFrame(columns=scalers_str, data=err_list, index=['MAE','MSE','RMSE'])

In [13]:
err_df
# -> model produces bad predictions! -> skip linear model with no regularization and go straigth to elastic net model

Unnamed: 0,MinMaxScaler(),RobustScaler(),StandardScaler()
MAE,2382898000.0,3.154196,6099259000000.0
MSE,6.115032e+21,16.950497,4.014219e+28
RMSE,31791820000.0,4.112087,64976210000000.0


In [14]:
err_df.to_csv(f'./scaler-err/{model_name}-no.csv')

# Poisson Regressor

In [15]:
model_name = 'poi'

In [16]:
from sklearn.linear_model import PoissonRegressor
model = PoissonRegressor()

In [17]:
from sklearn.model_selection import cross_val_score

scalers_str = []
mae_list = []
mse_list = []
rmse_list = []

for scaler in scalers: 
    # Scale data
    X_train_s = scaler.transform(X_train)
    X_test_s = scaler.transform(X_test)
    
    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_mean_absolute_error',cv=10)
    mae_list.append(abs(scores.mean()))
    print(f"{scaler} MAE-scores: {scores}")
    
    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_mean_squared_error',cv=10)
    mse_list.append(abs(scores.mean()))
    print(f"{scaler} MSE-scores: {scores}")

    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_root_mean_squared_error',cv=10)
    rmse_list.append(abs(scores.mean()))
    print(f"{scaler} RMSE-scores: {scores}")
    
    print(f"{scaler} done!")
    print('\n')
    
    scalers_str.append(str(scaler))

MinMaxScaler() MAE-scores: [-3.86610336 -3.83007871 -3.81746525 -3.98388856 -4.01674501 -4.16441492
 -4.38352876 -3.53749411 -3.48408254 -3.61979619]
MinMaxScaler() MSE-scores: [-23.8627277  -23.59996751 -22.46344964 -24.61816722 -25.1022728
 -25.48852773 -30.26421579 -19.32041755 -17.32937084 -21.38610533]
MinMaxScaler() RMSE-scores: [-4.8849491  -4.85797978 -4.73956218 -4.9616698  -5.01021684 -5.04861642
 -5.50129219 -4.39549969 -4.16285609 -4.62451136]
MinMaxScaler() done!


RobustScaler() MAE-scores: [-3.48499397 -3.33159237 -3.38952257 -3.62253194 -3.46462562 -3.57151104
 -3.75074699 -3.14380839 -3.23571139 -3.08120083]
RobustScaler() MSE-scores: [-20.04725053 -18.64940066 -18.09635922 -21.68286417 -19.61970926
 -19.14326453 -23.94593468 -15.89325516 -15.76704954 -15.94961451]
RobustScaler() RMSE-scores: [-4.47741561 -4.31849518 -4.25398157 -4.65648625 -4.4294141  -4.37530165
 -4.89345836 -3.98663457 -3.97077443 -3.99369685]
RobustScaler() done!


StandardScaler() MAE-scores: [-3.

In [18]:
min_index(mae_list)

'StandardScaler()'

In [19]:
min_index(mse_list)

'StandardScaler()'

In [20]:
min_index(rmse_list)

'StandardScaler()'

In [21]:
err_list = [mae_list, mse_list, rmse_list]

In [22]:
err_df = pd.DataFrame(columns=scalers_str, data=err_list, index=['MAE','MSE','RMSE'])

In [23]:
err_df
# -> StandardScaler -> use StandardScaler for Elastic Net also

Unnamed: 0,MinMaxScaler(),RobustScaler(),StandardScaler()
MAE,3.87036,3.407625,3.224268
MSE,23.343522,18.87947,17.237562
RMSE,4.818715,4.335566,4.14626


In [24]:
err_df.to_csv(f'./scaler-err/{model_name}-no.csv')

# Support Vector Regressor

In [25]:
model_name = 'svr'

In [26]:
from sklearn.svm import SVR
model = SVR()

In [27]:
from sklearn.model_selection import cross_val_score

scalers_str = []
mae_list = []
mse_list = []
rmse_list = []

for scaler in scalers: 
    # Scale data
    X_train_s = scaler.transform(X_train)
    X_test_s = scaler.transform(X_test)
    
    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_mean_absolute_error',cv=10)
    mae_list.append(abs(scores.mean()))
    print(f"{scaler} MAE-scores: {scores}")
    
    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_mean_squared_error',cv=10)
    mse_list.append(abs(scores.mean()))
    print(f"{scaler} MSE-scores: {scores}")

    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_root_mean_squared_error',cv=10)
    rmse_list.append(abs(scores.mean()))
    print(f"{scaler} RMSE-scores: {scores}")
    
    print(f"{scaler} done!")
    print('\n')
    
    scalers_str.append(str(scaler))

MinMaxScaler() MAE-scores: [-3.27362479 -3.26848712 -3.24739593 -3.41932586 -3.25489949 -3.36106469
 -3.59581583 -3.07334336 -2.92381515 -2.8907573 ]
MinMaxScaler() MSE-scores: [-19.14325346 -18.76092865 -17.76072928 -20.0393581  -19.02409301
 -18.89451138 -23.24612376 -15.87148271 -13.42338091 -15.57877611]
MinMaxScaler() RMSE-scores: [-4.37530039 -4.33138877 -4.21434803 -4.47653416 -4.36166173 -4.34678173
 -4.82142342 -3.98390295 -3.66379324 -3.94699583]
MinMaxScaler() done!


RobustScaler() MAE-scores: [-3.5258303  -3.39736271 -3.29623005 -3.43556872 -3.33477559 -3.37036535
 -3.5775804  -2.95395862 -3.04018569 -3.00012188]
RobustScaler() MSE-scores: [-21.35602327 -19.45315436 -17.78831782 -20.95002151 -19.43650145
 -18.94809936 -23.41989081 -15.59896344 -14.49813728 -16.41174216]
RobustScaler() RMSE-scores: [-4.62125776 -4.41057302 -4.21761992 -4.57711935 -4.40868478 -4.35294146
 -4.83941017 -3.94955231 -3.80764196 -4.05114085]
RobustScaler() done!


StandardScaler() MAE-scores: [-3

In [28]:
min_index(mae_list)

'MinMaxScaler()'

In [29]:
min_index(mse_list)

'MinMaxScaler()'

In [30]:
min_index(rmse_list)

'MinMaxScaler()'

In [31]:
err_list = [mae_list, mse_list, rmse_list]

In [32]:
err_df = pd.DataFrame(columns=scalers_str, data=err_list, index=['MAE','MSE','RMSE'])

In [33]:
err_df
# -> RobustScaler

Unnamed: 0,MinMaxScaler(),RobustScaler(),StandardScaler()
MAE,3.230853,3.293198,3.553873
MSE,18.174264,18.786085,21.149728
RMSE,4.252213,4.323594,4.585628


In [34]:
err_df.to_csv(f'./scaler-err/{model_name}-no.csv')

In [35]:
# def highlight_min(s):
#     is_min = s == s.min()
#     return ['background-color: grey' if v else '' for v in is_min]

In [36]:
# scalers_df.transpose().style.apply(highlight_min)

# Random Forest Regressor

In [37]:
model_name = 'rf'

In [38]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()

In [39]:
from sklearn.model_selection import cross_val_score

scalers_str = []
mae_list = []
mse_list = []
rmse_list = []

for scaler in scalers: 
    # Scale data
    X_train_s = scaler.transform(X_train)
    X_test_s = scaler.transform(X_test)
    
    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_mean_absolute_error',cv=10)
    mae_list.append(abs(scores.mean()))
    print(f"{scaler} MAE-scores: {scores}")
    
    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_mean_squared_error',cv=10)
    mse_list.append(abs(scores.mean()))
    print(f"{scaler} MSE-scores: {scores}")

    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_root_mean_squared_error',cv=10)
    rmse_list.append(abs(scores.mean()))
    print(f"{scaler} RMSE-scores: {scores}")
    
    print(f"{scaler} done!")
    print('\n')
    
    scalers_str.append(str(scaler))

MinMaxScaler() MAE-scores: [-2.99734058 -2.83424088 -3.2649077  -3.10695449 -2.83656159 -3.29594312
 -3.39039485 -3.03450811 -2.77079313 -2.77035001]
MinMaxScaler() MSE-scores: [-17.39711443 -15.66420311 -20.42233615 -19.38745168 -14.67680722
 -17.80544844 -20.84423781 -14.91511604 -14.0691739  -14.37850415]
MinMaxScaler() RMSE-scores: [-4.17126185 -4.00593795 -4.47390002 -4.34728292 -3.86522501 -4.19688114
 -4.56625123 -3.91563671 -3.64077324 -3.76652878]
MinMaxScaler() done!


RobustScaler() MAE-scores: [-3.04008547 -2.86776835 -3.24902648 -3.0722059  -2.86684865 -3.31866403
 -3.30962734 -3.01819151 -2.77497299 -2.75868015]
RobustScaler() MSE-scores: [-16.85778589 -15.82112046 -19.5180286  -18.92332788 -14.90974125
 -17.89155816 -21.05135509 -14.9930447  -14.10979457 -14.53291824]
RobustScaler() RMSE-scores: [-4.0849968  -3.95034615 -4.50783119 -4.34501504 -3.88283281 -4.25233608
 -4.54675789 -3.9091822  -3.70588452 -3.75235326]
RobustScaler() done!


StandardScaler() MAE-scores: [-3

In [40]:
def min_index(l):
     return scalers_str[min(range(len(l)), key=l.__getitem__)]

In [41]:
min_index(mae_list)

'StandardScaler()'

In [42]:
min_index(mse_list)

'StandardScaler()'

In [43]:
min_index(rmse_list)

'StandardScaler()'

In [44]:
err_list = [mae_list, mse_list, rmse_list]

In [45]:
err_df = pd.DataFrame(columns=scalers_str, data=err_list, index=['MAE','MSE','RMSE'])

In [46]:
err_df
# -> RobustScaler (marginal difference)

Unnamed: 0,MinMaxScaler(),RobustScaler(),StandardScaler()
MAE,3.030199,3.027607,3.022005
MSE,16.956039,16.860867,16.839399
RMSE,4.094968,4.093754,4.089235


In [47]:
err_df.to_csv(f'./scaler-err/{model_name}-no.csv')

DONE!