In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('./data/d-wo-ns.csv')

X = df.drop('throughput',axis=1)

y = df['throughput']

# Scaler Selection
---
In the last Notebook we examined the Features and their distributions. We concluded non of the Features are normally distributed and the Features have a variaty of scales and units.

In this section we settle on the Scaler that yields the best performance in a Linear Regression Model, i.e., the Scaler yields the smallest error metric (MAE, MSE, and RMSE). This will be the baseline model. 

Note: when fitting data to the Scaler, it's important to fit ONLY on the train data set to avoid data leakage. 

In [2]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

from sklearn import preprocessing
mm_scaler = preprocessing.MinMaxScaler()
mm_scaler.fit(X_train)

rb_scaler = preprocessing.RobustScaler()
rb_scaler.fit(X_train)

st_scaler = preprocessing.StandardScaler()
st_scaler.fit(X_train)

scalers = [mm_scaler, rb_scaler, st_scaler]

In [3]:
scalers

[MinMaxScaler(), RobustScaler(), StandardScaler()]

# Linear Regression

In [4]:
model_name = 'lin'

In [5]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [6]:
from sklearn.model_selection import cross_val_score

scalers_str = []
mae_list = []
mse_list = []
rmse_list = []

for scaler in scalers: 
    # Scale data
    X_train_s = scaler.transform(X_train)
    X_test_s = scaler.transform(X_test)
    
    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_mean_absolute_error',cv=10)
    mae_list.append(abs(scores.mean()))
    print(f"{scaler} MAE-scores: {scores}")
    
    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_mean_squared_error',cv=10)
    mse_list.append(abs(scores.mean()))
    print(f"{scaler} MSE-scores: {scores}")

    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_root_mean_squared_error',cv=10)
    rmse_list.append(abs(scores.mean()))
    print(f"{scaler} RMSE-scores: {scores}")
    
    print(f"{scaler} done!")
    print('\n')
    
    scalers_str.append(str(scaler))

MinMaxScaler() MAE-scores: [-8.56137455e+09 -4.95564680e+11 -4.28623553e+00 -4.19446020e+00
 -7.11603122e+10 -4.93175014e+00 -4.29759639e+00 -7.27136753e+10
 -7.17345955e+11 -4.56699035e+11]
MinMaxScaler() MSE-scores: [-1.41463469e+22 -4.73977800e+25 -2.85807561e+01 -2.74393513e+01
 -4.89049612e+23 -3.80861430e+01 -2.86523498e+01 -1.01515749e+24
 -4.98494304e+25 -4.00462096e+25]
MinMaxScaler() RMSE-scores: [-1.18938416e+11 -6.88460457e+12 -5.34609728e+00 -5.23825842e+00
 -6.99320822e+11 -6.17139716e+00 -5.35278898e+00 -1.00755024e+12
 -7.06041291e+12 -6.32820746e+12]
MinMaxScaler() done!


RobustScaler() MAE-scores: [-4.05999257 -5.23258377 -4.27368365 -4.19665424 -4.65141011 -4.93195488
 -4.29187386 -4.00130188 -4.775689   -4.68530046]
RobustScaler() MSE-scores: [-25.4203735  -42.01663193 -28.45698555 -27.44304367 -33.66199007
 -38.09133588 -28.58893538 -25.04565529 -32.78869514 -32.86310721]
RobustScaler() RMSE-scores: [-5.04186211 -6.48202375 -5.33450893 -5.23861085 -5.80189539 -6.1

In [7]:
def min_index(l):
     return scalers_str[min(range(len(l)), key=l.__getitem__)]

In [8]:
min_index(mae_list)

'RobustScaler()'

In [9]:
min_index(mse_list)

'RobustScaler()'

In [10]:
min_index(rmse_list)

'RobustScaler()'

In [11]:
err_list = [mae_list, mse_list, rmse_list]

In [12]:
err_df = pd.DataFrame(columns=scalers_str, data=err_list, index=['MAE','MSE','RMSE'])

In [13]:
err_df
# -> model produces bad predictions! -> skip linear model with no regularization and go straigth to elastic net model

Unnamed: 0,MinMaxScaler(),RobustScaler(),StandardScaler()
MAE,182204500000.0,4.510044,2172924000000.0
MSE,1.388118e+25,31.437675,1.887842e+27
RMSE,2209903000000.0,5.588092,25988660000000.0


In [14]:
err_df.to_csv(f'./scaler-err/{model_name}-wo.csv')

# Poisson Regressor

In [15]:
model_name = 'poi'

In [16]:
from sklearn.linear_model import PoissonRegressor
model = PoissonRegressor()

In [17]:
from sklearn.model_selection import cross_val_score

scalers_str = []
mae_list = []
mse_list = []
rmse_list = []

for scaler in scalers: 
    # Scale data
    X_train_s = scaler.transform(X_train)
    X_test_s = scaler.transform(X_test)
    
    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_mean_absolute_error',cv=10)
    mae_list.append(abs(scores.mean()))
    print(f"{scaler} MAE-scores: {scores}")
    
    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_mean_squared_error',cv=10)
    mse_list.append(abs(scores.mean()))
    print(f"{scaler} MSE-scores: {scores}")

    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_root_mean_squared_error',cv=10)
    rmse_list.append(abs(scores.mean()))
    print(f"{scaler} RMSE-scores: {scores}")
    
    print(f"{scaler} done!")
    print('\n')
    
    scalers_str.append(str(scaler))

MinMaxScaler() MAE-scores: [-5.12749283 -5.9137481  -5.13333563 -5.22435439 -5.62168201 -5.39582849
 -4.7990192  -5.17076078 -5.4893257  -5.27806285]
MinMaxScaler() MSE-scores: [-36.79626515 -49.82908155 -38.39794681 -38.72531133 -46.0741724
 -42.45565127 -34.68344399 -38.69858346 -42.86752053 -41.05275678]
MinMaxScaler() RMSE-scores: [-6.06599251 -7.05897171 -6.19660769 -6.22296644 -6.78779584 -6.51580012
 -5.88926515 -6.22081855 -6.54732927 -6.40724253]
MinMaxScaler() done!


RobustScaler() MAE-scores: [-4.37877267 -5.22383166 -4.65221504 -4.70955294 -4.89569948 -5.15580498
 -4.44712911 -4.61277363 -4.95985894 -4.74238624]
RobustScaler() MSE-scores: [-28.76991647 -41.72242104 -33.7929554  -32.80201863 -37.89693255
 -41.03936851 -31.1489553  -31.78570084 -36.77056208 -32.82756382]
RobustScaler() RMSE-scores: [-5.36375955 -6.45928951 -5.81317086 -5.72730466 -6.15604845 -6.40619766
 -5.58112491 -5.63788088 -6.06387352 -5.72953435]
RobustScaler() done!


StandardScaler() MAE-scores: [-4.

In [18]:
min_index(mae_list)

'StandardScaler()'

In [19]:
min_index(mse_list)

'StandardScaler()'

In [20]:
min_index(rmse_list)

'StandardScaler()'

In [21]:
err_list = [mae_list, mse_list, rmse_list]

In [22]:
err_df = pd.DataFrame(columns=scalers_str, data=err_list, index=['MAE','MSE','RMSE'])

In [23]:
err_df
# -> StandardScaler -> use StandardScaler for Elastic Net also

Unnamed: 0,MinMaxScaler(),RobustScaler(),StandardScaler()
MAE,5.315361,4.777802,4.577593
MSE,40.958073,34.855639,32.060898
RMSE,6.391279,5.893818,5.649586


In [24]:
err_df.to_csv(f'./scaler-err/{model_name}-wo.csv')

# Support Vector Regressor

In [25]:
model_name = 'svr'

In [26]:
from sklearn.svm import SVR
model = SVR()

In [27]:
from sklearn.model_selection import cross_val_score

scalers_str = []
mae_list = []
mse_list = []
rmse_list = []

for scaler in scalers: 
    # Scale data
    X_train_s = scaler.transform(X_train)
    X_test_s = scaler.transform(X_test)
    
    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_mean_absolute_error',cv=10)
    mae_list.append(abs(scores.mean()))
    print(f"{scaler} MAE-scores: {scores}")
    
    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_mean_squared_error',cv=10)
    mse_list.append(abs(scores.mean()))
    print(f"{scaler} MSE-scores: {scores}")

    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_root_mean_squared_error',cv=10)
    rmse_list.append(abs(scores.mean()))
    print(f"{scaler} RMSE-scores: {scores}")
    
    print(f"{scaler} done!")
    print('\n')
    
    scalers_str.append(str(scaler))

MinMaxScaler() MAE-scores: [-4.46228244 -5.30657891 -4.60207978 -4.64709339 -4.98762856 -4.86015145
 -4.30809958 -4.63501161 -4.94092774 -4.78183516]
MinMaxScaler() MSE-scores: [-29.69795624 -42.44157363 -33.17501143 -32.95183124 -37.67674096
 -36.89149768 -30.09733418 -32.32265509 -35.9738163  -34.73919184]
MinMaxScaler() RMSE-scores: [-5.44958313 -6.51471977 -5.75977529 -5.74036856 -6.13813823 -6.07383715
 -5.48610373 -5.68530167 -5.99781763 -5.89399625]
MinMaxScaler() done!


RobustScaler() MAE-scores: [-5.17847904 -5.95791665 -4.97662762 -5.33755831 -5.63063489 -5.27606829
 -4.89265852 -5.33276772 -5.20913764 -4.89294282]
RobustScaler() MSE-scores: [-40.77521156 -51.21665706 -41.19556614 -43.42648076 -47.51224479
 -43.49107336 -38.3217912  -43.2467605  -42.48746563 -37.04220933]
RobustScaler() RMSE-scores: [-6.38554708 -7.15658138 -6.41837722 -6.58987714 -6.89291265 -6.59477622
 -6.19045969 -6.57622692 -6.51824099 -6.08623113]
RobustScaler() done!


StandardScaler() MAE-scores: [-4

In [28]:
min_index(mae_list)

'MinMaxScaler()'

In [29]:
min_index(mse_list)

'MinMaxScaler()'

In [30]:
min_index(rmse_list)

'MinMaxScaler()'

In [31]:
err_list = [mae_list, mse_list, rmse_list]

In [32]:
err_df = pd.DataFrame(columns=scalers_str, data=err_list, index=['MAE','MSE','RMSE'])

In [33]:
err_df
# -> RobustScaler

Unnamed: 0,MinMaxScaler(),RobustScaler(),StandardScaler()
MAE,4.753169,5.268479,4.921331
MSE,34.596761,42.871546,38.551971
RMSE,5.873964,6.540923,6.193866


In [34]:
err_df.to_csv(f'./scaler-err/{model_name}-wo.csv')

In [35]:
# def highlight_min(s):
#     is_min = s == s.min()
#     return ['background-color: grey' if v else '' for v in is_min]

In [36]:
# scalers_df.transpose().style.apply(highlight_min)

# Random Forest Regressor

In [37]:
model_name = 'rf'

In [38]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()

In [39]:
from sklearn.model_selection import cross_val_score

scalers_str = []
mae_list = []
mse_list = []
rmse_list = []

for scaler in scalers: 
    # Scale data
    X_train_s = scaler.transform(X_train)
    X_test_s = scaler.transform(X_test)
    
    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_mean_absolute_error',cv=10)
    mae_list.append(abs(scores.mean()))
    print(f"{scaler} MAE-scores: {scores}")
    
    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_mean_squared_error',cv=10)
    mse_list.append(abs(scores.mean()))
    print(f"{scaler} MSE-scores: {scores}")

    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_root_mean_squared_error',cv=10)
    rmse_list.append(abs(scores.mean()))
    print(f"{scaler} RMSE-scores: {scores}")
    
    print(f"{scaler} done!")
    print('\n')
    
    scalers_str.append(str(scaler))

MinMaxScaler() MAE-scores: [-2.51902018 -3.50414799 -2.94292598 -2.69167594 -2.63451314 -2.65297809
 -2.97957268 -2.6793037  -2.77528607 -2.67152796]
MinMaxScaler() MSE-scores: [-12.1916587  -23.66618305 -17.00152489 -14.9292984  -14.15868393
 -14.90109344 -16.37634508 -13.61073807 -14.05439817 -12.12381543]
MinMaxScaler() RMSE-scores: [-3.52751898 -4.95140921 -4.19418341 -3.77616553 -3.73876079 -3.88201817
 -4.05718136 -3.6668904  -3.72495327 -3.56538653]
MinMaxScaler() done!


RobustScaler() MAE-scores: [-2.44751959 -3.44815776 -2.90819867 -2.77134031 -2.63028391 -2.65619106
 -2.96264054 -2.675063   -2.76715903 -2.59673378]
RobustScaler() MSE-scores: [-11.94848838 -23.88055464 -17.36889942 -14.21862585 -13.72196315
 -15.42610698 -16.44710064 -13.5901223  -13.96846325 -12.60827085]
RobustScaler() RMSE-scores: [-3.41178756 -4.88379848 -4.20578423 -3.70797126 -3.74233538 -3.82866818
 -4.09456665 -3.69072569 -3.70833291 -3.4729535 ]
RobustScaler() done!


StandardScaler() MAE-scores: [-2

In [40]:
def min_index(l):
     return scalers_str[min(range(len(l)), key=l.__getitem__)]

In [41]:
min_index(mae_list)

'RobustScaler()'

In [42]:
min_index(mse_list)

'MinMaxScaler()'

In [43]:
min_index(rmse_list)

'RobustScaler()'

In [44]:
err_list = [mae_list, mse_list, rmse_list]

In [45]:
err_df = pd.DataFrame(columns=scalers_str, data=err_list, index=['MAE','MSE','RMSE'])

In [46]:
err_df
# -> RobustScaler (marginal difference)

Unnamed: 0,MinMaxScaler(),RobustScaler(),StandardScaler()
MAE,2.805095,2.786329,2.794931
MSE,15.301374,15.31786,15.356044
RMSE,3.908447,3.874692,3.881379


In [47]:
err_df.to_csv(f'./scaler-err/{model_name}-wo.csv')

DONE!