In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('./data/ul-nfe-ns.csv')

X = df.drop('throughput',axis=1)

y = df['throughput']

# Scaler Selection
---
In the last Notebook we examined the Features and their distributions. We concluded non of the Features are normally distributed and the Features have a variaty of scales and units.

In this section we settle on the Scaler that yields the best performance in a Linear Regression Model, i.e., the Scaler yields the smallest error metric (MAE, MSE, and RMSE). This will be the baseline model. 

Note: when fitting data to the Scaler, it's important to fit ONLY on the train data set to avoid data leakage. 

In [2]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

from sklearn import preprocessing
mm_scaler = preprocessing.MinMaxScaler()
mm_scaler.fit(X_train)

rb_scaler = preprocessing.RobustScaler()
rb_scaler.fit(X_train)

st_scaler = preprocessing.StandardScaler()
st_scaler.fit(X_train)

scalers = [mm_scaler, rb_scaler, st_scaler]

In [3]:
scalers

[MinMaxScaler(), RobustScaler(), StandardScaler()]

# Linear Regression

In [4]:
model_name = 'LinReg'

In [5]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [6]:
from sklearn.model_selection import cross_val_score

scalers_str = []
mae_list = []
mse_list = []
rmse_list = []

for scaler in scalers: 
    # Scale data
    X_train_s = scaler.transform(X_train)
    X_test_s = scaler.transform(X_test)
    
    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_mean_absolute_error',cv=10)
    mae_list.append(abs(scores.mean()))
    print(f"{scaler} MAE-scores: {scores}")
    
    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_mean_squared_error',cv=10)
    mse_list.append(abs(scores.mean()))
    print(f"{scaler} MSE-scores: {scores}")

    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_root_mean_squared_error',cv=10)
    rmse_list.append(abs(scores.mean()))
    print(f"{scaler} RMSE-scores: {scores}")
    
    print(f"{scaler} done!")
    print('\n')
    
    scalers_str.append(str(scaler))

MinMaxScaler() MAE-scores: [-4.35638952e+00 -4.37352082e+00 -2.97906900e+11 -2.25280273e+11
 -4.79037536e+00 -4.04934316e+00 -2.74316730e+11 -1.88105385e+12
 -4.76328944e+00 -7.31665187e+11]
MinMaxScaler() MSE-scores: [-3.02192047e+01 -2.90222655e+01 -1.27797871e+25 -7.30817298e+24
 -3.88078736e+01 -2.50063656e+01 -1.07607026e+25 -2.62344680e+26
 -3.27849646e+01 -7.65527543e+25]
MinMaxScaler() RMSE-scores: [-5.49719972e+00 -5.38723171e+00 -3.57488280e+12 -2.70336327e+12
 -6.22959659e+00 -5.00063652e+00 -3.28035099e+12 -1.61970578e+13
 -5.72581563e+00 -8.74944309e+12]
MinMaxScaler() done!


RobustScaler() MAE-scores: [-4.36087186 -4.37263411 -4.65467221 -5.06617317 -4.79037536 -4.04369449
 -4.14659076 -4.30266143 -4.763889   -4.89936755]
RobustScaler() MSE-scores: [-30.2848188  -29.01371247 -32.64838968 -40.70481599 -38.80787362
 -24.97366994 -27.75378048 -26.69038543 -32.78811251 -35.21663572]
RobustScaler() RMSE-scores: [-5.50316444 -5.38643783 -5.71387694 -6.3800326  -6.22959659 -4.9

In [7]:
def min_index(l):
     return scalers_str[min(range(len(l)), key=l.__getitem__)]

In [8]:
min_index(mae_list)

'RobustScaler()'

In [9]:
min_index(mse_list)

'RobustScaler()'

In [10]:
min_index(rmse_list)

'RobustScaler()'

In [11]:
err_list = [mae_list, mse_list, rmse_list]

In [12]:
err_df = pd.DataFrame(columns=scalers_str, data=err_list, index=['MAE','MSE','RMSE'])

In [13]:
err_df

Unnamed: 0,MinMaxScaler(),RobustScaler(),StandardScaler()
MAE,341022300000.0,4.540093,1184985000000.0
MSE,3.697461e+25,31.888219,5.38963e+26
RMSE,3450510000000.0,5.630539,14158180000000.0


In [14]:
err_df.to_csv(f'./scaler-err/{model_name}-scaler-err-nfe.csv')

# Poisson Regressor

In [15]:
model_name = 'PoiReg'

In [16]:
from sklearn.linear_model import PoissonRegressor
model = PoissonRegressor()

In [17]:
from sklearn.model_selection import cross_val_score

scalers_str = []
mae_list = []
mse_list = []
rmse_list = []

for scaler in scalers: 
    # Scale data
    X_train_s = scaler.transform(X_train)
    X_test_s = scaler.transform(X_test)
    
    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_mean_absolute_error',cv=10)
    mae_list.append(abs(scores.mean()))
    print(f"{scaler} MAE-scores: {scores}")
    
    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_mean_squared_error',cv=10)
    mse_list.append(abs(scores.mean()))
    print(f"{scaler} MSE-scores: {scores}")

    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_root_mean_squared_error',cv=10)
    rmse_list.append(abs(scores.mean()))
    print(f"{scaler} RMSE-scores: {scores}")
    
    print(f"{scaler} done!")
    print('\n')
    
    scalers_str.append(str(scaler))

MinMaxScaler() MAE-scores: [-5.15905815 -5.58809325 -5.49164941 -5.57176205 -5.4915588  -4.5867889
 -5.34161224 -4.96969821 -5.62338402 -5.39118565]
MinMaxScaler() MSE-scores: [-38.20603168 -42.97494095 -45.48974688 -43.84743074 -43.85448862
 -32.42634416 -41.23730868 -36.53923353 -43.6603461  -43.3548836 ]
MinMaxScaler() RMSE-scores: [-6.18110279 -6.55552751 -6.74460873 -6.62173925 -6.62227216 -5.69441342
 -6.4216282  -6.0447691  -6.6075976  -6.58444254]
MinMaxScaler() done!


RobustScaler() MAE-scores: [-4.7264365  -4.99860576 -4.78092855 -5.18405274 -5.2219084  -4.16605452
 -4.79261669 -4.48326763 -5.08283917 -4.91598991]
RobustScaler() MSE-scores: [-34.35521545 -35.49663865 -37.34984366 -39.22035616 -43.54556381
 -27.5051494  -34.24856332 -30.49221609 -37.502191   -34.84556006]
RobustScaler() RMSE-scores: [-5.86133223 -5.95790556 -6.11145185 -6.26261576 -6.59890626 -5.24453519
 -5.85222721 -5.52197574 -6.12390325 -5.9030128 ]
RobustScaler() done!


StandardScaler() MAE-scores: [-4.

In [18]:
min_index(mae_list)

'StandardScaler()'

In [19]:
min_index(mse_list)

'StandardScaler()'

In [20]:
min_index(rmse_list)

'StandardScaler()'

In [21]:
err_list = [mae_list, mse_list, rmse_list]

In [22]:
err_df = pd.DataFrame(columns=scalers_str, data=err_list, index=['MAE','MSE','RMSE'])

In [23]:
err_df

Unnamed: 0,MinMaxScaler(),RobustScaler(),StandardScaler()
MAE,5.321479,4.83527,4.65347
MSE,41.159075,35.45613,33.214081
RMSE,6.40781,5.943787,5.754888


In [24]:
err_df.to_csv(f'./scaler-err/{model_name}-scaler-err-nfe.csv')

# Support Vector Regressor

In [25]:
model_name = 'SVR'

In [26]:
from sklearn.svm import SVR
model = SVR()

In [27]:
from sklearn.model_selection import cross_val_score

scalers_str = []
mae_list = []
mse_list = []
rmse_list = []

for scaler in scalers: 
    # Scale data
    X_train_s = scaler.transform(X_train)
    X_test_s = scaler.transform(X_test)
    
    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_mean_absolute_error',cv=10)
    mae_list.append(abs(scores.mean()))
    print(f"{scaler} MAE-scores: {scores}")
    
    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_mean_squared_error',cv=10)
    mse_list.append(abs(scores.mean()))
    print(f"{scaler} MSE-scores: {scores}")

    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_root_mean_squared_error',cv=10)
    rmse_list.append(abs(scores.mean()))
    print(f"{scaler} RMSE-scores: {scores}")
    
    print(f"{scaler} done!")
    print('\n')
    
    scalers_str.append(str(scaler))

MinMaxScaler() MAE-scores: [-4.84092558 -5.06128616 -5.13366898 -5.09367041 -5.0850733  -4.23827897
 -5.00206035 -4.55822647 -5.24291137 -4.9969369 ]
MinMaxScaler() MSE-scores: [-35.34725096 -37.67158409 -39.80009638 -38.7714151  -39.97059578
 -28.9467117  -37.35862601 -32.18569824 -39.03196911 -38.4736273 ]
MinMaxScaler() RMSE-scores: [-5.94535541 -6.13771815 -6.30873176 -6.22666966 -6.32223029 -5.38021484
 -6.11217032 -5.67324407 -6.24755705 -6.20271129]
MinMaxScaler() done!


RobustScaler() MAE-scores: [-5.51967724 -5.90855574 -5.85541027 -5.76314326 -5.62271287 -4.82698341
 -5.84962338 -4.95588732 -5.79506688 -5.0747286 ]
RobustScaler() MSE-scores: [-45.99931554 -51.288383   -51.05073025 -49.4343824  -48.01248349
 -36.98242102 -51.76112095 -38.26210436 -48.3976456  -40.94236679]
RobustScaler() RMSE-scores: [-6.78227952 -7.16159081 -7.14497937 -7.03095885 -6.92910409 -6.08131738
 -7.1945202  -6.18563694 -6.95684164 -6.39862226]
RobustScaler() done!


StandardScaler() MAE-scores: [-5

In [28]:
min_index(mae_list)

'MinMaxScaler()'

In [29]:
min_index(mse_list)

'MinMaxScaler()'

In [30]:
min_index(rmse_list)

'MinMaxScaler()'

In [31]:
err_list = [mae_list, mse_list, rmse_list]

In [32]:
err_df = pd.DataFrame(columns=scalers_str, data=err_list, index=['MAE','MSE','RMSE'])

In [33]:
err_df

Unnamed: 0,MinMaxScaler(),RobustScaler(),StandardScaler()
MAE,4.925304,5.517179,5.138459
MSE,36.755757,46.213095,41.336337
RMSE,6.05566,6.786585,6.419866


In [34]:
err_df.to_csv(f'./scaler-err/{model_name}-scaler-err-nfe.csv')

In [35]:
# def highlight_min(s):
#     is_min = s == s.min()
#     return ['background-color: grey' if v else '' for v in is_min]

In [36]:
# scalers_df.transpose().style.apply(highlight_min)

# Random Forest Regressor

In [37]:
model_name = 'RF'

In [38]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()

In [39]:
from sklearn.model_selection import cross_val_score

scalers_str = []
mae_list = []
mse_list = []
rmse_list = []

for scaler in scalers: 
    # Scale data
    X_train_s = scaler.transform(X_train)
    X_test_s = scaler.transform(X_test)
    
    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_mean_absolute_error',cv=10)
    mae_list.append(abs(scores.mean()))
    print(f"{scaler} MAE-scores: {scores}")
    
    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_mean_squared_error',cv=10)
    mse_list.append(abs(scores.mean()))
    print(f"{scaler} MSE-scores: {scores}")

    scores = cross_val_score(model,X_train_s,y_train, scoring='neg_root_mean_squared_error',cv=10)
    rmse_list.append(abs(scores.mean()))
    print(f"{scaler} RMSE-scores: {scores}")
    
    print(f"{scaler} done!")
    print('\n')
    
    scalers_str.append(str(scaler))

MinMaxScaler() MAE-scores: [-2.99248403 -2.84562639 -2.61321293 -2.99254696 -3.13560899 -2.77484129
 -2.88831294 -2.81653975 -2.66520857 -2.68114836]
MinMaxScaler() MSE-scores: [-18.2626101  -15.6043602  -15.36955631 -16.91645521 -26.11232748
 -12.38561602 -16.58129954 -13.31134031 -12.66423576 -13.66994899]
MinMaxScaler() RMSE-scores: [-4.22872102 -3.98493175 -3.93382654 -4.10919076 -5.07145452 -3.45414522
 -4.02696938 -3.64613876 -3.51382837 -3.78484904]
MinMaxScaler() done!


RobustScaler() MAE-scores: [-3.00843171 -2.7597923  -2.70819244 -3.0288732  -3.12510471 -2.72654387
 -2.86991144 -2.71608296 -2.6752036  -2.696265  ]
RobustScaler() MSE-scores: [-18.71190395 -15.67653116 -15.50199143 -17.22291266 -25.99485575
 -12.61307756 -16.63486015 -13.65361981 -12.33836933 -14.245372  ]
RobustScaler() RMSE-scores: [-4.33577024 -3.94766419 -3.95115898 -4.12626203 -5.01477658 -3.50918871
 -4.16899034 -3.62977489 -3.60391972 -3.77421653]
RobustScaler() done!


StandardScaler() MAE-scores: [-2

In [41]:
min_index(mae_list)

'RobustScaler()'

In [42]:
min_index(mse_list)

'StandardScaler()'

In [43]:
min_index(rmse_list)

'StandardScaler()'

In [44]:
err_list = [mae_list, mse_list, rmse_list]

In [45]:
err_df = pd.DataFrame(columns=scalers_str, data=err_list, index=['MAE','MSE','RMSE'])

In [46]:
err_df

Unnamed: 0,MinMaxScaler(),RobustScaler(),StandardScaler()
MAE,2.840553,2.83144,2.832119
MSE,16.087775,16.259349,16.01891
RMSE,3.975406,4.006172,3.966615


In [47]:
err_df.to_csv(f'./scaler-err/{model_name}-scaler-err-nfe.csv')

DONE!