In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('./data/dl-pfe-ns.csv')

In [3]:
# df.columns

In [4]:
# df.head()

In [5]:
df.shape

(1976, 81)

In [6]:
# df.info()

In [7]:
X = df.drop('throughput',axis=1)

In [8]:
X.shape

(1976, 80)

In [9]:
y = df['throughput']

In [10]:
y.shape

(1976,)

# Scaler Selection
---
In the last Notebook we examined the Features and their distributions. We concluded non of the Features are normally distributed and the Features have a variaty of scales and units.

In this section we settle on the Scaler that yields the best performance in a Linear Regression Model, i.e., the Scaler yields the smallest error metric (MAE, MSE, and RMSE). This will be the baseline model. 

Note: when fitting data to the Scaler, it's important to fit ONLY on the train data set to avoid data leakage. 

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

from sklearn import preprocessing
mm_scaler = preprocessing.MinMaxScaler()
mm_scaler.fit(X_train)

rb_scaler = preprocessing.RobustScaler()
rb_scaler.fit(X_train)

st_scaler = preprocessing.StandardScaler()
st_scaler.fit(X_train)

scalers = [mm_scaler, rb_scaler, st_scaler]

In [12]:
scalers

[MinMaxScaler(), RobustScaler(), StandardScaler()]

In [13]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [14]:
from sklearn.model_selection import cross_val_score

scalers_str = []
mae_list = []
mse_list = []
rmse_list = []

for scaler in scalers: 
    # Scale data
    X_train_s = scaler.transform(X_train)
    X_test_s = scaler.transform(X_test)
    
    scores = cross_val_score(model,X_train,y_train, scoring='neg_mean_absolute_error',cv=10)
    mae_list.append(abs(scores.mean()))
    print(f"{scaler} MAE-scores: {scores}")
    
    scores = cross_val_score(model,X_train,y_train, scoring='neg_mean_squared_error',cv=10)
    mse_list.append(abs(scores.mean()))
    print(f"{scaler} MSE-scores: {scores}")

    scores = cross_val_score(model,X_train,y_train, scoring='neg_root_mean_squared_error',cv=10)
    rmse_list.append(abs(scores.mean()))
    print(f"{scaler} RMSE-scores: {scores}")
    
    print(f"{scaler} done!")
    print('\n')
    
    scalers_str.append(str(scaler))

MinMaxScaler() MAE-scores: [-3.19907889 -3.04984672 -3.20904192 -3.31414168 -3.16858144 -3.26850698
 -3.39898964 -3.11183876 -2.97194458 -2.8544241 ]
MinMaxScaler() MSE-scores: [-17.04662562 -16.53010689 -16.64258054 -18.2386817  -17.3033322
 -16.9085637  -21.0040796  -16.52567622 -14.74367401 -14.64165446]
MinMaxScaler() RMSE-scores: [-4.12875594 -4.06572341 -4.0795319  -4.27067696 -4.15972742 -4.1120024
 -4.58302079 -4.0651785  -3.83974921 -3.82644149]
MinMaxScaler() done!


RobustScaler() MAE-scores: [-3.19907889 -3.04984672 -3.20904192 -3.31414168 -3.16858144 -3.26850698
 -3.39898964 -3.11183876 -2.97194458 -2.8544241 ]
RobustScaler() MSE-scores: [-17.04662562 -16.53010689 -16.64258054 -18.2386817  -17.3033322
 -16.9085637  -21.0040796  -16.52567622 -14.74367401 -14.64165446]
RobustScaler() RMSE-scores: [-4.12875594 -4.06572341 -4.0795319  -4.27067696 -4.15972742 -4.1120024
 -4.58302079 -4.0651785  -3.83974921 -3.82644149]
RobustScaler() done!


StandardScaler() MAE-scores: [-3.199

In [15]:
# scalers_str

In [16]:
# mae_list

In [17]:
# mse_list

In [18]:
# rmse_list

In [19]:
# def min_index(l):
#     return min(range(len(l)), key=l.__getitem__)

In [20]:
# min_index(mae_list)

In [21]:
# min_index(mse_list)

In [22]:
# min_index(rmse_list)

In [23]:
err_list = [mae_list, mse_list, rmse_list]

In [24]:
err_df = pd.DataFrame(columns=scalers_str, data=err_list, index=['MAE','MSE','RMSE'])

In [25]:
err_df
# all scalers yield identical error metrics -> no particular scaler preference

Unnamed: 0,MinMaxScaler(),RobustScaler(),StandardScaler()
MAE,3.154639,3.154639,3.154639
MSE,16.958497,16.958497,16.958497
RMSE,4.113081,4.113081,4.113081


In [26]:
err_df.to_csv('./scaler-err/scaler-err.csv')

In [27]:
# def highlight_min(s):
#     is_min = s == s.min()
#     return ['background-color: grey' if v else '' for v in is_min]

In [28]:
# scalers_df.transpose().style.apply(highlight_min)

DONE!