# Normalization vs Standardization  - Quantitative analysis

This notebook containes the code to extract tables and info used in my Medium article - TODO:add link

# Let's read the data

In [13]:
import os
import sys
import sys
sys.path.append("../..")
from src.data.bank_additional import BandAdditionalParser
from src.data.income_evaluation import IncomeEvaluationParser
from src.data.skyserver import SkyserverParser
from src.data.sonar import SonarParser
from src.data.weather_aus import WeatherAUSParser



avaliable_restul_files = {
    "sonar_results.csv": SonarParser,
    "Skyserver_results.csv": SkyserverParser,
    "income_evaluation_results.csv": IncomeEvaluationParser,
    "bank-additional_results.csv": BandAdditionalParser,
    "weatherAUS_results.csv": WeatherAUSParser
}


# Pick one of the keys name in the above dictionarys and paste it in results_file variable. Then run all the cells
results_file = "income_evaluation_results.csv"

In [14]:
# Pick the wanted parser and run the cells
parser = avaliable_restul_files[results_file]()
X,y = parser.X, parser.y
X.head(5)

############################## Start Dataset - income_evaluation Stats ##############################
Dataset shape: (32561, 7)
Counts for each class:
 <=50K    24720
 >50K     7841 
Name:  income, dtype: int64
Sample of first 5 rows:
   age   fnlwgt   education-num   capital-gain   capital-loss   hours-per-week  income
0  39   77516    13              2174           0              40                <=50K
1  50   83311    13              0              0              13                <=50K
2  38   215646   9               0              0              40                <=50K
3  53   234721   7               0              0              40                <=50K
4  28   338409   13              0              0              40                <=50K
############################## End Dataset Stats ##############################


Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
0,39,77516,13,2174,0,40
1,50,83311,13,0,0,13
2,38,215646,9,0,0,40
3,53,234721,7,0,0,40
4,28,338409,13,0,0,40


In [15]:
print("Data shape: ", X.shape)
print(y.value_counts())

Data shape:  (32561, 6)
0    24720
1    7841 
dtype: int64


### Let's read the results file

In [16]:
import os
import pandas as pd


results_df = pd.read_csv(os.path.join("..", "..", "data", "processed", results_file)).dropna().round(3)
results_df

Unnamed: 0,Dataset,Classifier_Name,CV_mean,CV_std,Test_score
0,income_evaluation,_LR,0.583,0.013,0.587
1,income_evaluation,StandardScaler_LR,0.833,0.011,0.825
2,income_evaluation,MinMaxScaler_LR,0.831,0.011,0.824
3,income_evaluation,MaxAbsScaler_LR,0.831,0.011,0.824
4,income_evaluation,RobustScaler_LR,0.833,0.011,0.825
5,income_evaluation,QuantileTransformer-Normal_LR,0.829,0.01,0.82
6,income_evaluation,QuantileTransformer-Uniform_LR,0.833,0.01,0.822
7,income_evaluation,PowerTransformer-Yeo-Johnson_LR,0.829,0.01,0.82
8,income_evaluation,Normalizer_LR,0.684,0.007,0.686
10,income_evaluation,_LR-PCA,0.764,0.007,0.76


<a id="Out-of-the-box_classifier"></a>
# 1. Out-of-the-box classifiers

In [17]:
import operator
results_df.loc[operator.and_(results_df["Classifier_Name"].str.startswith("_"), ~results_df["Classifier_Name"].str.endswith("PCA"))].dropna()

Unnamed: 0,Dataset,Classifier_Name,CV_mean,CV_std,Test_score
0,income_evaluation,_LR,0.583,0.013,0.587
20,income_evaluation,_LDA,0.822,0.011,0.813
40,income_evaluation,_KNN,0.662,0.011,0.683
60,income_evaluation,_CART,0.695,0.015,0.68
80,income_evaluation,_NB,0.824,0.007,0.819
100,income_evaluation,_SVM,0.582,0.01,0.589
120,income_evaluation,_RF,0.836,0.012,0.828
140,income_evaluation,_MLP,0.602,0.071,0.63


<a id="Classifiers_Scaling"></a>
# 2. Classifiers+Scaling

In [18]:
import operator
import numpy as np


temp = results_df.loc[~results_df["Classifier_Name"].str.endswith("PCA")].dropna()
temp["model"] = results_df["Classifier_Name"].apply(lambda sen: sen.split("_")[1])
temp["scaler"] = results_df["Classifier_Name"].apply(lambda sen: sen.split("_")[0])

def df_style(val):
    return 'font-weight: 800'
    

pivot_t = pd.pivot_table(temp, values='CV_mean', index=["scaler"], columns=['model'], aggfunc=np.sum)
pivot_t_bold = pivot_t.style.applymap(df_style,
                      subset=pd.IndexSlice[pivot_t["CART"].idxmax(),"CART"])
for col in list(pivot_t):
    pivot_t_bold = pivot_t_bold.applymap(df_style,
                      subset=pd.IndexSlice[pivot_t[col].idxmax(),col])
pivot_t_bold

model,CART,KNN,LDA,LR,MLP,NB,RF,SVM
scaler,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
,0.695,0.662,0.822,0.583,0.602,0.824,0.836,0.582
MaxAbsScaler,0.696,0.786,0.822,0.831,0.852,0.831,0.836,0.836
MinMaxScaler,0.696,0.783,0.822,0.831,0.852,0.831,0.836,0.84
Normalizer,0.621,0.696,0.676,0.684,0.685,0.68,0.701,0.684
PowerTransformer-Yeo-Johnson,0.679,0.776,0.826,0.829,0.843,0.826,0.836,0.809
QuantileTransformer-Normal,0.696,0.786,0.823,0.829,0.856,0.825,0.836,0.822
QuantileTransformer-Uniform,0.696,0.778,0.83,0.833,0.845,0.828,0.836,0.819
RobustScaler,0.696,0.821,0.822,0.833,0.797,0.83,0.836,0.823
StandardScaler,0.695,0.792,0.822,0.833,0.856,0.831,0.836,0.817


In [19]:
# Print table for the Medium article

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 100000)
pd.options.display.max_rows
pd.set_option('display.max_colwidth', -1)

dict2 = {'StandardScaler': "StandardScaler",
'MinMaxScaler':"MinMaxScaler",
'MaxAbsScaler':"MaxAbsScaler",
'RobustScaler':"RobustScaler",
'QuantileTransformer-Normal':"QuantileTransformer(output_distribution='normal')",
'QuantileTransformer-Uniform':"QuantileTransformer(output_distribution='uniform')",
'PowerTransformer-Yeo-Johnson':"PowerTransformer(method='yeo-johnson')",
'Normalizer':"Normalizer"}

scalers_df = pd.DataFrame(list(dict2.items()), columns=["Name","Sklearn_Class"])
s = scalers_df.style.set_properties(subset=["Name", "Sklearn_Class"], **{'text-align': 'left'})
s.set_table_styles([ dict(selector='th', props=[('text-align', 'left')] ) ])

Unnamed: 0,Name,Sklearn_Class
0,StandardScaler,StandardScaler
1,MinMaxScaler,MinMaxScaler
2,MaxAbsScaler,MaxAbsScaler
3,RobustScaler,RobustScaler
4,QuantileTransformer-Normal,QuantileTransformer(output_distribution='normal')
5,QuantileTransformer-Uniform,QuantileTransformer(output_distribution='uniform')
6,PowerTransformer-Yeo-Johnson,PowerTransformer(method='yeo-johnson')
7,Normalizer,Normalizer


In [20]:
# Print table for the Medium article
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 100000)
pd.options.display.max_rows
pd.set_option('display.max_colwidth', -1)

dict2 = {'LR': "LogisticRegression",
'LDA':"LinearDiscriminantAnalysis",
'KNN':"KNeighborsClassifier",
'CART':"DecisionTreeClassifier",
'NB':"GaussianNB",
'SVM':"SVC",
'RF':"RandomForestClassifier",
'MLP':"MLPClassifier"}

scalers_df = pd.DataFrame(list(dict2.items()), columns=["Name","Sklearn_Class"])
s = scalers_df.style.set_properties(subset=["Name", "Sklearn_Class"], **{'text-align': 'left'})
s.set_table_styles([ dict(selector='th', props=[('text-align', 'left')] ) ])

Unnamed: 0,Name,Sklearn_Class
0,LR,LogisticRegression
1,LDA,LinearDiscriminantAnalysis
2,KNN,KNeighborsClassifier
3,CART,DecisionTreeClassifier
4,NB,GaussianNB
5,SVM,SVC
6,RF,RandomForestClassifier
7,MLP,MLPClassifier


In [21]:
import operator

cols_max_vals = {}
cols_max_row_names = {}
for col in list(pivot_t):
    row_name = pivot_t[col].idxmax()
    cell_val = pivot_t[col].max()
    cols_max_vals[col] = cell_val
    cols_max_row_names[col] = row_name
    
sorted_cols_max_vals = sorted(cols_max_vals.items(), key=lambda kv: kv[1], reverse=True)

print("Best classifiers sorted:\n")
counter = 1
for model, score in sorted_cols_max_vals:
    print(str(counter) + ". " + model + " + " +cols_max_row_names[model] + " : " +str(score))
    counter +=1

Best classifiers sorted:

1. MLP + QuantileTransformer-Normal : 0.856
2. SVM + MinMaxScaler : 0.84
3. RF +  : 0.836
4. LR + QuantileTransformer-Uniform : 0.833
5. NB + MaxAbsScaler : 0.831
6. LDA + QuantileTransformer-Uniform : 0.83
7. KNN + RobustScaler : 0.821
8. CART + MaxAbsScaler : 0.696


# 3. Classifier+Scaling+PCA

In [22]:
import operator
temp = results_df.copy()
temp["model"] = results_df["Classifier_Name"].apply(lambda sen: sen.split("_")[1])
temp["scaler"] = results_df["Classifier_Name"].apply(lambda sen: sen.split("_")[0])

def df_style(val):
    return 'font-weight: 800'
    

pivot_t = pd.pivot_table(temp, values='CV_mean', index=["scaler"], columns=['model'], aggfunc=np.sum)
pivot_t_bold = pivot_t.style.applymap(df_style,
                      subset=pd.IndexSlice[pivot_t["CART"].idxmax(),"CART"])
for col in list(pivot_t):
    pivot_t_bold = pivot_t_bold.applymap(df_style,
                      subset=pd.IndexSlice[pivot_t[col].idxmax(),col])
pivot_t_bold

model,CART,CART-PCA,KNN,KNN-PCA,LDA,LDA-PCA,LR,LR-PCA,MLP,MLP-PCA,NB,NB-PCA,RF,RF-PCA,SVM,SVM-PCA
scaler,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
,0.695,0.659,0.662,0.662,0.822,0.751,0.583,0.764,0.602,0.644,0.824,0.768,0.836,0.772,0.582,0.586
MaxAbsScaler,0.696,0.673,0.786,0.775,0.822,0.808,0.831,0.808,0.852,0.853,0.831,0.812,0.836,0.834,0.836,0.818
MinMaxScaler,0.696,0.673,0.783,0.772,0.822,0.807,0.831,0.808,0.852,0.853,0.831,0.813,0.836,0.837,0.84,0.819
Normalizer,0.621,0.609,0.696,0.641,0.676,0.683,0.684,0.683,0.685,0.685,0.68,0.631,0.701,0.684,0.684,0.684
PowerTransformer-Yeo-Johnson,0.679,0.639,0.776,0.77,0.826,0.827,0.829,0.827,0.843,0.838,0.826,0.816,0.836,0.813,0.809,0.8
QuantileTransformer-Normal,0.696,0.679,0.786,0.79,0.823,0.819,0.829,0.825,0.856,0.849,0.825,0.815,0.836,0.831,0.822,0.813
QuantileTransformer-Uniform,0.696,0.646,0.778,0.773,0.83,0.827,0.833,0.829,0.845,0.839,0.828,0.822,0.836,0.814,0.819,0.811
RobustScaler,0.696,0.703,0.821,0.79,0.822,0.746,0.833,0.756,0.797,0.72,0.83,0.753,0.836,0.802,0.823,0.688
StandardScaler,0.695,0.662,0.792,0.779,0.822,0.823,0.833,0.828,0.856,0.85,0.831,0.769,0.836,0.8,0.817,0.809



# Classifiers+Scaling+PCA+Hyperparameter tuning

I hypertune the parameters only on the Sonar dataset

In [23]:
import operator

import os
import pandas as pd

results_hyper_file = "sonar_results_hypertuned.csv"
results_hyper_df = pd.read_csv(os.path.join("..", "..", "data", "processed", results_hyper_file)).dropna().round(3)


temp = results_hyper_df.copy()
temp["model"] = results_hyper_df["Classifier_Name"].apply(lambda sen: sen.split("_")[1])
temp["scaler"] = results_hyper_df["Classifier_Name"].apply(lambda sen: sen.split("_")[0])

def df_style(val):
    return 'font-weight: 800'
    

pivot_t = pd.pivot_table(temp, values='CV_mean', index=["scaler"], columns=['model'], aggfunc=np.sum)
pivot_t_bold = pivot_t.style.applymap(df_style,
                      subset=pd.IndexSlice[pivot_t["KNN"].idxmax(),"KNN"])
for col in list(pivot_t):
    pivot_t_bold = pivot_t_bold.applymap(df_style,
                      subset=pd.IndexSlice[pivot_t[col].idxmax(),col])
pivot_t_bold

model,CART,CART-PCA,KNN,KNN-PCA,LDA,LDA-PCA,LR,LR-PCA,MLP,MLP-PCA,RF,RF-PCA,SVM,SVM-PCA
scaler,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
,0.734,0.704,0.85,0.771,0.77,0.676,0.76,0.689,0.734,0.651,0.771,0.699,0.789,0.67
MaxAbsScaler,0.734,0.723,0.843,0.741,0.765,0.76,0.736,0.766,0.728,0.754,0.776,0.758,0.759,0.743
MinMaxScaler,0.734,0.657,0.837,0.753,0.782,0.754,0.711,0.766,0.746,0.742,0.776,0.722,0.735,0.737
PowerTransformer-Yeo-Johnson,0.65,0.74,0.874,0.777,0.778,0.771,0.789,0.777,0.807,0.681,0.776,0.772,0.873,0.728
QuantileTransformer-Normal,0.64,0.687,0.806,0.731,0.771,0.778,0.795,0.766,0.694,0.735,0.795,0.773,0.837,0.735
QuantileTransformer-Uniform,0.651,0.717,0.891,0.771,0.783,0.765,0.814,0.778,0.758,0.741,0.746,0.741,0.814,0.753
RobustScaler,0.734,0.686,0.838,0.789,0.776,0.73,0.758,0.724,0.783,0.71,0.776,0.711,0.771,0.759
StandardScaler,0.734,0.736,0.825,0.776,0.783,0.753,0.741,0.748,0.777,0.742,0.776,0.722,0.861,0.718
