# How does standardization affect accuracies?

In this document, I compare prediction accuracies with and without standardizations.
As a prediction model, I use a logit regression model for simplicity.

## Preparation

First, I import libraries:

In [1]:
# Import libraries
%matplotlib inline

import numpy as np
import pandas as pd
import itertools
import random

import matplotlib.pyplot as plt


Then, I load the datasets:

In [2]:
# Load datasets for training my model
train_values = pd.read_csv('../Data/train_values.csv', index_col = 'patient_id')
train_labels = pd.read_csv('../Data/train_labels.csv', index_col = 'patient_id')

The head of the training features looks like this:

In [3]:
train_values.head()

Unnamed: 0_level_0,slope_of_peak_exercise_st_segment,thal,resting_blood_pressure,chest_pain_type,num_major_vessels,fasting_blood_sugar_gt_120_mg_per_dl,resting_ekg_results,serum_cholesterol_mg_per_dl,oldpeak_eq_st_depression,sex,age,max_heart_rate_achieved,exercise_induced_angina
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0z64un,1,normal,128,2,0,0,2,308,0.0,1,45,170,0
ryoo3j,2,normal,110,3,0,0,0,214,1.6,0,54,158,0
yt1s1x,1,normal,125,4,3,0,2,304,0.0,1,77,162,1
l2xjde,1,reversible_defect,152,4,0,0,0,223,0.0,1,40,181,0
oyt4ek,3,reversible_defect,178,1,0,0,2,270,4.2,1,59,145,0


In this exercise, I just use the following selected numeric features: `age`, `sex`, `max_heart_rate_achieved`, and `resting_blood_pressure`.

In [4]:
selected_features = ['age',
                     'sex',
                     'max_heart_rate_achieved',
                     'resting_blood_pressure']
train_values_subset = train_values[selected_features].astype('float')

In [None]:
train_values_subset.dtypes

## Logistic Regression

In [None]:
# for preprocessing the data
from sklearn.preprocessing import StandardScaler

# the model
from sklearn.linear_model import LogisticRegression

# for combining the preprocess with model training
from sklearn.pipeline import Pipeline

# for optimizing parameters of the pipeline
from sklearn.model_selection import GridSearchCV, cross_val_score


For predictions, I use `Pipeline`s.
Here I define two pipelines, with and without standardizations:

In [None]:
pipe_wo_standard = Pipeline(steps = [('logistic', LogisticRegression(solver = 'liblinear'))])
pipe_with_standard = Pipeline(steps = [('scale', StandardScaler()),
                                       ('logistic', LogisticRegression(solver = 'liblinear'))])

For hyperparameter tuning, I use three-fold cross validations.
For this, I use the grid between 0.0001 and 10 for the regularization parameter, $C$, and as a regularization term, I attempt L1 and L2.
The cost function is hence represented as
\begin{equation*}
    J(w) = \sum_{i=1}^n \{ - y_i \log (\phi (w' x_i)) - (1 - y_i) \log (1 - \phi (w' x_i)) \} + \frac{1}{C} L(w),
\end{equation*}
where $\phi$ is a logit function: $\phi(z) = \frac{1}{1 + e^{-z}}$.
The function $L$ is the penalty term, which takes either L1 or L2 norm of $w$.

In [None]:
param_grid = {'logistic__C': np.linspace(0.0001, 10, num = 100),
              'logistic__penalty': ['l1', 'l2']}
gs_wo_standard = GridSearchCV(estimator = pipe_wo_standard,
                              param_grid = param_grid,
                              cv = 3)
gs_with_standard = GridSearchCV(estimator = pipe_with_standard,
                                param_grid = param_grid,
                                cv = 3)

Now I estimate the parameters, with and without standardizations:

In [None]:
gs_wo_standard.fit(train_values_subset, train_labels.heart_disease_present)
gs_with_standard.fit(train_values_subset, train_labels.heart_disease_present)

In [None]:
gs_wo_standard.best_params_

In [None]:
gs_with_standard.best_params_

To evaluate the performances, I use the log loss as the metric:
\begin{equation*}
Log\ loss = \sum_{i = 1}^n \{- y_i \log(\phi(w' x_i)) - (1 - y_i) \log (1 - \phi(w' x_i))\}.
\end{equation*}
Also, for evaluations, I use the means of scores in five-fold cross validations.

In [None]:
print(cross_val_score(gs_wo_standard.best_estimator_, train_values_subset, np.ravel(train_labels), cv = 5, scoring = 'neg_log_loss').mean())

In [None]:
from sklearn.metrics import log_loss

cv_score_wo_standard   = cross_val_score(gs_wo_standard.best_estimator_, train_values_subset, train_labels.heart_disease_present, cv = 5, scoring = 'neg_log_loss')
cv_score_with_standard = cross_val_score(gs_with_standard.best_estimator_, train_values_subset, train_labels.heart_disease_present, cv = 5, scoring = 'neg_log_loss')

The means of log losses in cross validations are

In [None]:
print('Log loss without standardization:', -cv_score_wo_standard.mean(), 
      ', \n Log loss with standardization:', -cv_score_with_standard.mean())

## Standardization for subset of variables

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion

class ColSelect(BaseEstimator, TransformerMixin):
    
    # class constructor
    def __init__(self, col_list = None):
        self.col_list = col_list
    def fit(self, X, y = None):
        return self
    def transform(self, X, y = None):
        output = X.loc[:, self.col_list]            
        return output
        

In [None]:
std_col_pipe = Pipeline(steps = [('std_col_select', ColSelect()),
                                 ('scale', StandardScaler())])
no_std_col_pipe = Pipeline(steps = [('non_std_col_select', ColSelect())])

input_pipeline = FeatureUnion(transformer_list = [('std_col', std_col_pipe),
                                                  ('non_std_col', no_std_col_pipe)])
full_pipeline = Pipeline(steps = [('input', input_pipeline),
                                  ('logistic', LogisticRegression(solver = 'liblinear'))])

In [None]:
data_columns = list(train_values_subset.columns)

std_list_list = []
best_mean_cv_list = []
for L in range(1, len(data_columns) + 1):
    for std_list in itertools.combinations(data_columns, L):
        non_std_list = data_columns[:]
        for ele in std_list:
            non_std_list.remove(ele)
        param_grid =  {'input__std_col__std_col_select__col_list': [std_list],
                        'input__non_std_col__non_std_col_select__col_list': [non_std_list],
                        'logistic__C': np.linspace(0.0001, 10, num = 100),
                        'logistic__penalty': ['l1', 'l2']}
        gs_cv = GridSearchCV(estimator = full_pipeline,
                            param_grid = param_grid,
                            cv = 3)
        gs_cv.fit(train_values_subset, train_labels.heart_disease_present)
#        best_mean_cv = gs_cv.best_score_
        random.seed(10)
        best_mean_cv = - cross_val_score(gs_cv.best_estimator_, train_values_subset, train_labels.heart_disease_present, cv = 5, scoring = 'neg_log_loss').mean()
        std_list_list.append(std_list)
        best_mean_cv_list.append(best_mean_cv)
        
        



In [None]:
log_loss_table = pd.DataFrame(data = {'standardized_col': std_list_list, 'mean_log_loss': best_mean_cv_list})
log_loss_table

## Adding a noisy column and standardizing it

In [None]:
train_values_with_noise = train_values_subset
train_values_with_noise['noise'] = np.random.normal(0, 0.0001, train_values_subset.shape[0])

data_columns = list(train_values_with_noise.columns)

# with noise standardized
std_list = ['noise']
non_std_list = data_columns[:]
for ele in std_list:
    non_std_list.remove(ele)
param_grid =  {'input__std_col__std_col_select__col_list': [std_list],
                'input__non_std_col__non_std_col_select__col_list': [non_std_list],
                'logistic__C': np.linspace(0.0001, 10, num = 100),
                'logistic__penalty': ['l1', 'l2']}
gs_cv = GridSearchCV(estimator = full_pipeline,
                    param_grid = param_grid,
                    cv = 3)
gs_cv.fit(train_values_with_noise, train_labels.heart_disease_present)
random.seed(10)
best_mean_cv_noise_std = - cross_val_score(gs_cv.best_estimator_, train_values_with_noise, train_labels.heart_disease_present, cv = 5, scoring = 'neg_log_loss').mean()
        
# without noise standardized

param_grid = {'logistic__C': np.linspace(0.0001, 10, num = 100),
              'logistic__penalty': ['l1', 'l2']}

gs_wo_noise_std = GridSearchCV(estimator = pipe_wo_standard,
                                param_grid = param_grid,
                                cv = 3)

gs_wo_noise_std.fit(train_values_with_noise, train_labels.heart_disease_present)

random.seed(10)
best_mean_cv_noise_non_std = - cross_val_score(gs_wo_noise_std.best_estimator_, train_values_with_noise, train_labels.heart_disease_present, cv = 5, scoring = 'neg_log_loss').mean()




In [None]:
print('Log loss with noise standardized:', best_mean_cv_noise_std,
      ', \n Log loss without noise standardized:', best_mean_cv_noise_non_std)


In [None]:
cv_score_wo_noise
cv_score_with_noise
print('Log loss without noise:', -cv_score_wo_noise.mean(), 
      ', \n Log loss with noise:', -cv_score_with_noise.mean())