# Regularization


- This tutorial illustrates how to perform regularization in a model. 
- Example code to perform statistical analysis to select features before the model training is also shown.

In [64]:
import pandas as pd
import numpy as np

# Read data

Wine data: original source is https://archive.ics.uci.edu/ml/machine-learning-databases/wine/

Class label is the first column. Three classes : `1, 2, 3`

In [65]:
df_wine = pd.read_csv('wine.data', header=None)
df_wine.columns = ['Class label', 'Alcohol', 'Malic acid', 'Ash',
                   'Alcalinity of ash', 'Magnesium', 'Total phenols',
                   'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins',
                   'Color intensity', 'Hue', 'OD280/OD315 of diluted wines',
                   'Proline']

print('Class labels', np.unique(df_wine['Class label']))
df_wine

Class labels [1 2 3]


Unnamed: 0,Class label,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
0,1,14.23,1.71,2.43,15.6,127,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.20,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050
2,1,13.16,2.36,2.67,18.6,101,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.50,16.8,113,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,3,13.71,5.65,2.45,20.5,95,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740
174,3,13.40,3.91,2.48,23.0,102,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750
175,3,13.27,4.28,2.26,20.0,120,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835
176,3,13.17,2.59,2.37,20.0,120,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840


# Train test split

In [68]:
from sklearn.model_selection import train_test_split
df_wine_features = df_wine.loc[:,'Alcohol':'Proline']
df_wine_class = df_wine.loc[:,'Class label']
X_train, X_test, y_train, y_test =train_test_split(df_wine_features, df_wine_class,test_size=0.2)

display(X_train,X_test)

print("Shapes of training and testing data")
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

Unnamed: 0,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
42,13.88,1.89,2.59,15.0,101,3.25,3.56,0.17,1.70,5.43,0.88,3.56,1095
148,13.32,3.24,2.38,21.5,92,1.93,0.76,0.45,1.25,8.42,0.55,1.62,650
81,12.72,1.81,2.20,18.8,86,2.20,2.53,0.26,1.77,3.90,1.16,3.14,714
11,14.12,1.48,2.32,16.8,95,2.20,2.43,0.26,1.57,5.00,1.17,2.82,1280
137,12.53,5.51,2.64,25.0,96,1.79,0.60,0.63,1.10,5.00,0.82,1.69,515
...,...,...,...,...,...,...,...,...,...,...,...,...,...
90,12.08,1.83,2.32,18.5,81,1.60,1.50,0.52,1.64,2.40,1.08,2.27,480
100,12.08,2.08,1.70,17.5,97,2.23,2.17,0.26,1.40,3.30,1.27,2.96,710
117,12.42,1.61,2.19,22.5,108,2.00,2.09,0.34,1.61,2.06,1.06,2.96,345
124,11.87,4.31,2.39,21.0,82,2.86,3.03,0.21,2.91,2.80,0.75,3.64,380


Unnamed: 0,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
73,12.99,1.67,2.6,30.0,139,3.3,2.89,0.21,1.96,3.35,1.31,3.5,985
70,12.29,1.61,2.21,20.4,103,1.1,1.02,0.37,1.46,3.05,0.906,1.82,870
97,12.29,1.41,1.98,16.0,85,2.55,2.5,0.29,1.77,2.9,1.23,2.74,428
135,12.6,2.46,2.2,18.5,94,1.62,0.66,0.63,0.94,7.1,0.73,1.58,695
166,13.45,3.7,2.6,23.0,111,1.7,0.92,0.43,1.46,10.68,0.85,1.56,695
55,13.56,1.73,2.46,20.5,116,2.96,2.78,0.2,2.45,6.25,0.98,3.03,1120
172,14.16,2.51,2.48,20.0,91,1.68,0.7,0.44,1.24,9.7,0.62,1.71,660
58,13.72,1.43,2.5,16.7,108,3.4,3.67,0.19,2.04,6.8,0.89,2.87,1285
87,11.65,1.67,2.62,26.0,88,1.92,1.61,0.4,1.34,2.6,1.36,3.21,562
83,13.05,3.86,2.32,22.5,85,1.65,1.59,0.61,1.62,4.8,0.84,2.01,515


Shapes of training and testing data
(142, 13) (36, 13) (142,) (36,)


# Univariate analysis to remove features


There are three class labels. Perform an anova test that checks for each feature, if the values are significantly dependent on the class labels.

In [146]:
import scipy.stats as stats

print("Performing ANOVA analysis\n=====================================\n")

selected_features = [] # a list to store selected feature names

'''
for each feature perform the stat test.
'''
for feature in X_train.columns:
    '''
    X_train.loc[y_train==1,feature] : the values of feature for class label = 1
    X_train.loc[y_train==2,feature] : the values of feature for class label = 2
    X_train.loc[y_train==3,feature] : the values of feature for class label = 3
    '''
    fvalue, pvalue = stats.f_oneway(X_train.loc[y_train==1,feature] , X_train.loc[y_train==2,feature],X_train.loc[y_train==3,feature]  )
    print('p-value for feature ', feature , " = ", pvalue)
    if pvalue < 0.001: # signigicantly different
        selected_features.append(feature) # store the feature name in the list
        
print('\n\nselected features are ', selected_features) 


Performing ANOVA analysis

p-value for feature  Alcohol  =  1.606674897466022e-30
p-value for feature  Malic acid  =  6.503110923769575e-10
p-value for feature  Ash  =  4.554866513787937e-05
p-value for feature  Alcalinity of ash  =  2.5346464383089883e-14
p-value for feature  Magnesium  =  1.5976118310882153e-08
p-value for feature  Total phenols  =  9.981308645344049e-23
p-value for feature  Flavanoids  =  7.806792905696504e-42
p-value for feature  Nonflavanoid phenols  =  7.517791843495314e-12
p-value for feature  Proanthocyanins  =  3.1715011858050654e-09
p-value for feature  Color intensity  =  4.542497943686269e-24
p-value for feature  Hue  =  1.7753009377149963e-22
p-value for feature  OD280/OD315 of diluted wines  =  2.6586321928271736e-36
p-value for feature  Proline  =  3.4868571801566934e-42


selected features are  ['Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash', 'Magnesium', 'Total phenols', 'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins', 'Color intensity', '

It looks like all the feature take values significantly different across the class labels. Therefore, do not remove any feature.
For example if features `Alcohol` and `Flavanoids` were not significantly different across the class labels (e.g `p-value > 0.001`),
you can choose to remove them as shown below. In that case, you will use `X_train_new` & `X_test_new` for subsequent analyses.


`X_train_new = X_train.drop(columns = ['Alcohol','Flavanoids'])`

`X_test_new = X_test.drop(columns = ['Alcohol','Flavanoids'])`


Or you can simply write the following. The code will select only the selected features.

`X_train = X_train.loc[:,selected_features]`

`X_test = X_test.loc[:,selected_features]`



In [148]:
X_train = X_train.loc[:,selected_features]
X_test = X_test.loc[:,selected_features]
X_train.head() # display first few rows.

Unnamed: 0,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
42,13.88,1.89,2.59,15.0,101,3.25,3.56,0.17,1.7,5.43,0.88,3.56,1095
148,13.32,3.24,2.38,21.5,92,1.93,0.76,0.45,1.25,8.42,0.55,1.62,650
81,12.72,1.81,2.2,18.8,86,2.2,2.53,0.26,1.77,3.9,1.16,3.14,714
11,14.12,1.48,2.32,16.8,95,2.2,2.43,0.26,1.57,5.0,1.17,2.82,1280
137,12.53,5.51,2.64,25.0,96,1.79,0.6,0.63,1.1,5.0,0.82,1.69,515


# Scale the features

In [69]:
from sklearn.preprocessing import StandardScaler
stdsc = StandardScaler()
X_train_std = stdsc.fit_transform(X_train)
X_test_std = stdsc.transform(X_test)

# Perform Regularization using Logistic Regression

Logistic Regression parameters can be found here

https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html


### Set up the models for training

In [150]:
from sklearn.linear_model import LogisticRegression


'''
Three logistic models based on the type of regularization: l1, l2 or l1l2(elastic net)
'''
lr_l1 = LogisticRegression(penalty='l1',solver='saga',max_iter = 10000)
lr_l2 = LogisticRegression(penalty='l2',solver='saga',max_iter = 10000)
lr_elastic = LogisticRegression(penalty='elasticnet',solver='saga',max_iter = 10000)

'''
parameter Grid for l1 or l2
In scikit-learn, C is the inverse of shrinkage parameter. Smaller C means higher penalty.
If C = 0.01, the value of the shrinkage parameter = 1/0.01 = 100.
'''

param_grid_l1_or_l2 =[ {'C': [0.0001,0.0005,0.001,0.005, 0.01,0.05, 1, 5, 10, 20, 30, 40, 50, 60, 100, 200, 400,500,1000] } ]

'''
parameter Grid for elastic-net
l1_ratio has to be between 0 and 1. It is representative the value of alpha (see lecture notes).
'''
param_grid_elastic  = {'C': [0.0001,0.0005,0.001,0.005, 0.01,0.05, 1, 5, 10, 20, 30, 40, 50, 60, 100, 200, 400,500,1000], 
                       'l1_ratio': [0.00001, 0.0001,0.0005,0.001,0.005, 0.01,0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.8, 0.9]}



### Train the regularized models 

In [140]:
from sklearn.model_selection import GridSearchCV

clf_l1 = GridSearchCV(lr_l1, param_grid_l1_or_l2, cv = 5, scoring='accuracy',verbose=0)
clf_l1.fit(X_train_std, y_train)
print('l1 best param',clf_l1.best_params_)


clf_l2 = GridSearchCV(lr_l2, param_grid_l1_or_l2, cv = 5, scoring='accuracy',verbose=0)
clf_l2.fit(X_train_std, y_train)
print('l2 best param',clf_l2.best_params_)



clf_elastic = GridSearchCV(lr_elastic, param_grid_elastic, cv = 5, scoring='accuracy',verbose=0)
clf_elastic.fit(X_train_std, y_train)
print('elastic net best param',clf_elastic.best_params_)


l1 best param {'C': 40}
l2 best param {'C': 0.05}
elastic net best param {'C': 0.05, 'l1_ratio': 0.0001}


In [161]:
print(clf_l1.best_estimator_.coef_.shape)
print("logistic regresion l1 regularization model parameters")
coef_l1 = pd.DataFrame(clf_l1.best_estimator_.coef_,columns = X_train.columns, index=['1','2','3'])
display(coef_l1)

print(clf_l2.best_estimator_.coef_.shape)
print("logistic regresion l2 regularization model parameters")
coef_l2 = pd.DataFrame(clf_l2.best_estimator_.coef_,columns = X_train.columns, index=['1','2','3'])
display(coef_l2)


print(clf_elastic.best_estimator_.coef_.shape)
print("logistic regresion elastic-net regularization model parameters")
coef_elastic = pd.DataFrame(clf_elastic.best_estimator_.coef_,columns = X_train.columns, index=['1','2','3'])
display(coef_elastic)

(3, 13)
logistic regresion l1 regularization model parameters


Unnamed: 0,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
1,1.9785,0.197272,0.0,-1.913536,0.0,0.0,0.990308,-0.326474,0.0,0.0,0.0,0.788207,3.052882
2,-2.219681,-1.305874,-2.289691,0.0,0.0,0.0,0.0,0.838775,0.0,-3.174802,1.641868,0.0,-4.024307
3,0.0,0.0,0.238624,0.203722,0.0,-0.566354,-2.851821,0.0,0.0,1.832769,-1.574006,-2.126928,0.0


(3, 13)
logistic regresion l2 regularization model parameters


Unnamed: 0,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
1,0.343011,-0.012703,0.146314,-0.289019,0.10962,0.199876,0.244079,-0.187341,0.056757,0.087557,0.114663,0.260261,0.442775
2,-0.429732,-0.171143,-0.246253,0.137687,-0.15105,0.010583,0.113601,0.06877,0.091178,-0.382442,0.205739,0.108427,-0.412928
3,0.08672,0.183846,0.099939,0.151332,0.04143,-0.210459,-0.35768,0.118571,-0.147935,0.294885,-0.320402,-0.368688,-0.029847


(3, 13)
logistic regresion elastic-net regularization model parameters


Unnamed: 0,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
1,0.342951,-0.012642,0.146192,-0.289007,0.109556,0.199835,0.244059,-0.187312,0.056735,0.087471,0.114612,0.260247,0.442791
2,-0.429692,-0.171097,-0.246222,0.137585,-0.151044,0.010564,0.113589,0.068643,0.091149,-0.382397,0.205722,0.108437,-0.412872
3,0.086641,0.183839,0.09993,0.151322,0.041387,-0.210499,-0.357748,0.11857,-0.147984,0.294826,-0.320434,-0.368784,-0.029819


Recall from logistic regression from data mining:
    
- Note that there are 13 features
- If there are only two class labels, the logistic regression has 14 parameters (13 features + 1 intercept). The model is a hyperplane in 13 dimensions.
- Since we have three class labels, we have a multiclass logistic regression model. For multi class problem, the model will have
(`13 * num_class_labels` + `num_class_labels` intercepts) parameters

# Accuracy of regularized logistic regression on testing set

In [190]:
print("accuracy score on test set : l1 regularization")
from sklearn.metrics import accuracy_score
accuracy_score(y_test, clf_l1.best_estimator_.predict(X_test_std))

accuracy score on test set : l1 regularization


0.9722222222222222

In [192]:
print("accuracy score on train set : l1 regularization")
from sklearn.metrics import accuracy_score
accuracy_score(y_train, clf_l1.best_estimator_.predict(X_train_std))

accuracy score on train set : l1 regularization


1.0

In [158]:

print("accuracy score on test set: l2 regularization")
from sklearn.metrics import accuracy_score
accuracy_score(y_test, clf_l2.best_estimator_.predict(X_test_std))


accuracy score on test set: l2 regularization


0.9444444444444444

In [193]:
print("accuracy score on train set: l2 regularization")
from sklearn.metrics import accuracy_score
accuracy_score(y_train, clf_l2.best_estimator_.predict(X_train_std))

accuracy score on train set: l2 regularization


0.9929577464788732

In [160]:
print("accuracy score on test set: elastic-net regularization")
from sklearn.metrics import accuracy_score
accuracy_score(y_test, clf_elastic.best_estimator_.predict(X_test_std))

accuracy score on test set: elastic-net regularization


0.9444444444444444

In [194]:
print("accuracy score on train set: elastic-net regularization")
from sklearn.metrics import accuracy_score
accuracy_score(y_train, clf_elastic.best_estimator_.predict(X_train_std))

accuracy score on train set: elastic-net regularization


0.9929577464788732

The difference in the train and test sets accuracies is smallest in l1 regularization.

# Use the selected features from logistic L1 regularization and Train a random forest model.

In [165]:
X_train_new = X_train.drop(columns = ['Magnesium','Proanthocyanins'])
X_test_new = X_test.drop(columns = ['Magnesium','Proanthocyanins'])


from sklearn.preprocessing import StandardScaler
stdsc = StandardScaler()
X_train_new_std = stdsc.fit_transform(X_train_new)
X_test_new_std = stdsc.transform(X_test_new)

In [168]:
X_train_new.head()

Unnamed: 0,Alcohol,Malic acid,Ash,Alcalinity of ash,Total phenols,Flavanoids,Nonflavanoid phenols,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
42,13.88,1.89,2.59,15.0,3.25,3.56,0.17,5.43,0.88,3.56,1095
148,13.32,3.24,2.38,21.5,1.93,0.76,0.45,8.42,0.55,1.62,650
81,12.72,1.81,2.2,18.8,2.2,2.53,0.26,3.9,1.16,3.14,714
11,14.12,1.48,2.32,16.8,2.2,2.43,0.26,5.0,1.17,2.82,1280
137,12.53,5.51,2.64,25.0,1.79,0.6,0.63,5.0,0.82,1.69,515


In [166]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

param_grid_rf = {
    'max_features': ['log2', 'sqrt'],
    'n_estimators': [100, 200, 300, 500, 800, 1000,1500,2000]
}
rf = RandomForestClassifier()
rf_search = GridSearchCV(estimator = rf, param_grid = param_grid_rf, scoring='accuracy', cv = 5, verbose = 1)
rf_search.fit(X_train_new_std, y_train)
print('RF best param',rf_search.best_params_)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
RF best param {'max_features': 'log2', 'n_estimators': 1500}


In [169]:
print("accuracy score on test set: RF ")
from sklearn.metrics import accuracy_score
accuracy_score(y_test, rf_search.best_estimator_.predict(X_test_new_std))

accuracy score on test set: RF 


0.9444444444444444

# Code to illustrate the effect of value of shrinkage parameter on model coefficients.

In [180]:
import matplotlib.pyplot as plt

'''
Set up three LASSO models with different values of C
'''
lr_small_C = LogisticRegression(penalty='l1', C = 0.1, solver='saga',max_iter = 10000)
lr_mid_C = LogisticRegression(penalty='l1', C = 1, solver='saga',max_iter = 10000)
lr_large_C = LogisticRegression(penalty='l1', C = 10000, solver='saga',max_iter = 10000)

'''
Train three LASSO models with different values of C
'''
lr_small_C.fit(X_train_std, y_train)
lr_mid_C.fit(X_train_std, y_train)
lr_large_C.fit(X_train_std, y_train)


'''
Display the learned model parameters
'''

print("\n===============================\nlogistic regresion l1 regularization model parameters with small C (large shrinkage parameter)")
coef_lr_small_C = pd.DataFrame(lr_small_C.coef_,columns = X_train.columns, index=['1','2','3'])
display(coef_lr_small_C)


print("\n===============================\nlogistic regresion l1 regularization model parameters with mid C (mid shrinkage parameter)")
coef_lr_mid_C = pd.DataFrame(lr_mid_C.coef_,columns = X_train.columns, index=['1','2','3'])
display(coef_lr_mid_C)


print("\n===============================\nlogistic regresion l1 regularization model parameters with large C (small shrinkage parameter)")
coef_lr_large_C = pd.DataFrame(lr_large_C.coef_,columns = X_train.columns, index=['1','2','3'])
display(coef_lr_large_C)


logistic regresion l1 regularization model parameters with small C (large shrinkage parameter)


Unnamed: 0,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
1,0.0,0.0,0.0,0.0,0.0,0.0,0.054656,0.0,0.0,0.0,0.0,0.029474,1.122934
2,-0.829377,0.0,-0.116295,0.0,0.0,0.0,0.0,0.0,0.0,-0.452828,0.0,0.0,-0.118933
3,0.0,0.0,0.0,0.0,0.0,0.0,-0.744157,0.0,0.0,0.0,-0.377808,-0.637326,0.0



logistic regresion l1 regularization model parameters with mid C (mid shrinkage parameter)


Unnamed: 0,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
1,0.077684,0.0,0.0,-0.61097,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.528856,1.36408
2,-1.234259,-0.638874,-0.758388,0.0,0.0,0.0,0.0,0.217575,0.0,-1.578316,0.151377,0.0,-1.519194
3,0.0,0.0,0.0,0.0,0.0,0.0,-2.109837,0.0,0.0,0.0,-0.67747,-0.913208,0.0



logistic regresion l1 regularization model parameters with large C (small shrinkage parameter)


Unnamed: 0,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
1,2.544689,1.000444,1.153002,-2.506126,0.173453,1.046524,1.91777,-1.452925,-0.739927,0.6632,0.403984,1.827075,3.522916
2,-2.710206,-1.327131,-2.354267,1.15986,-0.240221,0.159458,0.999513,1.190136,-0.067374,-3.373865,1.77968,0.488003,-4.239559
3,0.157722,0.318892,1.193479,1.33847,0.059214,-1.213777,-2.925075,0.25501,0.813847,2.702869,-2.19146,-2.322873,0.708852


From the above Illustration, when the value of C (inverse of shrinkage parameter) increases, the shrinkage parameter decreases and hence more features are retained during L1 (LASSO) regularization