* Regularization 

  * Adds a penalty on the parameters of the model to reduce the freedom of the model. 
  * Less overfit, better generalization. 
  * For linear models, there are three types of regularization: 
    * L1 (Lasso)
    * L2 (Ridge)
    * L1/L2 (Elastic net)

* L1 (Lasso)

  * $$
    \frac{1}{2m}\sum(y-\hat{y})^2 + \lambda \sum \phi
    $$

  * $\hat{y}=\theta_1 X_1 + \theta_2 X_2 + ... + \theta_n X_n$

  * $\lambda$ is the regularization parameter = penalty. Higher the penalty, the bigger the generalization. If the penalty is too high, the model may lose predictive power. 

  * L1 will shrink some parameters to zero, allowing for feature elimination 

* L2 (Ridge)

  * $$
    \frac{1}{2m}\sum(y-\hat{y})^2 + \lambda \sum \phi^2
    $$

  * $\hat{y}=\theta_1 X_1 + \theta_2 X_2 + ... + \theta_n X_n$

  * $\lambda$ is the regularization parameter = penalty. Higher the penalty, the bigger the generalization. If the penalty is too high, the model may lose predictive power. 

  * L2 will make coefficients approach to zero but equal to zero. No variable is ever excluded.  

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler

# Classification

In [2]:
# load dataset
data = pd.read_csv('../datasets/dataset_2.csv')
data.shape

(50000, 109)

In [3]:
data.head()

Unnamed: 0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,var_10,...,var_100,var_101,var_102,var_103,var_104,var_105,var_106,var_107,var_108,var_109
0,4.53271,3.280834,17.982476,4.404259,2.34991,0.603264,2.784655,0.323146,12.009691,0.139346,...,2.079066,6.748819,2.941445,18.360496,17.726613,7.774031,1.473441,1.973832,0.976806,2.541417
1,5.821374,12.098722,13.309151,4.125599,1.045386,1.832035,1.833494,0.70909,8.652883,0.102757,...,2.479789,7.79529,3.55789,17.383378,15.193423,8.263673,1.878108,0.567939,1.018818,1.416433
2,1.938776,7.952752,0.972671,3.459267,1.935782,0.621463,2.338139,0.344948,9.93785,11.691283,...,1.861487,6.130886,3.401064,15.850471,14.620599,6.849776,1.09821,1.959183,1.575493,1.857893
3,6.02069,9.900544,17.869637,4.366715,1.973693,2.026012,2.853025,0.674847,11.816859,0.011151,...,1.340944,7.240058,2.417235,15.194609,13.553772,7.229971,0.835158,2.234482,0.94617,2.700606
4,3.909506,10.576516,0.934191,3.419572,1.871438,3.340811,1.868282,0.439865,13.58562,1.153366,...,2.738095,6.565509,4.341414,15.893832,11.929787,6.954033,1.853364,0.511027,2.599562,0.811364


In [4]:
# separate train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=['target'], axis=1),
    data['target'],
    test_size=0.3,
    random_state=0)

X_train.shape, X_test.shape

((35000, 108), (15000, 108))

In [5]:
scaler = StandardScaler()
scaler.fit(X_train)

StandardScaler()

### Using normal logistic regression

In [6]:
model = LogisticRegression(C=0.5, penalty='l1', solver='liblinear', random_state=0)
model.fit(scaler.transform(X_train), y_train)

LogisticRegression(C=0.5, penalty='l1', random_state=0, solver='liblinear')

In [7]:
model.coef_

array([[ 0.00614903, -0.01263699,  0.03864236,  0.01362317,  0.03910865,
        -0.02312949, -0.02346654,  0.        ,  0.02198417, -0.00886113,
         0.05656938,  0.03480483, -0.0348094 ,  0.0186791 , -0.03739686,
         0.11022365,  0.01064827,  0.0368803 ,  0.        ,  0.01533676,
         0.12677206,  0.00325506, -0.11345834,  0.08742256,  0.01239674,
         0.06037315,  0.01660917,  0.00297285, -0.00293245, -0.01876521,
         0.00663995, -0.12553687, -0.07940795,  0.03136287,  0.00748913,
         0.02530746,  0.01787214,  0.04440867,  0.06288793,  0.06703192,
        -0.0036661 ,  0.        , -0.0348187 ,  0.04153097,  0.0029443 ,
         0.02186785,  0.        ,  0.30941427,  0.01970609,  0.04219899,
         0.01308563,  0.05011866,  0.        ,  0.01244806,  0.82078495,
        -0.08294842, -0.00187706,  0.02769523,  0.        , -0.0645345 ,
         0.        , -0.09353409,  0.        , -0.00326007,  0.0168383 ,
         0.01279489,  0.00199789, -0.22568649, -0.1

In [8]:
# excluded features 
X_train.columns[model.coef_[0]==0]

Index(['var_8', 'var_19', 'var_42', 'var_47', 'var_53', 'var_59', 'var_62',
       'var_64', 'var_73', 'var_75', 'var_85', 'var_87', 'var_91', 'var_105',
       'var_109'],
      dtype='object')

### Using SelectFromModel

In [9]:
sel_ = SelectFromModel(
    LogisticRegression(C=0.5, penalty='l1', solver='liblinear', random_state=10))

sel_.fit(scaler.transform(X_train), y_train)

SelectFromModel(estimator=LogisticRegression(C=0.5, penalty='l1',
                                             random_state=10,
                                             solver='liblinear'))

In [10]:
# excluded features. It's the same 
X_train.columns[sel_.get_support()==False]

Index(['var_8', 'var_19', 'var_42', 'var_47', 'var_53', 'var_59', 'var_62',
       'var_64', 'var_73', 'var_75', 'var_85', 'var_87', 'var_91', 'var_105',
       'var_109'],
      dtype='object')

In [11]:
# benefit of using SelectFromModel is to use transform to use selected features 
X_train_selected = sel_.transform(X_train)
X_test_selected = sel_.transform(X_test)

X_train_selected.shape, X_test_selected.shape


((35000, 93), (15000, 93))

However, the output from selectFromModel is a numpy array, not pandas.

### (Comparison) Ridge Regression does not shrink coefficients to zero 

In [12]:
# For comparison, I will fit a logistic regression with a
# Ridge regularisation, and evaluate the coefficients

l2_logit = LogisticRegression(C=0.5, penalty='l2', max_iter=300, random_state=10)
l2_logit.fit(scaler.transform(X_train), y_train)

# I count the number of coefficients with zero values
# and it is zero, as expected
np.sum(l2_logit.coef_ == 0)

0

# Regression

In [13]:
data = pd.read_csv('../datasets/houseprice.csv')
data.shape

(1460, 81)

In [14]:
# In practice, feature selection should be done after data pre-processing,
# so ideally, all the categorical variables are encoded into numbers,
# and then you can assess how deterministic they are of the target

# here for simplicity I will use only numerical variables
# select numerical columns:

numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numerical_vars = list(data.select_dtypes(include=numerics).columns)
data = data[numerical_vars]
data.shape

(1460, 38)

In [15]:
# separate train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=['SalePrice'], axis=1),
    data['SalePrice'],
    test_size=0.3,
    random_state=0)

X_train.shape, X_test.shape

((1022, 37), (438, 37))

In [16]:
X_train.fillna(0, inplace=True)
X_test.fillna(0, inplace=True)

In [17]:
scaler = StandardScaler()
scaler.fit(X_train)

StandardScaler()

### Default Lasso

In [18]:
model = Lasso(alpha=100, random_state=10)
model.fit(scaler.transform(X_train), y_train)

Lasso(alpha=100, random_state=10)

In [19]:
model.coef_

array([   -0.        , -5779.76732378,  2290.34505559,  4607.37122564,
       21269.51543947,  5577.91163722,  9423.03808179,  2956.43804576,
        5778.94116166,  6863.37372868,   353.2013171 ,    -0.        ,
        7466.00135269,     0.        ,  1201.11596689,  -809.019151  ,
       25174.1846504 ,  1555.24630906,   200.40775198,  1754.58826035,
        -226.59179722, -7994.10139938, -3545.95816149,  7261.09279627,
        3043.8007292 , -5619.28076081,  4394.6987103 ,  6281.15069601,
        2780.8447284 ,   -35.28691684,   104.04769839,   502.8896326 ,
        1396.47063167,   863.85518229,    -0.        ,  -811.25380751,
        -424.20697403])

In [20]:
# excluded features 
X_train.columns[model.coef_==0]

Index(['Id', 'BsmtUnfSF', '1stFlrSF', 'MiscVal'], dtype='object')

### Using SelectFromModel

In [21]:
sel_ = SelectFromModel(Lasso(alpha=100, random_state=10))
sel_.fit(scaler.transform(X_train), y_train)

SelectFromModel(estimator=Lasso(alpha=100, random_state=10))

In [22]:
# excluded features. It's the same 
X_train.columns[sel_.get_support()==False]

Index(['Id', 'BsmtUnfSF', '1stFlrSF', 'MiscVal'], dtype='object')