# Feature selection

### Why do we need to select variables?

1. For production: Fewer variables mean smaller client input requirements (e.g. customers filling out a form on a website or mobile app), and hence less code for error handling. This reduces the chances of bugs.
2. For model performance: Fewer variables mean simpler, more interpretable, less over-fitted models


**We will select variables using the Lasso regression: Lasso has the property of setting the coefficient of non-informative variables to zero. This way we can identify those variables and remove them from our final models.**

This is perhaps one of the most important lessons that you need to take away from this course: **Always set the seeds**.

In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
pd.pandas.set_option('display.max_columns', None)

In [6]:
X_train = pd.read_csv('./data/xtrain.csv')
X_test = pd.read_csv('./data/xtest.csv')

X_train.head()

Unnamed: 0,Id,SalePrice,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,LotFrontage_na,MasVnrArea_na,GarageYrBlt_na
0,931,12.21106,0.0,0.75,0.461171,0.377048,1.0,1.0,0.333333,1.0,1.0,0.0,0.0,0.863636,0.4,1.0,0.75,0.6,0.777778,0.5,0.014706,0.04918,0.0,0.0,1.0,1.0,0.25,0.0,0.666667,1.0,1.0,0.75,0.666667,0.75,1.0,0.002835,0.8,0.0,0.673479,0.239935,1.0,1.0,1.0,1.0,0.55976,0.0,0.0,0.52325,0.0,0.0,0.666667,0.0,0.375,0.333333,0.666667,0.416667,1.0,0.0,0.2,0.75,0.018692,1.0,0.75,0.430183,1.0,1.0,1.0,0.116686,0.032907,0.0,0.0,0.0,0.0,0.0,0.666667,1.0,0.0,0.545455,0.75,0.666667,0.75,0.0,0.0,0.0
1,657,11.887931,0.0,0.75,0.456066,0.399443,1.0,1.0,0.333333,0.333333,1.0,0.0,0.0,0.363636,0.4,1.0,0.75,0.6,0.444444,0.75,0.360294,0.04918,0.0,0.0,0.6,0.6,0.5,0.03375,0.666667,1.0,0.5,0.5,0.666667,0.25,0.666667,0.142807,0.8,0.0,0.114724,0.17234,1.0,1.0,1.0,1.0,0.434539,0.0,0.0,0.406196,0.333333,0.0,0.333333,0.5,0.375,0.333333,0.666667,0.25,1.0,0.0,0.2,0.75,0.457944,0.666667,0.25,0.220028,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,1.0,0.0,0.636364,0.5,0.666667,0.75,0.0,0.0,0.0
2,46,12.675764,0.588235,0.75,0.394699,0.347082,1.0,1.0,0.0,0.333333,1.0,0.0,0.0,0.954545,0.4,1.0,1.0,0.6,0.888889,0.5,0.036765,0.098361,1.0,0.0,0.3,0.2,0.5,0.2575,1.0,1.0,1.0,1.0,0.666667,0.25,1.0,0.080794,0.8,0.0,0.601951,0.286743,1.0,1.0,1.0,1.0,0.627205,0.0,0.0,0.586296,0.333333,0.0,0.666667,0.0,0.25,0.333333,1.0,0.333333,1.0,0.333333,0.8,0.75,0.046729,0.666667,0.5,0.406206,1.0,1.0,1.0,0.228705,0.149909,0.0,0.0,0.0,0.0,0.0,0.666667,1.0,0.0,0.090909,1.0,0.666667,0.75,0.0,0.0,0.0
3,1349,12.278393,0.0,0.75,0.388581,0.493677,1.0,1.0,0.666667,0.666667,1.0,0.0,0.0,0.454545,0.4,1.0,0.75,0.6,0.666667,0.5,0.066176,0.163934,0.0,0.0,1.0,1.0,0.25,0.0,0.666667,1.0,1.0,0.75,0.666667,1.0,1.0,0.25567,0.8,0.0,0.018114,0.242553,1.0,1.0,1.0,1.0,0.56692,0.0,0.0,0.529943,0.333333,0.0,0.666667,0.0,0.375,0.333333,0.666667,0.25,1.0,0.333333,0.4,0.75,0.084112,0.666667,0.5,0.362482,1.0,1.0,1.0,0.469078,0.045704,0.0,0.0,0.0,0.0,0.0,0.666667,1.0,0.0,0.636364,0.25,0.666667,0.75,1.0,0.0,0.0
4,56,12.103486,0.0,0.75,0.577658,0.402702,1.0,1.0,0.333333,0.333333,1.0,0.0,0.0,0.363636,0.4,1.0,0.75,0.6,0.555556,0.5,0.323529,0.737705,0.0,0.0,0.6,0.7,0.5,0.17,0.333333,1.0,0.5,0.5,0.666667,0.25,0.333333,0.086818,0.8,0.0,0.434278,0.233224,1.0,0.75,1.0,1.0,0.549026,0.0,0.0,0.513216,0.0,0.0,0.666667,0.0,0.375,0.333333,0.333333,0.416667,1.0,0.333333,0.8,0.75,0.411215,0.666667,0.5,0.406206,1.0,1.0,1.0,0.0,0.0,0.0,0.801181,0.0,0.0,0.0,0.666667,1.0,0.0,0.545455,0.5,0.666667,0.75,0.0,0.0,0.0


In [7]:
y_train = X_train['SalePrice']
y_test = X_test['SalePrice']

X_train.drop(['Id', 'SalePrice'], axis=1, inplace=True)
X_test.drop(['Id', 'SalePrice'], axis=1, inplace=True)

In [9]:
y_train.head()

0    12.211060
1    11.887931
2    12.675764
3    12.278393
4    12.103486
Name: SalePrice, dtype: float64

## Feature selection

In [10]:
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

In [16]:
selector = SelectFromModel(Lasso(alpha=0.005, random_state=0))
selector.fit(X_train, y_train)

In [20]:
list(selector.estimator_.coef_)

[-0.017698768083641517,
 0.04393320058028792,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.3263118047012549,
 0.0,
 0.0,
 -0.0,
 0.0,
 0.43593388123540194,
 0.004903110889512029,
 -0.0,
 -0.10211318977986941,
 0.020621825516689004,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.049905187972440354,
 0.0,
 0.07450757785010376,
 0.0,
 0.0,
 0.0,
 0.0,
 -0.0,
 0.0,
 0.0,
 0.004993749816432765,
 0.06640260833106877,
 0.0,
 0.11354815514881689,
 0.0,
 0.0,
 0.7269405624284083,
 0.06334553047828123,
 0.0,
 0.0,
 0.0,
 0.0,
 -0.0,
 0.11612131666534585,
 0.0,
 0.0,
 0.05225707353265747,
 0.09143145576070821,
 0.0496017539562728,
 -0.0,
 0.040609401244999704,
 0.17182331727745687,
 0.0,
 0.0,
 0.0,
 0.006646101818277878,
 0.0,
 0.0,
 -0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 -0.0,
 0.0,
 -0.0,
 -0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 -0.0]

In [24]:
selector.threshold_

1e-05

In [25]:
selector.get_support()

array([ True,  True, False, False, False, False, False, False, False,
       False, False,  True, False, False, False, False,  True,  True,
       False,  True,  True, False, False, False, False, False, False,
       False, False,  True, False,  True, False, False, False, False,
       False, False, False,  True,  True, False,  True, False, False,
        True,  True, False, False, False, False, False,  True, False,
       False,  True,  True,  True, False,  True,  True, False, False,
       False,  True, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False])

In [27]:
selected_features = X_train.columns[selector.get_support()]
selected_features

Index(['MSSubClass', 'MSZoning', 'Neighborhood', 'OverallQual', 'OverallCond',
       'YearRemodAdd', 'RoofStyle', 'BsmtQual', 'BsmtExposure', 'HeatingQC',
       'CentralAir', '1stFlrSF', 'GrLivArea', 'BsmtFullBath', 'KitchenQual',
       'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageCars',
       'PavedDrive'],
      dtype='object')

In [29]:
print(f"""
Total features: {len(X_train.columns)}
Selected features: {len(selected_features)}
Discarded cols: {np.sum(selector.estimator_.coef_==0)} 
""")


Total features: 82
Selected features: 21
Discarded cols: 61 



In [30]:
pd.Series(selected_features).to_csv('./data/selected_features.csv', index=False)

  """Entry point for launching an IPython kernel.
