## Table of Contents

1. [Univariate Feature Selection](#univariate)
2. [Feature Selection from Model](#feature_from_model)

In [5]:
# Read data
import pandas as pd
%matplotlib inline
train_final = pd.read_csv("housing_df_mod.csv")


In [6]:
X = train_final.drop(columns=["Id","SalePrice"])
y = train_final["SalePrice"]

## Univariate Feature Selection <a name="univariate"></a>

In [7]:
# For regression problems, we can choose either f_regression or mutual_info_regression..
# .. to quantify the relationship between the feature and the target variable
# SelectKBest will then pick the top K values
# CAUTION: Indiscriminately throwing away features can impact model performance
# Also a feature's univariate importance measure may not be an indicator of how well the feature performs in a model
# Considering that there may be interactions between different features

from sklearn.feature_selection import SelectKBest, f_regression
sel = SelectKBest(score_func=f_regression,k=15)
kbest = sel.fit(X,y)


In [8]:
kbest.transform(X).shape

(1460, 15)

In [9]:
# get_support() returns a boolean indicating whether the feature is included in the kbest set of features or not
# We loop thru 2 lists - List of booleans and List of column names - to identify which column is part of k best
idx = kbest.get_support()
[col_name for col_name,flag in zip(X.columns,idx) if flag == True]

['OverallQual',
 'YearBuilt',
 'YearRemodAdd',
 'ExterQual',
 'TotalBsmtSF',
 '1stFlrSF',
 'GrLivArea',
 'FullBath',
 'KitchenQual',
 'TotRmsAbvGrd',
 'GarageCars',
 'GarageArea',
 'property_age',
 'Foundation_PConc',
 'BsmtQual_5']

## Feature selection from Model <a name="feature_from_model"></a>

### Using Lasso

In [11]:
# We use regularzied regresson Lasso for feature selection
from sklearn.linear_model import LassoCV
from sklearn.feature_selection import SelectFromModel
lasso_reg = LassoCV(n_alphas =100)

In [12]:
sel_model = SelectFromModel(lasso_reg)
sel_model.fit(X,y)
train_reduced = sel_model.transform(X)
train_reduced.shape





(1460, 12)

#### Question

<b> Which variables have been picked up by the Lasso model? </b>

In [9]:
train_reduced = pd.DataFrame(train_reduced,columns=final_cols)

In [10]:
train_reduced.head()

Unnamed: 0,LotArea,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,TotalBsmtSF,2ndFlrSF,GrLivArea,GarageArea,WoodDeckSF,MiscVal,property_age
0,8450.0,2003.0,2003.0,196.0,706.0,856.0,854.0,1710.0,548.0,0.0,0.0,5.0
1,9600.0,1976.0,1976.0,0.0,978.0,1262.0,0.0,1262.0,460.0,298.0,0.0,31.0
2,11250.0,2001.0,2002.0,162.0,486.0,920.0,866.0,1786.0,608.0,0.0,0.0,7.0
3,9550.0,1915.0,1970.0,0.0,216.0,756.0,756.0,1717.0,642.0,0.0,0.0,91.0
4,14260.0,2000.0,2000.0,350.0,655.0,1145.0,1053.0,2198.0,836.0,192.0,0.0,8.0


### Using Random forest

In [11]:
# Create a random forest classifier
from sklearn.ensemble import RandomForestRegressor
rf_mod = RandomForestRegressor(n_estimators=100, random_state=0, n_jobs=-1)

# Train the classifier
rf_mod.fit(X,y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

In [15]:
feature_importance = pd.DataFrame({"feature" : X.columns, "imp" : rf_mod.feature_importances_})


#### Question

<b> Find the top 10 features based on random forest importance (gini importance) </b>

In [17]:
# use Select from model class with random forest model
sfm = SelectFromModel(rf_mod,max_features = 10)

# Train the selector
sfm.fit(X,y)

SelectFromModel(estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
           oob_score=False, random_state=0, verbose=0, warm_start=False),
        max_features=10, norm_order=1, prefit=False, threshold=None)

In [20]:
train_reduced = sfm.transform(X)
train_reduced.shape

(1460, 10)

In [21]:
idx = sfm.get_support()
final_cols = [col_name for col_name,flag in zip(X.columns,idx) if flag == True]
final_cols

['LotArea',
 'OverallQual',
 'BsmtFinSF1',
 'TotalBsmtSF',
 '1stFlrSF',
 '2ndFlrSF',
 'GrLivArea',
 'FullBath',
 'GarageCars',
 'GarageArea']