https://www.kaggle.com/raviprakash438/wrapper-method-feature-selection

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

from mlxtend.feature_selection import SequentialFeatureSelector as sfs
from mlxtend.feature_selection import ExhaustiveFeatureSelector as efs
from sklearn.ensemble import RandomForestRegressor

In [2]:
df = pd.read_csv('input/train.csv')
df.shape

(1460, 81)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [4]:
colType = ['int64', 'float64']
numCols = list(df.select_dtypes(include=colType).columns)

data = df[numCols]
data.shape

(1460, 38)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(data.drop('SalePrice', axis=1), data['SalePrice'],
                                                    test_size=0.2, random_state=2020)

X_train.shape, X_test.shape

((1168, 37), (292, 37))

In [6]:
def correlation(dataset, threshold):
    col_corr = set()
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold:
                colName = corr_matrix.columns[i]
                col_corr.add(colName)
    return col_corr

col = correlation(X_train, 0.8)
print('Correlated columns: ', col)

Correlated columns:  {'TotRmsAbvGrd', 'GarageArea', '1stFlrSF', 'GarageYrBlt'}


In [7]:
X_train.drop(columns=col, axis=1, inplace=True)
X_test.drop(columns=col, axis=1, inplace=True)

X_train.shape, X_test.shape

((1168, 33), (292, 33))

In [8]:
X_train.fillna(0, inplace=True)

In [9]:
X_train.isnull().sum().max()

0

## Forward feature selection

In [10]:
model = sfs(RandomForestRegressor(),
            k_features=210,
            forward=True,
            verbose=2,
            cv=5,
            n_jobs=-1,
            scoring='r2')
model.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    4.3s
[Parallel(n_jobs=-1)]: Done  22 out of  33 | elapsed:    4.8s remaining:    2.4s
[Parallel(n_jobs=-1)]: Done  26 out of  33 | elapsed:    5.6s remaining:    1.4s
[Parallel(n_jobs=-1)]: Done  30 out of  33 | elapsed:    5.7s remaining:    0.5s
[Parallel(n_jobs=-1)]: Done  33 out of  33 | elapsed:    6.1s finished

[2020-04-29 20:57:44] Features: 1/10 -- score: 0.64120180718231[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done  21 out of  32 | elapsed:    3.4s remaining:    1.7s
[Parallel(n_jobs=

SequentialFeatureSelector(clone_estimator=True, cv=5,
                          estimator=RandomForestRegressor(bootstrap=True,
                                                          ccp_alpha=0.0,
                                                          criterion='mse',
                                                          max_depth=None,
                                                          max_features='auto',
                                                          max_leaf_nodes=None,
                                                          max_samples=None,
                                                          min_impurity_decrease=0.0,
                                                          min_impurity_split=None,
                                                          min_samples_leaf=1,
                                                          min_samples_split=2,
                                                          min_weight_fraction_leaf=0.0,
   

In [11]:
model.k_feature_idx_

(2, 4, 7, 11, 12, 13, 15, 16, 20, 23)

In [12]:
model.k_feature_names_

('LotFrontage',
 'OverallQual',
 'YearRemodAdd',
 'BsmtUnfSF',
 'TotalBsmtSF',
 '2ndFlrSF',
 'GrLivArea',
 'BsmtFullBath',
 'BedroomAbvGr',
 'GarageCars')

## Backward Feature Selection

In [13]:
backwardModel=sfs(RandomForestRegressor(),
                  k_features=10,
                  forward=False,
                  verbose=2,
                  cv=5,
                  n_jobs=-1,
                  scoring='r2')
backwardModel.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 out of  33 | elapsed:   31.6s finished

[2020-04-29 21:00:16] Features: 32/10 -- score: 0.8414230280759964[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  32 out of  32 | elapsed:   30.8s finished

[2020-04-29 21:00:47] Features: 31/10 -- score: 0.8430996754848181[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  31 out of  31 | elapsed:   29.5s finished

[2020-04-29 21:01:17] Features: 30/10 -- score: 0.8448351341604479[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   28.0s finished

[2020-04-29 21:01:45] Features: 29/10 -- score: 0.8438761007388507[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  29 out of  29 | elapsed:   26.5s remain

SequentialFeatureSelector(clone_estimator=True, cv=5,
                          estimator=RandomForestRegressor(bootstrap=True,
                                                          ccp_alpha=0.0,
                                                          criterion='mse',
                                                          max_depth=None,
                                                          max_features='auto',
                                                          max_leaf_nodes=None,
                                                          max_samples=None,
                                                          min_impurity_decrease=0.0,
                                                          min_impurity_split=None,
                                                          min_samples_leaf=1,
                                                          min_samples_split=2,
                                                          min_weight_fraction_leaf=0.0,
   

In [None]:
backwardModel.k_feature_idx_

In [None]:
X_train.columns[list(backwardModel.k_feature_idx_)]

## Exhaustive Feature Selection

In [None]:
emodel=efs(RandomForestRegressor(),min_features=1,max_features=5,scoring='r2',n_jobs=-1)

miniData=X_train[X_train.columns[list(backwardModel.k_feature_idx_)]]

emodel.fit(np.array(miniData),y_train)

In [None]:
emodel.best_idx_

In [None]:
miniData.columns[list(emodel.best_idx_)]