https://www.kaggle.com/raviprakash438/wrapper-method-feature-selection

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

from mlxtend.feature_selection import SequentialFeatureSelector as sfs
from mlxtend.feature_selection import ExhaustiveFeatureSelector as efs
from sklearn.ensemble import RandomForestRegressor

In [2]:
df = pd.read_csv('input/train.csv')
df.shape

(1460, 81)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
Id               1460 non-null int64
MSSubClass       1460 non-null int64
MSZoning         1460 non-null object
LotFrontage      1201 non-null float64
LotArea          1460 non-null int64
Street           1460 non-null object
Alley            91 non-null object
LotShape         1460 non-null object
LandContour      1460 non-null object
Utilities        1460 non-null object
LotConfig        1460 non-null object
LandSlope        1460 non-null object
Neighborhood     1460 non-null object
Condition1       1460 non-null object
Condition2       1460 non-null object
BldgType         1460 non-null object
HouseStyle       1460 non-null object
OverallQual      1460 non-null int64
OverallCond      1460 non-null int64
YearBuilt        1460 non-null int64
YearRemodAdd     1460 non-null int64
RoofStyle        1460 non-null object
RoofMatl         1460 non-null object
Exterior1st      1460 non-n

In [4]:
# 只挑选数值型特征
colType = ['int64', 'float64']
numCols = list(df.select_dtypes(include=colType).columns)

data = df[numCols]
data.shape

(1460, 38)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(data.drop('SalePrice', axis=1), data['SalePrice'],
                                                    test_size=0.2, random_state=2020)

X_train.shape, X_test.shape

((1168, 37), (292, 37))

In [6]:
def correlation(dataset, threshold):
    col_corr = set()
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold:
                colName = corr_matrix.columns[i]
                col_corr.add(colName)
    return col_corr

col = correlation(X_train, 0.8)
print('Correlated columns: ', col)

Correlated columns:  {'GarageArea', 'TotRmsAbvGrd', 'GarageYrBlt', '1stFlrSF'}


In [7]:
X_train.drop(columns=col, axis=1, inplace=True)
X_test.drop(columns=col, axis=1, inplace=True)

X_train.shape, X_test.shape

((1168, 33), (292, 33))

In [8]:
X_train.fillna(0, inplace=True)

In [9]:
X_train.isnull().sum().max()

0

## Forward feature selection

In [10]:
model = sfs(RandomForestRegressor(),
            k_features=10,
            forward=True,
            verbose=2,
            cv=5,
            n_jobs=-1,
            scoring='r2')
model.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 out of  33 | elapsed:    5.7s finished

[2020-04-29 22:43:17] Features: 1/10 -- score: 0.6404767115586607[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  32 out of  32 | elapsed:    1.2s finished

[2020-04-29 22:43:18] Features: 2/10 -- score: 0.6860100086659132[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 out of  31 | elapsed:    1.2s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done  31 out of  31 | elapsed:    1.3s finished

[2020-04-29 22:43:19] Features: 3/10 -- score: 0.7204727786959871[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    1.0s finished

[2020-04-29 22:43:20] Features: 4/10 -- score: 0.7369196936210333[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent 

SequentialFeatureSelector(clone_estimator=True, cv=5,
                          estimator=RandomForestRegressor(bootstrap=True,
                                                          criterion='mse',
                                                          max_depth=None,
                                                          max_features='auto',
                                                          max_leaf_nodes=None,
                                                          min_impurity_decrease=0.0,
                                                          min_impurity_split=None,
                                                          min_samples_leaf=1,
                                                          min_samples_split=2,
                                                          min_weight_fraction_leaf=0.0,
                                                          n_estimators='warn',
                                                          n_jobs=None,
  

In [11]:
model.k_feature_idx_

(2, 4, 5, 6, 13, 15, 16, 18, 20, 23)

In [12]:
model.k_feature_names_

('LotFrontage',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 '2ndFlrSF',
 'GrLivArea',
 'BsmtFullBath',
 'FullBath',
 'BedroomAbvGr',
 'GarageCars')

## Backward Feature Selection

In [13]:
backwardModel=sfs(RandomForestRegressor(),
                  k_features=10,
                  forward=False,
                  verbose=2,
                  cv=5,
                  n_jobs=-1,
                  scoring='r2')
backwardModel.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 out of  33 | elapsed:    5.3s finished

[2020-04-29 22:43:35] Features: 32/10 -- score: 0.8325082899819449[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  32 out of  32 | elapsed:    4.7s finished

[2020-04-29 22:43:40] Features: 31/10 -- score: 0.8317132195561042[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  31 out of  31 | elapsed:    4.4s finished

[2020-04-29 22:43:45] Features: 30/10 -- score: 0.8244514521610562[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    4.2s finished

[2020-04-29 22:43:49] Features: 29/10 -- score: 0.8375587702597528[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  29 out of  29 | elapsed:    3.9s finish

SequentialFeatureSelector(clone_estimator=True, cv=5,
                          estimator=RandomForestRegressor(bootstrap=True,
                                                          criterion='mse',
                                                          max_depth=None,
                                                          max_features='auto',
                                                          max_leaf_nodes=None,
                                                          min_impurity_decrease=0.0,
                                                          min_impurity_split=None,
                                                          min_samples_leaf=1,
                                                          min_samples_split=2,
                                                          min_weight_fraction_leaf=0.0,
                                                          n_estimators='warn',
                                                          n_jobs=None,
  

In [14]:
backwardModel.k_feature_idx_

(1, 4, 5, 12, 17, 18, 22, 23, 25, 29)

In [15]:
X_train.columns[list(backwardModel.k_feature_idx_)]

Index(['MSSubClass', 'OverallQual', 'OverallCond', 'TotalBsmtSF',
       'BsmtHalfBath', 'FullBath', 'Fireplaces', 'GarageCars', 'OpenPorchSF',
       'PoolArea'],
      dtype='object')

## Exhaustive Feature Selection

In [16]:
emodel=efs(RandomForestRegressor(),
           min_features=1,
           max_features=5,
           scoring='r2',
           n_jobs=-1)

miniData=X_train[X_train.columns[list(backwardModel.k_feature_idx_)]]

emodel.fit(miniData, y_train)

Features: 637/637

ExhaustiveFeatureSelector(clone_estimator=True, cv=5,
                          estimator=RandomForestRegressor(bootstrap=True,
                                                          criterion='mse',
                                                          max_depth=None,
                                                          max_features='auto',
                                                          max_leaf_nodes=None,
                                                          min_impurity_decrease=0.0,
                                                          min_impurity_split=None,
                                                          min_samples_leaf=1,
                                                          min_samples_split=2,
                                                          min_weight_fraction_leaf=0.0,
                                                          n_estimators='warn',
                                                          n_jobs=None,
  

In [17]:
emodel.best_idx_

(0, 1, 3, 8)

In [18]:
miniData.columns[list(emodel.best_idx_)]

Index(['MSSubClass', 'OverallQual', 'TotalBsmtSF', 'OpenPorchSF'], dtype='object')