In [27]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import roc_auc_score
from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS

In [2]:
data = pd.read_csv('santander.csv')
data.shape

(76020, 371)

In [3]:
data.head(2)

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
0,1,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39205.17,0
1,3,2,34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49278.03,0


In [5]:
# Target variable will be "TARGET"
x_train, x_test, y_train, y_test = train_test_split(data.drop(labels = ['TARGET'], axis = 1), 
                                                    data['TARGET'], 
                                                    test_size = 0.3,
                                                    random_state = 123)

In [7]:
x_train.shape,x_test.shape

((53214, 370), (22806, 370))

In [6]:
def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr

corr_features = correlation(x_train, 0.8)
print('correlated features: ', len(set(corr_features)) )

correlated features:  202


In [8]:
# removed correlated  features
x_train.drop(labels=corr_features, axis=1, inplace=True)
x_test.drop(labels=corr_features, axis=1, inplace=True)

x_train.shape,x_test.shape

((53214, 168), (22806, 168))

#### Step Forward Feature Selection

In [12]:
# Step Forward Feature selection
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
sfs1 = SFS(RandomForestRegressor(),
          k_features = 10,
          forward = True,
          floating = False, 
          verbose = 2,
          scoring = 'r2',
          cv = 3) 
# cv performs cross-validation
# floating: The floating algorithms have an additional exclusion or inclusion step to remove features 
# once they were included (or excluded), so that a larger number of feature subset 
# combinations can be sampled.
# verbose:  level of verbosity to use in logging.

In [13]:
sfs1 = sfs1.fit(np.array(x_train.fillna(0)), y_train)

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 168 out of 168 | elapsed:   17.9s finished

[2019-02-15 22:53:20] Features: 1/10 -- score: 0.024170955460577053[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.5s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 167 out of 167 | elapsed:   22.7s finished

[2019-02-15 22:53:43] Features: 2/10 -- score: 0.04700188885874138[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 166 out of 166 | elapsed:   53.2s finished

[2019-02-15 22:54:36] Features: 3/10 -- score: 0.04913975051856737[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 165 out of 165 | elapsed:   53.8s finished

[2019-02-15 22:55:30] Features: 4/10 -- score: 0.04931403785984471[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 164 out of 164 | elapsed

In [14]:
selected_feat = x_train.columns[list(sfs1.k_feature_idx_)]

In [15]:
selected_feat

Index(['var15', 'ind_var28', 'ind_var30_0', 'ind_var41', 'num_var4',
       'imp_aport_var13_hace3', 'imp_trasp_var33_out_ult1',
       'num_reemb_var17_ult1', 'saldo_medio_var13_largo_hace3',
       'saldo_medio_var44_hace3'],
      dtype='object')

In [16]:
def run_randomForests(x_train, x_test, y_train, y_test):
    rf = RandomForestClassifier(n_estimators=200, random_state=39, max_depth=4)
    rf.fit(x_train, y_train)
    print('Train set')
    pred = rf.predict_proba(x_train)
    print('Random Forests roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
    print('Test set')
    pred = rf.predict_proba(x_test)
    print('Random Forests roc-auc: {}'.format(roc_auc_score(y_test, pred[:,1])))

In [19]:
# evaluate performance of algorithm built
# using selected features

run_randomForests(x_train[selected_feat].fillna(0),
                  x_test[selected_feat].fillna(0),
                  y_train, y_test)

Train set
Random Forests roc-auc: 0.797867363239713
Test set
Random Forests roc-auc: 0.7832090292397456


#### Step Backward Feature Selection

In [33]:
# step backward feature selection
# I indicate that I want to select 15 features from
# the total, and that I want to select those features
# based on the optimal roc_auc

sfs2 = SFS(RandomForestClassifier(), 
           k_features=6, 
           forward=False, 
           floating=False, 
           verbose=2,
           scoring='roc_auc',
           cv=2)

sfs2= sfs2.fit(np.array(x_train.fillna(0)), y_train)

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 168 out of 168 | elapsed:  3.6min finished

[2019-02-16 00:45:57] Features: 167/6 -- score: 0.6985813055189178[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 167 out of 167 | elapsed:  3.5min finished

[2019-02-16 00:49:28] Features: 166/6 -- score: 0.7000613351155902[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 166 out of 166 | elapsed:  3.8min finished

[2019-02-16 00:53:19] Features: 165/6 -- score: 0.696182976456716[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 165 out of 165 | elapsed:  3.4min finished

[2019-02-16 00:56:40] Features: 164/6 -- score: 0.705457464881275[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 164 out of 164 | elapsed:  

[2019-02-16 02:28:37] Features: 130/6 -- score: 0.6981134045833578[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 130 out of 130 | elapsed:  2.2min finished

[2019-02-16 02:30:48] Features: 129/6 -- score: 0.6994319639123233[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 129 out of 129 | elapsed:  2.2min finished

[2019-02-16 02:33:00] Features: 128/6 -- score: 0.7004513040226517[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 128 out of 128 | elapsed:  2.2min finished

[2019-02-16 02:35:09] Features: 127/6 -- score: 0.7036207384228088[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 127 out of 127 | elapsed:  2.1min finished

[2019-02-16 02:37:18] Features: 126/6 -- score: 0.7004591624563088[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.0s remai

[2019-02-16 03:34:28] Features: 92/6 -- score: 0.6978234444000225[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.8s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  92 out of  92 | elapsed:  1.3min finished

[2019-02-16 03:35:45] Features: 91/6 -- score: 0.6983483659535006[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.8s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  91 out of  91 | elapsed:  1.3min finished

[2019-02-16 03:37:00] Features: 90/6 -- score: 0.6942353388762613[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.8s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  90 out of  90 | elapsed:  1.2min finished

[2019-02-16 03:38:15] Features: 89/6 -- score: 0.6993248516795627[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.8s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  89 out of  89 | elapsed:  1.2min finished

[2019-02-16 03:39:27] Features: 88/6 -- score: 0.7029972506511217[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.8s remaining:

[2019-02-16 04:09:51] Features: 54/6 -- score: 0.694330804798551[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.7s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  54 out of  54 | elapsed:   37.2s finished

[2019-02-16 04:10:28] Features: 53/6 -- score: 0.6984593612269806[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.7s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  53 out of  53 | elapsed:   36.2s finished

[2019-02-16 04:11:04] Features: 52/6 -- score: 0.6964813964854062[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.7s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  52 out of  52 | elapsed:   35.8s finished

[2019-02-16 04:11:40] Features: 51/6 -- score: 0.6935819863806681[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.7s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  51 out of  51 | elapsed:   34.9s finished

[2019-02-16 04:12:15] Features: 50/6 -- score: 0.6903392578026872[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.7s remaining: 

[2019-02-16 04:24:08] Features: 16/6 -- score: 0.6885890767058542[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.6s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  16 out of  16 | elapsed:    9.0s finished

[2019-02-16 04:24:17] Features: 15/6 -- score: 0.6861562483100623[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.6s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:    8.4s finished

[2019-02-16 04:24:26] Features: 14/6 -- score: 0.6863010693743623[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.6s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  14 out of  14 | elapsed:    7.8s finished

[2019-02-16 04:24:34] Features: 13/6 -- score: 0.6865120470076718[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.6s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  13 out of  13 | elapsed:    7.3s finished

[2019-02-16 04:24:41] Features: 12/6 -- score: 0.6861526169837302[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.6s remaining:

In [34]:
selected_feat= x_train.columns[list(sfs2.k_feature_idx_)]
selected_feat

Index(['ID', 'var15', 'imp_op_var41_efect_ult3', 'saldo_var30', 'saldo_var37',
       'var38'],
      dtype='object')

#### Exhaustive Feature Selection

In [29]:
# exhaustive feature selection
# I indicate that I want to select 10 features from
# the total, and that I want to select those features
# based on the optimal roc_auc

# in order to shorter search time for the demonstration
# i will ask the algorithm to try all possible 1,2,3 and 4
# feature combinations from a dataset of 4 features

# if you have access to a multicore or distributed computer
# system you can try more greedy searches

efs3 = EFS(RandomForestClassifier(), 
           min_features=1,
           max_features=4, 
           scoring='roc_auc',
           print_progress=True,
           cv=2)

efs3 = efs3.fit(np.array(x_train[x_train.columns[0:4]].fillna(0)), y_train)

Features: 15/15

In [30]:
efs3.best_idx_

(2,)

In [32]:
selected_feat= x_train.columns[list(efs3.best_idx_)]
selected_feat

Index(['var15'], dtype='object')