In [2]:
!pip install mlxtend

Collecting mlxtend
  Downloading mlxtend-0.19.0-py2.py3-none-any.whl (1.3 MB)
Installing collected packages: mlxtend
Successfully installed mlxtend-0.19.0


In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import roc_auc_score, r2_score

from mlxtend.feature_selection import SequentialFeatureSelector as SFS

In [2]:
data = pd.read_csv('dataset_2.csv')
data.shape

(50000, 109)

In [3]:
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=['target'], axis=1),
    data['target'],
    test_size=0.3,
    random_state=0)

X_train.shape, X_test.shape

((35000, 108), (15000, 108))

In [4]:
sfs = SFS(RandomForestClassifier(n_estimators=10, n_jobs=4, random_state=0), 
           k_features=10, # the more features we want, the longer it will take to run
           forward=False, 
           floating=False, # see the docs for more details in this parameter
           verbose=2, # this indicates how much to print out intermediate steps
           scoring='roc_auc',
           cv=2)



In [5]:
fea = sfs.fit(X_train,y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 108 out of 108 | elapsed:  2.9min finished

[2021-12-21 16:04:10] Features: 107/10 -- score: 0.6231792953008196[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.5s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 107 out of 107 | elapsed:  2.8min finished

[2021-12-21 16:06:59] Features: 106/10 -- score: 0.6309133431379182[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 106 out of 106 | elapsed:  2.8min finished

[2021-12-21 16:09:47] Features: 105/10 -- score: 0.6297957167278045[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: 

[Parallel(n_jobs=1)]: Done  81 out of  81 | elapsed:  1.6min finished

[2021-12-21 17:05:58] Features: 80/10 -- score: 0.6327129879850415[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed:  1.6min finished

[2021-12-21 17:07:34] Features: 79/10 -- score: 0.6303351447085885[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  79 out of  79 | elapsed:  1.6min finished

[2021-12-21 17:09:09] Features: 78/10 -- score: 0.6313534207949818[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  78 out of  78 | elapsed:  1.6min finished

[2021-12-21 17:10:42] Features: 77/

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  53 out of  53 | elapsed:   56.3s finished

[2021-12-21 17:41:22] Features: 52/10 -- score: 0.6369678426827285[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  52 out of  52 | elapsed:   54.7s finished

[2021-12-21 17:42:17] Features: 51/10 -- score: 0.6369779762859136[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  51 out of  51 | elapsed:   54.1s finished

[2021-12-21 17:43:11] Features: 50/10 -- score: 0.6373177092423776[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.9s remaining:    0.0s
[Parallel(n_jobs=1)]: Done

In [13]:
sfs.k_feature_idx_

(13, 15, 18, 37, 44, 54, 56, 68, 89, 106)

In [17]:
X_train.columns[13]

'var_14'

In [19]:
reg = X_train.columns[list(sfs.k_feature_idx_)]

In [20]:
reg

Index(['var_14', 'var_16', 'var_19', 'var_38', 'var_45', 'var_55', 'var_57',
       'var_70', 'var_91', 'var_108'],
      dtype='object')

In [21]:
# the above columns are top 10 columns so we can work on them 