## Feature Selection
* Using the new features built in the previous notebook, I will do some feature selection using the Recursive Feature Selection (RFE) algorithm. I will try to reduce the currently 459 features to a more manageable ~200.
* I will then test the new features in the model


In [13]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import datetime
import lightgbm as lgb
import xgboost as xgb
from sklearn import preprocessing
from sklearn import metrics
from sklearn.model_selection import KFold, TimeSeriesSplit, train_test_split,StratifiedKFold
import gc
from sklearn.feature_selection import RFECV,RFE
import time
import pickle

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [17]:
train_feat = pd.read_pickle('data/train_feat.pkl')
test_feat = pd.read_pickle('data/test_feat.pkl')

In [4]:
# Label Encoding
for f in test_feat.columns:
    if train_feat[f].dtype=='object' or test_feat[f].dtype=='object': 
        train_feat[f] = train_feat[f].fillna('unseen_before_label')
        test_feat[f]  = test_feat[f].fillna('unseen_before_label')
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(train_feat[f].values) + list(test_feat[f].values))
        train_feat[f] = lbl.transform(list(train_feat[f].values))
        test_feat[f] = lbl.transform(list(test_feat[f].values)) 

train_feat = train_feat.fillna(-999)
test_feat = test_feat.fillna(-999)
del test_feat
gc.collect()

14

In [5]:
rm_cols = [
    'TransactionID','TransactionDT', 
    'isFraud'                         
]

# Final features
features_columns = [col for col in list(train_feat.columns) if col not in rm_cols]

In [None]:
# params = {
#                     'objective':'binary',
#                     'boosting_type':'gbdt',
#                     'metric':'auc',
#                     'n_jobs':-1,
#                     'learning_rate':0.01,
#                     'num_leaves': 2**8,
#                     'max_depth':-1,
#                     'tree_learner':'serial',
#                     'colsample_bytree': 0.7,
#                     'subsample_freq':1,
#                     'subsample':0.7,
#                     'n_estimators':500,
#                     'max_bin':255,
#                     'verbose':-1,
#                     'seed': 2019,
# #                     'early_stopping_rounds':100, 
#                 } 

In [6]:
folds = 5
step = 10

# rfecv = RFECV(
#               estimator=lgb.LGBMClassifier(),
#               step=step,
#               cv=StratifiedKFold(
#                                  n_splits=folds,
#                                  shuffle=True,
#                                  random_state=42).split(train_feat[features_columns], train_feat['isFraud']),
#               scoring='roc_auc',
#               n_jobs=1,
#               verbose=2)

rfe=RFE(estimator=lgb.LGBMClassifier(), n_features_to_select = 200, step=step, verbose=2)

start = time.time()
rfe.fit(train_feat[features_columns], train_feat['isFraud'])
end = time.time()
print('Time Elapsed (min): ',(end - start)/60)

Fitting estimator with 456 features.
Fitting estimator with 446 features.
Fitting estimator with 436 features.
Fitting estimator with 426 features.
Fitting estimator with 416 features.
Fitting estimator with 406 features.
Fitting estimator with 396 features.
Fitting estimator with 386 features.
Fitting estimator with 376 features.
Fitting estimator with 366 features.
Fitting estimator with 356 features.
Fitting estimator with 346 features.
Fitting estimator with 336 features.
Fitting estimator with 326 features.
Fitting estimator with 316 features.
Fitting estimator with 306 features.
Fitting estimator with 296 features.
Fitting estimator with 286 features.
Fitting estimator with 276 features.
Fitting estimator with 266 features.
Fitting estimator with 256 features.
Fitting estimator with 246 features.
Fitting estimator with 236 features.
Fitting estimator with 226 features.
Fitting estimator with 216 features.
Fitting estimator with 206 features.
Time Elapsed (min):  24.03383366664250

In [9]:
rfe_support = rfe.get_support()
rfe_feature = train_feat[features_columns].loc[:,rfe_support].columns.tolist()
print(str(len(rfe_feature)), 'selected features')

200 selected features


In [14]:
with open("data/rfe_features_200.pkl", 'wb') as f:
    pickle.dump(rfe_feature, f)

In [15]:
with open('data/rfe_features_200.pkl', 'rb') as f:
    rfe_feature = pickle.load(f)