** Objective. **

* Input - Preprocessed data with golden features.
* Rearrange features - Based on the feature importance values spit out by the tree based selector.
* Forward step feature selection - Choose features until AUC score on the test set improves.

In [1]:
import numpy as np
import pandas as pd
import os, sys

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.preprocessing import Imputer

import warnings
warnings.filterwarnings('ignore')

basepath = os.path.expanduser('~/Desktop/src/Loan_Default_Prediction/')
sys.path.append(os.path.join(basepath, 'src'))

from data import *
from features import *

In [2]:
train = pd.read_csv(os.path.join(basepath, 'data/raw/train_v2.csv'),
                    index_col='id')

** Create a binary variable out of the loss. **

In [3]:
train['is_default'] = train.loss.map(lambda x: 1 if x > 1 else 0)

** Work with a stratified sample. **

In [4]:
itrain, itest = get_stratified_sample(train, train.is_default, train_size=0.2)

train_sample  = train.iloc[itrain]
y_default     = train_sample.is_default
y_loss        = train_sample.loss

del train # because of memory constraints

** Drop features with constant values. **

In [5]:
constant_features = ['f33', 'f678', 'f37', 'f764', \
                      'f700', 'f34', 'f38', 'f702', \
                      'f701', 'f736', 'f35']

In [6]:
train_sample = drop_features(train_sample, constant_features)

** Transformation **

In [7]:
numerical_features = train_sample.select_dtypes(exclude=['object']).columns
features           = numerical_features.drop(['loss', 'is_default']) 

In [8]:
train_sample = transform(train_sample, features, np.log1p)

** Golden Features **

In [9]:
train_sample = create_golden_feature(train_sample)

** Fill missing values. **

In [10]:
train_sample = fill_missing_values(train_sample)

** Split the sample further into train and test sets. **

In [11]:
itrain, itest = get_stratified_sample(train_sample, y_default, train_size=0.7, random_state=8)

X_train = train_sample[features].iloc[itrain]
X_test  = train_sample[features].iloc[itest]

y_train = y_default.iloc[itrain]
y_test  = y_default.iloc[itest]

In [12]:
# feature importance
forest          = ExtraTreesClassifier(random_state=111, n_jobs=2)
feature_indices = feature_importance(forest, X_train, y_train)

** Feature Selection **

In [13]:
X_train = train_sample[list(features) + ['f528-f527', 'f528-f274']].iloc[itrain]
X_test  = train_sample[list(features) + ['f528-f527', 'f528-f274']].iloc[itest]

In [16]:
potential_features = list(features[feature_indices])
selected_features  = ['f528-f527', 'f528-f274']

In [17]:
final_selected_features = forward_step_selection(X_train, y_train, X_test, y_test, selected_features, potential_features)

No longer able to improve AUC score


In [18]:
final_selected_features

['f528-f527', 'f528-f274', 'f271', 'f2', 'f727', 'f555', 'f201', 'f594']