** Two stage problem. **
* Default Classifier: Use AUC or F1-score to evaluate performance because of class imbalance.
* Default Loss: Use MAE ( Evaluation metric ) to evaluate performance

In [16]:
%matplotlib inline

import pandas as pd
import numpy as np
import os, sys

from sklearn.cross_validation import train_test_split, StratifiedKFold
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import MinMaxScaler, Imputer
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import f_regression, SelectKBest
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, f1_score

import xgboost as xgb

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

np.random.seed(3)

basepath = os.path.expanduser('~/Desktop/src/Loan_Default_Prediction/')
sys.path.append(os.path.join(basepath, 'src'))

from data import *
from features import *

In [4]:
train  = pd.read_csv(os.path.join(basepath, 'data/raw/train_v2.csv'),
                     index_col='id',
                     dtype=np.float32
                    )
# test   = pd.read_csv(os.path.join(basepath, 'data/raw/test_v2.csv'), index_col='id')
# sample_sub = pd.read_csv(os.path.join(basepath, 'data/raw/sampleSubmission.csv'))

In [5]:
# loss target
train['is_default'] = (train.loss > 0).astype(np.int)

** Training examples are in the chronological order but not in the test set. **

In [6]:
train = train.iloc[np.random.permutation(len(train))] # shuffle training set.

In [7]:
# features to remove from training set
features_to_remove = ['f33', 'f678', 'f37', 'f764', 'f700', \
                      'f34', 'f38', 'f702', 'f701', 'f736', 'f35']

features_to_remove.extend(['is_default', 'loss'])

In [8]:
itrain, itest = get_stratified_sample(train, train.is_default, train_size=.2, random_state=11)

X_train = train.iloc[itrain][train.columns.drop(features_to_remove)]
X_test  = train.iloc[itest][train.columns.drop(features_to_remove)]

y_train = train.is_default.iloc[itrain]
y_test  = train.is_default.iloc[itest] 

In [9]:
print(X_train.shape, X_test.shape)

(21094, 758) (84377, 758)


** Two Step Modelling. **

* Pipeline for predicting whether there was any loss.
* Pipeline to predict the actual value in case any loss was incurred.

In [18]:
pipeline_default = Pipeline([
        ('feature_union', FeatureUnion([
                    ('golden_feature', GoldenFeatures())
                ])),
        ('imputer', Imputer()),
        ('scaler', MinMaxScaler()),
        ('select', TreeBasedSelection(ExtraTreesClassifier(), y_train, n_features_to_select=30)),
        ('union', FeatureUnion([
                    ('feature_interaction', FeatureInteraction())
                ])),
        ('model', RandomForestClassifier(n_estimators=25, n_jobs=2, random_state=5))
    ])

In [19]:
pipeline_default.fit(X_train, y_train)

Pipeline(steps=[('feature_union', FeatureUnion(n_jobs=1,
       transformer_list=[('golden_feature', GoldenFeatures())],
       transformer_weights=None)), ('imputer', Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)), ('scaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('s...estimators=25, n_jobs=2,
            oob_score=False, random_state=5, verbose=0, warm_start=False))])

In [22]:
yhat  = pipeline_default.predict_proba(X_test)[:, 1]
ypred = pipeline_default.predict(X_test)
print('AUC score: %f'%(roc_auc_score(y_test, yhat)))
print('F1 score: %f'%(f1_score(y_test, ypred)))

AUC score: 0.986068
F1 score: 0.801074


** Submission **

In [None]:
sample_sub = pd.read_csv(os.path.join(basepath, 'data/raw/sampleSubmission.csv'))

In [None]:
sample_sub['loss'] = loss
sample_sub.to_csv(os.path.join(basepath, 'submissions/baseline.csv'), index=False)