** Two stage problem. **
* Default Classifier: Use AUC or F1-score to evaluate performance because of class imbalance.
* Default Loss: Use MAE ( Evaluation metric ) to evaluate performance

In [50]:
%matplotlib inline

import pandas as pd
import numpy as np
import os, sys

from sklearn.cross_validation import train_test_split, StratifiedKFold
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, Imputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import f_regression, SelectKBest
from sklearn.linear_model import Lasso

import xgboost as xgb

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

np.random.seed(3)

basepath = os.path.expanduser('~/Desktop/src/Loan_Default_Prediction/')
sys.path.append(os.path.join(basepath, 'src'))

In [38]:
train  = pd.read_csv(os.path.join(basepath, 'data/raw/train_v2.csv'),
                     index_col='id',
                     dtype=np.float32
                    )
# test   = pd.read_csv(os.path.join(basepath, 'data/raw/test_v2.csv'), index_col='id')
# sample_sub = pd.read_csv(os.path.join(basepath, 'data/raw/sampleSubmission.csv'))

In [40]:
# loss target
train['is_loss'] = (train.loss > 0).astype(np.int)

** Training examples are in the chronological order but not in the test set. **

In [41]:
train = train.iloc[np.random.permutation(len(train))] # shuffle training set.

In [43]:
# features to remove from training set
features_to_remove = ['f33', 'f678', 'f37', 'f764', 'f700', \
                      'f34', 'f38', 'f702', 'f701', 'f736', 'f35', 'loss',
                      'is_loss']

** Split dataset **

In [45]:
itrain, itest = train_test_split(range(len(train)), test_size=0.3, random_state=10)

X_train = train.iloc[itrain][train.columns.drop(features_to_remove)]
X_test  = train.iloc[itest][train.columns.drop(features_to_remove)]

y_train = train['is_loss'].iloc[itrain]
y_test  = train['is_loss'].iloc[itest]

** Feature Importance (Default Classifier) **

In [46]:
def get_xgb_imp(trained_xgb, feature_names):
    imp_vals = trained_xgb.booster().get_fscore()
    imp_dict = {feature_names[i]:float(imp_vals.get(feature_names[i],0.)) for i in range(len(feature_names))}
    
    return pd.DataFrame.from_dict(imp_dict, orient='index')

In [48]:
pipeline = Pipeline([
        ('imputer', Imputer()),
        ('model', xgb.XGBClassifier())
    ])

pipeline.fit(X_train, y_train)

Pipeline(steps=[('imputer', Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)), ('model', XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1))])

In [58]:
feat_importance = get_xgb_imp(pipeline.get_params()['model'], X_train.columns)
print('Feature Importance ( XGBoost): \n')
print(feat_importance.sort_values(by=0, ascending=False).iloc[:10])

Feature Importance ( XGBoost): 

         0
f657  64.0
f1    32.0
f460  30.0
f59   25.0
f653  18.0
f263  17.0
f743  16.0
f523  15.0
f453  13.0
f501  11.0


In [35]:
def cv(estimators, X, y):
    skf = StratifiedKFold(y, n_folds=3, shuffle=True, random_state=12)
    best_estimators = [] # list of best estimators 
    scores = [] # mae score for every fold
    
    for itrain, itest in skf:
        Xtr = X[itrain]
        Xte = X[itest]
        
        ytr = y[itrain]
        yte = y[itest]
        
        min_score = np.Infinity
        
        for est in estimators:
            est.fit(Xtr, ytr)
            pred = est.predict(Xte)
            score = mean_absolute_error(yte, pred)
            
            if score < min_score:
                min_score      = score
                best_estimator = est
        
        best_estimators.append(best_estimator)
        scores.append(min_score)
    
    return best_estimators, scores

In [36]:
pipeline_rf = Pipeline([
        ('imputer', Imputer()),
        ('select', SelectKBest(f_regression, k=100)),
        ('model', RandomForestRegressor(random_state=3, n_jobs=3))
    ])

pipeline_linear = Pipeline([
        ('imputer', Imputer()),
        ('scale', StandardScaler()),
        ('select', SelectKBest(f_regression, k=100)),
        ('model', Lasso(random_state=3))
    ])

# pipeline_gbr = Pipeline([
#         ('imputer', Imputer()),
#         ('select', SelectKBest(f_classif, k=100)),
#         ('model', GradientBoostingRegressor(random_state=3))
#     ])

In [None]:
best_ests, scores = cv([pipeline_linear, pipeline_rf, pipeline_gbr], X_train.values, y_train.values)

In [24]:
best_ests

[Pipeline(steps=[('imputer', Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)), ('scale', StandardScaler(copy=True, with_mean=True, with_std=True)), ('select', SelectKBest(k=100, score_func=<function f_classif at 0x7f0ce5e02e18>)), ('model', Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
    normalize=False, positive=False, precompute=False, random_state=3,
    selection='cyclic', tol=0.0001, warm_start=False))]),
 Pipeline(steps=[('imputer', Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)), ('scale', StandardScaler(copy=True, with_mean=True, with_std=True)), ('select', SelectKBest(k=100, score_func=<function f_classif at 0x7f0ce5e02e18>)), ('model', Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
    normalize=False, positive=False, precompute=False, random_state=3,
    selection='cyclic', tol=0.0001, warm_start=False))]),
 Pipeline(steps=[('imputer', Imputer(axis=0, copy=True, missing_values='

In [25]:
scores

[1.4506707830779793, 1.4357816056997013, 1.4294655744290388]

In [13]:
# test on the held out examples
best_ests[0].fit(X_train, y_train)
print('MAE Random Forest Regressor ', mean_absolute_error(y_test, best_ests[0].predict(X_test)))

MAE Random Forest Regressor  0.00916973702793


In [14]:
best_ests[1].fit(X_train, y_train)
print('MAE Lasso Regression ', mean_absolute_error(y_test, best_ests[1].predict(X_test)))

MAE Lasso Regression  0.00916903832835


In [46]:
# train on the full dataset
best_ests[2].fit(train, target)
loss = pipeline_linear.predict(test)

In [47]:
sample_sub = pd.read_csv(os.path.join(basepath, 'data/raw/sampleSubmission.csv'))

In [50]:
sample_sub['loss'] = loss
sample_sub.to_csv(os.path.join(basepath, 'submissions/baseline.csv'), index=False)