** Two stage problem. **
* Default Classifier: Use AUC or F1-score to evaluate performance because of class imbalance.
* Default Loss: Use MAE ( Evaluation metric ) to evaluate performance

In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import os, sys

from sklearn.cross_validation import train_test_split, StratifiedKFold
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, Imputer
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingClassifier
from sklearn.feature_selection import f_regression, SelectKBest
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, f1_score

import xgboost as xgb

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

np.random.seed(3)

basepath = os.path.expanduser('~/Desktop/src/Loan_Default_Prediction/')
sys.path.append(os.path.join(basepath, 'src'))

In [2]:
train  = pd.read_csv(os.path.join(basepath, 'data/raw/train_v2.csv'),
                     index_col='id',
#                      dtype=np.float32
                    )
# test   = pd.read_csv(os.path.join(basepath, 'data/raw/test_v2.csv'), index_col='id')
# sample_sub = pd.read_csv(os.path.join(basepath, 'data/raw/sampleSubmission.csv'))

In [3]:
# loss target
train['is_loss'] = (train.loss > 0).astype(np.int)

** Training examples are in the chronological order but not in the test set. **

In [4]:
train = train.iloc[np.random.permutation(len(train))] # shuffle training set.

In [5]:
# features to remove from training set
features_to_remove = ['f33', 'f678', 'f37', 'f764', 'f700', \
                      'f34', 'f38', 'f702', 'f701', 'f736', 'f35', 'loss',
                      'is_loss']

** Split dataset **

In [6]:
itrain, itest = train_test_split(range(len(train)), test_size=0.3, random_state=10)

X_train = train.iloc[itrain][train.columns.drop(features_to_remove)]
X_test  = train.iloc[itest][train.columns.drop(features_to_remove)]

y_train = train['is_loss'].iloc[itrain]
y_test  = train['is_loss'].iloc[itest]

** Feature Importance ( Gradient Boosting Classifier ) **

In [7]:
imputer = Imputer()
imputer.fit(X_train)

X_train = imputer.transform(X_train)
X_test  = imputer.transform(X_test)

scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test  = scaler.transform(X_test)

In [8]:
forest = RandomForestClassifier(random_state=0, n_jobs=3)

forest.fit(X_train, y_train)
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

features = train.columns.drop(features_to_remove)

# Print the feature ranking
print("Feature ranking:")

for f in range(X_train.shape[1]):
    print("%d. feature %s (%f)" % (f + 1, features[indices[f]], importances[indices[f]]))

# Plot the feature importances of the forest
# plt.figure()
# plt.title("Feature importances")
# plt.bar(range(X_train.shape[1]), importances[indices],
#        color="r", yerr=std[indices], align="center")
# plt.xticks(range(X_train.shape[1]), indices, rotation=90)
# plt.xlim([-1, X_train.shape[1]])
# plt.show();

Feature ranking:
1. feature f67 (0.006034)
2. feature f767 (0.004061)
3. feature f3 (0.003396)
4. feature f670 (0.003368)
5. feature f746 (0.003209)
6. feature f766 (0.003162)
7. feature f76 (0.002961)
8. feature f404 (0.002921)
9. feature f412 (0.002916)
10. feature f19 (0.002914)
11. feature f468 (0.002881)
12. feature f533 (0.002879)
13. feature f514 (0.002865)
14. feature f212 (0.002854)
15. feature f696 (0.002853)
16. feature f655 (0.002817)
17. feature f75 (0.002799)
18. feature f639 (0.002712)
19. feature f406 (0.002695)
20. feature f413 (0.002681)
21. feature f601 (0.002678)
22. feature f672 (0.002664)
23. feature f774 (0.002660)
24. feature f281 (0.002607)
25. feature f211 (0.002589)
26. feature f322 (0.002581)
27. feature f271 (0.002569)
28. feature f727 (0.002564)
29. feature f479 (0.002550)
30. feature f598 (0.002542)
31. feature f201 (0.002534)
32. feature f149 (0.002518)
33. feature f647 (0.002518)
34. feature f471 (0.002492)
35. feature f402 (0.002484)
36. feature f525 (

** Feature Importance (Default Classifier) **

In [None]:
def get_xgb_imp(trained_xgb, feature_names):
    imp_vals = trained_xgb.booster().get_fscore()
    imp_dict = {feature_names[i]:float(imp_vals.get(feature_names[i],0.)) for i in range(len(feature_names))}
    
    return pd.DataFrame.from_dict(imp_dict, orient='index')

In [None]:
pipeline = Pipeline([
        ('imputer', Imputer()),
        ('scale', StandardScaler()),
        ('model', xgb.XGBClassifier(n_estimators=100, max_depth=3, learning_rate=0.1))
    ])

In [None]:
# fit on training set
pipeline.fit(X_train, y_train)

In [None]:
feat_importance = get_xgb_imp(pipeline.get_params()['model'], X_train.columns)
sorted_feat_importance = feat_importance.sort_values(by=0, ascending=False)
print('Feature Importance ( XGBoost): \n')
print(sorted_feat_importance.iloc[:10])

** Train on features deemed important. **

In [None]:
sorted_feat_importance.index[70:80]

In [None]:
selected_features = sorted_feat_importance.index[:20]

X_train_sub = X_train[selected_features]
X_test_sub  = X_test[selected_features]

pipeline.fit(X_train_sub, y_train)

** Test on the held out set. **

In [None]:
preds_prob = pipeline.predict_proba(X_test_sub)[:, 1]
preds      = pipeline.predict(X_test_sub)

In [None]:
y_test.iloc[10: 20]

In [None]:
preds[10:20]

In [None]:
print('AUC score: %f'%(roc_auc_score(y_test, preds_prob)))
print('F1 score: %f'%(f1_score(y_test, preds)))

In [None]:
def cv(estimators, X, y):
    skf = StratifiedKFold(y, n_folds=3, shuffle=True, random_state=12)
    best_estimators = [] # list of best estimators 
    scores = [] # mae score for every fold
    
    for itrain, itest in skf:
        Xtr = X[itrain]
        Xte = X[itest]
        
        ytr = y[itrain]
        yte = y[itest]
        
        min_score = np.Infinity
        
        for est in estimators:
            est.fit(Xtr, ytr)
            pred = est.predict(Xte)
            score = mean_absolute_error(yte, pred)
            
            if score < min_score:
                min_score      = score
                best_estimator = est
        
        best_estimators.append(best_estimator)
        scores.append(min_score)
    
    return best_estimators, scores

In [None]:
pipeline_rf = Pipeline([
        ('imputer', Imputer()),
        ('select', SelectKBest(f_regression, k=100)),
        ('model', RandomForestRegressor(random_state=3, n_jobs=3))
    ])

pipeline_linear = Pipeline([
        ('imputer', Imputer()),
        ('scale', StandardScaler()),
        ('select', SelectKBest(f_regression, k=100)),
        ('model', Lasso(random_state=3))
    ])

In [None]:
best_ests, scores = cv([pipeline_linear, pipeline_rf, pipeline_gbr], X_train.values, y_train.values)

In [None]:
best_ests

In [None]:
scores

In [None]:
# test on the held out examples
best_ests[0].fit(X_train, y_train)
print('MAE Random Forest Regressor ', mean_absolute_error(y_test, best_ests[0].predict(X_test)))

In [None]:
best_ests[1].fit(X_train, y_train)
print('MAE Lasso Regression ', mean_absolute_error(y_test, best_ests[1].predict(X_test)))

In [None]:
# train on the full dataset
best_ests[2].fit(train, target)
loss = pipeline_linear.predict(test)

In [None]:
sample_sub = pd.read_csv(os.path.join(basepath, 'data/raw/sampleSubmission.csv'))

In [None]:
sample_sub['loss'] = loss
sample_sub.to_csv(os.path.join(basepath, 'submissions/baseline.csv'), index=False)