# 3. Home Credit Default Risk - Classical Machine Learning Algorithms

In [1]:
import pandas as pd 
import numpy as np
import random
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from six.moves import cPickle as pickle
from sklearn.externals import joblib
import os

In [2]:
data_path="../data/pickles"

In [3]:
# with engineered features
prep_train = pickle.load(open(os.path.join(data_path, "prep_train_FE.pkl"), "rb"))
prep_test = pickle.load(open(os.path.join(data_path, "prep_test_FE.pkl"), "rb"))
# without engineered features WOEF
# prep_train = pickle.load(open(os.path.join(data_path, "prep_train.pkl"), "rb"))
# prep_test = pickle.load(open(os.path.join(data_path, "prep_test.pkl"), "rb"))

In [4]:
print(prep_train.shape)
print(prep_test.shape)

(307504, 169)
(48744, 168)


----
## Helper Functions

In [5]:
# ROC
def plot_roc_curve(y_groundtruth, y_predicted_proba, caption=None, legend=None, lable=None):
    from sklearn.metrics import roc_curve, roc_auc_score
    fpr, tpr, thresholds = roc_curve(y_groundtruth, y_predicted_proba)
    auc = roc_auc_score(y_truth.values.ravel(), y_predicted_proba)
    plt.plot(fpr, tpr, linewidth=2, label=legend)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.axis([0, 1, 0, 1])
    plt.grid()
    plt.title(caption)
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.legend(loc="lower right")
    plt.show()

In [6]:
def get_roc_with_auc(y_train, y_train_predicted):
    from sklearn.metrics import roc_curve, roc_auc_score, precision_recall_curve
    fpr, tpr, thresholds = roc_curve(y_train.values.ravel(), y_train_predicted[:,:1])
    auc = roc_auc_score(y_train.values.ravel(), y_train_predicted[:,:1])
    return fpr, tpr, thresholds, auc 

In [7]:
# Submission 
def make_submission(y_test_predicted, sub_file_name):
    # Make the submission dataframe
    submission = pd.DataFrame({'SK_ID_CURR': X_test_SK_IDS, 'TARGET': y_test_predicted[:,1]})
    submission.to_csv('./single_ml/submissions/' + sub_file_name + '.csv', index = False)

----
# Classification

In [8]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score, cross_val_predict

In [9]:
X_train_SK_IDS = prep_train.SK_ID_CURR
X_train = prep_train.drop(columns=['TARGET', 'SK_ID_CURR'])
y_train = prep_train.TARGET

# delete
X_test_SK_IDS = prep_test.SK_ID_CURR
X_test = prep_test.drop(columns=['SK_ID_CURR'])

del prep_train
del prep_test

## Scaler

In [10]:
from sklearn.preprocessing import StandardScaler

In [11]:
std_scaler = StandardScaler()
std_scaler.fit(X_train)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [12]:
X_train = std_scaler.transform(X_train)
X_test = std_scaler.transform(X_test)

### Dimension Reduction with PCA

In [13]:
#from sklearn.decomposition import PCA

In [14]:
#pca = PCA(n_components=20)
#pca.fit(X_train)

In [15]:
#X_train = pca.transform(X_train)
#X_test = pca.transform(X_test)

## LogisticRegression

In [16]:
#from sklearn.linear_model import LogisticRegression

In [17]:
#log_reg = LogisticRegression()

#param_logreg = {
#  "C": [0.01, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4],
#  "class_weight" : ['balanced'],
#  "random_state" : [42]
#}

In [18]:
#grid_logreg = GridSearchCV(log_reg, param_grid=param_logreg, cv=5, n_jobs=3, verbose=5, scoring='roc_auc')
#grid_logreg.fit(X_train, y_train)

In [19]:
#print("Best Score: {}".format(grid_logreg.best_score_))
#print("Best Params: {}".format(grid_logreg.best_params_))

## RandomForest 

In [20]:
#from sklearn.ensemble import RandomForestClassifier
#clf_rf = RandomForestClassifier()

#param_rf = {
#  "n_estimators": [100, 150],
#  "max_depth": [25, 50],
#  "random_state": [42]
#}

In [21]:
#grid_rf = GridSearchCV(clf_rf, param_grid=param_rf, cv=5, n_jobs=3, verbose=5, scoring='roc_auc')
#grid_rf.fit(X_train, y_train)

In [22]:
#print("Best Score: {}".format(grid_rf.best_score_))
#print("Best Params: {}".format(grid_rf.best_params_))

## ElasticNet

In [23]:
#from sklearn.linear_model import ElasticNet
#clf_eln = ElasticNet()

#param_eln = {
#  'alpha': [0.001, 0.01, 0.1 , 1, 10],
#  'l1_ratio': [0.1, 0.2, 0.3, 0.5, 0.7],
#  'random_state': [42]
#}

In [24]:
#grid_eln = GridSearchCV(clf_eln, param_grid=param_eln, cv=5, n_jobs=-1, verbose=5, scoring='roc_auc')
#grid_eln.fit(X_train, y_train)

In [25]:
#print("Best Score: {}".format(grid_eln.best_score_))
#print("Best Params: {}".format(grid_eln.best_params_))

## Multilayer Perceptron

In [26]:
#from sklearn.neural_network import MLPClassifier
#from skopt import BayesSearchCV
#from skopt.space import Real, Integer, Categorical

In [27]:
#clf_meta_mlp = MLPClassifier()

#clf_meta_grid_mlp = {
#  'hidden_layer_sizes': Categorical([(70, 30)]),
#  'alpha': Real(0.0000001, 8, prior='log-uniform'),
#}

In [28]:
#gridcv_meta_mlp = BayesSearchCV(clf_meta_mlp, search_spaces=clf_meta_grid_mlp, cv=5, n_jobs=-1, 
#                                verbose=5, random_state=42, scoring='roc_auc', n_iter=10)
#gridcv_meta_mlp.fit(X_train, y_train)

In [29]:
#print("Best Score: {}".format(gridcv_meta_mlp.best_score_))
#print("Best Params: {}".format(gridcv_meta_mlp.best_params_))
#for mean, std, params in zip(gridcv_meta_mlp.cv_results_['mean_test_score'], gridcv_meta_mlp.cv_results_['std_test_score'], gridcv_meta_mlp.cv_results_['params']):
#  print("mean: %0.3f std: (+/-%0.03f) for %r" % (mean, std * 2, params))

## XGBoost with GridSearchCV

In [30]:
from xgboost import XGBClassifier
clf_xgb = XGBClassifier()

In [31]:
param_xgb = {
  "n_estimators": [1000],
  "max_depth": [8],#, 4],
  "min_child_weight": [4],
  "subsample": [0.8],
  "learning_rate": [0.01],
  "colsample_bytree": [0.8],
  "objective": ['binary:logistic'],
  "random_state": [42]
}

In [None]:
grid_xgb = GridSearchCV(clf_xgb, param_grid=param_xgb, cv=5, n_jobs=1, verbose=5, scoring='roc_auc')
grid_xgb.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] colsample_bytree=0.8, learning_rate=0.01, max_depth=8, min_child_weight=4, n_estimators=1000, objective=binary:logistic, random_state=42, subsample=0.8 


In [None]:
predicts_xgb=cross_val_predict(grid_xgb.best_estimator_, X_train, y_train, cv=5, verbose=2, n_jobs=1, method='predict_proba')

In [None]:
joblib.dump(grid_xgb, './single_ml/models/grid_xgb.joblib') # WOEF grid_xgb_wofe
joblib.dump(predicts_xgb, './single_ml/predictions/predicts_xgb.joblib') # WOEF predicts_xgb_wofe

In [None]:
print("Best Score: {}".format(grid_xgb.best_score_))
print("Best Params: {}".format(grid_xgb.best_params_))

In [None]:
y_test_xgb = grid_xgb.best_estimator_.predict_proba(X_test)

In [None]:
# Make the submission dataframe
make_submission(y_test_xgb,'xgb_submission')

## LightGBM with GridSearchCV

In [None]:
from lightgbm import LGBMClassifier
clf_lgbm = LGBMClassifier()

In [None]:
lgbm_params = {
    'nthread':[3],
    'n_estimators':[10000],#,500],
    'learning_rate':[0.02],
    'num_leaves':[34],
    'colsample_bytree':[0.9497036],
    'subsample':[0.8715623],
    'max_depth':[8],
    'reg_alpha':[0.041545473],
    'reg_lambda':[0.0735294],
    'min_split_gain':[0.0222415],
    'min_child_weight':[39.3259775],
    'verbose':[3],
    'metric': ['auc']
}

In [None]:
grid_lgdb = GridSearchCV(clf_lgbm, param_grid=lgbm_params, cv=5, n_jobs=1, verbose=3, scoring='roc_auc')
grid_lgdb.fit(X_train, y_train)

In [None]:
predicts_lgbme=cross_val_predict(grid_lgdb.best_estimator_, X_train, y_train, cv=5, verbose=2, n_jobs=1, method='predict_proba')

In [None]:
joblib.dump(grid_lgdb, './single_ml/models/grid_lgdb.joblib') # WOEF grid_lgdb_wofe
joblib.dump(predicts_lgbm, './single_ml/predictions/predicts_lgbm.joblib') # WOEF predicts_lgbm_wofe

In [None]:
print("Best Score: {}".format(grid_lgdb.best_score_))
print("Best Params: {}".format(grid_lgdb.best_params_))

In [None]:
y_test_lgbm = grid_lgdb.best_estimator_.predict_proba(X_test)

In [None]:
# create the submission
make_submission(y_test_lgbm,'lgbm_submission_gs')

## Load all Predictions

This only works if you first create the predictions.

In [None]:
# WOEF
#grid_lgdb_wofe = joblib.load('./single_ml/models/grid_lgdb_wofe.joblib')
predicts_lgbm_wofe = joblib.load('./single_ml/predictions/predicts_lgbm_wofe.joblib')

In [None]:
#grid_lgdb = joblib.load('./single_ml/models/grid_lgdb.joblib')
predicts_lgbm = joblib.load('./single_ml/predictions/predicts_lgbm.joblib')

In [None]:
# WOEF
#grid_xgb_wofe = joblib.load('./single_ml/models/grid_xgb_wofe.joblib')
predicts_xgb_wofe = joblib.load('./single_ml/predictions/predicts_xgb_wofe.joblib')

In [None]:
#grid_xgb = joblib.load('./single_ml/models/grid_xgb.joblib')
predicts_xgb = joblib.load('./single_ml/predictions/predicts_xgb.joblib')

## Plot ROC for XGBoost and LightGBM

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score

fpr_xgb, tpr_xgb, thresholds_xgb = roc_curve(y_train, predicts_xgb[:,1:])
auc_xgb = roc_auc_score(y_train.values.ravel(), predicts_xgb[:,1:])

fpr_lgbm, tpr_lgbm, thresholds_lgbm = roc_curve(y_train, predicts_lgbm[:,1:])
auc_lgbm = roc_auc_score(y_train.values.ravel(), predicts_lgbm[:,1:])

fpr_xgb_wofe, tpr_xgb_wofe, thresholds_xgb_wofe = roc_curve(y_train, predicts_xgb_wofe[:,1:])
auc_xgb_wofe = roc_auc_score(y_train.values.ravel(), predicts_xgb_wofe[:,1:])

fpr_lgbm_wofe, tpr_lgbm_wofe, thresholds_lgbm_wofe = roc_curve(y_train, predicts_lgbm_wofe[:,1:])
auc_lgbm_wofe = roc_auc_score(y_train.values.ravel(), predicts_lgbm_wofe[:,1:])

caption = 'ROC -XGBoost vs LightGBM (with and without EF)'

# Create plots with pre-defined labels.
# Alternatively, you can pass labels explicitly when calling `legend`.
fig, ax = plt.subplots()
fig.set_size_inches(13.5, 10, forward=True)

ax.plot(fpr_xgb, tpr_xgb, 'r:', label='XGBoost with EF (CV=%0.5f, PubLB=%0.5f, PriLB=%0.5f)' % (round(auc_xgb,5), 0.75977, 0.75930))
ax.plot(fpr_xgb_wofe, tpr_xgb_wofe, 'r' ,label='XGBoost (CV=%0.5f, PubLB=%0.5f, PriLB=%0.5f)' % (round(auc_xgb_wofe,5), 0.74427, 0.74407))
ax.plot(fpr_lgbm, fpr_lgbm, 'b:', label='LightGBM with EF (CV=%0.5f, PubLB=%0.5f, PriLB=%0.5f)' % (round(auc_lgbm,5), 0.74261, 0.74478))
ax.plot(fpr_lgbm_wofe, tpr_lgbm_wofe, 'b' ,label='LightGBM (CV=%0.5f, PubLB=%0.5f, PriLB=%0.5f)' % (round(auc_lgbm_wofe,5), 0.73145, 0.73287))

# Now add the legend with some customizations.
legend = ax.legend(loc='lower right', shadow=True)

# The frame is matplotlib.patches.Rectangle instance surrounding the legend.
frame = legend.get_frame()
frame.set_facecolor('0.9')
plt.plot([0, 1], [0, 1], 'k--')
plt.axis([0, 1, 0, 1])
plt.grid()
plt.title(caption, fontsize=14)
plt.xlabel('False positive rate', fontsize=14)
plt.ylabel('True positive rate', fontsize=14)
plt.legend(loc="lower right", prop={'size': 14})

fig.savefig('xgb_vs_lgbm.png', dpi=600)
plt.show()