In [1]:
import warnings
warnings.filterwarnings("ignore")

In [187]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.utils import resample
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate, KFold

from sklearn.metrics import recall_score, roc_auc_score, f1_score
from sklearn.metrics import accuracy_score, roc_auc_score, \
                            classification_report, confusion_matrix

In [3]:
from sklearn.linear_model import LogisticRegression

### Read data

#### Regular features

In [4]:
df_selected = pd.read_csv('./data/df_selected.csv')

In [5]:
df_selected.drop('fico_range_low', axis = 1, inplace=True)

In [6]:
df_selected.drop(['funded_amnt','funded_amnt_inv'], axis = 1, inplace=True)

In [7]:
df_selected.drop('earliest_cr_year', axis = 1, inplace=True)

In [24]:
df_selected.drop('issue_year', axis = 1, inplace=True)

### Class balance

In [8]:
df_selected.loan_status.value_counts(normalize=True)

0    0.783494
1    0.216506
Name: loan_status, dtype: float64

In [9]:
df_selected.loan_status.value_counts()

0    358436
1     99048
Name: loan_status, dtype: int64

Loan status class is imbalanced. We need to treat this with some special techniques: 

(1) Assign class weight
(2) Use ensemble algorithoms with cross validation
(3) Upsample minority class or downsample the majority class

#### Upsample the minority class
To upsample I used scikit learn resample method setting replacement option true


In [25]:
df_major = df_selected[df_selected.loan_status == 0]
df_minor = df_selected[df_selected.loan_status == 1]

In [26]:
df_minor_upsmapled = resample(df_minor, replace = True, n_samples = 358436, random_state = 2018)

In [27]:
df_minor_upsmapled = pd.concat([df_minor_upsmapled, df_major])

In [28]:
df_minor_upsmapled.loan_status.value_counts()

1    358436
0    358436
Name: loan_status, dtype: int64

#### 0. Evaluate the model
The following function prints out the evaluation information on the estimator: the AUC-ROC score, Accuracy, classification report, confusion matrix.

In [29]:
def evaluate_model(ytest, ypred, ypred_proba = None):
    if ypred_proba is not None:
        print('ROC-AUC score of the model: {}'.format(roc_auc_score(ytest, ypred_proba[:, 1])))
    print('Accuracy of the model: {}\n'.format(accuracy_score(ytest, ypred)))
    print('Classification report: \n{}\n'.format(classification_report(ytest, ypred)))
    print('Confusion matrix: \n{}\n'.format(confusion_matrix(ytest, ypred)))

#### 1. Standarize the data

In [30]:
X = df_minor_upsmapled.drop('loan_status', axis = 1)
Y = df_minor_upsmapled.loan_status

In [31]:
xtrain, xtest, ytrain, ytest = train_test_split(X, Y, test_size=0.25, random_state=0)

In [32]:
mms = StandardScaler()
mms.fit(xtrain)
xtrain_scaled = mms.transform(xtrain)

In [34]:
np.shape(df_minor_upsmapled)

(716872, 30)

#### 2. logistic regression model

In [35]:
logisticRegr = LogisticRegression()

In [36]:
logisticRegr.fit(xtrain_scaled, ytrain)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [37]:
xtest_scaled = mms.transform(xtest)

In [38]:
lr_pred = logisticRegr.predict(xtest_scaled)

In [39]:
evaluate_model(ytest, lr_pred)

Accuracy of the model: 0.66409066053633

Classification report: 
             precision    recall  f1-score   support

          0       0.66      0.68      0.67     89877
          1       0.67      0.65      0.66     89341

avg / total       0.66      0.66      0.66    179218


Confusion matrix: 
[[60846 29031]
 [31170 58171]]



### 3. Random forest model

In [40]:
def random_forest(xtrain, xtest, ytrain):
    rf_params = {
        'n_estimators': 126, 
        'max_depth': 14
    }

    rf = RandomForestClassifier(**rf_params)
    rf.fit(xtrain, ytrain)
    rfpred = rf.predict(xtest)
    rfpred_proba = rf.predict_proba(xtest)
    
    return rfpred, rfpred_proba

In [41]:
rfpred, rfpred_proba = random_forest(xtrain_scaled, xtest_scaled, ytrain)

In [42]:
evaluate_model(ytest, rfpred, rfpred_proba)

ROC-AUC score of the model: 0.8054282761077389
Accuracy of the model: 0.7304177035788816

Classification report: 
             precision    recall  f1-score   support

          0       0.75      0.69      0.72     89877
          1       0.71      0.77      0.74     89341

avg / total       0.73      0.73      0.73    179218


Confusion matrix: 
[[61972 27905]
 [20409 68932]]



### Cross validation

In [38]:
scoring = ['accuracy', 'recall', 'roc_auc', 'f1']
scores = cross_validate(rf, X = xtrain_scaled, y = ytrain, scoring=scoring,
                         cv = 10, return_train_score = False, verbose = 10, n_jobs= -1)

[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  , accuracy=0.7333258936874605, recall=0.7784095131921219, roc_auc=0.8090472984618574, f1=0.7450206288234457, total= 8.8min
[CV]  ................................................................
[CV]  , accuracy=0.7297920618978536, recall=0.767372723894463, roc_auc=0.8047750367596309, f1=0.7397721573404027, total= 8.8min
[CV]  ................................................................
[CV]  , accuracy=0.731019603466875, recall=0.7719435154217763, roc_auc=0.8065173873635427, f1=0.7417868875874875, total= 8.8min
[CV]  ................................................................
[CV]  , accuracy=0.7312427928430607, recall=0.7732441471571906, roc_auc=0.803348966208371, f1=0.7422680412371134, tota

[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed: 17.2min remaining: 17.2min


[CV]  , accuracy=0.729991630242723, recall=0.7712289568545839, roc_auc=0.803770231154411, f1=0.740874283776306, total= 8.4min
[CV]  ................................................................
[CV]  , accuracy=0.7293778480424068, recall=0.770374224237244, roc_auc=0.8041309804369061, f1=0.740224959828602, total= 8.4min


[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed: 17.2min remaining:  7.4min


[CV]  , accuracy=0.7343625034873988, recall=0.7772864097513843, roc_auc=0.8082918628438596, f1=0.7454824108065723, total= 8.4min
[CV]  , accuracy=0.7320747698316749, recall=0.7717492288825301, roc_auc=0.8079808736099967, f1=0.7424873522944636, total= 5.1min
[CV]  , accuracy=0.7320139870545347, recall=0.7727154483630012, roc_auc=0.8082303043905488, f1=0.7426867164339036, total= 5.1min


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 22.3min finished


In [39]:
scores

{'fit_time': array([519.01635194, 519.08185387, 518.91476393, 514.854949  ,
        491.86662292, 491.60002613, 492.63204885, 491.53150296,
        296.78954768, 297.04013801]),
 'score_time': array([10.45862293, 10.56293011, 10.65213013, 10.40467215, 10.21855617,
        10.32528472, 10.14684105, 10.33355498,  6.49080539,  6.19442701]),
 'test_accuracy': array([0.72979206, 0.7310196 , 0.73124279, 0.73332589, 0.73371648,
        0.72999163, 0.7343625 , 0.72937785, 0.73207477, 0.73201399]),
 'test_recall': array([0.76737272, 0.77194352, 0.77324415, 0.77840951, 0.77421033,
        0.77122896, 0.77728641, 0.77037422, 0.77174923, 0.77271545]),
 'test_roc_auc': array([0.80477504, 0.80651739, 0.80334897, 0.8090473 , 0.80837787,
        0.80377023, 0.80829186, 0.80413098, 0.80798087, 0.8082303 ]),
 'test_f1': array([0.73977216, 0.74178689, 0.74226804, 0.74502063, 0.74427079,
        0.74087428, 0.74548241, 0.74022496, 0.74248735, 0.74268672])}

In [54]:
print('F1 score# (1) mean: {} (2)variance: {}'.format(np.mean(scores['test_f1']), np.var(scores['test_f1'])))
print('Recall score# (1) mean: {} (2)variance: {}'.format(np.mean(scores['test_recall']), np.var(scores['test_recall'])))
print('Accuracy score# (1) mean: {} (2)variance: {}'.format(np.mean(scores['test_accuracy']), np.var(scores['test_accuracy'])))

F1 score# (1) mean: 0.7424874224946193 (2)variance: 3.4239691671447294e-06
Recall score# (1) mean: 0.7728534498486367 (2)variance: 9.340496661280428e-06
Accuracy score# (1) mean: 0.7316917565649772 (2)variance: 2.6660110240196636e-06


### 4. LightGBM model

In [227]:
import lightgbm

In [228]:
lbg_params = {
    'n_estimators': 8000,
    'max_depth': 100,
    'objective': 'binary',
    'learning_rate' : 0.02,
    'num_leaves' : 250,
    'feature_fraction': 0.64, 
    'bagging_fraction': 0.8, 
    'bagging_freq': 1,
    'boosting_type' : 'gbdt'
}

In [229]:
lgb = lightgbm.LGBMClassifier(**lbg_params)

In [230]:
lgb.fit(xtrain_scaled, ytrain)

LGBMClassifier(bagging_fraction=0.8, bagging_freq=1, boosting_type='gbdt',
        class_weight=None, colsample_bytree=1.0, feature_fraction=0.64,
        learning_rate=0.02, max_depth=100, min_child_samples=20,
        min_child_weight=0.001, min_split_gain=0.0, n_estimators=8000,
        n_jobs=-1, num_leaves=250, objective='binary', random_state=None,
        reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=0)

#### Test the model with test data

In [231]:
lgb_pred = lgb.predict(xtest_scaled)

  if diff:


In [232]:
lgb_pred_proba = lgb.predict_proba(xtest_scaled)

In [233]:
evaluate_model(ytest, lgb_pred, lgb_pred_proba)

ROC-AUC score of the model: 0.9586191656898193
Accuracy of the model: 0.890546708477943

Classification report: 
             precision    recall  f1-score   support

          0       0.93      0.85      0.89     89877
          1       0.86      0.94      0.90     89341

avg / total       0.89      0.89      0.89    179218


Confusion matrix: 
[[75963 13914]
 [ 5702 83639]]



#### Cross validation

In [240]:
folds = list(KFold(5, shuffle=True, random_state=2016)\
             .split(xtrain_scaled, ytrain))

In [241]:
for i, (train_idx, valid_idx) in enumerate(folds):
    
    ytrain = np.array(ytrain)
    X_train = xtrain_scaled[train_idx]
    y_train = ytrain[train_idx]
    X_valid = xtrain_scaled[valid_idx]
    y_valid = ytrain[valid_idx]
    
    lgb.fit(X_train, y_train)
    pred = lgb.predict(X_valid)
    pred_proba = lgb.predict_proba(X_valid)
    
    print('\ncv: {}\n'.format(i))
    evaluate_model(y_valid, pred, pred_proba)

  if diff:



cv: 0

ROC-AUC score of the model: 0.9483670270426987
Accuracy of the model: 0.8754963684890869

Classification report: 
             precision    recall  f1-score   support

          0       0.91      0.83      0.87     53472
          1       0.84      0.92      0.88     54059

avg / total       0.88      0.88      0.88    107531


Confusion matrix: 
[[44284  9188]
 [ 4200 49859]]



  if diff:



cv: 1

ROC-AUC score of the model: 0.9478880521895745
Accuracy of the model: 0.8756730617217361

Classification report: 
             precision    recall  f1-score   support

          0       0.92      0.83      0.87     53861
          1       0.84      0.92      0.88     53670

avg / total       0.88      0.88      0.88    107531


Confusion matrix: 
[[44633  9228]
 [ 4141 49529]]



  if diff:



cv: 2

ROC-AUC score of the model: 0.9490374658331387
Accuracy of the model: 0.8780909691158829

Classification report: 
             precision    recall  f1-score   support

          0       0.92      0.83      0.87     53504
          1       0.85      0.92      0.88     54027

avg / total       0.88      0.88      0.88    107531


Confusion matrix: 
[[44457  9047]
 [ 4062 49965]]



  if diff:



cv: 3

ROC-AUC score of the model: 0.9490402272662238
Accuracy of the model: 0.8781188680473538

Classification report: 
             precision    recall  f1-score   support

          0       0.92      0.83      0.87     53958
          1       0.85      0.92      0.88     53573

avg / total       0.88      0.88      0.88    107531


Confusion matrix: 
[[44916  9042]
 [ 4064 49509]]



  if diff:



cv: 4

ROC-AUC score of the model: 0.9486846347287888
Accuracy of the model: 0.8762112898725937

Classification report: 
             precision    recall  f1-score   support

          0       0.91      0.83      0.87     53764
          1       0.84      0.92      0.88     53766

avg / total       0.88      0.88      0.88    107530


Confusion matrix: 
[[44671  9093]
 [ 4218 49548]]

