In [6]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.utils import shuffle, class_weight
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [4]:
from sklearn.linear_model import LogisticRegression

### Read data

In [7]:
df_selected = pd.read_csv('./data/df_selected.csv')

In [8]:
print(df_selected.columns)

Index(['loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'term', 'int_rate',
       'installment', 'grade', 'sub_grade', 'emp_length', 'home_ownership',
       'annual_inc', 'verification_status', 'loan_status', 'purpose',
       'addr_state', 'dti', 'delinq_2yrs', 'fico_range_low', 'fico_range_high',
       'open_acc', 'pub_rec', 'revol_bal', 'revol_util', 'total_acc',
       'initial_list_status', 'application_type', 'acc_open_past_24mths',
       'mort_acc', 'pub_rec_bankruptcies', 'tax_liens', 'disbursement_method',
       'issue_month', 'issue_year', 'earliest_cr_year', 'credit_history'],
      dtype='object')


In [9]:
df_selected.drop('application_type', axis = 1, inplace=True)

In [10]:
df_selected.drop('issue_year', axis = 1, inplace=True)

In [11]:
df_selected.drop('fico_range_low', axis = 1, inplace=True)

In [12]:
df_selected.drop(['funded_amnt','funded_amnt_inv'], axis = 1, inplace=True)

In [13]:
df_selected.drop('earliest_cr_year', axis = 1, inplace=True)

### Class balance

In [14]:
df_selected.loan_status.value_counts(normalize=True)

0    0.783494
1    0.216506
Name: loan_status, dtype: float64

In [15]:
df_selected.loan_status.value_counts()

0    358436
1     99048
Name: loan_status, dtype: int64

Loan status class is imbalanced. We need to treat this with some special techniques: 

(1) Assign class weight
(2) Use ensemble algorithoms with cross validation
(3) Overpredict minor class

#### Downsmaple_majority class

In [16]:
df_major = df_selected[df_selected.loan_status == 0]
df_minor = df_selected[df_selected.loan_status == 1]

In [17]:
from sklearn.utils import resample

In [18]:
df_minor_upsmapled = resample(df_minor, replace = True, n_samples = 358436, random_state = 2018)

In [19]:
df_minor_upsmapled = pd.concat([df_minor_upsmapled, df_major])

In [20]:
df_minor_upsmapled.loan_status.value_counts()

1    358436
0    358436
Name: loan_status, dtype: int64

#### 0. Evaluation metrics:

In [21]:
def evaluate_model(ytest, ypred):
    print('Accuracy of the model: {}\n'.format(accuracy_score(ytest, ypred)))
    print('Classification report: \n{}\n'.format(classification_report(ytest, ypred)))
    print('Confusion matrix: \n{}\n'.format(confusion_matrix(ytest, ypred)))

#### 1. Standarize the data 

In [22]:
X = df_minor_upsmapled.drop('loan_status', axis = 1)
Y = df_minor_upsmapled.loan_status

In [23]:
xtrain, xtest, ytrain, ytest = train_test_split(X, Y, test_size=0.25, random_state=0)

In [24]:
from keras.utils import np_utils
from sklearn.preprocessing import LabelEncoder

def encode_label(y):
    encoder = LabelEncoder()
    encoder.fit(y)
    encoded_y = encoder.transform(y)
    new_y = np_utils.to_categorical(encoded_y)
    return new_y

# Neural Network model
ytrain_encoded = encode_label(ytrain)
ytest_encoded = encode_label(ytest)

Using TensorFlow backend.


In [25]:
mms = StandardScaler()
mms.fit(xtrain)
xtrain_scaled = mms.transform(xtrain)

In [26]:
np.shape(df_minor_upsmapled)

(716872, 29)

#### 2. logistic regression model

In [27]:
logisticRegr = LogisticRegression()

In [28]:
logisticRegr.fit(xtrain_scaled, ytrain)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [29]:
xtest_scaled = mms.transform(xtest)

In [30]:
lr_pred = logisticRegr.predict(xtest_scaled)

In [31]:
evaluate_model(ytest, lr_pred)

Accuracy of the model: 0.6635215212757647

Classification report: 
             precision    recall  f1-score   support

          0       0.66      0.68      0.67     89877
          1       0.67      0.65      0.66     89341

avg / total       0.66      0.66      0.66    179218


Confusion matrix: 
[[60806 29071]
 [31232 58109]]



### 3. Random forest model

In [32]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import roc_auc_score

In [33]:
rf_params = {
    'n_estimators': 126, 
    'max_depth': 14
}

rf = RandomForestClassifier(**rf_params)
rf.fit(xtrain_scaled, ytrain)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=14, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=126, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [34]:
rfpred = rf.predict(xtest_scaled)

In [35]:
rfpred_proba = rf.predict_proba(xtest_scaled)

In [36]:
evaluate_model(ytest, rfpred)

Accuracy of the model: 0.7320470042071667

Classification report: 
             precision    recall  f1-score   support

          0       0.75      0.69      0.72     89877
          1       0.71      0.77      0.74     89341

avg / total       0.73      0.73      0.73    179218


Confusion matrix: 
[[62025 27852]
 [20170 69171]]



In [37]:
roc_auc_score(ytest, rfpred_proba[:, 1])

0.8076692169438009

### Cross validation

In [281]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import recall_score, roc_auc_score, f1_score

scoring = ['accuracy', 'recall', 'roc_auc', 'f1']
scores = cross_validate(rf, X = xtrain_scaled, y = ytrain, scoring=scoring,
                         cv = 10, return_train_score = False, verbose = 10, n_jobs= -1)

[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  , accuracy=0.7373061042294387, recall=0.7758825715347455, roc_auc=0.8111402603388849, f1=0.7472531405461509, total= 9.6min
[CV]  ................................................................
[CV]  , accuracy=0.735111408696946, recall=0.7725009290226682, roc_auc=0.8089891978767917, f1=0.7448493317567809, total= 9.6min
[CV]  ................................................................
[CV]  , accuracy=0.7383104564222743, recall=0.7801560758082497, roc_auc=0.812914957923603, f1=0.7490099539762389, total= 9.6min
[CV]  ................................................................
[CV]  , accuracy=0.735371796302496, recall=0.77785209959123, roc_auc=0.8078189212776576, f1=0.74634528988091, total= 9

[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed: 18.3min remaining: 18.3min


[CV]  , accuracy=0.7328559471775319, recall=0.7733472072540786, roc_auc=0.806607263795885, f1=0.7434399728489006, total= 8.7min
[CV]  ................................................................
[CV]  , accuracy=0.7374500139495954, recall=0.7775837080530678, roc_auc=0.8106767958310224, f1=0.7477664212708169, total= 8.7min


[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed: 18.4min remaining:  7.9min


[CV]  , accuracy=0.7326513531107598, recall=0.7741276152959976, roc_auc=0.807700040681469, f1=0.7434863302162895, total= 8.8min
[CV]  , accuracy=0.7351622802938714, recall=0.7742019398714185, roc_auc=0.8109214101969001, f1=0.745300062606207, total= 5.3min
[CV]  , accuracy=0.7356223495275649, recall=0.7745364004608124, roc_auc=0.8107587403732223, f1=0.7457154102114567, total= 5.3min


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 23.7min finished
