In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.utils import shuffle, class_weight
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

### Read data

In [3]:
df_selected = pd.read_csv('./data/df_selected.csv')

### Class balance

In [5]:
df_selected.loan_status.value_counts(normalize=True)

0    0.783494
1    0.216506
Name: loan_status, dtype: float64

In [6]:
df_selected.loan_status.value_counts()

0    358436
1     99048
Name: loan_status, dtype: int64

Loan status class is imbalanced. We need to treat this with some special techniques: 

(1) Assign class weight
(2) Use ensemble algorithoms with cross validation
(3) Overpredict minor class

#### Downsmaple_majority class

In [7]:
df_major = df_selected[df_selected.loan_status == 0]
df_minor = df_selected[df_selected.loan_status == 1]

In [8]:
from sklearn.utils import resample

In [9]:
df_major_downsmapled = resample(df_major, replace = False, n_samples = 99048, random_state = 2018)

In [10]:
df_downsampled = pd.concat([df_major_downsmapled, df_minor])

In [11]:
df_downsampled.loan_status.value_counts()

1    99048
0    99048
Name: loan_status, dtype: int64

#### 0. Evaluation metrics:

In [12]:
def evaluate_model(ytest, ypred):
    print('Accuracy of the model: {}\n'.format(accuracy_score(ytest, ypred)))
    print('Classification report: \n{}\n'.format(classification_report(ytest, ypred)))
    print('Confusion matrix: \n{}\n'.format(confusion_matrix(ytest, ypred)))

#### 1. Standarize the data 

In [13]:
X = df_downsampled.drop('loan_status', axis = 1)
Y = df_downsampled.loan_status
xtrain, xtest, ytrain, ytest = train_test_split(X, Y, test_size=0.25, random_state=0)

In [14]:
mms = StandardScaler()
mms.fit(xtrain)
xtrain_scaled = mms.transform(xtrain)

#### 2. logistic regression model

In [317]:
logisticRegr = LogisticRegression()

In [318]:
logisticRegr.fit(xtrain_scaled, ytrain)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [319]:
xtest_scaled = mms.transform(xtest)

In [320]:
lr_pred = logisticRegr.predict(xtest_scaled)

In [321]:
evaluate_model(ytest, lr_pred)

Accuracy of the model: 0.6632743720216461

Classification report: 
             precision    recall  f1-score   support

          0       0.66      0.68      0.67     24617
          1       0.67      0.65      0.66     24907

avg / total       0.66      0.66      0.66     49524


Confusion matrix: 
[[16771  7846]
 [ 8830 16077]]



### 3. Random forest model

In [322]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import roc_auc_score

In [394]:
rf_params = {
    'n_estimators': 120, 
    'max_depth': 15
}

rf = RandomForestClassifier(**rf_params)
rf.fit(xtrain_scaled, ytrain)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=15, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=120, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [395]:
rfpred = rf.predict(xtest_scaled)

In [396]:
rfpred_proba = rf.predict_proba(xtest_scaled)

In [397]:
evaluate_model(ytest, rfpred)

Accuracy of the model: 0.6700791535417171

Classification report: 
             precision    recall  f1-score   support

          0       0.68      0.65      0.66     24617
          1       0.66      0.69      0.68     24907

avg / total       0.67      0.67      0.67     49524


Confusion matrix: 
[[15910  8707]
 [ 7632 17275]]



In [398]:
roc_auc_score(ytest, rfpred_proba[:, 1])

0.7328277400240223

### Cross validation

In [399]:
from sklearn.model_selection import cross_val_score