In [218]:
import warnings
warnings.filterwarnings("ignore")

In [219]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.utils import shuffle, class_weight
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [220]:
from sklearn.linear_model import LogisticRegression

### Read data

In [221]:
df_selected = pd.read_csv('./data/df_selected.csv')

### Class balance

In [222]:
df_selected.loan_status.value_counts(normalize=True)

0    0.783494
1    0.216506
Name: loan_status, dtype: float64

In [223]:
df_selected.loan_status.value_counts()

0    358436
1     99048
Name: loan_status, dtype: int64

Loan status class is imbalanced. We need to treat this with some special techniques: 

(1) Assign class weight
(2) Use ensemble algorithoms with cross validation
(3) Overpredict minor class

#### Downsmaple_majority class

In [224]:
df_major = df_selected[df_selected.loan_status == 0]
df_minor = df_selected[df_selected.loan_status == 1]

In [225]:
from sklearn.utils import resample

In [226]:
df_minor_upsmapled = resample(df_minor, replace = True, n_samples = 358436, random_state = 2018)

In [227]:
df_minor_upsmapled = pd.concat([df_minor_upsmapled, df_major])

In [228]:
df_minor_upsmapled.loan_status.value_counts()

1    358436
0    358436
Name: loan_status, dtype: int64

#### 0. Evaluation metrics:

In [229]:
def evaluate_model(ytest, ypred):
    print('Accuracy of the model: {}\n'.format(accuracy_score(ytest, ypred)))
    print('Classification report: \n{}\n'.format(classification_report(ytest, ypred)))
    print('Confusion matrix: \n{}\n'.format(confusion_matrix(ytest, ypred)))

#### 1. Standarize the data 

In [230]:
X = df_minor_upsmapled.drop('loan_status', axis = 1)
Y = df_minor_upsmapled.loan_status

In [231]:
xtrain, xtest, ytrain, ytest = train_test_split(X, Y, test_size=0.25, random_state=0)

In [232]:
from keras.utils import np_utils
from sklearn.preprocessing import LabelEncoder

def encode_label(y):
    encoder = LabelEncoder()
    encoder.fit(y)
    encoded_y = encoder.transform(y)
    new_y = np_utils.to_categorical(encoded_y)
    return new_y

# Neural Network model
ytrain_encoded = encode_label(ytrain)
ytest_encoded = encode_label(ytest)

In [233]:
mms = StandardScaler()
mms.fit(xtrain)
xtrain_scaled = mms.transform(xtrain)

In [234]:
np.shape(df_minor_upsmapled)

(716872, 35)

#### 2. logistic regression model

In [235]:
logisticRegr = LogisticRegression()

In [236]:
logisticRegr.fit(xtrain_scaled, ytrain)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [237]:
xtest_scaled = mms.transform(xtest)

In [238]:
lr_pred = logisticRegr.predict(xtest_scaled)

In [239]:
evaluate_model(ytest, lr_pred)

Accuracy of the model: 0.6641576180963966

Classification report: 
             precision    recall  f1-score   support

          0       0.66      0.68      0.67     89877
          1       0.67      0.65      0.66     89341

avg / total       0.66      0.66      0.66    179218


Confusion matrix: 
[[61006 28871]
 [31318 58023]]



### 3. Random forest model

In [240]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import roc_auc_score

In [241]:
rf_params = {
    'n_estimators': 126, 
    'max_depth': 14
}

rf = RandomForestClassifier(**rf_params)
rf.fit(xtrain_scaled, ytrain)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=14, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=126, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [242]:
rfpred = rf.predict(xtest_scaled)

In [243]:
rfpred_proba = rf.predict_proba(xtest_scaled)

In [244]:
evaluate_model(ytest, rfpred)

Accuracy of the model: 0.7357743083842024

Classification report: 
             precision    recall  f1-score   support

          0       0.76      0.69      0.73     89877
          1       0.72      0.78      0.75     89341

avg / total       0.74      0.74      0.74    179218


Confusion matrix: 
[[62422 27455]
 [19899 69442]]



In [245]:
roc_auc_score(ytest, rfpred_proba[:, 1])

0.8108352361790795

### Cross validation