In [76]:
import warnings
warnings.filterwarnings("ignore")

In [77]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.utils import shuffle, class_weight
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [78]:
from sklearn.linear_model import LogisticRegression

### Read data

In [79]:
df_selected = pd.read_csv('./data/df_selected.csv')

In [80]:
print(df_selected.columns)

Index(['loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'term', 'int_rate',
       'installment', 'grade', 'sub_grade', 'emp_length', 'home_ownership',
       'annual_inc', 'verification_status', 'loan_status', 'purpose',
       'addr_state', 'dti', 'delinq_2yrs', 'fico_range_low', 'fico_range_high',
       'open_acc', 'pub_rec', 'revol_bal', 'revol_util', 'total_acc',
       'initial_list_status', 'application_type', 'acc_open_past_24mths',
       'mort_acc', 'pub_rec_bankruptcies', 'tax_liens', 'disbursement_method',
       'issue_month', 'issue_year', 'earliest_cr_year', 'credit_history'],
      dtype='object')


In [81]:
df_selected.drop('application_type', axis = 1, inplace=True)

In [82]:
df_selected.drop('issue_year', axis = 1, inplace=True)

In [83]:
df_selected.drop('fico_range_low', axis = 1, inplace=True)

In [84]:
df_selected.drop(['funded_amnt','funded_amnt_inv'], axis = 1, inplace=True)

In [85]:
df_selected.drop('earliest_cr_year', axis = 1, inplace=True)

### Class balance

In [86]:
df_selected.loan_status.value_counts(normalize=True)

0    0.783494
1    0.216506
Name: loan_status, dtype: float64

In [87]:
df_selected.loan_status.value_counts()

0    358436
1     99048
Name: loan_status, dtype: int64

Loan status class is imbalanced. We need to treat this with some special techniques: 

(1) Assign class weight
(2) Use ensemble algorithoms with cross validation
(3) Overpredict minor class

#### Downsmaple_majority class

In [88]:
df_major = df_selected[df_selected.loan_status == 0]
df_minor = df_selected[df_selected.loan_status == 1]

In [89]:
from sklearn.utils import resample

In [90]:
df_minor_upsmapled = resample(df_minor, replace = True, n_samples = 358436, random_state = 2018)

In [91]:
df_minor_upsmapled = pd.concat([df_minor_upsmapled, df_major])

In [92]:
df_minor_upsmapled.loan_status.value_counts()

1    358436
0    358436
Name: loan_status, dtype: int64

#### 0. Evaluation metrics:

In [93]:
def evaluate_model(ytest, ypred):
    print('Accuracy of the model: {}\n'.format(accuracy_score(ytest, ypred)))
    print('Classification report: \n{}\n'.format(classification_report(ytest, ypred)))
    print('Confusion matrix: \n{}\n'.format(confusion_matrix(ytest, ypred)))

#### 1. Standarize the data 

In [94]:
X = df_minor_upsmapled.drop('loan_status', axis = 1)
Y = df_minor_upsmapled.loan_status

In [95]:
xtrain, xtest, ytrain, ytest = train_test_split(X, Y, test_size=0.25, random_state=0)

In [96]:
from keras.utils import np_utils
from sklearn.preprocessing import LabelEncoder

def encode_label(y):
    encoder = LabelEncoder()
    encoder.fit(y)
    encoded_y = encoder.transform(y)
    new_y = np_utils.to_categorical(encoded_y)
    return new_y

# Neural Network model
ytrain_encoded = encode_label(ytrain)
ytest_encoded = encode_label(ytest)

In [97]:
mms = StandardScaler()
mms.fit(xtrain)
xtrain_scaled = mms.transform(xtrain)

In [98]:
np.shape(df_minor_upsmapled)

(716872, 29)

#### 2. logistic regression model

In [27]:
logisticRegr = LogisticRegression()

In [28]:
logisticRegr.fit(xtrain_scaled, ytrain)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [29]:
xtest_scaled = mms.transform(xtest)

In [30]:
lr_pred = logisticRegr.predict(xtest_scaled)

In [31]:
evaluate_model(ytest, lr_pred)

Accuracy of the model: 0.6635215212757647

Classification report: 
             precision    recall  f1-score   support

          0       0.66      0.68      0.67     89877
          1       0.67      0.65      0.66     89341

avg / total       0.66      0.66      0.66    179218


Confusion matrix: 
[[60806 29071]
 [31232 58109]]



### 3. Random forest model

In [32]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import roc_auc_score

In [33]:
rf_params = {
    'n_estimators': 126, 
    'max_depth': 14
}

rf = RandomForestClassifier(**rf_params)
rf.fit(xtrain_scaled, ytrain)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=14, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=126, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [34]:
rfpred = rf.predict(xtest_scaled)

In [35]:
rfpred_proba = rf.predict_proba(xtest_scaled)

In [36]:
evaluate_model(ytest, rfpred)

Accuracy of the model: 0.7320470042071667

Classification report: 
             precision    recall  f1-score   support

          0       0.75      0.69      0.72     89877
          1       0.71      0.77      0.74     89341

avg / total       0.73      0.73      0.73    179218


Confusion matrix: 
[[62025 27852]
 [20170 69171]]



In [37]:
roc_auc_score(ytest, rfpred_proba[:, 1])

0.8076692169438009

### Cross validation

In [38]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import recall_score, roc_auc_score, f1_score

scoring = ['accuracy', 'recall', 'roc_auc', 'f1']
scores = cross_validate(rf, X = xtrain_scaled, y = ytrain, scoring=scoring,
                         cv = 10, return_train_score = False, verbose = 10, n_jobs= -1)

[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  , accuracy=0.7333258936874605, recall=0.7784095131921219, roc_auc=0.8090472984618574, f1=0.7450206288234457, total= 8.8min
[CV]  ................................................................
[CV]  , accuracy=0.7297920618978536, recall=0.767372723894463, roc_auc=0.8047750367596309, f1=0.7397721573404027, total= 8.8min
[CV]  ................................................................
[CV]  , accuracy=0.731019603466875, recall=0.7719435154217763, roc_auc=0.8065173873635427, f1=0.7417868875874875, total= 8.8min
[CV]  ................................................................
[CV]  , accuracy=0.7312427928430607, recall=0.7732441471571906, roc_auc=0.803348966208371, f1=0.7422680412371134, tota

[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed: 17.2min remaining: 17.2min


[CV]  , accuracy=0.729991630242723, recall=0.7712289568545839, roc_auc=0.803770231154411, f1=0.740874283776306, total= 8.4min
[CV]  ................................................................
[CV]  , accuracy=0.7293778480424068, recall=0.770374224237244, roc_auc=0.8041309804369061, f1=0.740224959828602, total= 8.4min


[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed: 17.2min remaining:  7.4min


[CV]  , accuracy=0.7343625034873988, recall=0.7772864097513843, roc_auc=0.8082918628438596, f1=0.7454824108065723, total= 8.4min
[CV]  , accuracy=0.7320747698316749, recall=0.7717492288825301, roc_auc=0.8079808736099967, f1=0.7424873522944636, total= 5.1min
[CV]  , accuracy=0.7320139870545347, recall=0.7727154483630012, roc_auc=0.8082303043905488, f1=0.7426867164339036, total= 5.1min


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 22.3min finished


In [39]:
scores

{'fit_time': array([519.01635194, 519.08185387, 518.91476393, 514.854949  ,
        491.86662292, 491.60002613, 492.63204885, 491.53150296,
        296.78954768, 297.04013801]),
 'score_time': array([10.45862293, 10.56293011, 10.65213013, 10.40467215, 10.21855617,
        10.32528472, 10.14684105, 10.33355498,  6.49080539,  6.19442701]),
 'test_accuracy': array([0.72979206, 0.7310196 , 0.73124279, 0.73332589, 0.73371648,
        0.72999163, 0.7343625 , 0.72937785, 0.73207477, 0.73201399]),
 'test_recall': array([0.76737272, 0.77194352, 0.77324415, 0.77840951, 0.77421033,
        0.77122896, 0.77728641, 0.77037422, 0.77174923, 0.77271545]),
 'test_roc_auc': array([0.80477504, 0.80651739, 0.80334897, 0.8090473 , 0.80837787,
        0.80377023, 0.80829186, 0.80413098, 0.80798087, 0.8082303 ]),
 'test_f1': array([0.73977216, 0.74178689, 0.74226804, 0.74502063, 0.74427079,
        0.74087428, 0.74548241, 0.74022496, 0.74248735, 0.74268672])}

In [54]:
print('F1 score# (1) mean: {} (2)variance: {}'.format(np.mean(scores['test_f1']), np.var(scores['test_f1'])))
print('Recall score# (1) mean: {} (2)variance: {}'.format(np.mean(scores['test_recall']), np.var(scores['test_recall'])))
print('Accuracy score# (1) mean: {} (2)variance: {}'.format(np.mean(scores['test_accuracy']), np.var(scores['test_accuracy'])))

F1 score# (1) mean: 0.7424874224946193 (2)variance: 3.4239691671447294e-06
Recall score# (1) mean: 0.7728534498486367 (2)variance: 9.340496661280428e-06
Accuracy score# (1) mean: 0.7316917565649772 (2)variance: 2.6660110240196636e-06


### Neural Network

In [60]:
## Keras related modules
from keras.models import Sequential, save_model, load_model
from keras.layers import Dense, Dropout, BatchNormalization
from keras import optimizers, regularizers, initializers
from keras.callbacks import ModelCheckpoint, EarlyStopping

In [99]:
HIDDEN_LAYER_SIZE = 32
L2_REGULARIZER = 5
LEARNING_RATE = 0.025
TRAINING_EPOCHS = 100
BATCH_SIZE = 32
VALIDATION_SPLIT = 0.30

In [102]:
model = Sequential()
model.add(Dense(32, 
                activation = 'relu',
                kernel_initializer = initializers.RandomNormal(mean = 0.0, stddev = 1, seed = 100),
                bias_initializer = 'Ones',
                input_shape = (np.shape(xtrain_scaled)[1],),
#                 kernel_regularizer = regularizers.l2(L2_REGULARIZER),
                name = 'layer_1'))

model.add(Dense(16, 
                activation = 'relu',
                kernel_initializer = initializers.RandomNormal(mean = 0.0, stddev = 1, seed = 100),
                bias_initializer = 'Ones',
#                 kernel_regularizer = regularizers.l2(L2_REGULARIZER),
                name = 'layer_2'))

adm_optz = optimizers.Adam(lr = LEARNING_RATE)

model.add(Dense(1, activation = "sigmoid", name = 'Output_layer'))
model.compile(optimizer = adm_optz, loss = 'binary_crossentropy', 
              metrics = ['accuracy'])

RuntimeError: ('The name "layer_1" is used 2 times in the model. All layer names should be unique. Layer names: ', ['layer_1_input', 'layer_1', 'layer_1', 'Output_layer'])

In [101]:

# filepath = './logs/random_seed_'+ str(random_seed) +'_{epoch:02d}.hdf5'# filepat 
# checkpoint = ModelCheckpoint(filepath, verbose = 0, period = 20)
# early_stopping = EarlyStopping(monitor = 'val_loss', patience = 20)
              
# callback_list = [early_stopping]

history = model.fit(xtrain_scaled, ytrain, 
                    batch_size = BATCH_SIZE, 
                    epochs = TRAINING_EPOCHS,
                    validation_split = VALIDATION_SPLIT, 
                    verbose = 1)

Train on 376357 samples, validate on 161297 samples
Epoch 1/100
Epoch 2/100

KeyboardInterrupt: 

In [58]:
plt..figurefigure((figsizefigsize==  ((1212,,  44))))
 # summarize history for accuracy# summari 
plt.subplot(1, 2, 1)
plt.plot(history.history['val_acc'], 'b-')
plt.plot(history.history['acc'], 'r-')
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='lower right')

# summarize history for loss
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], 'r-')
plt.plot(history.history['val_loss'], 'b-')
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper right')
plt.show()

In [59]:
evaluate_model(ytest, xgb_pred)

Accuracy of the model: 0.6655469874677766

Classification report: 
             precision    recall  f1-score   support

          0       0.67      0.65      0.66     89877
          1       0.66      0.68      0.67     89341

avg / total       0.67      0.67      0.67    179218


Confusion matrix: 
[[58153 31724]
 [28216 61125]]

