In [2]:
# Import pandas module
import pandas as pd

# Set display columns to max to allow us to see all columns in results
pd.set_option('display.max_columns', None)

# Read the loan_clean csv file in
loan = pd.read_csv('loan_clean.csv')

In [3]:
loan.head()

Unnamed: 0,unique_num,loan_amnt,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,issue_d,loan_status,purpose,addr_state,dti,delinq_2yrs,earliest_cr_line,mths_since_last_delinq,open_acc,pub_rec,revol_util,total_acc,total_rec_prncp,application_type,annual_inc_joint,dti_joint,acc_now_delinq,tot_cur_bal,total_rev_hi_lim,avg_cur_bal,delinq_amnt,mort_acc,num_accts_ever_120_pd,num_tl_30dpd,num_tl_90g_dpd_24m,percent_bc_gt_75,pub_rec_bankruptcies,total_bal_ex_mort,revol_bal_joint,sec_app_mort_acc,hardship_flag
0,0,2500,36 months,13.56,84.92,C,C1,Chef,10+ years,RENT,55000.0,Dec-2018,Current,debt_consolidation,NY,18.24,0.0,Apr-2001,999.0,9.0,1.0,10.3,34.0,113.98,Individual,,,0.0,16901.0,42000.0,1878.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,16901.0,,,N
1,1,30000,60 months,18.94,777.23,D,D2,Postmaster,10+ years,MORTGAGE,90000.0,Dec-2018,Current,debt_consolidation,LA,26.52,0.0,Jun-1987,71.0,13.0,1.0,24.2,44.0,612.25,Individual,,,0.0,321915.0,50800.0,24763.0,0.0,3.0,0.0,0.0,0.0,0.0,1.0,99468.0,,,N
2,2,5000,36 months,17.97,180.69,D,D1,Administrative,6 years,MORTGAGE,59280.0,Dec-2018,Current,debt_consolidation,MI,10.51,0.0,Apr-2011,999.0,8.0,0.0,19.1,13.0,212.79,Individual,,,0.0,110299.0,24100.0,18383.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,11749.0,,,N
3,3,4000,36 months,18.94,146.51,D,D2,IT Supervisor,10+ years,MORTGAGE,92000.0,Dec-2018,Current,debt_consolidation,WA,16.74,0.0,Feb-2006,999.0,10.0,0.0,78.1,13.0,168.07,Individual,,,0.0,305049.0,7000.0,30505.0,0.0,3.0,0.0,0.0,0.0,100.0,0.0,36151.0,,,N
4,4,30000,60 months,16.14,731.78,C,C4,Mechanic,10+ years,MORTGAGE,57250.0,Dec-2018,Current,debt_consolidation,MD,26.35,0.0,Dec-2000,999.0,12.0,0.0,3.6,26.0,660.98,Individual,,,0.0,116007.0,23100.0,9667.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,29674.0,,,N


# Preparing data for Train/Test Split

In [4]:
# Create a function to encode 'acc_now_delinq' to 1 for delinquent and 0 for non-delinquent (to use as the target/result)

def delinq(delinq_count):
    if delinq_count != 0:
        return 1
    else:
        return 0

# Apply the 'acc_now_delinq' column which shows number of delinquent accounts to the function and assigning it to a new column 'delinq_status'
loan['delinq_status'] = loan.acc_now_delinq.apply(delinq)

In [5]:
# Sanity-check to confirm that the encoding worked

loan.groupby('delinq_status')['unique_num'].agg(['count'])

Unnamed: 0_level_0,count
delinq_status,Unnamed: 1_level_1
0,1994496
1,8173


In [6]:
# One-hot encoding categorical features using pd.get_dummies

In [7]:
# Encode sub_grades
sub_grade_dummies = pd.get_dummies(loan.sub_grade, prefix='sub_grade')

# Dropping the first column given we only need n-1 columns, with n being total number of columns
sub_grade_dummies.drop(sub_grade_dummies.columns[0], axis=1, inplace=True)

# Concatenating sub_grade_dummies with the main 'loan' dataframe
loan_grades = pd.concat([loan, sub_grade_dummies], axis=1)

In [8]:
# Encode purpose
purpose_dummies = pd.get_dummies(loan.purpose, prefix='purpose')

# Dropping the first column given we only need n-1 columns, with n being total number of columns
purpose_dummies.drop(purpose_dummies.columns[0], axis=1, inplace=True)

# Concatenating sub_grade_dummies with the main 'loan' dataframe
loan_grades_purpose = pd.concat([loan_grades, purpose_dummies], axis=1)

In [9]:
# Encode addr_state
state_dummies = pd.get_dummies(loan.addr_state, prefix='state')

# Dropping the first column given we only need n-1 columns, with n being total number of columns
state_dummies.drop(state_dummies.columns[0], axis=1, inplace=True)

# Concatenating sub_grade_dummies with the main 'loan' dataframe
loan_grades_purpose_state = pd.concat([loan_grades_purpose, state_dummies], axis=1)

In [10]:
# Encode emp_length
emplen_dummies = pd.get_dummies(loan.emp_length, prefix='emplen')

# Dropping the first column given we only need n-1 columns, with n being total number of columns
emplen_dummies.drop(emplen_dummies.columns[0], axis=1, inplace=True)

# Concatenating sub_grade_dummies with the main 'loan' dataframe
loan_grades_purpose_state_emplen = pd.concat([loan_grades_purpose_state, emplen_dummies], axis=1)

In [11]:
loan_encoded = loan_grades_purpose_state_emplen

# Performing Train/Test Split

In [12]:
# Importing relevant modules for the train/test split
from sklearn.model_selection import train_test_split


Based on the analysis done in the EDA stage, we have determined the below features to use in the model:

<u>Numerical</u>
- annual_inc
- dti
- pub_rec_bankruptcies
- delinq_2yrs
- mths_since_last_delinq

<u>Categorical</u>
- addr_state
- sub_grade
- purpose
- emp_length

Note: The above is the complete list of features used in the testing of predictive models. However, the final features used in the selected best model may not have all the features above.

In [13]:
# Create feature matrix (X), i.e. features or factors that would impact prediction of delinquency

# Create list of feature columns with the numerical features
feature_cols = ['annual_inc', 'dti', 'pub_rec_bankruptcies', 'delinq_2yrs', 'mths_since_last_delinq']


In [14]:
# Use list comprehension to add the dummies columns



In [15]:
len(feature_cols)

5

In [16]:
# Creating feature matrix (X)
X = loan_encoded[feature_cols]

# Creating response vector (y)
y = loan_encoded.delinq_status


In [17]:
# Performing the train/test split with the test set set to 30% of dataset.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)

# Setting up a baseline model and determining baseline accuracy

In [18]:
import numpy as np

# Finding the mode (or most frequent result) in y_train
most_freq_class = y_train.mode().index[0]

# The baseline accuracy is the average number of times that the result in the test set would be the most frequent class
baseline_accuracy = np.mean(y_test == most_freq_class)
baseline_accuracy

# The baseline accuracy is at 99.6%

0.9959320973167488

In [19]:
loan.delinq_status.count()

2002669

# Creating Oversample set using SMOTE

In [23]:
# Importing SMOTE module
from imblearn.over_sampling import SMOTE

# Instantiate the SMOTE oversampling algorithm
smote = SMOTE()

# Apply SMOTE to oversample the minority class
X_train_oversampled, y_train_oversampled = smote.fit_resample(X_train, y_train)


In [24]:
len(X_train_oversampled)

2792278

# Tuning the XGBoost model

In [25]:
from xgboost import XGBClassifier
from sklearn import metrics
from sklearn.model_selection import GridSearchCV, StratifiedKFold

# Setting up the variables for grid search
xgb = XGBClassifier()
n_estimators = [500, 750, 1000, 1250]
max_depth = [9, 10, 11, 12]
print(max_depth)
param_grid = dict(max_depth=max_depth, n_estimators=n_estimators, eta=[0.1], seed=[123])
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=123)

# Use grid search
grid_search = GridSearchCV(xgb, param_grid, scoring='accuracy', n_jobs=-1, cv=kfold, verbose=1)
grid_result = grid_search.fit(X_train_oversampled, y_train_oversampled)

# Summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
 print("%f (%f) with: %r" % (mean, stdev, param))



[9, 10, 11, 12]
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Best: 0.998622 using {'eta': 0.1, 'max_depth': 12, 'n_estimators': 1250, 'seed': 123}
0.998409 (0.000081) with: {'eta': 0.1, 'max_depth': 9, 'n_estimators': 500, 'seed': 123}
0.998485 (0.000087) with: {'eta': 0.1, 'max_depth': 9, 'n_estimators': 750, 'seed': 123}
0.998524 (0.000093) with: {'eta': 0.1, 'max_depth': 9, 'n_estimators': 1000, 'seed': 123}
0.998551 (0.000093) with: {'eta': 0.1, 'max_depth': 9, 'n_estimators': 1250, 'seed': 123}
0.998468 (0.000089) with: {'eta': 0.1, 'max_depth': 10, 'n_estimators': 500, 'seed': 123}
0.998529 (0.000089) with: {'eta': 0.1, 'max_depth': 10, 'n_estimators': 750, 'seed': 123}
0.998562 (0.000090) with: {'eta': 0.1, 'max_depth': 10, 'n_estimators': 1000, 'seed': 123}
0.998574 (0.000090) with: {'eta': 0.1, 'max_depth': 10, 'n_estimators': 1250, 'seed': 123}
0.998514 (0.000083) with: {'eta': 0.1, 'max_depth': 11, 'n_estimators': 500, 'seed': 123}
0.998564 (0.000093) with: {