In [2]:
import pandas as pd
import seaborn as sns
import numpy as np
import warnings
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, cross_val_score
warnings.filterwarnings("ignore")

In [3]:
loan_data_clean = pd.read_csv("loan_data_clean.csv", header = 0, index_col = 0)

In [3]:
loan_data_clean.head(5)

Unnamed: 0_level_0,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_length,home_ownership,annual_inc,...,collections_12_mths_ex_med,policy_code,application_type,acc_now_delinq,total_rev_hi_lim,good_bad,mths_since_earliest_cr_line,mths_since_issue_d,mths_since_last_pymnt_d,mths_since_last_credit_pull_d
loan_amnt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5000,5000,4975.0,36,10.65,162.87,B,B2,10.0,RENT,24000.0,...,0.0,1,INDIVIDUAL,0.0,30379.087771,1,427.0,104.0,67.0,55.0
2500,2500,2500.0,60,15.27,59.83,C,C4,0.0,RENT,30000.0,...,0.0,1,INDIVIDUAL,0.0,30379.087771,0,256.0,104.0,88.0,83.0
2400,2400,2400.0,36,15.96,84.33,C,C5,10.0,RENT,12252.0,...,0.0,1,INDIVIDUAL,0.0,30379.087771,1,225.0,104.0,74.0,55.0
10000,10000,10000.0,36,13.49,339.31,C,C1,10.0,RENT,49200.0,...,0.0,1,INDIVIDUAL,0.0,30379.087771,1,294.0,104.0,67.0,67.0
3000,3000,3000.0,60,12.69,67.79,B,B5,1.0,RENT,80000.0,...,0.0,1,INDIVIDUAL,0.0,30379.087771,1,295.0,104.0,55.0,55.0


In [4]:
loan_data_clean.columns

Index(['funded_amnt', 'funded_amnt_inv', 'term', 'int_rate', 'installment',
       'grade', 'sub_grade', 'emp_length', 'home_ownership', 'annual_inc',
       'verification_status', 'purpose', 'dti', 'delinq_2yrs',
       'inq_last_6mths', 'open_acc', 'pub_rec', 'revol_bal', 'revol_util',
       'total_acc', 'initial_list_status', 'out_prncp', 'out_prncp_inv',
       'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int',
       'total_rec_late_fee', 'last_pymnt_amnt', 'collections_12_mths_ex_med',
       'policy_code', 'application_type', 'acc_now_delinq', 'total_rev_hi_lim',
       'good_bad', 'mths_since_earliest_cr_line', 'mths_since_issue_d',
       'mths_since_last_pymnt_d', 'mths_since_last_credit_pull_d'],
      dtype='object')

In [5]:
# To see which columns still have the nan values, and impute these nan values. 
loan_data_clean.isnull().sum()

funded_amnt                        0
funded_amnt_inv                    0
term                               0
int_rate                           0
installment                        0
grade                              0
sub_grade                          0
emp_length                         0
home_ownership                     0
annual_inc                         4
verification_status                0
purpose                            0
dti                                0
delinq_2yrs                       29
inq_last_6mths                    29
open_acc                          29
pub_rec                           29
revol_bal                          0
revol_util                         0
total_acc                         29
initial_list_status                0
out_prncp                          0
out_prncp_inv                      0
total_pymnt                        0
total_pymnt_inv                    0
total_rec_prncp                    0
total_rec_int                      0
t

In [6]:
loan_data_clean.fillna(0, inplace=True)

In [7]:
loan_dummy_df = pd.get_dummies(loan_data_clean)
loan_dummy_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 466285 entries, 5000 to 10000
Data columns (total 100 columns):
 #   Column                               Non-Null Count   Dtype  
---  ------                               --------------   -----  
 0   funded_amnt                          466285 non-null  int64  
 1   funded_amnt_inv                      466285 non-null  float64
 2   term                                 466285 non-null  int64  
 3   int_rate                             466285 non-null  float64
 4   installment                          466285 non-null  float64
 5   emp_length                           466285 non-null  float64
 6   annual_inc                           466285 non-null  float64
 7   dti                                  466285 non-null  float64
 8   delinq_2yrs                          466285 non-null  float64
 9   inq_last_6mths                       466285 non-null  float64
 10  open_acc                             466285 non-null  float64
 11  pub_rec   

In [8]:
# save new dataset for checkpointing
# loan_dummy_df.to_csv("loan_dummy.csv", index=False)
print(loan_dummy_df.shape)

# splitting into test and train after treating variables
X_train, X_test, y_train, y_test = train_test_split(loan_dummy_df, loan_data_clean['good_bad'], 
                                                    test_size=0.3, stratify = loan_data_clean['good_bad'], #to keep class balance during splitting,
                                                    random_state=42)

(466285, 100)


In [9]:
X_train.drop(columns=['good_bad'], axis=1, inplace=True)
X_test.drop(columns=['good_bad'], axis=1, inplace=True)

## Feature Selection

In [28]:
from scipy.stats import chi2_contingency
from sklearn.feature_selection import f_classif
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV, KFold,RepeatedStratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.feature_selection import SelectKBest, VarianceThreshold, RFE, f_regression, mutual_info_regression
import time

# metrics
from sklearn import metrics
from sklearn.metrics import accuracy_score,f1_score,precision_score, recall_score,auc,roc_curve,confusion_matrix
from statistics import mean

# models
from sklearn import model_selection
from sklearn import ensemble
from sklearn.linear_model import LogisticRegression, Ridge, Lasso, LassoLars, LinearRegression, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBClassifier

## Full Model

First, we will attempt to train a number of machine learning algorithms on the full subset of predictors at our disposal. For all of the models, we train ans test the model's performance. This lets us form unbiased predictions since we test on “uncontaminated” data.

In [30]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
y_pred = model.score(X_test, y_test)
print(f'Accuracy is: {y_pred*100.0: 0.3f}')

Accuracy is:  98.072


In [12]:
model = DecisionTreeClassifier(random_state=99)
model.fit(X_train, y_train)
result = model.score(X_test, y_test)
print(f'accuracy is: {result*100.0: 0.3f}')

accuracy is:  99.043


In [13]:
model = KNeighborsClassifier(n_neighbors = 1)
model.fit(X_train, y_train)
result = model.score(X_test, y_test)
print(f'accuracy is: {result*100.0: 0.3f}')

accuracy is:  95.794


In [18]:
X = loan_dummy_df
Y = loan_data_clean['good_bad']

In [25]:
seedValue = 99
MLA_columns = []
MLA_compare = pd.DataFrame(columns = MLA_columns)
row_index = 0
# prepare configuration for cross validation test harness
# prepare models
models = []
models.append(('Logistic Regression', LogisticRegression()))
models.append(('XGBoost', XGBClassifier(eval_metric='mlogloss')))
models.append(('Decision Tree Classifer', DecisionTreeClassifier()))
# evaluate each model in turn
results = []
names = []
scoring = 'accuracy'

# Create StratifiedKFold object.    
tic = time.perf_counter()
for name, model in models:
    skfold = model_selection.StratifiedKFold(n_splits = 5, shuffle = True, random_state=seedValue)
    cv_results = model_selection.cross_val_score(model, X, Y, cv=skfold, scoring=scoring)
    f1_results = model_selection.cross_val_score(model, X, Y, cv=skfold, scoring='f1')
    recall_results = model_selection.cross_val_score(model, X, Y, cv=skfold, scoring='recall')
    precision_results = model_selection.cross_val_score(model, X, Y, cv=skfold, scoring='precision')
    roc_auc_results = model_selection.cross_val_score(model, X, Y, cv=skfold, scoring='roc_auc')
    MLA_compare.loc[row_index,'MLA used'] = name
    MLA_compare.loc[row_index, 'Test Accuracy'] = round(mean(cv_results), 4)
    MLA_compare.loc[row_index, 'Precision'] = round(mean(precision_results),4)
    MLA_compare.loc[row_index, 'Recall'] = round(mean(recall_results),4)
    MLA_compare.loc[row_index, 'f1'] = mean(f1_results)
    MLA_compare.loc[row_index, 'roc_auc'] = mean(roc_auc_results)
    results.append(cv_results)
    names.append(name)
    # Print the output.
    print('List of possible accuracies for {0} is: {1}'.format(name, cv_results))
    print('List of possible Precision for {0} is: {1}'.format(name, precision_results))
    print('List of possible Recall for {0} is: {1}'.format(name, recall_results))
    print('List of possible F1 score for {0} is: {1}'.format(name, f1_results))
    print('List of possible ROC_AUC for {0} is: {1}'.format(name, roc_auc_results))
    msg = "%s: %f (%f)" % (name, mean(cv_results), cv_results.std())
    print(msg)
    toc = time.perf_counter()
    secs = toc - tic
    print("---Classifier %s use %0.4f seconds ---" %(name, secs))
    row_index+=1
    
MLA_compare.sort_values(by = ['Test Accuracy'], ascending = False, inplace = True)    
MLA_compare

List of possible accuracies for Logistic Regression is: [0.98130971 0.98142767 0.9811167  0.98094513 0.98192093]
List of possible Precision for Logistic Regression is: [0.97950399 0.97974424 0.97925067 0.97962001 0.98023062]
List of possible Recall for Logistic Regression is: [0.99993981 0.99981942 0.99998796 0.99939805 0.99986757]
List of possible F1 score for Logistic Regression is: [0.98961641 0.98968003 0.98951068 0.9894102  0.98995173]
List of possible ROC_AUC for Logistic Regression is: [0.9565127  0.95907888 0.95680259 0.95955572 0.95914343]
Logistic Regression: 0.981344 (0.000332)
---Classifier Logistic Regression use 247.9367 seconds ---
List of possible accuracies for XGBoost is: [0.99656862 0.99630055 0.99620404 0.99625765 0.99679381]
List of possible Precision for XGBoost is: [0.99635282 0.99616118 0.9960774  0.99616095 0.99675886]
List of possible Recall for XGBoost is: [0.99980738 0.99969903 0.99967495 0.99965087 0.99965087]
List of possible F1 score for XGBoost is: [0.99

Unnamed: 0,MLA used,Test Accuracy,Precision,Recall,f1,roc_auc
1,XGBoost,0.9964,0.9963,0.9997,0.997997,0.997689
2,Decision Tree Classifer,0.9906,0.9949,0.9946,0.99469,0.976432
0,Logistic Regression,0.9813,0.9797,0.9998,0.989634,0.958219
