In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
%matplotlib inline

#model
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import OneHotEncoder
from scipy.stats import chi2_contingency

In [2]:
X_train = pd.read_csv("TrainData.csv")

In [3]:
X_train.head(5)

Unnamed: 0,good_bad,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_length,...,last_pymnt_amnt,collections_12_mths_ex_med,policy_code,application_type,acc_now_delinq,total_rev_hi_lim,mths_since_earliest_cr_line,mths_since_issue_d,mths_since_last_pymnt_d,mths_since_last_credit_pull_d
0,1,15000,15000,15000.0,36,8.9,476.3,A,A5,7.0,...,8940.22,0.0,1,INDIVIDUAL,0.0,35100.0,296.0,79.0,62.0,55.0
1,1,8000,8000,8000.0,60,18.25,204.24,D,D3,10.0,...,204.24,0.0,1,INDIVIDUAL,0.0,41300.0,301.0,79.0,55.0,55.0
2,1,12150,12150,12100.0,60,18.92,314.65,D,D4,3.0,...,314.65,0.0,1,INDIVIDUAL,0.0,9800.0,359.0,75.0,55.0,55.0
3,1,10000,10000,10000.0,36,6.03,304.36,A,A1,3.0,...,3553.37,0.0,1,INDIVIDUAL,0.0,10100.0,224.0,84.0,59.0,59.0
4,0,15825,15825,15825.0,36,12.12,526.53,B,B3,10.0,...,526.53,0.0,1,INDIVIDUAL,0.0,32400.0,494.0,87.0,78.0,55.0


In [4]:
y_train = X_train['good_bad']
X_train.drop(columns=['good_bad'], axis=1, inplace=True)

In [5]:
X_test = pd.read_csv("TestData.csv")

In [6]:
y_test = X_test['good_bad']
X_test.drop(columns=['good_bad'], axis=1, inplace=True)

### Creating dummy variables

##### Convert discrete variables to dummy variables

In [7]:
X_train = pd.get_dummies(X_train)
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 373028 entries, 0 to 373027
Columns: 108 entries, loan_amnt to application_type_INDIVIDUAL
dtypes: float64(27), int64(5), uint8(76)
memory usage: 118.1 MB


In [9]:
X_test = pd.get_dummies(X_test)
# reindex the dummied test set variables to make sure all the feature columns in the training set are also available in the test set
X_test = X_test.reindex(labels=X_train.columns, axis=1, fill_value=0)

In [10]:
X_test

Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,emp_length,annual_inc,dti,delinq_2yrs,...,purpose_medical,purpose_moving,purpose_other,purpose_renewable_energy,purpose_small_business,purpose_vacation,purpose_wedding,initial_list_status_f,initial_list_status_w,application_type_INDIVIDUAL
0,1800,1800,1800.0,36,14.64,62.09,10.0,50000.0,19.11,0.0,...,0,0,0,0,0,0,0,0,1,1
1,6000,6000,6000.0,36,7.12,185.60,1.0,63000.0,7.98,0.0,...,0,0,0,0,0,0,0,0,1,1
2,10700,10700,10650.0,36,14.99,370.87,0.0,82000.0,23.35,1.0,...,0,0,0,0,0,0,0,1,0,1
3,15000,15000,15000.0,36,9.67,481.69,2.0,60000.0,28.34,0.0,...,0,0,0,0,0,0,0,1,0,1
4,14000,14000,13925.0,60,18.25,357.42,10.0,86000.0,31.02,0.0,...,0,0,0,0,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93252,2500,2500,2500.0,36,6.39,76.51,10.0,33600.0,3.39,0.0,...,1,0,0,0,0,0,0,1,0,1
93253,10400,10400,10400.0,36,19.52,383.97,6.0,42000.0,21.71,0.0,...,0,0,0,0,0,0,0,1,0,1
93254,3000,3000,3000.0,36,23.40,116.76,8.0,30000.0,17.48,0.0,...,0,0,1,0,0,0,0,1,0,1
93255,24000,24000,24000.0,60,15.61,578.68,5.0,80000.0,20.60,0.0,...,0,0,0,0,0,0,0,1,0,1


In [11]:
from sklearn.model_selection import cross_val_score, train_test_split, RepeatedStratifiedKFold
import time

# metrics
from sklearn import metrics
from sklearn.metrics import accuracy_score,f1_score,precision_score, recall_score,auc,roc_curve,confusion_matrix
from statistics import mean

# models
from sklearn import model_selection
from sklearn import ensemble
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

## Train Test Split

In [12]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
y_pred = model.score(X_test, y_test)
print(f'Accuracy is: {y_pred*100.0: 0.3f}')

Accuracy is:  98.166


In [13]:
model = DecisionTreeClassifier(random_state=99)
model.fit(X_train, y_train)
result = model.score(X_test, y_test)
print(f'Accuracy is: {result*100.0: 0.3f}')

Accuracy is:  100.000


## Stratified Kfold Cross Validation

In [14]:
X = X_train
Y = y_train

In [15]:
seedValue = 99
MLA_columns = []
MLA_compare = pd.DataFrame(columns = MLA_columns)
row_index = 0
# prepare configuration for cross validation test harness
# prepare models
models = []
models.append(('Logistic Regression', LogisticRegression()))
models.append(('XGBoost', XGBClassifier(eval_metric='mlogloss')))
models.append(('Decision Tree Classifer', DecisionTreeClassifier()))
# evaluate each model in turn
results = []
names = []
scoring = 'accuracy'

# Create StratifiedKFold object.    
tic = time.perf_counter()
for name, model in models:
    skfold = model_selection.StratifiedKFold(n_splits = 5, shuffle = True, random_state=seedValue)
    cv_results = model_selection.cross_val_score(model, X, Y, cv=skfold, scoring=scoring)
    f1_results = model_selection.cross_val_score(model, X, Y, cv=skfold, scoring='f1')
    recall_results = model_selection.cross_val_score(model, X, Y, cv=skfold, scoring='recall')
    precision_results = model_selection.cross_val_score(model, X, Y, cv=skfold, scoring='precision')
    roc_auc_results = model_selection.cross_val_score(model, X, Y, cv=skfold, scoring='roc_auc')
    MLA_compare.loc[row_index,'MLA used'] = name
    MLA_compare.loc[row_index, 'Test Accuracy'] = round(mean(cv_results), 4)
    MLA_compare.loc[row_index, 'Precision'] = round(mean(precision_results),4)
    MLA_compare.loc[row_index, 'Recall'] = round(mean(recall_results),4)
    MLA_compare.loc[row_index, 'f1'] = mean(f1_results)
    MLA_compare.loc[row_index, 'roc_auc'] = mean(roc_auc_results)
    results.append(cv_results)
    names.append(name)
    # Print the output.
    print('List of possible accuracies for {0} is: {1}'.format(name, cv_results))
    print('List of possible Precision for {0} is: {1}'.format(name, precision_results))
    print('List of possible Recall for {0} is: {1}'.format(name, recall_results))
    print('List of possible F1 score for {0} is: {1}'.format(name, f1_results))
    print('List of possible ROC_AUC for {0} is: {1}'.format(name, roc_auc_results))
    msg = "%s: %f (%f)" % (name, mean(cv_results), cv_results.std())
    print(msg)
    toc = time.perf_counter()
    secs = toc - tic
    print("---Classifier %s use %0.4f seconds ---" %(name, secs))
    row_index+=1
    
MLA_compare.sort_values(by = ['Test Accuracy'], ascending = False, inplace = True)    
MLA_compare

List of possible accuracies for Logistic Regression is: [0.98217302 0.98189154 0.98098008 0.98178406 0.98182427]
List of possible Precision for Logistic Regression is: [0.98039216 0.98028684 0.97930425 0.98011447 0.98038434]
List of possible Recall for Logistic Regression is: [0.99998495 0.99977427 0.99977427 0.99983446 0.99959368]
List of possible F1 score for Logistic Regression is: [0.99009163 0.98993466 0.98943339 0.98987627 0.98989583]
List of possible ROC_AUC for Logistic Regression is: [0.95794226 0.95916265 0.95517434 0.95998362 0.95956228]
Logistic Regression: 0.981731 (0.000399)
---Classifier Logistic Regression use 245.8669 seconds ---
List of possible accuracies for XGBoost is: [1. 1. 1. 1. 1.]
List of possible Precision for XGBoost is: [1. 1. 1. 1. 1.]
List of possible Recall for XGBoost is: [1. 1. 1. 1. 1.]
List of possible F1 score for XGBoost is: [1. 1. 1. 1. 1.]
List of possible ROC_AUC for XGBoost is: [1. 1. 1. 1. 1.]
XGBoost: 1.000000 (0.000000)
---Classifier XGBoost

Unnamed: 0,MLA used,Test Accuracy,Precision,Recall,f1,roc_auc
1,XGBoost,1.0,1.0,1.0,1.0,1.0
2,Decision Tree Classifer,1.0,1.0,1.0,1.0,1.0
0,Logistic Regression,0.9817,0.9801,0.9998,0.989846,0.958365
