In [5]:
#importing libraries that are required for the project model

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix
from sklearn.feature_selection import SelectFromModel
%matplotlib inline

In [6]:
# load data
data = pd.read_csv("cleaned_dataset.csv",encoding='utf-8',index_col='Unnamed: 0')
data.head()

Unnamed: 0,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_length,...,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,target
0,11981122.0,20800.0,20800.0,20800.0,0,13.53,706.16,1,9,1,...,3.0,90.2,50.0,0.0,0.0,43100.0,23473.0,15000.0,0.0,1
1,1319523.0,12000.0,12000.0,12000.0,0,6.62,368.45,0,1,1,...,2.0,95.5,0.0,0.0,0.0,333044.0,42603.0,52600.0,42769.0,0
2,11981072.0,12000.0,12000.0,12000.0,0,10.99,392.81,1,6,4,...,4.0,100.0,0.0,0.0,0.0,29700.0,7137.0,18100.0,0.0,1
3,11991209.0,12000.0,12000.0,12000.0,0,13.53,407.4,1,9,1,...,2.0,81.2,33.3,0.0,0.0,18130.0,13605.0,7000.0,10030.0,1
4,11979581.0,24000.0,24000.0,24000.0,0,13.53,814.8,1,9,1,...,2.0,100.0,75.0,0.0,0.0,229072.0,61397.0,21500.0,58847.0,1


In [8]:
#Target variable
data['target'].value_counts()

1    143919
0     44264
Name: target, dtype: int64

In [9]:
X, y = data.drop(['target'],axis=1), data['target']
X.head()

Unnamed: 0,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_length,...,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit
0,11981122.0,20800.0,20800.0,20800.0,0,13.53,706.16,1,9,1,...,0.0,3.0,90.2,50.0,0.0,0.0,43100.0,23473.0,15000.0,0.0
1,1319523.0,12000.0,12000.0,12000.0,0,6.62,368.45,0,1,1,...,0.0,2.0,95.5,0.0,0.0,0.0,333044.0,42603.0,52600.0,42769.0
2,11981072.0,12000.0,12000.0,12000.0,0,10.99,392.81,1,6,4,...,0.0,4.0,100.0,0.0,0.0,0.0,29700.0,7137.0,18100.0,0.0
3,11991209.0,12000.0,12000.0,12000.0,0,13.53,407.4,1,9,1,...,0.0,2.0,81.2,33.3,0.0,0.0,18130.0,13605.0,7000.0,10030.0
4,11979581.0,24000.0,24000.0,24000.0,0,13.53,814.8,1,9,1,...,0.0,2.0,100.0,75.0,0.0,0.0,229072.0,61397.0,21500.0,58847.0


In [10]:
#Splitting the Data
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)
len(X_train)

150546

In [12]:
#Brief overview of the columns
for col in X_train:
    print(col, ":" , X_train[col].max())

member_id : 12096968.0
loan_amnt : 35000.0
funded_amnt : 35000.0
funded_amnt_inv : 35000.0
term : 2
int_rate : 26.06
installment : 1408.13
grade : 7
sub_grade : 35
emp_length : 11
home_ownership : 5
annual_inc : 6100000.0
verification_status : 3
issue_d : 24
purpose : 13
zip_code : 839
addr_state : 49
dti : 34.99
delinq_2yrs : 29.0
earliest_cr_line : 614
fico_range_low : 845.0
fico_range_high : 850.0
inq_last_6mths : 8.0
mths_since_last_delinq : 152.0
mths_since_last_record : 121.0
open_acc : 53.0
pub_rec : 54.0
revol_bal : 2568995.0
revol_util : 128.1
total_acc : 105.0
initial_list_status : 2
out_prncp : 21447.0
out_prncp_inv : 21447.0
total_pymnt : 59829.51901
total_pymnt_inv : 59701.31
total_rec_prncp : 35000.0
total_rec_int : 25238.58
total_rec_late_fee : 367.6
recoveries : 39444.37
collection_recovery_fee : 6124.938
last_pymnt_d : 59
last_pymnt_amnt : 35760.2
next_pymnt_d : 3
last_credit_pull_d : 60
last_fico_range_high : 850.0
last_fico_range_low : 845.0
collections_12_mths_ex_me

In [13]:
# train decision tree classifier
decision_clf = DecisionTreeClassifier()
decision_clf.fit(X_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [14]:
# train decision tree classifier
knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [15]:
# train naive bayes classifier
bayes_clf = BernoulliNB()
bayes_clf.fit(X_train,y_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [16]:
# train logistic regression classifier
log_reg = LogisticRegression()
log_reg.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [17]:
#Cross validation results of the defined decision trees

print("Cross Validation Results")
print("="*40)
print("Decision Tree")
print(cross_val_score(decision_clf,X_train,y_train,cv=10))
print("="*40)
print("KNN Classifier")
print(cross_val_score(knn_clf,X_train,y_train,cv=10))
print("="*40)
print("Bayes Classifier")
print(cross_val_score(bayes_clf,X_train,y_train,cv=10))
print("="*40)
print("Logistic Regression")
print(cross_val_score(log_reg,X_train,y_train,cv=10))

Cross Validation Results
Decision Tree
[0.99754234 0.99674527 0.99701096 0.99628031 0.99628031 0.99621388
 0.99594819 0.99634649 0.99628006 0.9952169 ]
KNN Classifier
[0.78811026 0.78445699 0.78266357 0.78551976 0.78837595 0.78266357
 0.78266357 0.78404411 0.7845091  0.78210324]
Bayes Classifier
[0.98917303 0.98983726 0.98884092 0.98877449 0.98771172 0.98930588
 0.9893723  0.98970373 0.98897303 0.98970305]
Logistic Regression
[0.99329127 0.98997011 0.9922285  0.99262703 0.99229492 0.9922285
 0.99541681 0.9934901  0.99315796 0.99654554]


In [18]:
# decision tree analysis
model_decision = SelectFromModel(decision_clf,prefit=True)
X_train_decision_new = model_decision.transform(X_train)
X_test_decision_new = X_test.iloc[:,model_decision.get_support()]
X_train_decision_new.shape

(150546, 3)

In [19]:
decision_clf_trimmed = DecisionTreeClassifier()
decision_clf_trimmed.fit(X_train_decision_new,y_train)
decision_pred_train = decision_clf_trimmed.predict(X_train_decision_new)
decision_pred_test = decision_clf_trimmed.predict(X_test_decision_new)

In [20]:
X.iloc[:,model_decision.get_support()].head()

Unnamed: 0,out_prncp,total_rec_prncp,recoveries
0,0.0,20800.0,0.0
1,366.18,11633.82,0.0
2,0.0,12000.0,0.0
3,0.0,12000.0,0.0
4,0.0,24000.0,0.0


In [21]:
# bayes analysis
model_bayes = SelectFromModel(bayes_clf,prefit=True)
X_train_bayes_new = model_bayes.transform(X_train)
X_test_bayes_new = X_test.iloc[:,model_bayes.get_support()]
X_train_bayes_new.shape

(150546, 20)

In [22]:
bayes_clf_trimmed = BernoulliNB()
bayes_clf_trimmed.fit(X_train_bayes_new,y_train)
bayes_pred_train = bayes_clf_trimmed.predict(X_train_bayes_new)
bayes_pred_test = bayes_clf_trimmed.predict(X_test_bayes_new)

In [23]:
X.iloc[:,model_bayes.get_support()].head()

Unnamed: 0,term,delinq_2yrs,pub_rec,initial_list_status,out_prncp,out_prncp_inv,total_rec_late_fee,recoveries,collection_recovery_fee,collections_12_mths_ex_med,acc_now_delinq,tot_coll_amt,chargeoff_within_12_mths,delinq_amnt,num_accts_ever_120_pd,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,pub_rec_bankruptcies,tax_liens
0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,0,0.0,0.0,2,366.18,366.18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,0.0,2.0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15386.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0
4,0,0.0,0.0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,539.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
# logistic regression analysis
model_log = SelectFromModel(log_reg,prefit=True)
X_train_log_new = model_log.transform(X_train)
X_test_log_new = X_test.iloc[:,model_log.get_support()]
X_train_log_new.shape

(150546, 17)

In [25]:
log_clf_trimmed = LogisticRegression()
log_clf_trimmed.fit(X_train_log_new,y_train)
log_pred = log_clf_trimmed.predict(X_test_log_new)
log_pred_train = log_clf_trimmed.predict(X_train_log_new)
log_pred_test = log_clf_trimmed.predict(X_test_log_new)

In [26]:
X.iloc[:,model_log.get_support()].head()

Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,zip_code,earliest_cr_line,fico_range_low,fico_range_high,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,recoveries,last_pymnt_amnt,last_fico_range_high,last_fico_range_low
0,20800.0,20800.0,20800.0,74,349,685.0,689.0,0.0,0.0,23926.64001,23926.64,20800.0,3126.64,0.0,13334.93,644.0,640.0
1,12000.0,12000.0,12000.0,40,396,760.0,764.0,366.18,366.18,12895.75,12895.75,11633.82,1261.93,0.0,368.45,749.0,745.0
2,12000.0,12000.0,12000.0,248,108,720.0,724.0,0.0,0.0,13988.61,13988.61,12000.0,1988.61,0.0,3775.55,594.0,590.0
3,12000.0,12000.0,12000.0,728,561,660.0,664.0,0.0,0.0,13359.77686,13359.78,12000.0,1359.78,0.0,119.17,744.0,740.0
4,24000.0,24000.0,24000.0,444,240,660.0,664.0,0.0,0.0,28652.21,28652.21,24000.0,4652.21,0.0,10726.61,719.0,715.0


In [27]:
print("Training Set Comparison")
print("="*40)
print("Decision Tree")
print("Recall Score:    {:2.2f}".format(recall_score(y_train,decision_pred_train)))
print("Precision Score: {:2.2f}".format(precision_score(y_train,decision_pred_train)))
print("F1 Score:        {:2.2f}".format(f1_score(y_train,decision_pred_train)))
print("Confusion Matrix: ")
print(confusion_matrix(y_train,decision_pred_train))
print("="*40)
print("Bayes Classifier")
print("Recall Score:    {:2.2f}".format(recall_score(y_train,bayes_pred_train)))
print("Precision Score: {:2.2f}".format(precision_score(y_train,bayes_pred_train)))
print("F1 Score:        {:2.2f}".format(f1_score(y_train,bayes_pred_train)))
print("Confusion Matrix: ")
print(confusion_matrix(y_train,bayes_pred_train))
print("="*40)
print("Logistic Regression")
print("Recall Score:    {:2.2f}".format(recall_score(y_train,log_pred_train)))
print("Precision Score: {:2.2f}".format(precision_score(y_train,log_pred_train)))
print("F1 Score:        {:2.2f}".format(f1_score(y_train,log_pred_train)))
print("Confusion Matrix: ")
print(confusion_matrix(y_train,log_pred_train))

Training Set Comparison
Decision Tree
Recall Score:    1.00
Precision Score: 1.00
F1 Score:        1.00
Confusion Matrix: 
[[ 35336      1]
 [     0 115209]]
Bayes Classifier
Recall Score:    1.00
Precision Score: 0.99
F1 Score:        0.99
Confusion Matrix: 
[[ 33757   1580]
 [     0 115209]]
Logistic Regression
Recall Score:    1.00
Precision Score: 1.00
F1 Score:        1.00
Confusion Matrix: 
[[ 35165    172]
 [     0 115209]]


In [28]:
print("Test Set Comparison")
print("="*40)
print("Decision Tree")
print("Recall Score:    {:2.2f}".format(recall_score(y_test,decision_pred_test)))
print("Precision Score: {:2.2f}".format(precision_score(y_test,decision_pred_test)))
print("F1 Score:        {:2.2f}".format(f1_score(y_test,decision_pred_test)))
print("Confusion Matrix: ")
print(confusion_matrix(y_test,decision_pred_test))
print("="*40)
print("Bayes Classifier")
print("Recall Score:    {:2.2f}".format(recall_score(y_test,bayes_pred_test)))
print("Precision Score: {:2.2f}".format(precision_score(y_test,bayes_pred_test)))
print("F1 Score:        {:2.2f}".format(f1_score(y_test,bayes_pred_test)))
print("Confusion Matrix: ")
print(confusion_matrix(y_test,bayes_pred_test))
print("="*40)
print("Logistic Regression")
print("Recall Score:    {:2.2f}".format(recall_score(y_test,log_pred_test)))
print("Precision Score: {:2.2f}".format(precision_score(y_test,log_pred_test)))
print("F1 Score:        {:2.2f}".format(f1_score(y_test,log_pred_test)))
print("Confusion Matrix: ")
print(confusion_matrix(y_test,log_pred_test))

Test Set Comparison
Decision Tree
Recall Score:    1.00
Precision Score: 1.00
F1 Score:        1.00
Confusion Matrix: 
[[ 8811   116]
 [    5 28705]]
Bayes Classifier
Recall Score:    1.00
Precision Score: 0.99
F1 Score:        0.99
Confusion Matrix: 
[[ 8523   404]
 [    0 28710]]
Logistic Regression
Recall Score:    1.00
Precision Score: 1.00
F1 Score:        1.00
Confusion Matrix: 
[[ 8884    43]
 [    0 28710]]
