We will begin by importing the relevant packages i.e., Pandas, Sklearn

In [137]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, GaussianNB

We will then import the credit risk dataset using pandas

In [138]:
dataset = pd.read_csv('credit_risk_dataset.csv')
dataset

Unnamed: 0.1,Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,30786,41,40000,RENT,3.0,PERSONAL,A,9200,7.49,0,0.23,N,13
1,29460,44,28000,OWN,0.0,VENTURE,A,3500,8.94,0,0.13,N,12
2,7059,22,56000,RENT,0.0,DEBTCONSOLIDATION,B,7000,11.36,0,0.13,N,2
3,5377,24,45000,MORTGAGE,2.0,PERSONAL,A,7200,7.29,0,0.16,N,4
4,27170,28,55000,RENT,3.0,HOMEIMPROVEMENT,E,15000,17.06,0,0.27,Y,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
24430,9887,23,45000,RENT,7.0,EDUCATION,B,9000,10.65,0,0.20,N,2
24431,15196,26,125000,MORTGAGE,3.0,EDUCATION,D,24000,15.28,0,0.19,Y,4
24432,32218,43,138000,MORTGAGE,4.0,EDUCATION,A,9900,6.99,0,0.07,N,14
24433,25398,27,84780,MORTGAGE,5.0,VENTURE,A,18000,7.51,0,0.21,N,10


There are some empty cells on the person_emp_length and loan_int_rate.
Perform data cleaning to replacing empty fields with the mean of the columns

In [139]:
person_emp_length_mean = dataset["person_emp_length"].mean()
loan_int_rate_mean = dataset["loan_int_rate"].mean()

print("'person_emp_length' mean:" + str(person_emp_length_mean) + ", 'loan_int_rate' mean:" + str(loan_int_rate_mean))

dataset["person_emp_length"].fillna(person_emp_length_mean,inplace = True)
dataset["loan_int_rate"].fillna(loan_int_rate_mean,inplace = True)
dataset

'person_emp_length' mean:4.778072167549836, 'loan_int_rate' mean:11.006376791932013


Unnamed: 0.1,Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,30786,41,40000,RENT,3.0,PERSONAL,A,9200,7.49,0,0.23,N,13
1,29460,44,28000,OWN,0.0,VENTURE,A,3500,8.94,0,0.13,N,12
2,7059,22,56000,RENT,0.0,DEBTCONSOLIDATION,B,7000,11.36,0,0.13,N,2
3,5377,24,45000,MORTGAGE,2.0,PERSONAL,A,7200,7.29,0,0.16,N,4
4,27170,28,55000,RENT,3.0,HOMEIMPROVEMENT,E,15000,17.06,0,0.27,Y,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
24430,9887,23,45000,RENT,7.0,EDUCATION,B,9000,10.65,0,0.20,N,2
24431,15196,26,125000,MORTGAGE,3.0,EDUCATION,D,24000,15.28,0,0.19,Y,4
24432,32218,43,138000,MORTGAGE,4.0,EDUCATION,A,9900,6.99,0,0.07,N,14
24433,25398,27,84780,MORTGAGE,5.0,VENTURE,A,18000,7.51,0,0.21,N,10


First we need to map the "cb_person_default_on_file" field to where Y=1 and N=0
There exists several fields with categorical values. We are going to encoding of categorical values.

In [140]:
dataset["cb_person_default_on_file"] = dataset["cb_person_default_on_file"].replace(["Y"], 1)
dataset["cb_person_default_on_file"] = dataset["cb_person_default_on_file"].replace(["N"], 0)
dataset = pd.get_dummies(dataset)
dataset

Unnamed: 0.1,Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,...,loan_intent_MEDICAL,loan_intent_PERSONAL,loan_intent_VENTURE,loan_grade_A,loan_grade_B,loan_grade_C,loan_grade_D,loan_grade_E,loan_grade_F,loan_grade_G
0,30786,41,40000,3.0,9200,7.49,0,0.23,0,13,...,0,1,0,1,0,0,0,0,0,0
1,29460,44,28000,0.0,3500,8.94,0,0.13,0,12,...,0,0,1,1,0,0,0,0,0,0
2,7059,22,56000,0.0,7000,11.36,0,0.13,0,2,...,0,0,0,0,1,0,0,0,0,0
3,5377,24,45000,2.0,7200,7.29,0,0.16,0,4,...,0,1,0,1,0,0,0,0,0,0
4,27170,28,55000,3.0,15000,17.06,0,0.27,1,5,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24430,9887,23,45000,7.0,9000,10.65,0,0.20,0,2,...,0,0,0,0,1,0,0,0,0,0
24431,15196,26,125000,3.0,24000,15.28,0,0.19,1,4,...,0,0,0,0,0,0,1,0,0,0
24432,32218,43,138000,4.0,9900,6.99,0,0.07,0,14,...,0,0,0,1,0,0,0,0,0,0
24433,25398,27,84780,5.0,18000,7.51,0,0.21,0,10,...,0,0,1,1,0,0,0,0,0,0


We then need to split the training and testing data from the dataset using sklearn

In [141]:
target_dataset = dataset['cb_person_default_on_file']
dataset.drop('cb_person_default_on_file', inplace=True, axis=1)
train_feature_dataset, test_feature_dataset, train_target_dataset, test_target_dataset  = train_test_split(dataset, target_dataset, test_size=0.2)

Run Naive Bayes on the dataset

In [142]:
mnb = MultinomialNB()
gnb = GaussianNB()
predicted_dataset_mnb = mnb.fit(train_feature_dataset, train_target_dataset).predict(test_feature_dataset)
predicted_dataset_gnb = gnb.fit(train_feature_dataset, train_target_dataset).predict(test_feature_dataset)
print("MultinomialNB: Number of mislabeled points out of a total %d points : %d" % (test_feature_dataset.shape[0], (test_target_dataset != predicted_dataset_mnb).sum()))
print("GaussianNB: Number of mislabeled points out of a total %d points : %d" % (test_feature_dataset.shape[0], (test_target_dataset != predicted_dataset_gnb).sum()))

MultinomialNB: Number of mislabeled points out of a total 4887 points : 2284
GaussianNB: Number of mislabeled points out of a total 4887 points : 839
