We will begin by importing the relevant packages i.e., Pandas, Sklearn Packages

In [976]:
%matplotlib inline

import pandas as pd
import numpy as np
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import metrics

We will then import the credit risk dataset using pandas

In [977]:
original_dataset = pd.read_csv('credit_risk_dataset.csv')
original_dataset

Unnamed: 0.1,Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,30786,41,40000,RENT,3.0,PERSONAL,A,9200,7.49,0,0.23,N,13
1,29460,44,28000,OWN,0.0,VENTURE,A,3500,8.94,0,0.13,N,12
2,7059,22,56000,RENT,0.0,DEBTCONSOLIDATION,B,7000,11.36,0,0.13,N,2
3,5377,24,45000,MORTGAGE,2.0,PERSONAL,A,7200,7.29,0,0.16,N,4
4,27170,28,55000,RENT,3.0,HOMEIMPROVEMENT,E,15000,17.06,0,0.27,Y,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
24430,9887,23,45000,RENT,7.0,EDUCATION,B,9000,10.65,0,0.20,N,2
24431,15196,26,125000,MORTGAGE,3.0,EDUCATION,D,24000,15.28,0,0.19,Y,4
24432,32218,43,138000,MORTGAGE,4.0,EDUCATION,A,9900,6.99,0,0.07,N,14
24433,25398,27,84780,MORTGAGE,5.0,VENTURE,A,18000,7.51,0,0.21,N,10


# Data Pre-processing
There are some empty cells on the person_emp_length and loan_int_rate.
Perform data cleaning to replacing empty fields with the mean of the columns
Drop the unnamed column

In [978]:
dataset = original_dataset.copy()
person_emp_length_mean = dataset["person_emp_length"].mean()
loan_int_rate_mean = dataset["loan_int_rate"].mean()

print("'person_emp_length' mean:" + str(person_emp_length_mean) + ", 'loan_int_rate' mean:" + str(loan_int_rate_mean))

dataset["person_emp_length"].fillna(person_emp_length_mean, inplace=True)
dataset["loan_int_rate"].fillna(loan_int_rate_mean, inplace=True)

dataset.drop(dataset.columns[dataset.columns.str.contains('unnamed', case=False)], axis=1, inplace=True)
dataset

'person_emp_length' mean:4.778072167549836, 'loan_int_rate' mean:11.006376791932013


Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,41,40000,RENT,3.0,PERSONAL,A,9200,7.49,0,0.23,N,13
1,44,28000,OWN,0.0,VENTURE,A,3500,8.94,0,0.13,N,12
2,22,56000,RENT,0.0,DEBTCONSOLIDATION,B,7000,11.36,0,0.13,N,2
3,24,45000,MORTGAGE,2.0,PERSONAL,A,7200,7.29,0,0.16,N,4
4,28,55000,RENT,3.0,HOMEIMPROVEMENT,E,15000,17.06,0,0.27,Y,5
...,...,...,...,...,...,...,...,...,...,...,...,...
24430,23,45000,RENT,7.0,EDUCATION,B,9000,10.65,0,0.20,N,2
24431,26,125000,MORTGAGE,3.0,EDUCATION,D,24000,15.28,0,0.19,Y,4
24432,43,138000,MORTGAGE,4.0,EDUCATION,A,9900,6.99,0,0.07,N,14
24433,27,84780,MORTGAGE,5.0,VENTURE,A,18000,7.51,0,0.21,N,10


There exists several fields with categorical values. We are going to encoding of categorical values.

In [979]:
dataset["cb_person_default_on_file"] = dataset["cb_person_default_on_file"].replace(["Y"], 1)
dataset["cb_person_default_on_file"] = dataset["cb_person_default_on_file"].replace(["N"], 0)
dataset = pd.get_dummies(dataset)
new_dataset = dataset.copy()
new_dataset

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,person_home_ownership_MORTGAGE,...,loan_intent_MEDICAL,loan_intent_PERSONAL,loan_intent_VENTURE,loan_grade_A,loan_grade_B,loan_grade_C,loan_grade_D,loan_grade_E,loan_grade_F,loan_grade_G
0,41,40000,3.0,9200,7.49,0,0.23,0,13,0,...,0,1,0,1,0,0,0,0,0,0
1,44,28000,0.0,3500,8.94,0,0.13,0,12,0,...,0,0,1,1,0,0,0,0,0,0
2,22,56000,0.0,7000,11.36,0,0.13,0,2,0,...,0,0,0,0,1,0,0,0,0,0
3,24,45000,2.0,7200,7.29,0,0.16,0,4,1,...,0,1,0,1,0,0,0,0,0,0
4,28,55000,3.0,15000,17.06,0,0.27,1,5,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24430,23,45000,7.0,9000,10.65,0,0.20,0,2,0,...,0,0,0,0,1,0,0,0,0,0
24431,26,125000,3.0,24000,15.28,0,0.19,1,4,1,...,0,0,0,0,0,0,1,0,0,0
24432,43,138000,4.0,9900,6.99,0,0.07,0,14,1,...,0,0,0,1,0,0,0,0,0,0
24433,27,84780,5.0,18000,7.51,0,0.21,0,10,1,...,0,0,1,1,0,0,0,0,0,0


We will remove the outliers form the non-categorical features. We will achieve this by computing the z-score of each column relative to the column mean and standard deviation.

In [980]:
dataset = dataset[(np.abs(stats.zscore(dataset)) < 3).all(axis=1)]
new_dataset = dataset.copy()
new_dataset

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,person_home_ownership_MORTGAGE,...,loan_intent_MEDICAL,loan_intent_PERSONAL,loan_intent_VENTURE,loan_grade_A,loan_grade_B,loan_grade_C,loan_grade_D,loan_grade_E,loan_grade_F,loan_grade_G
0,41,40000,3.0,9200,7.490000,0,0.23,0,13,0,...,0,1,0,1,0,0,0,0,0,0
2,22,56000,0.0,7000,11.360000,0,0.13,0,2,0,...,0,0,0,0,1,0,0,0,0,0
3,24,45000,2.0,7200,7.290000,0,0.16,0,4,1,...,0,1,0,1,0,0,0,0,0,0
5,22,75000,6.0,5000,7.740000,0,0.07,0,4,1,...,0,0,0,1,0,0,0,0,0,0
6,23,42000,0.0,10000,11.006377,0,0.24,0,4,0,...,1,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24430,23,45000,7.0,9000,10.650000,0,0.20,0,2,0,...,0,0,0,0,1,0,0,0,0,0
24431,26,125000,3.0,24000,15.280000,0,0.19,1,4,1,...,0,0,0,0,0,0,1,0,0,0
24432,43,138000,4.0,9900,6.990000,0,0.07,0,14,1,...,0,0,0,1,0,0,0,0,0,0
24433,27,84780,5.0,18000,7.510000,0,0.21,0,10,1,...,0,0,1,1,0,0,0,0,0,0


Normalize the data. We will use the min max scalar.

In [981]:
scaler = MinMaxScaler()
columns = ["person_age", "person_income", "person_emp_length", "loan_amnt", "loan_int_rate", "loan_percent_income",
           "cb_person_cred_hist_length"]
dataset[columns] = scaler.fit_transform(dataset[columns])
dataset

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,person_home_ownership_MORTGAGE,...,loan_intent_MEDICAL,loan_intent_PERSONAL,loan_intent_VENTURE,loan_grade_A,loan_grade_B,loan_grade_C,loan_grade_D,loan_grade_E,loan_grade_F,loan_grade_G
0,0.807692,0.139268,0.1875,0.313514,0.158378,0,0.458333,0,0.733333,0,...,0,1,0,1,0,0,0,0,0,0
2,0.076923,0.201303,0.0000,0.234234,0.454476,0,0.250000,0,0.000000,0,...,0,0,0,0,1,0,0,0,0,0
3,0.153846,0.158654,0.1250,0.241441,0.143076,0,0.312500,0,0.133333,1,...,0,1,0,1,0,0,0,0,0,0
5,0.076923,0.274969,0.3750,0.162162,0.177506,0,0.125000,0,0.133333,1,...,0,0,0,1,0,0,0,0,0,0
6,0.115385,0.147022,0.0000,0.342342,0.427420,0,0.479167,0,0.133333,0,...,1,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24430,0.115385,0.158654,0.4375,0.306306,0.400153,0,0.395833,0,0.000000,0,...,0,0,0,0,1,0,0,0,0,0
24431,0.230769,0.468828,0.1875,0.846847,0.754399,0,0.375000,1,0.133333,1,...,0,0,0,0,0,0,1,0,0,0
24432,0.884615,0.519231,0.2500,0.338739,0.120122,0,0.125000,0,0.800000,1,...,0,0,0,1,0,0,0,0,0,0
24433,0.269231,0.312888,0.3125,0.630631,0.159908,0,0.416667,0,0.533333,1,...,0,0,1,1,0,0,0,0,0,0


We then need to split the training and testing data from the dataset using sklearn

In [982]:
target_dataset = dataset['cb_person_default_on_file']
dataset.drop('cb_person_default_on_file', inplace=True, axis=1)
train_feature_dataset, test_feature_dataset, train_label_dataset, test_label_dataset = train_test_split(dataset, target_dataset, test_size=0.2)

Run Naive Bayes, Decision Tree and KNN on the dataset

# Multinomial Naiver Bayes

In [983]:
mnb = MultinomialNB()

mnb = mnb.fit(train_feature_dataset, train_label_dataset)
predicted_train_dataset_mnb = mnb.predict(train_feature_dataset)
# print("MultinomialNB - Training Dataset: Number of mislabeled points out of a total %d points : %d" % (
# train_feature_dataset.shape[0], (train_label_dataset != predicted_train_dataset_mnb).sum()))
# print("MultinomialNB - Training Dataset Accuracy:",
#       metrics.accuracy_score(train_label_dataset, predicted_train_dataset_mnb))
print("MultinomialNB - Training Dataset Classification Report:", "\n", classification_report(train_label_dataset, predicted_train_dataset_mnb))
print("MultinomialNB - Training Dataset Confusion Matrix:", "\n", confusion_matrix(train_label_dataset, predicted_train_dataset_mnb), "\n")

predicted_test_dataset_mnb = mnb.predict(test_feature_dataset)
# print("MultinomialNB - Test Dataset: Number of mislabeled points out of a total %d points : %d" % (
# test_feature_dataset.shape[0], (test_label_dataset != predicted_test_dataset_mnb).sum()))
# print("MultinomialNB - Test Dataset Accuracy:", metrics.accuracy_score(test_label_dataset, predicted_test_dataset_mnb))
print("MultinomialNB - Test Dataset Classification Report:", "\n", classification_report(test_label_dataset, predicted_test_dataset_mnb))
print("MultinomialNB - Test Dataset Confusion Matrix:", "\n", confusion_matrix(test_label_dataset, predicted_test_dataset_mnb), "\n")

MultinomialNB - Training Dataset Classification Report: 
               precision    recall  f1-score   support

           0       0.93      0.87      0.90     13694
           1       0.50      0.66      0.57      2682

    accuracy                           0.84     16376
   macro avg       0.72      0.77      0.74     16376
weighted avg       0.86      0.84      0.85     16376

MultinomialNB - Training Dataset Confusion Matrix: 
 [[11960  1734]
 [  913  1769]] 

MultinomialNB - Test Dataset Classification Report: 
               precision    recall  f1-score   support

           0       0.93      0.86      0.90      3438
           1       0.49      0.68      0.57       657

    accuracy                           0.83      4095
   macro avg       0.71      0.77      0.73      4095
weighted avg       0.86      0.83      0.84      4095

MultinomialNB - Test Dataset Confusion Matrix: 
 [[2969  469]
 [ 209  448]] 



# Gaussian Naive Bayes

In [984]:
gnb = GaussianNB()
gnb = gnb.fit(train_feature_dataset, train_label_dataset)
predicted_train_dataset_gnb = gnb.predict(train_feature_dataset)
# print("GaussianNB - Training Dataset: Number of mislabeled points out of a total %d points : %d" % (
# train_feature_dataset.shape[0], (train_label_dataset != predicted_train_dataset_gnb).sum()))
# print("GaussianNB - Training Dataset Accuracy:",
#       metrics.accuracy_score(train_label_dataset, predicted_train_dataset_gnb), "\n")
print("GaussianNB - Training Dataset Classification Report:", "\n", classification_report(train_label_dataset, predicted_train_dataset_gnb))
print("GaussianNB - Training Dataset Confusion Matrix:", "\n", confusion_matrix(train_label_dataset, predicted_train_dataset_gnb), "\n")

predicted_test_dataset_gnb = gnb.predict(test_feature_dataset)
# print("GaussianNB - Test Dataset: Number of mislabeled points out of a total %d points : %d" % (
# test_feature_dataset.shape[0], (test_label_dataset != predicted_test_dataset_gnb).sum()))
# print("GaussianNB - Test Dataset Accuracy:", metrics.accuracy_score(test_label_dataset, predicted_test_dataset_gnb),
#       "\n")
print("GaussianNB - Test Dataset Classification Report:", "\n", classification_report(test_label_dataset, predicted_test_dataset_gnb))
print("GaussianNB - Test Dataset Confusion Matrix:", "\n", confusion_matrix(test_label_dataset, predicted_test_dataset_gnb), "\n")

GaussianNB - Training Dataset Classification Report: 
               precision    recall  f1-score   support

           0       1.00      0.81      0.89     13694
           1       0.51      1.00      0.67      2682

    accuracy                           0.84     16376
   macro avg       0.75      0.90      0.78     16376
weighted avg       0.92      0.84      0.86     16376

GaussianNB - Training Dataset Confusion Matrix: 
 [[11081  2613]
 [    0  2682]] 

GaussianNB - Test Dataset Classification Report: 
               precision    recall  f1-score   support

           0       1.00      0.81      0.90      3438
           1       0.50      1.00      0.67       657

    accuracy                           0.84      4095
   macro avg       0.75      0.91      0.78      4095
weighted avg       0.92      0.84      0.86      4095

GaussianNB - Test Dataset Confusion Matrix: 
 [[2788  650]
 [   0  657]] 



# Decision Tree

In [985]:
clf = DecisionTreeClassifier()
clf = clf.fit(train_feature_dataset, train_label_dataset)
# tree.plot_tree(clf)
predicted_train_dataset_dt = clf.predict(train_feature_dataset)
# print("DecisionTreeClassifier - Train Dataset: Number of mislabeled points out of a total %d points : %d" % (
# train_feature_dataset.shape[0], (train_label_dataset != predicted_train_dataset_dt).sum()))
# print("DecisionTreeClassifier - Train Dataset Accuracy:",
#       metrics.accuracy_score(train_label_dataset, predicted_train_dataset_dt), "\n")
print("DecisionTreeClassifier - Training Dataset Classification Report:", "\n", classification_report(train_label_dataset, predicted_train_dataset_dt))
print("DecisionTreeClassifier - Training Dataset Confusion Matrix:", "\n", confusion_matrix(train_label_dataset, predicted_train_dataset_dt), "\n")

predicted_test_dataset_dt = clf.predict(test_feature_dataset)
# print("DecisionTreeClassifier - Test Dataset: Number of mislabeled points out of a total %d points : %d" % (
# test_feature_dataset.shape[0], (test_label_dataset != predicted_test_dataset_dt).sum()))
# print("DecisionTreeClassifier - Test Dataset Accuracy:",
#       metrics.accuracy_score(test_label_dataset, predicted_test_dataset_dt), "\n")
print("DecisionTreeClassifier - Test Dataset Classification Report:", "\n", classification_report(test_label_dataset, predicted_test_dataset_dt))
print("DecisionTreeClassifier - Test Dataset Confusion Matrix:", "\n", confusion_matrix(test_label_dataset, predicted_test_dataset_dt), "\n")

DecisionTreeClassifier - Training Dataset Classification Report: 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     13694
           1       1.00      1.00      1.00      2682

    accuracy                           1.00     16376
   macro avg       1.00      1.00      1.00     16376
weighted avg       1.00      1.00      1.00     16376

DecisionTreeClassifier - Training Dataset Confusion Matrix: 
 [[13694     0]
 [    7  2675]] 

DecisionTreeClassifier - Test Dataset Classification Report: 
               precision    recall  f1-score   support

           0       0.90      0.91      0.90      3438
           1       0.50      0.48      0.49       657

    accuracy                           0.84      4095
   macro avg       0.70      0.69      0.70      4095
weighted avg       0.84      0.84      0.84      4095

DecisionTreeClassifier - Test Dataset Confusion Matrix: 
 [[3117  321]
 [ 342  315]] 



# KNN

In [1011]:
k = 5
# for k in range(2, 9):
#     print("K value:", k)
neigh = KNeighborsClassifier(n_neighbors=k)
neigh = neigh.fit(train_feature_dataset, train_label_dataset)

predicted_train_dataset_knn = neigh.predict(train_feature_dataset)
# print("KNeighborsClassifier - Train Dataset: Number of mislabeled points out of a total %d points : %d" % (
# train_feature_dataset.shape[0], (train_label_dataset != predicted_train_dataset_knn).sum()))
# print("KNeighborsClassifier - Train Dataset Accuracy:",
#       metrics.accuracy_score(train_label_dataset, predicted_train_dataset_knn), "\n")
print("KNeighborsClassifier - Train Dataset Classification Report:", "\n", classification_report(train_label_dataset, predicted_train_dataset_knn))
print("KNeighborsClassifier - Train Dataset Confusion Matrix:", "\n", confusion_matrix(train_label_dataset, predicted_train_dataset_knn), "\n")

predicted_test_dataset_knn = neigh.predict(test_feature_dataset)
# print("KNeighborsClassifier - Test Dataset: Number of mislabeled points out of a total %d points : %d" % (
# test_feature_dataset.shape[0], (test_label_dataset != predicted_test_dataset_knn).sum()))
# print("KNeighborsClassifier - Test Dataset Accuracy:",
#       metrics.accuracy_score(test_label_dataset, predicted_test_dataset_knn), "\n")
print("KNeighborsClassifier - Test Dataset Classification Report:", "\n", classification_report(test_label_dataset, predicted_test_dataset_knn))
print("KNeighborsClassifier - Test Dataset Confusion Matrix:", "\n", confusion_matrix(test_label_dataset, predicted_test_dataset_knn), "\n")

KNeighborsClassifier - Train Dataset Classification Report: 
               precision    recall  f1-score   support

           0       0.94      0.94      0.94     13704
           1       0.70      0.70      0.70      2672

    accuracy                           0.90     16376
   macro avg       0.82      0.82      0.82     16376
weighted avg       0.90      0.90      0.90     16376

KNeighborsClassifier - Train Dataset Confusion Matrix: 
 [[12890   814]
 [  805  1867]] 

KNeighborsClassifier - Test Dataset Classification Report: 
               precision    recall  f1-score   support

           0       0.90      0.91      0.90      3428
           1       0.51      0.50      0.50       667

    accuracy                           0.84      4095
   macro avg       0.70      0.70      0.70      4095
weighted avg       0.84      0.84      0.84      4095

KNeighborsClassifier - Test Dataset Confusion Matrix: 
 [[3105  323]
 [ 335  332]] 



# New Dataset
Predicting data using a new dataset.
First we pre-process the new dataset for the trained model to understand.

In [987]:
new_dataset = pd.read_csv('credit_risk_dataset.csv')

person_emp_length_mean = new_dataset["person_emp_length"].mean()
loan_int_rate_mean = new_dataset["loan_int_rate"].mean()
new_dataset["person_emp_length"].fillna(person_emp_length_mean, inplace=True)
new_dataset["loan_int_rate"].fillna(loan_int_rate_mean, inplace=True)

new_dataset.drop(new_dataset.columns[new_dataset.columns.str.contains('unnamed', case=False)], axis=1, inplace=True)

new_dataset["cb_person_default_on_file"] = new_dataset["cb_person_default_on_file"].replace(["Y"], 1)
new_dataset["cb_person_default_on_file"] = new_dataset["cb_person_default_on_file"].replace(["N"], 0)
new_dataset = pd.get_dummies(new_dataset)

scaler = MinMaxScaler()
columns = ["person_age", "person_income", "person_emp_length", "loan_amnt", "loan_int_rate", "loan_percent_income",
           "cb_person_cred_hist_length"]
new_dataset[columns] = scaler.fit_transform(new_dataset[columns])

actual_labels = new_dataset["cb_person_default_on_file"]
features = new_dataset.drop(["cb_person_default_on_file"], axis=1)

# Multinomial Naive Bayes

In [988]:
predicted_labels_mnb = mnb.predict(features)
print("MultinomialNB - New Dataset Accuracy:", metrics.accuracy_score(actual_labels, predicted_labels_mnb),
      "\n")
print("MultinomialNB - New Dataset Classification Report:", "\n", classification_report(actual_labels, predicted_labels_mnb))
print("MultinomialNB - New Dataset Confusion Matrix:", "\n", confusion_matrix(actual_labels, predicted_labels_mnb), "\n")
result = pd.DataFrame(columns = ["actual", "predicted"])
result["actual"] = actual_labels
result["predicted"] = predicted_labels_mnb
result = result.replace([1], "Y")
result = result.replace([0], "N")
result

MultinomialNB - New Dataset Accuracy: 0.8253325148352773 

MultinomialNB - New Dataset Classification Report: 
               precision    recall  f1-score   support

           0       0.93      0.85      0.89     20120
           1       0.50      0.72      0.59      4315

    accuracy                           0.83     24435
   macro avg       0.72      0.79      0.74     24435
weighted avg       0.86      0.83      0.84     24435

MultinomialNB - New Dataset Confusion Matrix: 
 [[17040  3080]
 [ 1188  3127]] 



Unnamed: 0,actual,predicted
0,N,N
1,N,N
2,N,N
3,N,N
4,Y,Y
...,...,...
24430,N,N
24431,Y,N
24432,N,N
24433,N,N


# Gaussian Naive Bayes

In [989]:
predicted_labels_gnb = gnb.predict(features)
print("GaussianNB - New Dataset Accuracy:", metrics.accuracy_score(actual_labels, predicted_labels_gnb),
      "\n")
print("GaussianNB - New Dataset Classification Report:", "\n", classification_report(actual_labels, predicted_labels_gnb))
print("GaussianNB - New Dataset Confusion Matrix:", "\n", confusion_matrix(actual_labels, predicted_labels_gnb), "\n")
result = pd.DataFrame(columns = ["actual", "predicted"])
result["actual"] = actual_labels
result["predicted"] = predicted_labels_gnb
result = result.replace([1], "Y")
result = result.replace([0], "N")
result

GaussianNB - New Dataset Accuracy: 0.8276652342950686 

GaussianNB - New Dataset Classification Report: 
               precision    recall  f1-score   support

           0       1.00      0.79      0.88     20120
           1       0.51      1.00      0.67      4315

    accuracy                           0.83     24435
   macro avg       0.75      0.90      0.78     24435
weighted avg       0.91      0.83      0.85     24435

GaussianNB - New Dataset Confusion Matrix: 
 [[15909  4211]
 [    0  4315]] 



Unnamed: 0,actual,predicted
0,N,N
1,N,N
2,N,N
3,N,N
4,Y,Y
...,...,...
24430,N,N
24431,Y,Y
24432,N,N
24433,N,N


# Decision Tree

In [990]:
predicted_labels_dt = clf.predict(features)
print("DecisionTreeClassifier - New Dataset Accuracy:", metrics.accuracy_score(actual_labels, predicted_labels_dt),
      "\n")
print("DecisionTreeClassifier - New Dataset Classification Report:", "\n", classification_report(actual_labels, predicted_labels_dt))
print("DecisionTreeClassifier - New Dataset Confusion Matrix:", "\n", confusion_matrix(actual_labels, predicted_labels_dt), "\n")
result = pd.DataFrame(columns = ["actual", "predicted"])
result["actual"] = actual_labels
result["predicted"] = predicted_labels_dt
result = result.replace([1], "Y")
result = result.replace([0], "N")
result

DecisionTreeClassifier - New Dataset Accuracy: 0.8239819930427665 

DecisionTreeClassifier - New Dataset Classification Report: 
               precision    recall  f1-score   support

           0       0.91      0.88      0.89     20120
           1       0.50      0.58      0.54      4315

    accuracy                           0.82     24435
   macro avg       0.70      0.73      0.71     24435
weighted avg       0.83      0.82      0.83     24435

DecisionTreeClassifier - New Dataset Confusion Matrix: 
 [[17645  2475]
 [ 1826  2489]] 



Unnamed: 0,actual,predicted
0,N,N
1,N,N
2,N,N
3,N,N
4,Y,Y
...,...,...
24430,N,N
24431,Y,N
24432,N,N
24433,N,N


# KNN

In [991]:
predicted_labels_knn = neigh.predict(features)
print("KNeighborsClassifier - New Dataset Accuracy:", metrics.accuracy_score(actual_labels, predicted_labels_knn),
      "\n")
print("KNeighborsClassifier - New Dataset Classification Report:", "\n", classification_report(actual_labels, predicted_labels_knn))
print("KNeighborsClassifier - New Dataset Confusion Matrix:", "\n", confusion_matrix(actual_labels, predicted_labels_knn), "\n")
result = pd.DataFrame(columns = ["actual", "predicted"])
result["actual"] = actual_labels
result["predicted"] = predicted_labels_knn
result = result.replace([1], "Y")
result = result.replace([0], "N")
result

KNeighborsClassifier - New Dataset Accuracy: 0.8300798035604665 

KNeighborsClassifier - New Dataset Classification Report: 
               precision    recall  f1-score   support

           0       0.89      0.90      0.90     20120
           1       0.52      0.48      0.50      4315

    accuracy                           0.83     24435
   macro avg       0.71      0.69      0.70     24435
weighted avg       0.83      0.83      0.83     24435

KNeighborsClassifier - New Dataset Confusion Matrix: 
 [[18208  1912]
 [ 2240  2075]] 



Unnamed: 0,actual,predicted
0,N,N
1,N,N
2,N,N
3,N,N
4,Y,Y
...,...,...
24430,N,N
24431,Y,N
24432,N,N
24433,N,N
