# Import library

In [46]:
# import necessary library
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
%matplotlib inline

# Read file and data pre-processing

In [47]:
# input file
file_path = 'https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/ML0101ENv3/labs/loan_train.csv'
df = pd.read_csv(file_path, delimiter=',')
print(df.shape)
# check NaN
print('Is there any NaN? ' + str(df.isnull().values.any()))
# convert data type to int
# get_dummies for education and gender
df_dummies = pd.get_dummies(data=df, columns=['education', 'Gender'])
# for our y, replace the string value to integer: PAIDOFF=1, COLLECTION=0
df_dummies['loan_status'].replace(to_replace=['PAIDOFF', 'COLLECTION'], value=[1,0], inplace=True)
# convert datetime dtype
df_dummies['effective_date'] = pd.to_datetime(df_dummies['effective_date'])
df_dummies['due_date'] = pd.to_datetime(df_dummies['due_date'])
df_dummies['loan_age'] = df_dummies['due_date'] - df_dummies['effective_date']
df_dummies['loan_age_num'] = pd.to_numeric(df_dummies['loan_age'])

(346, 10)
Is there any NaN? False


In [48]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import jaccard_score
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
# # 1. define X and y as array
X = df_dummies[['Principal', 'terms',
                'age', 'education_Bechalor',
                'education_High School or Below', 'education_Master or Above',
                'education_college', 'Gender_female', 'Gender_male', 'loan_age_num']].values
y = df_dummies[['loan_status']].values
# 2. normalize the feature
X = StandardScaler().fit(X).transform(X)
# 3. train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)
# prepare for final report
report_data = [[],[],[],[]]

# KNN Modeling & Evaluation

In [49]:
from sklearn.metrics import jaccard_score
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss, accuracy_score
# 4. KNN model
def checkKNNScore(neighbors, X_train_set, X_test_set, y_train_set, y_test_set):
    knn = KNeighborsClassifier(n_neighbors=neighbors)
    knn.fit(X_train_set, y_train_set.ravel())
    y_hat_set = knn.predict(X_test_set)
    score = knn.score(X_test_set, y_test_set)
    return [neighbors, score, y_hat_set, knn]
tmp_score = 0
tmp_neighbor = 0
y_hat = 0
best_k = 0
neighbor_candidate = []
for i in range(1,21):
    result = checkKNNScore(i, X_train, X_test, y_train, y_test)
#     based on the highest accuracy score and ignore unpredicted label
    if(result[1]>tmp_score and len(set(result[2]))==2):
#     if(result[1]>tmp_score):     
        tmp_score = result[1]
        tmp_neighbor = result[0]
        neighbor_candidate.append(tmp_neighbor)
        y_hat = result[2]
        best_k = result[3]
print('Best mean accuracy score = '+ str(tmp_score) + ' with K(neighbor) = ' + str(tmp_neighbor))
# KNN accracy check
# jaccard index
jaccard_index = jaccard_score(y_test, y_hat)
print('jaccard score: ' + str(jaccard_index))
# f1_socre
f1_score = f1_score(y_test, y_hat, average='weighted')
print('f1 score: ' + str(f1_score))
# log loss
log_loss = log_loss(y_test, y_hat)
print('log loss: ' + str(log_loss))
report_data[0] = [jaccard_index, f1_score, log_loss]
print('Classification Report:')
print(classification_report(y_test, y_hat))
print('Confusion Matrix of KNN:')
print(confusion_matrix(y_test, y_hat))

Best mean accuracy score = 0.8142857142857143 with K(neighbor) = 19
jaccard score: 0.8142857142857143
f1 score: 0.7437570303712036
log loss: 6.414481261471447
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        12
           1       0.83      0.98      0.90        58

    accuracy                           0.81        70
   macro avg       0.41      0.49      0.45        70
weighted avg       0.68      0.81      0.74        70

Confusion Matrix of KNN:
[[ 0 12]
 [ 1 57]]


# Decision Tree Modeling & Evaluation

In [50]:
from sklearn.metrics import jaccard_score
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss
# 5. Decision Tree model
def buildDecisionTree(depth, X_train_set, X_test_set, y_train_set, y_test_set):
    loan_tree = DecisionTreeClassifier(criterion="entropy", max_depth = depth)
    loan_tree.fit(X_train_set, y_train_set.ravel())
    y_hat_set = loan_tree.predict(X_test_set)
    score = loan_tree.score(X_test_set, y_test_set)
    return [depth, score, y_hat_set, loan_tree]
tmp_score = 0
tmp_depth = 0
y_hat = 0
best_tree = 0
for i in range(1,21):
    result = buildDecisionTree(i, X_train, X_test, y_train, y_test)
    #     based on the highest accuracy score and ignore unpredicted label
    if(result[1]>tmp_score and len(set(result[2]))==2):
#     if(result[1]>tmp_score):
        tmp_score = result[1]
        tmp_depth = result[0]
        y_hat = result[2]
        best_tree = result[3]
print('Best mean accuracy = '+ str(tmp_score) + ' with depth = ' + str(tmp_depth))
# Decision Tree accracy check
# jaccard index
jaccard_index = jaccard_score(y_test, y_hat)
print('jaccard score: ' + str(jaccard_index))
# f1_socre
f1_score = f1_score(y_test, y_hat, zero_division=1, average='weighted')
print('f1 score: ' + str(f1_score))
# log loss
log_loss = log_loss(y_test, y_hat)
report_data[1] = [jaccard_index, f1_score, log_loss]
print('log loss: ' + str(log_loss))
print('Classification Report:')
print(classification_report(y_test, y_hat))
print('Confusion Matrix of Decision Tree:')
print(confusion_matrix(y_test, y_hat))

Best mean accuracy = 0.7714285714285715 with depth = 3
jaccard score: 0.7714285714285715
f1 score: 0.7216589861751151
log loss: 7.894714535539047
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        12
           1       0.82      0.93      0.87        58

    accuracy                           0.77        70
   macro avg       0.41      0.47      0.44        70
weighted avg       0.68      0.77      0.72        70

Confusion Matrix of Decision Tree:
[[ 0 12]
 [ 4 54]]


# SVM Modeling & Evaluation

In [51]:
from sklearn.metrics import jaccard_score
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score
from sklearn import svm
# 7. Support Vector Machine
svm_model = svm.SVC(kernel='poly', degree=3)
svm_model.fit(X_train, y_train.ravel())
y_hat = svm_model.predict(X_test)
# SVM accracy check
# jaccard index
jaccard_index = jaccard_score(y_test, y_hat)
print('jaccard score: ' + str(jaccard_index))
# f1_socre
f1_score = f1_score(y_test, y_hat, average='weighted')
print('f1 score: ' + str(f1_score))
# log loss
log_loss = log_loss(y_test, y_hat)
report_data[2] = [jaccard_index, f1_score, log_loss]
print('log loss: ' + str(log_loss))
print('Classification Report:')
print(classification_report(y_test, y_hat))
print('Confusion Matrix of SVM:')
print(confusion_matrix(y_test, y_hat))

jaccard score: 0.8115942028985508
f1 score: 0.765257142857143
log loss: 6.414469838651016
Classification Report:
              precision    recall  f1-score   support

           0       0.33      0.08      0.13        12
           1       0.84      0.97      0.90        58

    accuracy                           0.81        70
   macro avg       0.58      0.52      0.51        70
weighted avg       0.75      0.81      0.77        70

Confusion Matrix of SVM:
[[ 1 11]
 [ 2 56]]


# Logistic Regression Modeling & Evaluation

In [52]:
# 8. Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import jaccard_score
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)
lr = LogisticRegression()
lr.fit(X_train, y_train.ravel())
y_hat = lr.predict(X_test)
# y_hat
# lr_score = lr.score(X_test, y_test)
# Logistic Regression accracy check
# jaccard index
jaccard_index = jaccard_score(y_test, y_hat)
print('jaccard score: ' + str(jaccard_index))
# f1_socre
f1_score = f1_score(y_test, y_hat, average='weighted')
print('f1 score: ' + str(f1_score))
# log loss
log_loss = log_loss(y_test, y_hat)
report_data[3] = [jaccard_index, f1_score, log_loss]
print('log loss: ' + str(log_loss))
print('Classification Report:')
print(classification_report(y_test, y_hat))
print('Confusion Matrix of Logistics Regression:')
print(confusion_matrix(y_test, y_hat))

jaccard score: 0.8142857142857143
f1 score: 0.7437570303712036
log loss: 6.414481261471447
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        12
           1       0.83      0.98      0.90        58

    accuracy                           0.81        70
   macro avg       0.41      0.49      0.45        70
weighted avg       0.68      0.81      0.74        70

Confusion Matrix of Logistics Regression:
[[ 0 12]
 [ 1 57]]


# Final Report of Training Data

In [53]:
# 9. Display report
pd.DataFrame(columns=['jaccard_score', 'f1_score', 'log_loss'], data=report_data, index=['KNN', 'Decision Tree', 'SVM', 'Logistic Regression'])

Unnamed: 0,jaccard_score,f1_score,log_loss
KNN,0.814286,0.743757,6.414481
Decision Tree,0.771429,0.721659,7.894715
SVM,0.811594,0.765257,6.41447
Logistic Regression,0.814286,0.743757,6.414481


# Test with test dataset 

In [54]:
# import and read test file
file_path = 'https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/ML0101ENv3/labs/loan_test.csv'
df = pd.read_csv(file_path, delimiter=',')
print(df.shape)
# check NaN
print('Is there any NaN? ' + str(df.isnull().values.any()))
# convert data type to int
# get_dummies for education and gender
df_dummies = pd.get_dummies(data=df, columns=['education', 'Gender'])
# for our y, replace the string value to integer: PAIDOFF=1, COLLECTION=0
df_dummies['loan_status'].replace(to_replace=['PAIDOFF', 'COLLECTION'], value=[1,0], inplace=True)
# convert datetime dtype
df_dummies['effective_date'] = pd.to_datetime(df_dummies['effective_date'])
df_dummies['due_date'] = pd.to_datetime(df_dummies['due_date'])
df_dummies['loan_age'] = df_dummies['due_date'] - df_dummies['effective_date']
df_dummies['loan_age_num'] = pd.to_numeric(df_dummies['loan_age'])

(54, 10)
Is there any NaN? False


In [55]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import jaccard_score
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
# # 1. define X and y as array
X = df_dummies[['Principal', 'terms',
                'age', 'education_Bechalor',
                'education_High School or Below', 'education_Master or Above',
                'education_college', 'Gender_female', 'Gender_male', 'loan_age_num']].values
y = df_dummies[['loan_status']].values
# 2. normalize the feature
X = StandardScaler().fit(X).transform(X)
# # 3. train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)
# prepare for final report
report_data_test = [[],[],[],[]]

# KNN Modeling & Evaluation

In [56]:
# KNN Modeling & Evaluation
# now we use neighbor=19 according to training data
from sklearn.metrics import jaccard_score
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss, accuracy_score
# use trained model to predict test data.
best_k.fit(X, y.ravel())
y_hat = best_k.predict(X)
score = best_k.score(X, y)
# KNN accracy check
# jaccard index
jaccard_index = jaccard_score(y, y_hat)
print('jaccard score: ' + str(jaccard_index))
# f1_socre
f1_score = f1_score(y, y_hat, average='weighted')
print('f1 score: ' + str(f1_score))
# log loss
log_loss = log_loss(y, y_hat)
print('log loss: ' + str(log_loss))
report_data_test[0] = [jaccard_index, f1_score, log_loss]
print('Classification Report:')
print(classification_report(y, y_hat))
print('Confusion Matrix of KNN:')
print(confusion_matrix(y, y_hat))

jaccard score: 0.7169811320754716
f1 score: 0.6491417079652374
log loss: 9.594297049819497
Classification Report:
              precision    recall  f1-score   support

           0       0.33      0.07      0.12        14
           1       0.75      0.95      0.84        40

    accuracy                           0.72        54
   macro avg       0.54      0.51      0.48        54
weighted avg       0.64      0.72      0.65        54

Confusion Matrix of KNN:
[[ 1 13]
 [ 2 38]]


# Decision Tree Modeling & Evaluation
Now we also loop for the best depth and score.

In [57]:
# Decision Tree Modeling & Evaluation
from sklearn.metrics import jaccard_score
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss
# use trained model to predict
best_tree.fit(X, y.ravel())
y_hat = best_tree.predict(X)
score = best_tree.score(X, y)
# Decision Tree accracy check
# jaccard index
jaccard_index = jaccard_score(y, y_hat)
print('jaccard score: ' + str(jaccard_index))
# f1_socre
f1_score = f1_score(y, y_hat, zero_division=1, average='weighted')
print('f1 score: ' + str(f1_score))
# log loss
log_loss = log_loss(y, y_hat)
report_data_test[1] = [jaccard_index, f1_score, log_loss]
print('log loss: ' + str(log_loss))
print('Classification Report:')
print(classification_report(y, y_hat))
print('Confusion Matrix of Decision Tree:')
print(confusion_matrix(y, y_hat))

jaccard score: 0.8085106382978723
f1 score: 0.8201193520886615
log loss: 5.756566384003844
Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.50      0.61        14
           1       0.84      0.95      0.89        40

    accuracy                           0.83        54
   macro avg       0.81      0.72      0.75        54
weighted avg       0.83      0.83      0.82        54

Confusion Matrix of Decision Tree:
[[ 7  7]
 [ 2 38]]


# SVM Modeling & Evaluation
Now we use kernal=poly with degree=3 according to training data

In [58]:
# SVM Modeling & Evaluation
# Now we use kernal=poly with degree=3 according to training data
from sklearn.metrics import jaccard_score
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score
from sklearn import svm
# use trained model to predict test data
svm_model.fit(X, y.ravel())
y_hat = svm_model.predict(X)
# SVM accracy check
# jaccard index
jaccard_index = jaccard_score(y, y_hat)
print('jaccard score: ' + str(jaccard_index))
# f1_socre
f1_score = f1_score(y, y_hat, average='weighted')
print('f1 score: ' + str(f1_score))
# log loss
log_loss = log_loss(y, y_hat)
report_data_test[2] = [jaccard_index, f1_score, log_loss]
print('log loss: ' + str(log_loss))
print('Classification Report:')
print(classification_report(y, y_hat))
print('Confusion Matrix of SVM:')
print(confusion_matrix(y, y_hat))

jaccard score: 0.7843137254901961
f1 score: 0.7427039191745074
log loss: 7.035839553995365
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.21      0.35        14
           1       0.78      1.00      0.88        40

    accuracy                           0.80        54
   macro avg       0.89      0.61      0.62        54
weighted avg       0.84      0.80      0.74        54

Confusion Matrix of SVM:
[[ 3 11]
 [ 0 40]]


# Logistic Regression Modeling & Evaluation

In [59]:
# Logistic Regression Modeling & Evaluation
# 8. Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import jaccard_score
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score
# use trained model to predict test data
lr.fit(X, y.ravel())
y_hat = lr.predict(X)
# Logistic Regression accracy check
# jaccard index
jaccard_index = jaccard_score(y, y_hat)
print('jaccard score: ' + str(jaccard_index))
# f1_socre
f1_score = f1_score(y, y_hat, average='weighted')
print('f1 score: ' + str(f1_score))
# log loss
log_loss = log_loss(y, y_hat)
report_data_test[3] = [jaccard_index, f1_score, log_loss]
print('log loss: ' + str(log_loss))
print('Classification Report:')
print(classification_report(y, y_hat))
print('Confusion Matrix of Logistics Regression:')
print(confusion_matrix(y, y_hat))

jaccard score: 0.74
f1 score: 0.7288207747977863
log loss: 8.315038687187794
Classification Report:
              precision    recall  f1-score   support

           0       0.57      0.29      0.38        14
           1       0.79      0.93      0.85        40

    accuracy                           0.76        54
   macro avg       0.68      0.61      0.62        54
weighted avg       0.73      0.76      0.73        54

Confusion Matrix of Logistics Regression:
[[ 4 10]
 [ 3 37]]


# Final Report of Testing Dataset

In [60]:
# 9. Display report
print('Training data report:')
print(pd.DataFrame(columns=['jaccard_score', 'f1_score', 'log_loss'], data=report_data, index=['KNN', 'Decision Tree', 'SVM', 'Logistic Regression']))
print('\nTest data report:')
print(pd.DataFrame(columns=['jaccard_score', 'f1_score', 'log_loss'], data=report_data_test, index=['KNN', 'Decision Tree', 'SVM', 'Logistic Regression']))

Training data report:
                     jaccard_score  f1_score  log_loss
KNN                       0.814286  0.743757  6.414481
Decision Tree             0.771429  0.721659  7.894715
SVM                       0.811594  0.765257  6.414470
Logistic Regression       0.814286  0.743757  6.414481

Test data report:
                     jaccard_score  f1_score  log_loss
KNN                       0.716981  0.649142  9.594297
Decision Tree             0.808511  0.820119  5.756566
SVM                       0.784314  0.742704  7.035840
Logistic Regression       0.740000  0.728821  8.315039
