In [1]:
#this is a coursera project for classification algorithms kNN, logistic regression, decision trees and svm
import numpy as np
import pandas as pd
from sklearn import preprocessing

In [2]:
#Read & pre-process training data

#some features are categorical we do one-hot-encoding for education and label encoding 0,1 for male&female

#X_trn:features
#Y_trn:label

data_trn = pd.read_csv('loan_train.csv')

#if it is weekend loan do not be paid
data_trn['effective_date'] = pd.to_datetime(data_trn['effective_date'])

data_trn['dayofweek'] = data_trn['effective_date'].dt.dayofweek

data_trn['weekend'] = data_trn['dayofweek'].apply(lambda x: 1 if (x>5)  else 0)


data_trn['Gender'].replace(to_replace=['male','female'], value=[0,1],inplace=True)
data_trn.head()

X_trn = data_trn[['Principal','terms','age','Gender', 'weekend']]
X_trn = pd.concat([X_trn,pd.get_dummies(data_trn['education'])], axis=1)


Y_trn =data_trn['loan_status']

#Normalization process for the distances

X_trn=preprocessing.StandardScaler().fit(X_trn).transform(X_trn.astype(float))



In [3]:
#Read& pre-process test data

#some features are categorical we do one-hot-encoding for education and label encoding  0,1 for male&female

#X_tst:features
#Y_tst:label

data_tst = pd.read_csv('loan_test.csv')

data_tst['effective_date'] = pd.to_datetime(data_tst['effective_date'])

data_tst['dayofweek'] = data_tst['effective_date'].dt.dayofweek

data_tst['weekend'] = data_tst['dayofweek'].apply(lambda x: 1 if (x>5)  else 0)
data_tst.head()



data_tst['Gender'].replace(to_replace=['male','female'], value=[0,1],inplace=True)


X_tst = data_tst[['Principal','terms','age','Gender', 'weekend']]
X_tst = pd.concat([X_tst,pd.get_dummies(data_tst['education'])], axis=1)


Y_tst = data_tst['loan_status']


#Normalization process for the distances

X_tst=preprocessing.StandardScaler().fit(X_tst).transform(X_tst.astype(float))


In [4]:
#finding the best k for kNN

#we do not use loan_test data, instead we split training data to find the best k value


from sklearn.model_selection import train_test_split

from sklearn.neighbors import KNeighborsClassifier

from sklearn import metrics

X, X_t, Y, Y_t = train_test_split( X_trn, Y_trn, test_size=0.2, random_state=4)
#train, test, train, test
K=10
accuracy=np.zeros(K-1)
for n in range(1, K):
    model_kNN = KNeighborsClassifier(n_neighbors = n).fit(X,Y)
    accuracy[n-1] = metrics.accuracy_score(Y_t, model_kNN.predict(X_t))
    
accuracy    


array([0.64285714, 0.54285714, 0.75714286, 0.71428571, 0.8       ,
       0.75714286, 0.75714286, 0.71428571, 0.72857143])

In [5]:
#Classification

# kNN
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import svm

k = 5
model_kNN = KNeighborsClassifier(n_neighbors = k).fit(X_trn,Y_trn)

model_DT = DecisionTreeClassifier(criterion ='entropy', max_depth = 6).fit(X_trn, Y_trn)

model_LR = LogisticRegression(C=0.1, solver='liblinear').fit(X_trn, Y_trn)

model_SVM = svm.SVC(kernel = 'linear').fit(X_trn, Y_trn)





In [6]:
#Accuracy metrics
#kNN: jaccard similarity, f1 score

from sklearn import metrics

print("kNN Algorithm")
print("Jaccard Train set Accuracy: ", metrics.accuracy_score(Y_trn, model_kNN.predict(X_trn)))
print(" Jaccard Test set Accuracy: ", metrics.accuracy_score(Y_tst,  model_kNN.predict(X_tst)))

print("F1_score: paid off, collection", metrics.f1_score(Y_tst, model_kNN.predict(X_tst), average = None))


print("Decision Tree Algorithm")
print("Jaccard Train set Accuracy: ", metrics.accuracy_score(Y_trn, model_DT.predict(X_trn)))
print("Jaccard Test set Accuracy: ", metrics.accuracy_score(Y_tst,  model_DT.predict(X_tst)))
print("F1_score: paid off, collection", metrics.f1_score(Y_tst, model_DT.predict(X_tst), average = None))


print("Logistic Regression")
print("Jaccard Train set Accuracy: ", metrics.accuracy_score(Y_trn, model_LR.predict(X_trn)))
print("Jaccard Test set Accuracy: ", metrics.accuracy_score(Y_tst,  model_LR.predict(X_tst)))
print("F1_score: paid off, collection", metrics.f1_score(Y_tst, model_LR.predict(X_tst), average = None))
#print("Logloss:", metrics.log_loss(Y_tst, model_LR.predict(X_tst)))

print("SVM")
print("Jaccard Train set Accuracy: ", metrics.accuracy_score(Y_trn, model_SVM.predict(X_trn)))
print("Jaccard Test set Accuracy: ", metrics.accuracy_score(Y_tst,  model_SVM.predict(X_tst)))
print("F1_score: paid off, collection", metrics.f1_score(Y_tst, model_SVM.predict(X_tst), average = None))



kNN Algorithm
Jaccard Train set Accuracy:  0.8063583815028902
 Jaccard Test set Accuracy:  0.7222222222222222
F1_score: paid off, collection [0.28571429 0.82758621]
Decision Tree Algorithm
Jaccard Train set Accuracy:  0.7832369942196532
Jaccard Test set Accuracy:  0.7222222222222222
F1_score: paid off, collection [0.34782609 0.82352941]
Logistic Regression
Jaccard Train set Accuracy:  0.7456647398843931
Jaccard Test set Accuracy:  0.7407407407407407
F1_score: paid off, collection [0.         0.85106383]
SVM
Jaccard Train set Accuracy:  0.7514450867052023
Jaccard Test set Accuracy:  0.7407407407407407
F1_score: paid off, collection [0.         0.85106383]
