## Predict loan paid of or not

In [105]:
## uncomment to load train set data
# !wget -O loan_train.csv https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/ML0101ENv3/labs/loan_train.csv

In [98]:
import itertools
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy.optimize as opt
import matplotlib.ticker as ticker
from matplotlib.ticker import NullFormatter
from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.externals.six import StringIO
import matplotlib.image as mpimg
from sklearn import tree
from sklearn import svm
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import jaccard_similarity_score
import wget
%matplotlib inline

In [3]:
df  = pd.read_csv('loan_train.csv')
df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,loan_status,Principal,terms,effective_date,due_date,age,education,Gender
0,0,0,PAIDOFF,1000,30,09-08-16,10-07-16,45,High School or Below,male
1,2,2,PAIDOFF,1000,30,09-08-16,10-07-16,33,Bechalor,female
2,3,3,PAIDOFF,1000,15,09-08-16,9/22/2016,27,college,male
3,4,4,PAIDOFF,1000,30,09-09-16,10-08-16,28,college,female
4,6,6,PAIDOFF,1000,30,09-09-16,10-08-16,29,college,male


In [4]:
df.shape

(346, 10)

### Convert to date time object 

In [5]:
df['due_date'] = pd.to_datetime(df['due_date'])
df['effective_date'] = pd.to_datetime(df['effective_date'])
df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,loan_status,Principal,terms,effective_date,due_date,age,education,Gender
0,0,0,PAIDOFF,1000,30,2016-09-08,2016-10-07,45,High School or Below,male
1,2,2,PAIDOFF,1000,30,2016-09-08,2016-10-07,33,Bechalor,female
2,3,3,PAIDOFF,1000,15,2016-09-08,2016-09-22,27,college,male
3,4,4,PAIDOFF,1000,30,2016-09-09,2016-10-08,28,college,female
4,6,6,PAIDOFF,1000,30,2016-09-09,2016-10-08,29,college,male


In [6]:
df['loan_status'].value_counts()

PAIDOFF       260
COLLECTION     86
Name: loan_status, dtype: int64

# Pre-processing:  Feature selection/extraction

In [7]:
df['dayofweek'] = df['effective_date'].dt.dayofweek
bins = np.linspace(df.dayofweek.min(), df.dayofweek.max(), 10)

In [8]:
df['weekend'] = df['dayofweek'].apply(lambda x: 1 if (x>3)  else 0)
df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,loan_status,Principal,terms,effective_date,due_date,age,education,Gender,dayofweek,weekend
0,0,0,PAIDOFF,1000,30,2016-09-08,2016-10-07,45,High School or Below,male,3,0
1,2,2,PAIDOFF,1000,30,2016-09-08,2016-10-07,33,Bechalor,female,3,0
2,3,3,PAIDOFF,1000,15,2016-09-08,2016-09-22,27,college,male,3,0
3,4,4,PAIDOFF,1000,30,2016-09-09,2016-10-08,28,college,female,4,1
4,6,6,PAIDOFF,1000,30,2016-09-09,2016-10-08,29,college,male,4,1


In [9]:
# Convert Categorical features to numerical values
df.groupby(['Gender'])['loan_status'].value_counts(normalize=True)

Gender  loan_status
female  PAIDOFF        0.865385
        COLLECTION     0.134615
male    PAIDOFF        0.731293
        COLLECTION     0.268707
Name: loan_status, dtype: float64

In [10]:
df['Gender'].replace(to_replace=['male','female'], value=[0,1],inplace=True)
df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,loan_status,Principal,terms,effective_date,due_date,age,education,Gender,dayofweek,weekend
0,0,0,PAIDOFF,1000,30,2016-09-08,2016-10-07,45,High School or Below,0,3,0
1,2,2,PAIDOFF,1000,30,2016-09-08,2016-10-07,33,Bechalor,1,3,0
2,3,3,PAIDOFF,1000,15,2016-09-08,2016-09-22,27,college,0,3,0
3,4,4,PAIDOFF,1000,30,2016-09-09,2016-10-08,28,college,1,4,1
4,6,6,PAIDOFF,1000,30,2016-09-09,2016-10-08,29,college,0,4,1


In [11]:
df.groupby(['education'])['loan_status'].value_counts(normalize=True)

education             loan_status
Bechalor              PAIDOFF        0.750000
                      COLLECTION     0.250000
High School or Below  PAIDOFF        0.741722
                      COLLECTION     0.258278
Master or Above       COLLECTION     0.500000
                      PAIDOFF        0.500000
college               PAIDOFF        0.765101
                      COLLECTION     0.234899
Name: loan_status, dtype: float64

In [12]:
df[['Principal','terms','age','Gender','education']].head()

Unnamed: 0,Principal,terms,age,Gender,education
0,1000,30,45,0,High School or Below
1,1000,30,33,1,Bechalor
2,1000,15,27,0,college
3,1000,30,28,1,college
4,1000,30,29,0,college


In [13]:
Feature = df[['Principal','terms','age','Gender','weekend']]
Feature = pd.concat([Feature,pd.get_dummies(df['education'])], axis=1)
Feature.drop(['Master or Above'], axis = 1,inplace=True)
Feature.head()

Unnamed: 0,Principal,terms,age,Gender,weekend,Bechalor,High School or Below,college
0,1000,30,45,0,0,0,1,0
1,1000,30,33,1,0,1,0,0
2,1000,15,27,0,0,0,0,1
3,1000,30,28,1,1,0,0,1
4,1000,30,29,0,1,0,0,1


In [14]:
X = Feature
X[0:5]

Unnamed: 0,Principal,terms,age,Gender,weekend,Bechalor,High School or Below,college
0,1000,30,45,0,0,0,1,0
1,1000,30,33,1,0,1,0,0
2,1000,15,27,0,0,0,0,1
3,1000,30,28,1,1,0,0,1
4,1000,30,29,0,1,0,0,1


In [62]:
# To use scikit-learn library, we have to convert the Pandas data frame to a Numpy array:
y = df['loan_status'].values
print(y[0:5])
print(X[:5])
print(type(y))

['PAIDOFF' 'PAIDOFF' 'PAIDOFF' 'PAIDOFF' 'PAIDOFF']
[[1000 30 45 0 1]
 [1000 30 33 1 0]
 [1000 15 27 0 3]
 [1000 30 28 1 3]
 [1000 30 29 0 3]]
<class 'numpy.ndarray'>


In [63]:
X= preprocessing.StandardScaler().fit(X).transform(X)
print(X[0:5])
print(type(X))

[[ 0.51578458  0.92071769  2.33152555 -0.42056004 -0.64691859]
 [ 0.51578458  0.92071769  0.34170148  2.37778177 -1.5212695 ]
 [ 0.51578458 -0.95911111 -0.65321055 -0.42056004  1.10178323]
 [ 0.51578458  0.92071769 -0.48739188  2.37778177  1.10178323]
 [ 0.51578458  0.92071769 -0.3215732  -0.42056004  1.10178323]]
<class 'numpy.ndarray'>


# K Nearest Neighbor(KNN)

In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=5)
# print(X_train[0:5], y[:5])
print('Test set = ', X_train.shape, y_train.shape)
print('Test set  = ', X_test.shape, y_test.shape)
# print(type(y_train))

k = 1
from sklearn.neighbors import KNeighborsClassifier
neighbor = KNeighborsClassifier(n_neighbors=k).fit(X_train, y_train)
# print(neighbor)

yhat = neighbor.predict(X_test)
# print(yhat[:5])

Test set =  (242, 8) (242,)
Test set  =  (104, 8) (104,)


In [18]:
from sklearn import metrics
print('Train set Accuracy = ', metrics.accuracy_score(y_train, neighbor.predict(X_train)))
print('Test set Accuracy = ', metrics.accuracy_score(y_test, yhat))

Train set Accuracy =  0.9297520661157025
Test set Accuracy =  0.6538461538461539


In [20]:
Ks = 10
mean_acc = np.zeros((Ks-1))
std_acc = np.zeros((Ks-1))
ConfustionMx = [];
for n in range(1,Ks):
    
    #Train Model and Predict  
    neigh = KNeighborsClassifier(n_neighbors = n).fit(X_train,y_train)
    yhat=neigh.predict(X_test)
    mean_acc[n-1] = metrics.accuracy_score(y_test, yhat)

    std_acc[n-1]=np.std(yhat==y_test)/np.sqrt(yhat.shape[0])
    
print(mean_acc)

[0.65384615 0.64423077 0.69230769 0.67307692 0.69230769 0.68269231
 0.70192308 0.68269231 0.71153846]


# Decision Tree

In [64]:
X = df[['Principal', 'terms', 'age', 'Gender', 'education']].values
print(X[:5])
print(df[0:5])
print(df.size)
print(X[0:5])

[[1000 30 45 0 'High School or Below']
 [1000 30 33 1 'Bechalor']
 [1000 15 27 0 'college']
 [1000 30 28 1 'college']
 [1000 30 29 0 'college']]
   Unnamed: 0  Unnamed: 0.1 loan_status  Principal  terms effective_date  \
0           0             0     PAIDOFF       1000     30     2016-09-08   
1           2             2     PAIDOFF       1000     30     2016-09-08   
2           3             3     PAIDOFF       1000     15     2016-09-08   
3           4             4     PAIDOFF       1000     30     2016-09-09   
4           6             6     PAIDOFF       1000     30     2016-09-09   

    due_date  age             education  Gender  dayofweek  weekend  
0 2016-10-07   45  High School or Below       0          3        0  
1 2016-10-07   33              Bechalor       1          3        0  
2 2016-09-22   27               college       0          3        0  
3 2016-10-08   28               college       1          4        1  
4 2016-10-08   29               college       0 

In [22]:
le_gender = preprocessing.LabelEncoder()
le_gender.fit([1, 0])
X[:,3] = le_gender.transform(X[:,3]) 

le_education = preprocessing.LabelEncoder()
le_education.fit(['High School or Below', 'Bechalor', 'college', 'Master or Above'])
X[:,4] = le_education.transform(X[:,4])

In [23]:
X_trainset, X_testset, y_trainset, y_testset = train_test_split(X, y, test_size=0.2, random_state=5)
print('Train set', X_trainset.shape, y_trainset.shape)
print('Train set', X_testset.shape, y_testset.shape)

Train set (276, 5) (276,)
Train set (70, 5) (70,)


In [27]:
drugTree = DecisionTreeClassifier(criterion="entropy", max_depth = 4)
drugTree

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=4, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [30]:
drugTree.fit(X_trainset,y_trainset)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=4, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [31]:
predTree = drugTree.predict(X_testset)
print (predTree [0:5])
print (y_testset [0:5])
print("DecisionTrees's Accuracy: ", metrics.accuracy_score(y_testset, predTree))

['PAIDOFF' 'PAIDOFF' 'PAIDOFF' 'PAIDOFF' 'PAIDOFF']
['PAIDOFF' 'COLLECTION' 'PAIDOFF' 'COLLECTION' 'PAIDOFF']
DecisionTrees's Accuracy:  0.7


# Support Vector Machine

In [36]:
# convert X data frame to numpy array
X = np.asarray(X)
print(X)

[[1000 30 45 0 1]
 [1000 30 33 1 0]
 [1000 15 27 0 3]
 ...
 [800 15 39 0 3]
 [1000 30 28 0 3]
 [1000 30 26 0 3]]


In [53]:
y = np.asarray(y)
y[:5]

array(['PAIDOFF', 'PAIDOFF', 'PAIDOFF', 'PAIDOFF', 'PAIDOFF'],
      dtype=object)

In [54]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=4)
print('Train set:', X_train.shape,  y_train.shape)
print('Test set:', X_test.shape,  y_test.shape)

Train set: (276, 5) (276,)
Test set: (70, 5) (70,)


In [55]:
clf = svm.SVC(kernel='rbf')
clf.fit(X_train, y_train)
yhat = clf.predict(X_test)
print(yhat[0:5])

['PAIDOFF' 'PAIDOFF' 'PAIDOFF' 'PAIDOFF' 'PAIDOFF']


In [57]:
clf2 = svm.SVC(kernel='linear')
clf2.fit(X_train, y_train)
yhat2 = clf2.predict(X_test)
print(yhat[0:5])

['PAIDOFF' 'PAIDOFF' 'PAIDOFF' 'PAIDOFF' 'PAIDOFF']


In [59]:
clf3 = svm.SVC(kernel='rbf')
clf3.fit(X_train, y_train)
yhat3 = clf3.predict(X_test)
print(yhat[0:5])

['PAIDOFF' 'PAIDOFF' 'PAIDOFF' 'PAIDOFF' 'PAIDOFF']


In [60]:
# Evaluation
print("Avg F1-score: %.4f" % f1_score(y_test, yhat2, average='weighted'))
print("Jaccard score: %.4f" % jaccard_similarity_score(y_test, yhat2))

Avg F1-score: 0.6914
Jaccard score: 0.7857




# Logistic Regression

In [86]:
X = df[['Principal', 'terms', 'age', 'Gender', 'dayofweek', 'weekend']]
print(X[:5])

   Principal  terms  age  Gender  dayofweek  weekend
0       1000     30   45       0          3        0
1       1000     30   33       1          3        0
2       1000     15   27       0          3        0
3       1000     30   28       1          4        1
4       1000     30   29       0          4        1


In [87]:
X = preprocessing.StandardScaler().fit(X).transform(X)
print(X[0:5])

[[ 0.51578458  0.92071769  2.33152555 -0.42056004 -0.26122054 -1.20577805]
 [ 0.51578458  0.92071769  0.34170148  2.37778177 -0.26122054 -1.20577805]
 [ 0.51578458 -0.95911111 -0.65321055 -0.42056004 -0.26122054 -1.20577805]
 [ 0.51578458  0.92071769 -0.48739188  2.37778177  0.12175534  0.82934003]
 [ 0.51578458  0.92071769 -0.3215732  -0.42056004  0.12175534  0.82934003]]


In [91]:
y = df['loan_status'].values
print(y[:5])

['PAIDOFF' 'PAIDOFF' 'PAIDOFF' 'PAIDOFF' 'PAIDOFF']


In [95]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=5)
print('Train set:', X_train.shape,  y_train.shape)
print('Test set:', X_test.shape,  y_test.shape)

Train set: (276, 6) (276,)
Test set: (70, 6) (70,)


In [97]:
LR = LogisticRegression(C=0.01, solver='liblinear').fit(X_train,y_train)
print(LR)

LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)


In [100]:
yhat = LR.predict(X_test)
print(yhat[:5])

['COLLECTION' 'PAIDOFF' 'COLLECTION' 'COLLECTION' 'PAIDOFF']


In [101]:
yhat_prob = LR.predict_proba(X_test)
print(yhat_prob[:5])

[[0.51157025 0.48842975]
 [0.48924184 0.51075816]
 [0.50636021 0.49363979]
 [0.51330642 0.48669358]
 [0.48893146 0.51106854]]


In [103]:
##Evaluaton
print(jaccard_similarity_score(y_test, yhat))

0.7


## Evaluation

In [65]:
from sklearn.metrics import jaccard_similarity_score
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss

First, download and load the test set:

In [106]:
## uncomment to load test data
# !wget -O loan_test.csv https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/ML0101ENv3/labs/loan_test.csv
## test Set
# url = 'https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/ML0101ENv3/labs/loan_test.csv'
# filename = wget.download(url)

In [69]:
df_test = pd.read_csv('loan_test.csv')
df_test.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,loan_status,Principal,terms,effective_date,due_date,age,education,Gender
0,1,1,PAIDOFF,1000,30,9/8/2016,10/7/2016,50,Bechalor,female
1,5,5,PAIDOFF,300,7,9/9/2016,9/15/2016,35,Master or Above,male
2,21,21,PAIDOFF,1000,30,9/10/2016,10/9/2016,43,High School or Below,female
3,24,24,PAIDOFF,1000,30,9/10/2016,10/9/2016,26,college,male
4,35,35,PAIDOFF,800,15,9/11/2016,9/25/2016,29,Bechalor,male


In [72]:
y = df['loan_status'].values
print(y[:5])

['PAIDOFF' 'PAIDOFF' 'PAIDOFF' 'PAIDOFF' 'PAIDOFF']


In [73]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=5)
print('Train set', X_trainset.shape, y_trainset.shape)
print('Train set', X_testset.shape, y_testset.shape)

Train set (276, 5) (276,)
Train set (70, 5) (70,)


In [74]:
print(f1_score(y_test, yhat, average='weighted'))
print(f1_score(y_test, yhat, average='micro'))
print(f1_score(y_test, yhat, average='macro'))

0.5764705882352941
0.7
0.4117647058823529


In [75]:
print(jaccard_similarity_score(y_test, yhat))

0.7


