In [16]:
from __future__ import print_function
import numpy as np
import pandas as pd 
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC 
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression


In [11]:
X = pd.read_csv('X.csv')
y = pd.read_csv('y.csv')
y['cid'] = y['idLoan'].apply(lambda x:x[:-3].lower())

In [12]:
data = pd.merge(X, y, on='cid', how='inner')
y = data['label']
data = data.drop(['label','cid','idLoan','how_use_money'],axis=1)
X = StandardScaler().fit_transform(data)

In [13]:
data.shape, y.shape

((656, 39), (656,))

In [14]:
X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size=.3)

In [2]:
algos = ["Nearest Neighbors", "Linear SVM",
         "Decision Tree", "Random Forest", "Logistic Regression"]

classifiers = [
    KNeighborsClassifier(3 ),
    SVC(kernel="linear", C=0.01), 
    DecisionTreeClassifier(max_depth=5,min_samples_leaf=30),
    RandomForestClassifier(max_depth=10,min_samples_leaf=30, n_estimators=20),
    LogisticRegression(penalty='l1')]


In [17]:
print ('\n', '**** cross-validation metric: ****')
for name, clf in zip(algos, classifiers):
    acc = cross_val_score(clf, X_train, y_train, cv= 3, scoring='accuracy')
    pre = cross_val_score(clf, X_train, y_train, cv= 3, scoring='precision')
    rec = cross_val_score(clf, X_train, y_train, cv= 3, scoring='recall')
    f1score = cross_val_score(clf, X_train, y_train, cv= 3, scoring='f1')
    print()
    print(name, 'accuracy = %.3f'%np.mean(acc))
    print(name, 'precision= %.3f'%np.mean(pre))
    print(name, 'recall   = %.3f'%np.mean(rec))
    print(name, 'f1 score = %.3f'%np.mean(f1score))


 **** cross-validation metric: ****

Nearest Neighbors accuracy = 0.584
Nearest Neighbors precision= 0.616
Nearest Neighbors recall   = 0.663
Nearest Neighbors f1 score = 0.638

Linear SVM accuracy = 0.575
Linear SVM precision= 0.580
Linear SVM recall   = 0.855
Linear SVM f1 score = 0.691

Decision Tree accuracy = 0.521
Decision Tree precision= 0.574
Decision Tree recall   = 0.537
Decision Tree f1 score = 0.550

Random Forest accuracy = 0.575
Random Forest precision= 0.592
Random Forest recall   = 0.769
Random Forest f1 score = 0.655

Logistic Regression accuracy = 0.577
Logistic Regression precision= 0.604
Logistic Regression recall   = 0.694
Logistic Regression f1 score = 0.646


In [18]:
print ('\n', '**** out-sample test accuracy: ****')
for name, clf in zip(algos, classifiers):
    clf.fit(X_train, y_train)
    train_score = clf.score(X_train, y_train)
    test_score = clf.score(X_test, y_test)
    print(name,': train= %.3f'%train_score, ', test= %.3f'%test_score)


 **** out-sample test accuracy: ****
Nearest Neighbors : train= 0.780 , test= 0.497
Linear SVM : train= 0.627 , test= 0.624
Decision Tree : train= 0.682 , test= 0.563
Random Forest : train= 0.678 , test= 0.599
Logistic Regression : train= 0.643 , test= 0.614
