In [1]:
#Library Imports

from sklearn.model_selection import train_test_split
import pickle as pk
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np
from sklearn.cross_validation import cross_val_score
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn import svm
from sklearn import neighbors
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV


#Allows to see up to 500 columns within notebook
pd.set_option('display.max_rows',500)
pd.set_option('display.max_columns',500)
pd.set_option('display.width',1000)





In [2]:
#Function to find ML Scores
def scores(ytest,ypredicted):
    tn, fp, fn, tp = confusion_matrix(ytest, ypredicted).ravel()
    recall = tp / (tp + fn)
    prec = tp / (tp + fp)
    F1 = 2 * recall * prec / (recall + prec)
    return("recall:",recall, "prec:",prec,"F1:", F1)


In [3]:
#Load Pickled dataset from previous Workbook
pickle_in = open("df_pickle.pkl","rb")
df = pk.load(pickle_in)
type(df)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 100002 to 111633
Data columns (total 15 columns):
AMT_CREDIT_app_train         10000 non-null float64
AMT_ANNUITY_app_train        10000 non-null float64
AMT_GOODS_PRICE_app_train    10000 non-null float64
DAYS_BIRTH                   10000 non-null int64
DAYS_REGISTRATION            10000 non-null float64
DAYS_ID_PUBLISH              10000 non-null int64
DAYS_LAST_PHONE_CHANGE       10000 non-null float64
AMT_CREDIT_prev_app          10000 non-null float64
AMT_DOWN_PAYMENT             10000 non-null float64
AMT_GOODS_PRICE_prev_app     10000 non-null float64
AMT_ANNUITY                  10000 non-null float64
AMT_CREDIT_SUM_LIMIT         10000 non-null float64
AMT_INSTALMENT               10000 non-null float64
TARGET                       10000 non-null int64
AMT_INCOME_TOTAL             10000 non-null float64
dtypes: float64(12), int64(3)
memory usage: 1.2 MB


In [4]:
#Create X and Y splits to train ML Algos
y = df.TARGET
x = df.drop(['TARGET'],axis=1)

In [5]:
#Resulted in lower F1/AUC Score
minmaxscaler = MinMaxScaler()
x_normed = minmaxscaler.fit_transform(x)
x_normed

array([[0.12400463, 0.16577233, 0.11854103, ..., 0.        , 0.        ,
        0.09262314],
       [0.42815586, 0.24825352, 0.38145897, ..., 0.        , 0.06146219,
        0.12797549],
       [0.0308642 , 0.03114981, 0.04559271, ..., 0.        , 0.        ,
        0.02191845],
       ...,
       [0.15518519, 0.38236982, 0.15197568, ..., 0.        , 0.        ,
        0.10440726],
       [0.104     , 0.18784381, 0.10638298, ..., 0.        , 0.00765902,
        0.17511195],
       [0.10649383, 0.1210894 , 0.09118541, ..., 0.        , 0.        ,
        0.03605939]])

In [6]:
#Use Train_Test_Split to create Test Set
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=.2)


In [7]:
score = []
#Train Classifier: Random Forest
clf = RandomForestClassifier(n_estimators=1,class_weight={0:1,1:100000})

#Fit classifier to Train data set
clf.fit(x_train,y_train)

#Predict Proba

y_predict_test = clf.predict(x_test)

#auc score
auc_test = metrics.roc_auc_score(y_test, y_predict_test)
score.append(auc_test)
print(score)

scores(y_test,y_predict_test)

[0.5186141304347827]


('recall:', 0.10625, 'prec:', 0.11805555555555555, 'F1:', 0.1118421052631579)

In [8]:
#SVM
clf = svm.SVC(C=.3)
clf.fit(x_train,y_train)
y_predict_test_s= clf.predict(x_test)

auc_score=metrics.roc_auc_score(y_test,y_predict_test_s)
print(auc_score)

scores(y_test,y_predict_test_s)

0.5


  """


('recall:', 0.0, 'prec:', nan, 'F1:', nan)

In [9]:
#KNN

knn=neighbors.KNeighborsClassifier()
knn.fit(x_train,y_train)
y_predict_test_knn = knn.predict(x_test)

auc_score=metrics.roc_auc_score(y_test,y_predict_test_knn)
print(auc_score)

scores(y_test,y_predict_test_knn)

0.5019021739130435


('recall:', 0.0125, 'prec:', 0.1111111111111111, 'F1:', 0.022471910112359553)

In [10]:


param_grid = {'n_neighbors':np.arange(1,50)}

knn_cv = GridSearchCV(knn,param_grid,cv=5)
knn_cv.fit(x_train,y_train)
knn_cv.best_params_
knn_cv.best_score_

#y_gridsearch_test = knn_cv.predict(x_test)
#auc_score=metrics.roc_auc_score(y_test,y_gridsearch_test)
#print(auc_score)

0.923125