# Loading libraries

In [21]:
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression,Ridge,Lasso
from sklearn.model_selection import cross_val_score,GridSearchCV,RandomizedSearchCV,train_test_split
from sklearn.metrics import f1_score,accuracy_score,classification_report,confusion_matrix,roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier,AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

from mypipes import *

import warnings
warnings.filterwarnings('ignore') # ignore the warnings.


import matplotlib.pyplot as plt
import seaborn as sb
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.6f} (std: {1:.6f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

def Threshold_Finder(Y_test,Y_predict):
    I = []
    roc_ = []
    for i in np.linspace(0.01,1,1000):
        Y_predict = Y_predict_prob[:,1] > i
        roc = f1_score(Y_test,Y_predict)
        I.append(i)
        roc_.append(roc)
    Temp = I[roc_.index(max(roc_))]
    return Temp

#  Loading the datasets

In [2]:
train_data = r'paydayloan_collections.csv'
#test_data = r''

train_data = pd.read_csv(train_data)
#test_data = pd.read_csv(test_data)

In [3]:
# Data Previews
train_data.transpose()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,29990,29991,29992,29993,29994,29995,29996,29997,29998,29999
payment,Success,Denied,Denied,Success,Success,Success,Success,Success,Denied,Denied,...,Success,Success,Denied,Success,Success,Success,Denied,Denied,Success,Success
var1,qw,qw,qw,wv,ma,kq,wv,qw,wv,qw,...,qw,wv,qw,wv,wv,qw,kq,kq,qw,qw
var2,hk,rv,zg,js,xn,py,py,rv,rv,rv,...,py,py,rv,bq,py,zg,bq,js,py,js
var3,3.11,3.35,4.15,6.23,1.28,-2.45,1.05,5.41,7.29,3.13,...,-3.2,7.75,4.82,2.2,2.51,3.85,3.32,2.98,-0.3,6.03
var4,16.06,11.18,29.19,15.7,20.71,22.45,23.02,17.92,26.83,34.21,...,7.38,26.77,26.04,-26.34,4.36,12.75,25.31,19.28,16.41,-6.99
var5,-4.6,-18.55,18.91,2.81,14.98,15.18,17.59,-14.59,33.92,22.55,...,40.39,45.77,14.51,18.82,53.03,47.62,15.9,16.2,22.8,-28.71
var6,22.34,6.68,16.4,4.46,11.19,-2.12,6.65,5.0,13.35,0.8,...,7.69,4.51,4.12,14.42,-0.73,3.34,10.96,-1.7,-9.99,11.82
var7,13.53,12.78,3.67,5.13,17.66,-8.24,-2.06,1.34,20.57,6.91,...,-5.24,6.97,12.07,2.44,14.0,17.22,10.13,9.45,26.89,4.71
var8,1.53,6.62,5.72,8.66,1.13,10.34,12.2,-8.54,4.46,11.18,...,7.09,15.35,11.31,4.12,15.72,7.0,10.32,-8.44,-1.1,5.0
var9,nv,nv,ch,ja,nv,ch,ch,ch,ch,ch,...,ch,ch,ch,ch,ch,ch,ch,ch,ch,ch


In [4]:
# get information of Data
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 31 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   payment  30000 non-null  object 
 1   var1     30000 non-null  object 
 2   var2     30000 non-null  object 
 3   var3     30000 non-null  float64
 4   var4     30000 non-null  float64
 5   var5     30000 non-null  float64
 6   var6     30000 non-null  float64
 7   var7     30000 non-null  float64
 8   var8     30000 non-null  float64
 9   var9     30000 non-null  object 
 10  var10    30000 non-null  object 
 11  var11    30000 non-null  object 
 12  var12    30000 non-null  float64
 13  var13    30000 non-null  object 
 14  var14    30000 non-null  float64
 15  var15    30000 non-null  float64
 16  var16    30000 non-null  float64
 17  var17    30000 non-null  object 
 18  var18    30000 non-null  float64
 19  var19    30000 non-null  object 
 20  var20    30000 non-null  float64
 21  var21    300

In [5]:
p1 = pdPipeline([
    ('columns_selection',VarSelector(['var1','var2','var9','var10','var11','var13','var17','var19','var23','var29'])),
    ('data_impute',DataFrameImputer()),
    ('get_dummy',get_dummies_Pipe())
])
temp = pd.DataFrame(data = p1.fit_transform(train_data),columns = p1.get_feature_names())

train_data = pd.concat([train_data,temp],axis = 1)

In [6]:
Y_train = train_data['payment']
Y_train = pd.DataFrame(data = Y_train , columns = ['payment'] )
train_data.drop(columns = ['payment'],inplace = True)
X_train = train_data
X_train.drop(['var1','var2','var9','var10','var11','var13','var17','var19','var23','var29'],inplace = True,axis = 1)
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 62 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   var3      30000 non-null  float64
 1   var4      30000 non-null  float64
 2   var5      30000 non-null  float64
 3   var6      30000 non-null  float64
 4   var7      30000 non-null  float64
 5   var8      30000 non-null  float64
 6   var12     30000 non-null  float64
 7   var14     30000 non-null  float64
 8   var15     30000 non-null  float64
 9   var16     30000 non-null  float64
 10  var18     30000 non-null  float64
 11  var20     30000 non-null  float64
 12  var21     30000 non-null  float64
 13  var22     30000 non-null  float64
 14  var24     30000 non-null  float64
 15  var25     30000 non-null  float64
 16  var26     30000 non-null  float64
 17  var27     30000 non-null  float64
 18  var28     30000 non-null  float64
 19  var30     30000 non-null  float64
 20  var1_qw   30000 non-null  in

In [7]:
Y_train

Unnamed: 0,payment
0,Success
1,Denied
2,Denied
3,Success
4,Success
...,...
29995,Success
29996,Denied
29997,Denied
29998,Success


In [8]:
Y_train = (Y_train['payment'] == 'Success').astype(int)
Y_train

0        1
1        0
2        0
3        1
4        1
        ..
29995    1
29996    0
29997    0
29998    1
29999    1
Name: payment, Length: 30000, dtype: int32

# Create train_test_spit (for 1 time validation)


In [9]:
X_train,X_test,Y_train,Y_test = train_test_split(X_train, Y_train, test_size=0.2, random_state=1)

# Logistic Regression

##  Fine a general model (default hyper parameter setting)


In [11]:
LG_1 = LogisticRegression()
LG_1.fit(X_train,Y_train)

In [12]:
LG_1.coef_

array([[-2.99737623e-03, -1.28821488e-03,  7.18588717e-04,
         1.85343114e-03, -3.02380059e-02,  3.63846415e-06,
        -6.61761564e-04, -1.17358829e-02, -1.48892533e-03,
         3.88722184e-02, -1.08395625e-03,  3.67299969e-05,
        -1.52608507e-04,  5.93376096e-04,  4.63411055e-03,
        -1.79805801e-01, -2.32678506e-04, -4.60354138e-03,
         8.97124529e-03,  5.27912942e-04,  5.34348292e-02,
         9.06743815e-02, -5.22277413e-03,  6.09814152e-02,
        -9.31232272e-03,  3.85081291e-02, -3.13914969e-03,
         5.91547790e-02,  3.10702547e-02, -2.13619736e-02,
         1.62998665e-02,  9.61086645e-02,  7.91959976e-02,
         5.08362435e-02,  6.56861650e-02,  6.56391366e-02,
         1.82348870e-02,  2.08283553e-02,  1.64141619e-02,
         5.36547911e-02,  6.33670856e-02,  1.90575236e-02,
        -1.25551492e+00,  5.58213184e-01,  4.61476643e-01,
         3.01210806e-01,  6.49266659e-02,  1.48167997e-02,
         6.40695943e-02,  3.48009998e-02,  4.37761309e-0

In [13]:
LG_1.intercept_

array([0.18151519])

In [14]:
Y_predict = LG_1.predict(X_test)
Y_predict

array([0, 0, 0, ..., 1, 1, 1])

In [15]:
confusion_matrix(Y_test,Y_predict)

array([[3417,  324],
       [ 534, 1725]])

In [16]:
accuracy_score(Y_test,Y_predict)

0.857

In [17]:
print(classification_report(Y_test,Y_predict))

              precision    recall  f1-score   support

           0       0.86      0.91      0.89      3741
           1       0.84      0.76      0.80      2259

    accuracy                           0.86      6000
   macro avg       0.85      0.84      0.84      6000
weighted avg       0.86      0.86      0.86      6000



In [18]:
f1_score(Y_test,Y_predict)

0.8008356545961003

In [19]:
Y_predict_prob = LG_1.predict_proba(X_test)
Y_predict_prob

array([[0.84955972, 0.15044028],
       [0.89009549, 0.10990451],
       [0.73465919, 0.26534081],
       ...,
       [0.24406615, 0.75593385],
       [0.07885338, 0.92114662],
       [0.34634105, 0.65365895]])

In [20]:
Y_predict = Y_predict_prob[:,1] > Threshold_Finder(Y_test,Y_predict)
print(classification_report(Y_test,Y_predict))

              precision    recall  f1-score   support

           0       0.89      0.88      0.89      3741
           1       0.81      0.83      0.82      2259

    accuracy                           0.86      6000
   macro avg       0.85      0.85      0.85      6000
weighted avg       0.86      0.86      0.86      6000



##  Hyper parameter with Cross validation (select best hyper parameter settings with multifold validations)


In [21]:
param_grid = {'penalty':['l1', 'l2', 'elasticnet', None],
              'C':np.linspace(0.01,6,200),
              'class_weight':['balanced',None],
              'solver' : ['liblinear','saga']}

In [22]:
Log_Reg_model = LogisticRegression()

Log_Reg_Random_Search = RandomizedSearchCV(Log_Reg_model,
                                          param_distributions = param_grid,
                                          cv = 5,
                                          n_iter = 25,
                                          scoring = 'roc_auc')



In [23]:
Log_Reg_Random_Search.fit(X_train,Y_train)

In [24]:
Log_Reg_Random_Search.best_estimator_

In [None]:
param_grid = {'penalty':['l1', 'l2', 'elasticnet', None],
              'C':np.linspace(0.1,1,25),
              'class_weight':['balanced',None],
              'solver' : ['liblinear','saga']}
Log_Reg_model = LogisticRegression()

Log_Reg_Grid_Search = GridSearchCV(Log_Reg_model,
                             param_grid = param_grid,
                             cv = 5,
                             scoring = 'roc_auc')
Log_Reg_Grid_Search.fit(X_train,Y_train)

In [None]:
Log_Reg_Grid_Search.best_estimator_

In [None]:
Log_Reg_model = Log_Reg_Grid_Search.best_estimator_

In [None]:
Log_Reg_model.fit(X_train,Y_train)
Y_predict = Log_Reg_model.predict(X_test)

In [None]:
confusion_matrix(Y_test,Y_predict)

array([[3421,  320],
       [ 527, 1732]])

In [None]:
print(classification_report(Y_test,Y_predict))

              precision    recall  f1-score   support

           0       0.87      0.91      0.89      3741
           1       0.84      0.77      0.80      2259

    accuracy                           0.86      6000
   macro avg       0.86      0.84      0.85      6000
weighted avg       0.86      0.86      0.86      6000



In [None]:
f1_score(Y_test,Y_predict)

0.8035258640686614

In [None]:
Y_predict_prob = Log_Reg_model.predict_proba(X_test)

In [None]:
Threshold_Finder(Y_test,Y_predict)

0.42522522522522527

In [None]:
Y_predict = Y_predict_prob[:,1] > Threshold_Finder(Y_test,Y_predict)
print(classification_report(Y_test,Y_predict))

              precision    recall  f1-score   support

           0       0.90      0.87      0.89      3741
           1       0.80      0.84      0.82      2259

    accuracy                           0.86      6000
   macro avg       0.85      0.85      0.85      6000
weighted avg       0.86      0.86      0.86      6000



In [None]:
dtree_model = DecisionTreeClassifier()
dtree_model_cv_scores = cross_val_score(dtree_model,X_train,Y_train,cv=5,scoring='roc_auc')
dtree_model_cv_scores.mean() #uncontrolled decision tree's validation performance in auc is less than logistic

0.7694991694046608

In [None]:
dtree_model.fit(X_train,Y_train)
Y_predict = dtree_model.predict(X_test)

In [None]:
confusion_matrix(Y_test,Y_predict)

array([[3089,  652],
       [ 685, 1574]])

In [None]:
print(classification_report(Y_test,Y_predict))

              precision    recall  f1-score   support

           0       0.82      0.83      0.82      3741
           1       0.71      0.70      0.70      2259

    accuracy                           0.78      6000
   macro avg       0.76      0.76      0.76      6000
weighted avg       0.78      0.78      0.78      6000



In [None]:
f1_score(Y_test,Y_predict)

0.7018952062430324

In [None]:
Y_predict_prob = dtree_model.predict_proba(X_test)

In [None]:
Threshold_Finder(Y_test,Y_predict)

0.01

In [None]:
Y_predict = Y_predict_prob[:,1] > Threshold_Finder(Y_test,Y_predict)
print(classification_report(Y_test,Y_predict))

              precision    recall  f1-score   support

           0       0.82      0.83      0.82      3741
           1       0.71      0.70      0.70      2259

    accuracy                           0.78      6000
   macro avg       0.76      0.76      0.76      6000
weighted avg       0.78      0.78      0.78      6000



In [None]:
%%time
rf_model = RandomForestClassifier()
rf_model_cv_scores = cross_val_score(rf_model,X_train,Y_train,cv=5,scoring='roc_auc',n_jobs=-1)
rf_model_cv_scores.mean()
#slower processing compared to logistic, dtree
#better validation result

CPU times: user 409 ms, sys: 152 ms, total: 561 ms
Wall time: 37.1 s


0.884275076264538

In [None]:
rf_model.fit(X_train,Y_train)
Y_predict = rf_model.predict(X_test)

In [None]:
confusion_matrix(Y_test,Y_predict)

array([[3407,  334],
       [ 509, 1750]])

In [None]:
print(classification_report(Y_test,Y_predict))

              precision    recall  f1-score   support

           0       0.87      0.91      0.89      3741
           1       0.84      0.77      0.81      2259

    accuracy                           0.86      6000
   macro avg       0.85      0.84      0.85      6000
weighted avg       0.86      0.86      0.86      6000



In [None]:
f1_score(Y_test,Y_predict)

0.8058945429426665

In [None]:
Y_predict_prob = rf_model.predict_proba(X_test)

In [None]:
Threshold_Finder(Y_test,Y_predict_prob)

0.40045045045045047

In [None]:
Y_predict = Y_predict_prob[:,1] > Threshold_Finder(Y_test,Y_predict_prob)
print(classification_report(Y_test,Y_predict))

              precision    recall  f1-score   support

           0       0.90      0.87      0.89      3741
           1       0.80      0.83      0.82      2259

    accuracy                           0.86      6000
   macro avg       0.85      0.85      0.85      6000
weighted avg       0.86      0.86      0.86      6000



In [None]:
%%time
et_model = ExtraTreesClassifier()
et_model_cv_scores = cross_val_score(et_model,X_train,Y_train,cv=5,scoring='roc_auc',n_jobs=-1)
et_model_cv_scores.mean()
#faster processing compared to randomforest
#better validation result than logistic, dtree

CPU times: user 200 ms, sys: 55.6 ms, total: 255 ms
Wall time: 18.9 s


0.8613057313753556

In [None]:
et_model.fit(X_train,Y_train)
Y_predict = et_model.predict(X_test)

In [None]:
confusion_matrix(Y_test,Y_predict)

array([[3431,  310],
       [ 864, 1395]])

In [None]:
print(classification_report(Y_test,Y_predict))

              precision    recall  f1-score   support

           0       0.80      0.92      0.85      3741
           1       0.82      0.62      0.70      2259

    accuracy                           0.80      6000
   macro avg       0.81      0.77      0.78      6000
weighted avg       0.81      0.80      0.80      6000



In [None]:
f1_score(Y_test,Y_predict)

0.7038345105953582

In [None]:
Y_predict_prob = et_model.predict_proba(X_test)

In [None]:
Threshold_Finder(Y_test,Y_predict_prob)

0.4202702702702703

In [None]:
Y_predict = Y_predict_prob[:,1] > Threshold_Finder(Y_test,Y_predict_prob)
print(classification_report(Y_test,Y_predict))

              precision    recall  f1-score   support

           0       0.85      0.84      0.85      3741
           1       0.74      0.76      0.75      2259

    accuracy                           0.81      6000
   macro avg       0.80      0.80      0.80      6000
weighted avg       0.81      0.81      0.81      6000



In [None]:
%%time
ada_model = AdaBoostClassifier()
ada_model_cv_scores = cross_val_score(ada_model,X_train,Y_train,cv=5,scoring='roc_auc',n_jobs=-1)
ada_model_cv_scores.mean()

CPU times: user 160 ms, sys: 36.5 ms, total: 196 ms
Wall time: 14.5 s


0.8810854317302965

In [None]:
ada_model.fit(X_train,Y_train)
Y_predict = ada_model.predict(X_test)

In [None]:
confusion_matrix(Y_test,Y_predict)

array([[3398,  343],
       [ 588, 1671]])

In [None]:
print(classification_report(Y_test,Y_predict))

              precision    recall  f1-score   support

           0       0.85      0.91      0.88      3741
           1       0.83      0.74      0.78      2259

    accuracy                           0.84      6000
   macro avg       0.84      0.82      0.83      6000
weighted avg       0.84      0.84      0.84      6000



In [None]:
f1_score(Y_test,Y_predict)

0.7821202901942429

In [None]:
Y_predict_prob = ada_model.predict_proba(X_test)

In [None]:
Threshold_Finder(Y_test,Y_predict_prob)

0.4975675675675676

In [None]:
Y_predict = Y_predict_prob[:,1] > Threshold_Finder(Y_test,Y_predict_prob)
print(classification_report(Y_test,Y_predict))

              precision    recall  f1-score   support

           0       0.90      0.85      0.87      3741
           1       0.77      0.84      0.80      2259

    accuracy                           0.84      6000
   macro avg       0.83      0.84      0.83      6000
weighted avg       0.85      0.84      0.84      6000



In [None]:
%%time
gb_model = GradientBoostingClassifier()
gb_model_cv_scores = cross_val_score(gb_model,X_train,Y_train,cv=5,scoring='roc_auc',n_jobs=-1)
gb_model_cv_scores.mean()

CPU times: user 614 ms, sys: 125 ms, total: 739 ms
Wall time: 1min 6s


0.8889860046791135

In [None]:
gb_model.fit(X_train,Y_train)
Y_predict = gb_model.predict(X_test)

In [None]:
confusion_matrix(Y_test,Y_predict)

array([[3405,  336],
       [ 462, 1797]])

In [None]:
print(classification_report(Y_test,Y_predict))

              precision    recall  f1-score   support

           0       0.88      0.91      0.90      3741
           1       0.84      0.80      0.82      2259

    accuracy                           0.87      6000
   macro avg       0.86      0.85      0.86      6000
weighted avg       0.87      0.87      0.87      6000



In [None]:
f1_score(Y_test,Y_predict)

0.8183060109289617

In [None]:
Y_predict_prob = gb_model.predict_proba(X_test)

In [None]:
Threshold_Finder(Y_test,Y_predict_prob)

0.4242342342342343

In [None]:
Y_predict = Y_predict_prob[:,1] > Threshold_Finder(Y_test,Y_predict_prob)
print(classification_report(Y_test,Y_predict))

              precision    recall  f1-score   support

           0       0.90      0.89      0.89      3741
           1       0.82      0.84      0.83      2259

    accuracy                           0.87      6000
   macro avg       0.86      0.86      0.86      6000
weighted avg       0.87      0.87      0.87      6000



In [None]:
%%time
xgb_model = XGBClassifier()
xgb_model_cv_scores = cross_val_score(xgb_model,X_train,Y_train,cv=5,scoring='roc_auc',n_jobs=-1)
xgb_model_cv_scores.mean()

CPU times: user 108 ms, sys: 21.2 ms, total: 130 ms
Wall time: 5.64 s


0.8829256553207896

In [None]:
xgb_model.fit(X_train,Y_train)
Y_predict = xgb_model.predict(X_test)

In [None]:
confusion_matrix(Y_test,Y_predict)

array([[3379,  362],
       [ 433, 1826]])

In [None]:
print(classification_report(Y_test,Y_predict))

              precision    recall  f1-score   support

           0       0.89      0.90      0.89      3741
           1       0.83      0.81      0.82      2259

    accuracy                           0.87      6000
   macro avg       0.86      0.86      0.86      6000
weighted avg       0.87      0.87      0.87      6000



In [None]:
f1_score(Y_test,Y_predict)

0.8212277940184395

In [None]:
Y_predict_prob = xgb_model.predict_proba(X_test)

In [None]:
Threshold_Finder(Y_test,Y_predict_prob)

0.42225225225225227

In [None]:
Y_predict = Y_predict_prob[:,1] > Threshold_Finder(Y_test,Y_predict_prob)
print(classification_report(Y_test,Y_predict))

              precision    recall  f1-score   support

           0       0.90      0.89      0.89      3741
           1       0.82      0.83      0.82      2259

    accuracy                           0.87      6000
   macro avg       0.86      0.86      0.86      6000
weighted avg       0.87      0.87      0.87      6000



In [None]:
%%time
lgb_model = LGBMClassifier()
lgb_model_cv_scores = cross_val_score(lgb_model,X_train,Y_train,cv=5,scoring='roc_auc',n_jobs=-1)
lgb_model_cv_scores.mean()

CPU times: user 120 ms, sys: 15 ms, total: 135 ms
Wall time: 8.55 s


0.8891084378389884

In [None]:
lgb_model.fit(X_train,Y_train)
Y_predict = lgb_model.predict(X_test)

[LightGBM] [Info] Number of positive: 8986, number of negative: 15014
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003377 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5184
[LightGBM] [Info] Number of data points in the train set: 24000, number of used features: 62
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.374417 -> initscore=-0.513315
[LightGBM] [Info] Start training from score -0.513315


In [None]:
confusion_matrix(Y_test,Y_predict)

array([[3411,  330],
       [ 416, 1843]])

In [None]:
print(classification_report(Y_test,Y_predict))

              precision    recall  f1-score   support

           0       0.89      0.91      0.90      3741
           1       0.85      0.82      0.83      2259

    accuracy                           0.88      6000
   macro avg       0.87      0.86      0.87      6000
weighted avg       0.88      0.88      0.88      6000



In [None]:
f1_score(Y_test,Y_predict)

0.8316787003610107

In [None]:
Y_predict_prob = lgb_model.predict_proba(X_test)

In [None]:
Threshold_Finder(Y_test,Y_predict_prob)

0.35981981981981986

In [None]:
Y_predict = Y_predict_prob[:,1] > Threshold_Finder(Y_test,Y_predict_prob)
print(classification_report(Y_test,Y_predict))

              precision    recall  f1-score   support

           0       0.91      0.89      0.90      3741
           1       0.82      0.85      0.84      2259

    accuracy                           0.87      6000
   macro avg       0.86      0.87      0.87      6000
weighted avg       0.88      0.87      0.87      6000



In [None]:
%%time
cb_model = CatBoostClassifier()
cb_model_cv_scores = cross_val_score(cb_model,X_train,Y_train,cv=5,scoring='roc_auc',n_jobs=-1)
cb_model_cv_scores.mean()

CPU times: user 910 ms, sys: 148 ms, total: 1.06 s
Wall time: 1min 33s


0.8909371352020707

In [None]:
cb_model.fit(X_train,Y_train)
Y_predict = cb_model.predict(X_test)

Learning rate set to 0.040021
0:	learn: 0.6675296	total: 64.9ms	remaining: 1m 4s
1:	learn: 0.6452212	total: 78.3ms	remaining: 39s
2:	learn: 0.6254495	total: 91ms	remaining: 30.3s
3:	learn: 0.6059213	total: 104ms	remaining: 26s
4:	learn: 0.5880209	total: 118ms	remaining: 23.5s
5:	learn: 0.5718522	total: 133ms	remaining: 22.1s
6:	learn: 0.5597428	total: 147ms	remaining: 20.9s
7:	learn: 0.5461153	total: 161ms	remaining: 20s
8:	learn: 0.5334705	total: 177ms	remaining: 19.5s
9:	learn: 0.5231118	total: 190ms	remaining: 18.9s
10:	learn: 0.5126780	total: 207ms	remaining: 18.6s
11:	learn: 0.5032010	total: 221ms	remaining: 18.2s
12:	learn: 0.4956384	total: 237ms	remaining: 18s
13:	learn: 0.4877502	total: 251ms	remaining: 17.7s
14:	learn: 0.4801944	total: 264ms	remaining: 17.4s
15:	learn: 0.4732024	total: 278ms	remaining: 17.1s
16:	learn: 0.4675784	total: 291ms	remaining: 16.8s
17:	learn: 0.4622923	total: 305ms	remaining: 16.7s
18:	learn: 0.4569810	total: 319ms	remaining: 16.5s
19:	learn: 0.45215

In [None]:
confusion_matrix(Y_test,Y_predict)

array([[3422,  319],
       [ 428, 1831]])

In [None]:
print(classification_report(Y_test,Y_predict))

              precision    recall  f1-score   support

           0       0.89      0.91      0.90      3741
           1       0.85      0.81      0.83      2259

    accuracy                           0.88      6000
   macro avg       0.87      0.86      0.87      6000
weighted avg       0.87      0.88      0.87      6000



In [None]:
f1_score(Y_test,Y_predict)

0.8305738262644592

In [None]:
Y_predict_prob = cb_model.predict_proba(X_test)

In [None]:
Threshold_Finder(Y_test,Y_predict_prob)

0.39054054054054055

In [None]:
Y_predict = Y_predict_prob[:,1] > Threshold_Finder(Y_test,Y_predict_prob)
print(classification_report(Y_test,Y_predict))

              precision    recall  f1-score   support

           0       0.91      0.89      0.90      3741
           1       0.82      0.85      0.84      2259

    accuracy                           0.87      6000
   macro avg       0.87      0.87      0.87      6000
weighted avg       0.88      0.87      0.88      6000



In [None]:
roc_auc_score(Y_test,Y_predict_prob[:,1])

0.8837418747002546

In [None]:
accuracy_score(Y_test,Y_predict)

0.8746666666666667

In [None]:
%%time
dtree_clf_model = DecisionTreeClassifier()

dtree_clf_params = {
    'class_weight':[None,'balanced'],
    'criterion':['entropy','gini'],
    'max_depth':[None,5,10,15,20,30,50,70],
    'min_samples_leaf':[1,2,5,10,15,20],
    'min_samples_split':[2,5,10,15,20]
}

dtree_clf_random_search = RandomizedSearchCV(dtree_clf_model,param_distributions=dtree_clf_params,
                                             cv = 5, n_iter=10,scoring='roc_auc',n_jobs=-1)

dtree_clf_random_search.fit(X_train,Y_train)
dtree_clf_random_search.best_score_

CPU times: user 1.61 s, sys: 86.9 ms, total: 1.7 s
Wall time: 37.2 s


0.8724810232909943

In [None]:
dtree_clf_model = dtree_clf_random_search.best_estimator_

In [None]:
dtree_clf_model.fit(X_train,Y_train)
Y_predict = dtree_clf_model.predict(X_test)

In [None]:
confusion_matrix(Y_test,Y_predict)

array([[3361,  380],
       [ 475, 1784]])

In [None]:
print(classification_report(Y_test,Y_predict))

              precision    recall  f1-score   support

           0       0.88      0.90      0.89      3741
           1       0.82      0.79      0.81      2259

    accuracy                           0.86      6000
   macro avg       0.85      0.84      0.85      6000
weighted avg       0.86      0.86      0.86      6000



In [None]:
f1_score(Y_test,Y_predict)

0.8066922903007009

In [None]:
Y_predict_prob = dtree_clf_model.predict_proba(X_test)

In [None]:
Threshold_Finder(Y_test,Y_predict_prob)

0.4450450450450451

In [None]:
Y_predict = Y_predict_prob[:,1] > Threshold_Finder(Y_test,Y_predict_prob)
print(classification_report(Y_test,Y_predict))

              precision    recall  f1-score   support

           0       0.88      0.89      0.89      3741
           1       0.82      0.80      0.81      2259

    accuracy                           0.86      6000
   macro avg       0.85      0.85      0.85      6000
weighted avg       0.86      0.86      0.86      6000



In [None]:
%%time
rf_clf_model = RandomForestClassifier()

rf_clf_params = {
    'n_estimators':[100,200,300,500,700,1000], #number of individual decision trees to be created
    'max_features': [5,10,20,25,30,35], #how many features would be available at a split
    'bootstrap': [True, False], #should different data subsets go in or not
    'class_weight':[None,'balanced'],
    'criterion':['entropy','gini'],
    'max_depth':[None,5,10,15,20,30,50,70],
    'min_samples_leaf':[1,2,5,10,15,20],
    'min_samples_split':[2,5,10,15,20]
}

rf_clf_random_search = RandomizedSearchCV(rf_clf_model,param_distributions=rf_clf_params,
                                             cv = 5, n_jobs=-1, n_iter=10,scoring='roc_auc')

rf_clf_random_search.fit(X_train,Y_train)
rf_clf_random_search.best_score_

CPU times: user 1min 39s, sys: 7.47 s, total: 1min 46s
Wall time: 1h 13min 26s


0.889465651526206

In [None]:
rf_clf_model = dtree_clf_random_search.best_estimator_

In [None]:
rf_clf_model.fit(X_train,Y_train)
Y_predict = rf_clf_model.predict(X_test)

In [None]:
confusion_matrix(Y_test,Y_predict)

array([[3363,  378],
       [ 475, 1784]])

In [None]:
print(classification_report(Y_test,Y_predict))

              precision    recall  f1-score   support

           0       0.88      0.90      0.89      3741
           1       0.83      0.79      0.81      2259

    accuracy                           0.86      6000
   macro avg       0.85      0.84      0.85      6000
weighted avg       0.86      0.86      0.86      6000



In [None]:
f1_score(Y_test,Y_predict)

0.8070572268717485

In [None]:
Y_predict_prob = rf_clf_model.predict_proba(X_test)

In [None]:
Threshold_Finder(Y_test,Y_predict_prob)

0.4450450450450451

In [None]:
Y_predict = Y_predict_prob[:,1] > Threshold_Finder(Y_test,Y_predict_prob)
print(classification_report(Y_test,Y_predict))

              precision    recall  f1-score   support

           0       0.88      0.89      0.89      3741
           1       0.82      0.80      0.81      2259

    accuracy                           0.86      6000
   macro avg       0.85      0.85      0.85      6000
weighted avg       0.86      0.86      0.86      6000



In [10]:
%%time
cb_clf_model = CatBoostClassifier()

cb_clf_params = {
   'n_estimators':[50,100,200,500,700],
   'learning_rate': [0.01,.05,0.1,0.4,0.8,1],
   'max_depth':[1,2,3,4,5,6],
   'subsample':[0.5,0.8,1],
   'colsample_bylevel':[i/10.0 for i in range(5,10)],
   'l2_leaf_reg':[1e-5, 1e-2, 0.1, 1, 100]
}

cb_clf_random_search = RandomizedSearchCV(cb_clf_model,param_distributions=cb_clf_params,
                                             cv = 5, n_iter=2,scoring='roc_auc',n_jobs=-1)



Wall time: 0 ns


In [11]:
cb_clf_random_search.fit(X_train,Y_train)
cb_clf_random_search.best_score_

0:	learn: 0.4391944	total: 81.1ms	remaining: 8.03s
1:	learn: 0.4058549	total: 90.4ms	remaining: 4.43s
2:	learn: 0.3963906	total: 99ms	remaining: 3.2s
3:	learn: 0.3909068	total: 109ms	remaining: 2.62s
4:	learn: 0.3879280	total: 119ms	remaining: 2.26s
5:	learn: 0.3851715	total: 128ms	remaining: 2.01s
6:	learn: 0.3838744	total: 139ms	remaining: 1.85s
7:	learn: 0.3825268	total: 148ms	remaining: 1.7s
8:	learn: 0.3814508	total: 156ms	remaining: 1.57s
9:	learn: 0.3801633	total: 164ms	remaining: 1.48s
10:	learn: 0.3794438	total: 172ms	remaining: 1.39s
11:	learn: 0.3782878	total: 184ms	remaining: 1.35s
12:	learn: 0.3774227	total: 194ms	remaining: 1.29s
13:	learn: 0.3764315	total: 204ms	remaining: 1.25s
14:	learn: 0.3749392	total: 212ms	remaining: 1.2s
15:	learn: 0.3711877	total: 221ms	remaining: 1.16s
16:	learn: 0.3678521	total: 229ms	remaining: 1.12s
17:	learn: 0.3659683	total: 239ms	remaining: 1.09s
18:	learn: 0.3646994	total: 247ms	remaining: 1.05s
19:	learn: 0.3637430	total: 256ms	remaining

0.8644950772115386

In [12]:
cb_clf_model = cb_clf_random_search.best_estimator_

In [13]:
cb_clf_model.fit(X_train,Y_train)
Y_predict_prob = cb_clf_model.predict_proba(X_test)
Y_predict = cb_clf_model.predict(X_test)

0:	learn: 0.4391944	total: 9.56ms	remaining: 946ms
1:	learn: 0.4058549	total: 18.9ms	remaining: 925ms
2:	learn: 0.3963906	total: 27.5ms	remaining: 889ms
3:	learn: 0.3909068	total: 34.1ms	remaining: 819ms
4:	learn: 0.3879280	total: 41.5ms	remaining: 788ms
5:	learn: 0.3851715	total: 48.2ms	remaining: 756ms
6:	learn: 0.3838744	total: 55.5ms	remaining: 737ms
7:	learn: 0.3825268	total: 62ms	remaining: 713ms
8:	learn: 0.3814508	total: 67.9ms	remaining: 687ms
9:	learn: 0.3801633	total: 73.5ms	remaining: 661ms
10:	learn: 0.3794438	total: 78.8ms	remaining: 637ms
11:	learn: 0.3782878	total: 86.1ms	remaining: 631ms
12:	learn: 0.3774227	total: 92.4ms	remaining: 618ms
13:	learn: 0.3764315	total: 99.7ms	remaining: 612ms
14:	learn: 0.3749392	total: 106ms	remaining: 602ms
15:	learn: 0.3711877	total: 112ms	remaining: 590ms
16:	learn: 0.3678521	total: 118ms	remaining: 577ms
17:	learn: 0.3659683	total: 125ms	remaining: 569ms
18:	learn: 0.3646994	total: 130ms	remaining: 555ms
19:	learn: 0.3637430	total: 1

In [14]:
roc_auc_score(Y_test,Y_predict_prob[:,1])

0.8657049014432632

In [15]:
confusion_matrix(Y_test,Y_predict)

array([[3316,  425],
       [ 526, 1733]], dtype=int64)

In [16]:
print(classification_report(Y_test,Y_predict))

              precision    recall  f1-score   support

           0       0.86      0.89      0.87      3741
           1       0.80      0.77      0.78      2259

    accuracy                           0.84      6000
   macro avg       0.83      0.83      0.83      6000
weighted avg       0.84      0.84      0.84      6000



In [17]:
f1_score(Y_test,Y_predict)

0.7846954946796468

In [18]:
Y_predict_prob = cb_clf_model.predict_proba(X_test)

In [19]:
Threshold_Finder(Y_test,Y_predict_prob)

0.3588288288288289

In [20]:
Y_predict = Y_predict_prob[:,1] > Threshold_Finder(Y_test,Y_predict_prob)
print(classification_report(Y_test,Y_predict))

              precision    recall  f1-score   support

           0       0.88      0.84      0.86      3741
           1       0.76      0.81      0.78      2259

    accuracy                           0.83      6000
   macro avg       0.82      0.83      0.82      6000
weighted avg       0.84      0.83      0.83      6000



In [22]:
temp = X_train.columns

In [23]:
scalar = StandardScaler()

X_train = scalar.fit_transform(X_train)
X_train = pd.DataFrame(data = X_train, columns = temp)
X_train.head()

Unnamed: 0,var3,var4,var5,var6,var7,var8,var12,var14,var15,var16,...,var23_tf,var23_ub,var23_qu,var23_ri,var23_fe,var23_da,var23_cz,var23_sy,var23_yv,var29_ev
0,1.795857,-0.92554,0.212231,-1.845218,2.16426,0.107453,0.164472,0.874381,0.814711,1.301922,...,1.690716,-0.517781,-0.444259,-0.386983,-0.315444,-0.265377,-0.210141,-0.15186,-0.10363,-1.995584
1,0.994893,-0.617507,-0.646956,0.710405,1.021448,-0.450844,-0.078446,-0.263682,-1.304482,-0.196447,...,-0.591465,1.93132,-0.444259,-0.386983,-0.315444,-0.265377,-0.210141,-0.15186,-0.10363,0.501107
2,-0.341154,0.928364,0.358946,1.471601,0.108987,1.240615,-1.172287,-0.980612,-0.757384,2.261316,...,-0.591465,-0.517781,-0.444259,2.584094,-0.315444,-0.265377,-0.210141,-0.15186,-0.10363,0.501107
3,-0.111833,-0.886323,-1.113731,-0.344039,0.197326,1.275405,0.40739,-0.925464,0.023932,0.517626,...,-0.591465,-0.517781,-0.444259,-0.386983,3.17013,-0.265377,-0.210141,-0.15186,-0.10363,0.501107
4,-0.18495,-1.549451,-0.526871,1.278182,0.636783,-0.097974,1.59783,-0.594573,-0.8532,-2.03407,...,-0.591465,-0.517781,2.250941,-0.386983,-0.315444,-0.265377,-0.210141,-0.15186,-0.10363,0.501107


In [24]:
knn_model = KNeighborsClassifier(10)
knn_cv_scores= cross_val_score(knn_model,X_train.values,Y_train,cv=5,scoring='roc_auc')
knn_cv_scores.mean()

0.7112238648031439

In [25]:
svm_model = SVC()
svm_cv_scores= cross_val_score(svm_model,X_train,Y_train,cv=5,scoring='roc_auc')
svm_cv_scores.mean()

0.8814048549704323

#### Bases on the above Model performance. 
#### We can Consider lgb_model, cb_model Model for predicting whether Person will default on the payday loan.
