# Loading libraries

In [59]:
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression,Ridge,Lasso
from sklearn.model_selection import cross_val_score,GridSearchCV,RandomizedSearchCV,train_test_split
from sklearn.metrics import f1_score,accuracy_score,classification_report,confusion_matrix,roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier,AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.utils import class_weight

from mypipes import *

import warnings
warnings.filterwarnings('ignore') # ignore the warnings.


import matplotlib.pyplot as plt
import seaborn as sb
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.6f} (std: {1:.6f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

def Threshold_Finder(Y_test,Y_predict_):
    I = []
    roc_ = []
    for i in np.linspace(0.01,1,1000):
        Y_predict = Y_predict_ > i
        roc = f1_score(Y_test,Y_predict)
        I.append(i)
        roc_.append(roc)
    Temp = I[roc_.index(max(roc_))]
    return Temp

#  Loading the datasets

In [2]:
train_data = r'paydayloan_collections.csv'
#test_data = r''

train_data = pd.read_csv(train_data)
#test_data = pd.read_csv(test_data)

In [3]:
# Data Previews
train_data.transpose()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,29990,29991,29992,29993,29994,29995,29996,29997,29998,29999
payment,Success,Denied,Denied,Success,Success,Success,Success,Success,Denied,Denied,...,Success,Success,Denied,Success,Success,Success,Denied,Denied,Success,Success
var1,qw,qw,qw,wv,ma,kq,wv,qw,wv,qw,...,qw,wv,qw,wv,wv,qw,kq,kq,qw,qw
var2,hk,rv,zg,js,xn,py,py,rv,rv,rv,...,py,py,rv,bq,py,zg,bq,js,py,js
var3,3.11,3.35,4.15,6.23,1.28,-2.45,1.05,5.41,7.29,3.13,...,-3.2,7.75,4.82,2.2,2.51,3.85,3.32,2.98,-0.3,6.03
var4,16.06,11.18,29.19,15.7,20.71,22.45,23.02,17.92,26.83,34.21,...,7.38,26.77,26.04,-26.34,4.36,12.75,25.31,19.28,16.41,-6.99
var5,-4.6,-18.55,18.91,2.81,14.98,15.18,17.59,-14.59,33.92,22.55,...,40.39,45.77,14.51,18.82,53.03,47.62,15.9,16.2,22.8,-28.71
var6,22.34,6.68,16.4,4.46,11.19,-2.12,6.65,5.0,13.35,0.8,...,7.69,4.51,4.12,14.42,-0.73,3.34,10.96,-1.7,-9.99,11.82
var7,13.53,12.78,3.67,5.13,17.66,-8.24,-2.06,1.34,20.57,6.91,...,-5.24,6.97,12.07,2.44,14.0,17.22,10.13,9.45,26.89,4.71
var8,1.53,6.62,5.72,8.66,1.13,10.34,12.2,-8.54,4.46,11.18,...,7.09,15.35,11.31,4.12,15.72,7.0,10.32,-8.44,-1.1,5.0
var9,nv,nv,ch,ja,nv,ch,ch,ch,ch,ch,...,ch,ch,ch,ch,ch,ch,ch,ch,ch,ch


In [4]:
# get information of Data
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 31 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   payment  30000 non-null  object 
 1   var1     30000 non-null  object 
 2   var2     30000 non-null  object 
 3   var3     30000 non-null  float64
 4   var4     30000 non-null  float64
 5   var5     30000 non-null  float64
 6   var6     30000 non-null  float64
 7   var7     30000 non-null  float64
 8   var8     30000 non-null  float64
 9   var9     30000 non-null  object 
 10  var10    30000 non-null  object 
 11  var11    30000 non-null  object 
 12  var12    30000 non-null  float64
 13  var13    30000 non-null  object 
 14  var14    30000 non-null  float64
 15  var15    30000 non-null  float64
 16  var16    30000 non-null  float64
 17  var17    30000 non-null  object 
 18  var18    30000 non-null  float64
 19  var19    30000 non-null  object 
 20  var20    30000 non-null  float64
 21  var21    300

In [5]:
p1 = pdPipeline([
    ('columns_selection',VarSelector(['var1','var2','var9','var10','var11','var13','var17','var19','var23','var29'])),
    ('data_impute',DataFrameImputer()),
    ('get_dummy',get_dummies_Pipe())
])
temp = pd.DataFrame(data = p1.fit_transform(train_data),columns = p1.get_feature_names())

train_data = pd.concat([train_data,temp],axis = 1)

In [6]:
Y_train = train_data['payment']
Y_train = pd.DataFrame(data = Y_train , columns = ['payment'] )
train_data.drop(columns = ['payment'],inplace = True)
X_train = train_data
X_train.drop(['var1','var2','var9','var10','var11','var13','var17','var19','var23','var29'],inplace = True,axis = 1)
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 62 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   var3      30000 non-null  float64
 1   var4      30000 non-null  float64
 2   var5      30000 non-null  float64
 3   var6      30000 non-null  float64
 4   var7      30000 non-null  float64
 5   var8      30000 non-null  float64
 6   var12     30000 non-null  float64
 7   var14     30000 non-null  float64
 8   var15     30000 non-null  float64
 9   var16     30000 non-null  float64
 10  var18     30000 non-null  float64
 11  var20     30000 non-null  float64
 12  var21     30000 non-null  float64
 13  var22     30000 non-null  float64
 14  var24     30000 non-null  float64
 15  var25     30000 non-null  float64
 16  var26     30000 non-null  float64
 17  var27     30000 non-null  float64
 18  var28     30000 non-null  float64
 19  var30     30000 non-null  float64
 20  var1_qw   30000 non-null  in

In [7]:
Y_train

Unnamed: 0,payment
0,Success
1,Denied
2,Denied
3,Success
4,Success
...,...
29995,Success
29996,Denied
29997,Denied
29998,Success


In [8]:
Y_train = (Y_train['payment'] == 'Success').astype('float32')
Y_train

0        1.0
1        0.0
2        0.0
3        1.0
4        1.0
        ... 
29995    1.0
29996    0.0
29997    0.0
29998    1.0
29999    1.0
Name: payment, Length: 30000, dtype: float32

# Create train_test_spit (for 1 time validation)


In [9]:
X_train,X_test,Y_train,Y_test = train_test_split(X_train, Y_train, test_size=0.2, random_state=1)


In [10]:
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')

# FFN USING KERAS

In [11]:
scalar = StandardScaler()

X_train_ = scalar.fit_transform(X_train)
X_train_

array([[ 1.7958571 , -0.9255405 ,  0.21223104, ..., -0.15186016,
        -0.10362964, -1.9955837 ],
       [ 0.99489325, -0.6175071 , -0.64695626, ..., -0.15186016,
        -0.10362964,  0.5011065 ],
       [-0.34115422,  0.928364  ,  0.35894606, ..., -0.15186016,
        -0.10362964,  0.5011065 ],
       ...,
       [-0.36441875,  0.6288872 , -0.19625974, ..., -0.15186016,
        -0.10362964,  0.5011065 ],
       [-1.4046746 , -0.45778608,  0.39060038, ..., -0.15186016,
        -0.10362964,  0.5011065 ],
       [-0.36774224,  0.44420975,  0.28106657, ..., -0.15186016,
        -0.10362964,  0.5011065 ]], dtype=float32)

In [12]:
X_test_ = scalar.transform(X_test)
X_test_

array([[ 1.9487381 , -1.4225296 , -0.03798841, ..., -0.15186016,
        -0.10362964, -1.9955837 ],
       [-2.893603  , -0.36509088, -0.08069656, ..., -0.15186016,
        -0.10362964,  0.5011065 ],
       [-1.2285291 , -1.2699388 , -1.1036822 , ..., -0.15186016,
        -0.10362964,  0.5011065 ],
       ...,
       [ 1.443566  , -0.6068115 ,  0.21373841, ..., -0.15186016,
        -0.10362964,  0.5011065 ],
       [-1.201941  ,  0.2602454 ,  1.3221403 , ..., -0.15186016,
        -0.10362964,  0.5011065 ],
       [-0.37106574, -0.7907759 ,  0.8151695 , ..., -0.15186016,
        -0.10362964,  0.5011065 ]], dtype=float32)

In [13]:
cw = class_weight.compute_class_weight(class_weight='balanced',classes = np.unique(Y_train),y = Y_train)

In [14]:
cw_dict =dict(enumerate(cw))

In [15]:
cw_dict

{0: 0.7992540295723991, 1: 1.3354106387714222}

In [16]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense,Dropout,BatchNormalization
from tensorflow.keras.regularizers import L1,L2

In [17]:
model_seq = Sequential()

model_seq.add(Dense(32,input_dim = 62,activation = 'relu',kernel_regularizer = L2(0.01),name = 'h1'))
model_seq.add(BatchNormalization(name = 'bn_h1'))
model_seq.add(Dropout(0.2,name = 'dr_h1'))

model_seq.add(Dense(16,input_dim = 32,activation = 'relu',kernel_regularizer = L2(0.01),name = 'h2'))
model_seq.add(BatchNormalization(name = 'bn_h2'))
model_seq.add(Dropout(0.2,name = 'dr_h2'))

model_seq.add(Dense(8,input_dim = 16,activation = 'relu',kernel_regularizer = L2(0.01),name = 'h3'))
model_seq.add(BatchNormalization(name = 'bn_h3'))
model_seq.add(Dropout(0.2,name = 'dr_h3'))

model_seq.add(Dense(1,input_dim = 8,activation = 'sigmoid',name = 'output'))

In [38]:
model_seq.compile(optimizer= 'adam',loss = 'binary_crossentropy',metrics =['AUC'])

In [39]:
model_seq.summary()

In [40]:
model_seq.fit(x = X_train_,y = Y_train,validation_data = (X_test_,Y_test),epochs = 1000,batch_size = 1000,verbose = 1)

Epoch 1/1000
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 15ms/step - AUC: 0.9238 - loss: 0.3279 - val_AUC: 0.8761 - val_loss: 0.4240
Epoch 2/1000
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - AUC: 0.9192 - loss: 0.3394 - val_AUC: 0.8787 - val_loss: 0.4101
Epoch 3/1000
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - AUC: 0.9198 - loss: 0.3321 - val_AUC: 0.8817 - val_loss: 0.4085
Epoch 4/1000
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - AUC: 0.9237 - loss: 0.3266 - val_AUC: 0.8787 - val_loss: 0.4207
Epoch 5/1000
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - AUC: 0.9195 - loss: 0.3365 - val_AUC: 0.8802 - val_loss: 0.4089
Epoch 6/1000
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - AUC: 0.9219 - loss: 0.3308 - val_AUC: 0.8771 - val_loss: 0.4159
Epoch 7/1000
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step -

<keras.src.callbacks.history.History at 0x28d74152a90>

In [41]:
y_pred = model_seq.predict(X_test_)

[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step


In [42]:
roc_auc_score(Y_test,y_pred)

0.8801693046637886

In [60]:
Y_predict = y_pred[:,0] > Threshold_Finder(Y_test,y_pred)   
print(classification_report(Y_test,Y_predict))

              precision    recall  f1-score   support

         0.0       0.90      0.90      0.90      3741
         1.0       0.83      0.83      0.83      2259

    accuracy                           0.87      6000
   macro avg       0.86      0.86      0.86      6000
weighted avg       0.87      0.87      0.87      6000



In [61]:
confusion_matrix(Y_test,Y_predict)

array([[3366,  375],
       [ 393, 1866]], dtype=int64)