In [1]:
import pandas as pd
import numpy as np
import random
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras import regularizers

In [2]:
df = pd.read_csv('Loan_Default.csv')

In [3]:
df

Unnamed: 0,ID,year,loan_limit,Gender,approv_in_adv,loan_type,loan_purpose,Credit_Worthiness,open_credit,business_or_commercial,...,credit_type,Credit_Score,co-applicant_credit_type,age,submission_of_application,LTV,Region,Security_Type,Status,dtir1
0,24890,2019,cf,Sex Not Available,nopre,type1,p1,l1,nopc,nob/c,...,EXP,758,CIB,25-34,to_inst,98.728814,south,direct,1,45.0
1,24891,2019,cf,Male,nopre,type2,p1,l1,nopc,b/c,...,EQUI,552,EXP,55-64,to_inst,,North,direct,1,
2,24892,2019,cf,Male,pre,type1,p1,l1,nopc,nob/c,...,EXP,834,CIB,35-44,to_inst,80.019685,south,direct,0,46.0
3,24893,2019,cf,Male,nopre,type1,p4,l1,nopc,nob/c,...,EXP,587,CIB,45-54,not_inst,69.376900,North,direct,0,42.0
4,24894,2019,cf,Joint,pre,type1,p1,l1,nopc,nob/c,...,CRIF,602,EXP,25-34,not_inst,91.886544,North,direct,0,39.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148665,173555,2019,cf,Sex Not Available,nopre,type1,p3,l1,nopc,nob/c,...,CIB,659,EXP,55-64,to_inst,71.792763,south,direct,0,48.0
148666,173556,2019,cf,Male,nopre,type1,p1,l1,nopc,nob/c,...,CIB,569,CIB,25-34,not_inst,74.428934,south,direct,0,15.0
148667,173557,2019,cf,Male,nopre,type1,p4,l1,nopc,nob/c,...,CIB,702,EXP,45-54,not_inst,61.332418,North,direct,0,49.0
148668,173558,2019,cf,Female,nopre,type1,p4,l1,nopc,nob/c,...,EXP,737,EXP,55-64,to_inst,70.683453,North,direct,0,29.0


In [4]:
df['Status'].value_counts()

0    112031
1     36639
Name: Status, dtype: int64

In [5]:
df0s = df[df['Status']==0].iloc[random.sample(range(1, 100000), 36639)]
df1s = df[df['Status']==1]
df = pd.concat([df0s, df1s])

In [6]:
df['Status'].value_counts()

0    36639
1    36639
Name: Status, dtype: int64

In [7]:
df.columns

Index(['ID', 'year', 'loan_limit', 'Gender', 'approv_in_adv', 'loan_type',
       'loan_purpose', 'Credit_Worthiness', 'open_credit',
       'business_or_commercial', 'loan_amount', 'rate_of_interest',
       'Interest_rate_spread', 'Upfront_charges', 'term', 'Neg_ammortization',
       'interest_only', 'lump_sum_payment', 'property_value',
       'construction_type', 'occupancy_type', 'Secured_by', 'total_units',
       'income', 'credit_type', 'Credit_Score', 'co-applicant_credit_type',
       'age', 'submission_of_application', 'LTV', 'Region', 'Security_Type',
       'Status', 'dtir1'],
      dtype='object')

In [8]:
df = df.drop('ID', axis = 1)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 73278 entries, 83112 to 148663
Data columns (total 33 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   year                       73278 non-null  int64  
 1   loan_limit                 71573 non-null  object 
 2   Gender                     73278 non-null  object 
 3   approv_in_adv              72810 non-null  object 
 4   loan_type                  73278 non-null  object 
 5   loan_purpose               73216 non-null  object 
 6   Credit_Worthiness          73278 non-null  object 
 7   open_credit                73278 non-null  object 
 8   business_or_commercial     73278 non-null  object 
 9   loan_amount                73278 non-null  int64  
 10  rate_of_interest           36839 non-null  float64
 11  Interest_rate_spread       36639 non-null  float64
 12  Upfront_charges            35736 non-null  float64
 13  term                       73258 non-null

In [10]:
df.isnull().sum()

year                             0
loan_limit                    1705
Gender                           0
approv_in_adv                  468
loan_type                        0
loan_purpose                    62
Credit_Worthiness                0
open_credit                      0
business_or_commercial           0
loan_amount                      0
rate_of_interest             36439
Interest_rate_spread         36639
Upfront_charges              37542
term                            20
Neg_ammortization               55
interest_only                    0
lump_sum_payment                 0
property_value               15097
construction_type                0
occupancy_type                   0
Secured_by                       0
total_units                      0
income                        3878
credit_type                      0
Credit_Score                     0
co-applicant_credit_type         0
age                            200
submission_of_application      200
LTV                 

In [11]:
num = list(df.select_dtypes(exclude=['object']))
cat = list(df.select_dtypes(include=['object']))

In [12]:
df[num]=df[num].fillna(df[num].mean()) 
df[cat]=df[cat].fillna(df[cat].mode().iloc[0])

In [13]:
df.isnull().sum()

year                         0
loan_limit                   0
Gender                       0
approv_in_adv                0
loan_type                    0
loan_purpose                 0
Credit_Worthiness            0
open_credit                  0
business_or_commercial       0
loan_amount                  0
rate_of_interest             0
Interest_rate_spread         0
Upfront_charges              0
term                         0
Neg_ammortization            0
interest_only                0
lump_sum_payment             0
property_value               0
construction_type            0
occupancy_type               0
Secured_by                   0
total_units                  0
income                       0
credit_type                  0
Credit_Score                 0
co-applicant_credit_type     0
age                          0
submission_of_application    0
LTV                          0
Region                       0
Security_Type                0
Status                       0
dtir1   

In [14]:
le = LabelEncoder()
df[cat] = df[cat].apply(le.fit_transform)

In [15]:
df

Unnamed: 0,year,loan_limit,Gender,approv_in_adv,loan_type,loan_purpose,Credit_Worthiness,open_credit,business_or_commercial,loan_amount,...,credit_type,Credit_Score,co-applicant_credit_type,age,submission_of_application,LTV,Region,Security_Type,Status,dtir1
83112,2019,0,1,0,1,3,0,0,0,316500,...,0,505,1,3,0,90.948276,3,1,0,38.223873
61894,2019,0,1,0,0,2,0,0,1,476500,...,1,538,1,1,0,79.682274,0,1,0,36.000000
57543,2019,0,2,1,0,3,0,0,1,216500,...,1,684,0,4,1,70.292208,3,1,0,36.000000
23636,2019,0,3,1,0,2,0,0,1,296500,...,3,744,0,3,0,78.439153,3,1,0,42.000000
13637,2019,0,2,0,2,0,0,0,1,686500,...,1,546,0,0,0,101.253687,3,1,0,44.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148651,2019,0,1,0,2,2,0,0,1,446500,...,3,897,1,2,1,87.893701,0,1,1,37.000000
148652,2019,0,2,0,0,0,0,0,1,96500,...,2,608,1,3,1,73.691714,0,1,1,38.223873
148658,2019,0,3,0,0,3,0,0,1,386500,...,2,669,1,0,1,73.691714,3,1,1,38.223873
148661,2019,0,3,0,1,3,0,0,0,346500,...,3,585,0,0,1,96.787710,3,1,1,38.223873


In [16]:
X, y = df.drop('Status',axis=1), df['Status']

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [18]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [19]:
#model.add(BatchNormalization())

model = Sequential()
model.add(Dense(64, activation='relu', input_dim = X.shape[1]))
model.add(Dropout(0.2))
model.add(Dense(32, activation='relu', kernel_regularizer = 'l2'))
model.add(Dropout(0.2))
model.add(Dense(16, activation='relu', kernel_regularizer = 'l2'))
model.add(Dropout(0.2))
model.add(Dense(1, activation = 'sigmoid'))

In [20]:
from keras import backend as K

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision


def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [21]:
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics=['accuracy', f1_m, recall_m, precision_m] )

In [22]:
model.fit(
    X_train, 
    y_train, 
    batch_size = 32, 
    epochs = 10, 
    verbose = 1
    )

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1ea37f2d8b0>

In [23]:
y_pred = model.predict(X_test)
y_pred = np.where(y_pred >= 0.7,1,0)



In [24]:
model.evaluate(X_test, y_test.to_numpy())



[0.024519313126802444,
 0.9976118803024292,
 0.997509777545929,
 0.9980201721191406,
 0.9971665143966675]

In [25]:
confusion_matrix(y_test, y_pred)

array([[7250,   15],
       [  14, 7377]], dtype=int64)