In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt 
plt.rc("font", size=13)
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, random_split, DataLoader

import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv("data/bank_data.csv", header=0)

In [3]:
data

Unnamed: 0,Loan ID,Customer ID,Loan Status,Current Loan Amount,Term,Credit Score,Annual Income,Years in current job,Home Ownership,Purpose,Monthly Debt,Years of Credit History,Months since last delinquent,Number of Open Accounts,Number of Credit Problems,Current Credit Balance,Maximum Open Credit,Bankruptcies,Tax Liens
0,14dd8831-6af5-400b-83ec-68e61888a048,981165ec-3274-42f5-a3b4-d104041a9ca9,Fully Paid,445412.0,Short Term,709.0,1167493.0,8 years,Home Mortgage,Home Improvements,5214.74,17.2,,6.0,1.0,228190.0,416746.0,1.0,0.0
1,4771cc26-131a-45db-b5aa-537ea4ba5342,2de017a3-2e01-49cb-a581-08169e83be29,Fully Paid,262328.0,Short Term,,,10+ years,Home Mortgage,Debt Consolidation,33295.98,21.1,8.0,35.0,0.0,229976.0,850784.0,0.0,0.0
2,4eed4e6a-aa2f-4c91-8651-ce984ee8fb26,5efb2b2b-bf11-4dfd-a572-3761a2694725,Fully Paid,99999999.0,Short Term,741.0,2231892.0,8 years,Own Home,Debt Consolidation,29200.53,14.9,29.0,18.0,1.0,297996.0,750090.0,0.0,0.0
3,77598f7b-32e7-4e3b-a6e5-06ba0d98fe8a,e777faab-98ae-45af-9a86-7ce5b33b1011,Fully Paid,347666.0,Long Term,721.0,806949.0,3 years,Own Home,Debt Consolidation,8741.90,12.0,,9.0,0.0,256329.0,386958.0,0.0,0.0
4,d4062e70-befa-4995-8643-a0de73938182,81536ad9-5ccf-4eb8-befb-47a4d608658e,Fully Paid,176220.0,Short Term,,,5 years,Rent,Debt Consolidation,20639.70,6.1,,15.0,0.0,253460.0,427174.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100509,,,,,,,,,,,,,,,,,,,
100510,,,,,,,,,,,,,,,,,,,
100511,,,,,,,,,,,,,,,,,,,
100512,,,,,,,,,,,,,,,,,,,


In [4]:
data.drop(['Loan ID', 'Customer ID'], axis = 1, inplace = True)
data.drop_duplicates(inplace = True)

In [67]:
data.isnull().sum()

Loan Status                         1
Current Loan Amount                 1
Term                                1
Credit Score                    19155
Annual Income                   19155
Years in current job             3803
Home Ownership                      1
Purpose                             1
Monthly Debt                        1
Years of Credit History             1
Months since last delinquent    48338
Number of Open Accounts             1
Number of Credit Problems           1
Current Credit Balance              1
Maximum Open Credit                 3
Bankruptcies                      191
Tax Liens                          10
dtype: int64

In [68]:
data.drop(['Months since last delinquent'], axis = 1, inplace = True)
data.dropna(axis = 0, subset = ['Credit Score'], inplace = True)
data.dropna(axis = 0, subset = ['Years in current job'], inplace = True)
data.dropna(axis = 0, subset = ['Bankruptcies'], inplace = True)
data.dropna(axis = 0, subset = ['Maximum Open Credit'], inplace = True)

In [69]:
data.isnull().sum()

Loan Status                  0
Current Loan Amount          0
Term                         0
Credit Score                 0
Annual Income                0
Years in current job         0
Home Ownership               0
Purpose                      0
Monthly Debt                 0
Years of Credit History      0
Number of Open Accounts      0
Number of Credit Problems    0
Current Credit Balance       0
Maximum Open Credit          0
Bankruptcies                 0
Tax Liens                    0
dtype: int64

In [70]:
le = LabelEncoder()
data['Loan Status'] = le.fit_transform(data['Loan Status'])
data['Term'] = le.fit_transform(data['Term'])
data['Years in current job'] = le.fit_transform(data['Years in current job'])
data['Home Ownership'] = le.fit_transform(data['Home Ownership'])
data['Purpose'] = le.fit_transform(data['Purpose'])

In [71]:
X = data.loc[:, data.columns != 'Loan Status'].values
y = data.loc[ : , ["Loan Status"]].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [79]:

classifier = RandomForestClassifier(n_estimators = 100, criterion = 'entropy', max_depth = 3, random_state = 0)
classifier.fit(X_train, y_train)

In [80]:
y_pred = classifier.predict(X_test)
rftrain = classifier.score(X_train , y_train)
rftest = classifier.score(X_test , y_test)

In [81]:
print(rftrain, rftest)

0.8241591346866203 0.8202696695806786


In [82]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.27      0.42      3309
           1       0.81      1.00      0.89     10189

    accuracy                           0.82     13498
   macro avg       0.90      0.63      0.66     13498
weighted avg       0.85      0.82      0.78     13498

