In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data = pd.read_csv("/content/loan_approval_dataset.csv")
pd.set_option('display.max_columns', None)
data.head(10)

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,2,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,4,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected
5,6,0,Graduate,Yes,4800000,13500000,10,319,6800000,8300000,13700000,5100000,Rejected
6,7,5,Graduate,No,8700000,33000000,4,678,22500000,14800000,29200000,4300000,Approved
7,8,2,Graduate,Yes,5700000,15000000,20,382,13200000,5700000,11800000,6000000,Rejected
8,9,0,Graduate,Yes,800000,2200000,20,782,1300000,800000,2800000,600000,Approved
9,10,5,Not Graduate,No,1100000,4300000,10,388,3200000,1400000,3300000,1600000,Rejected


**Data Preprocessing**

In [3]:
data.count().isnull()

Unnamed: 0,0
loan_id,False
no_of_dependents,False
education,False
self_employed,False
income_annum,False
loan_amount,False
loan_term,False
cibil_score,False
residential_assets_value,False
commercial_assets_value,False


In [4]:
data.duplicated().sum()

np.int64(0)

In [5]:
!pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.8.1-py3-none-any.whl.metadata (7.9 kB)
Downloading category_encoders-2.8.1-py3-none-any.whl (85 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.7/85.7 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: category_encoders
Successfully installed category_encoders-2.8.1


In [6]:
target = ' loan_status'
X = data.drop(columns=[target])
y = data[target]

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4269 entries, 0 to 4268
Data columns (total 13 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   loan_id                    4269 non-null   int64 
 1    no_of_dependents          4269 non-null   int64 
 2    education                 4269 non-null   object
 3    self_employed             4269 non-null   object
 4    income_annum              4269 non-null   int64 
 5    loan_amount               4269 non-null   int64 
 6    loan_term                 4269 non-null   int64 
 7    cibil_score               4269 non-null   int64 
 8    residential_assets_value  4269 non-null   int64 
 9    commercial_assets_value   4269 non-null   int64 
 10   luxury_assets_value       4269 non-null   int64 
 11   bank_asset_value          4269 non-null   int64 
 12   loan_status               4269 non-null   object
dtypes: int64(10), object(3)
memory usage: 433.7+ KB


In [8]:
cat_cols = X.select_dtypes(include=['object']).columns

In [9]:
import category_encoders as ce
encoder = ce.BinaryEncoder(cols=cat_cols)
X_encoded = encoder.fit_transform(X)

In [10]:
X_encoded.head()

Unnamed: 0,loan_id,no_of_dependents,education_0,education_1,self_employed_0,self_employed_1,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value
0,1,2,0,1,0,1,9600000,29900000,12,778,2400000,17600000,22700000,8000000
1,2,0,1,0,1,0,4100000,12200000,8,417,2700000,2200000,8800000,3300000
2,3,3,0,1,0,1,9100000,29700000,20,506,7100000,4500000,33300000,12800000
3,4,3,0,1,0,1,8200000,30700000,8,467,18200000,3300000,23300000,7900000
4,5,5,1,0,1,0,9800000,24200000,20,382,12400000,8200000,29400000,5000000


In [11]:
print(data.columns)

Index(['loan_id', ' no_of_dependents', ' education', ' self_employed',
       ' income_annum', ' loan_amount', ' loan_term', ' cibil_score',
       ' residential_assets_value', ' commercial_assets_value',
       ' luxury_assets_value', ' bank_asset_value', ' loan_status'],
      dtype='object')


In [12]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, test_size=0.2, random_state=42, stratify=y)

In [13]:
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

In [14]:
y_pred = model.predict(X_test)

In [15]:
print(classification_report(y_test, y_pred, digits=4))

              precision    recall  f1-score   support

    Approved     0.9777    0.9887    0.9831       531
    Rejected     0.9811    0.9628    0.9719       323

    accuracy                         0.9789       854
   macro avg     0.9794    0.9758    0.9775       854
weighted avg     0.9789    0.9789    0.9789       854



**Address class imbalance**

In [16]:
from imblearn.over_sampling import SMOTE

In [17]:
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

In [18]:
clm = RandomForestClassifier(random_state=42)
clm.fit(X_train_res, y_train_res)

In [19]:
y_pred = clm.predict(X_test)

In [20]:
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Confusion Matrix:
[[526   5]
 [ 15 308]]
