In [2]:
# import required librarires 

%pip install seaborn

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.impute import SimpleImputer

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    classification_report,
    confusion_matrix
)

# For nice plots
sns.set(style="whitegrid", font_scale=1.1)


In [14]:
# load the file and view details

file_path = "data/loan_approval_dataset.csv"

df = pd.read_csv(file_path)

print("Shape:", df.shape)
print("\nColumns:\n", df.columns)

df.head()

Shape: (4269, 13)

Columns:
 Index(['loan_id', 'no_of_dependents', 'education', 'self_employed',
       'income_annum', 'loan_amount', 'loan_term', 'credit_score',
       'residential_assets_value', 'commercial_assets_value',
       'luxury_assets_value', 'bank_asset_value', 'loan_status'],
      dtype='object')


Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,credit_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,2,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,4,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected


In [4]:
TARGET_COLUMN = "loan_status"

# Drop rows where target is missing
df = df.dropna(subset=[TARGET_COLUMN])

print("Shape after dropping missing target rows:", df.shape)
df[TARGET_COLUMN].value_counts()

Shape after dropping missing target rows: (4269, 13)


loan_status
Approved    2656
Rejected    1613
Name: count, dtype: int64

In [15]:
# convert string labels to 0/1 if needed

y_raw = df[TARGET_COLUMN]

print("Unique target values before mapping:", y_raw.unique())

mapping = {
    "Approved": 1,
    "Rejected": 0,
}

y = y_raw.map(lambda x: x.strip()).map(mapping)


if y.isna().any():
    raise ValueError("Some target values were not mapped. Update the mapping dict to cover all values.")

y.value_counts()


Unique target values before mapping: [' Approved' ' Rejected']


loan_status
1    2656
0    1613
Name: count, dtype: int64

In [16]:
df[TARGET_COLUMN] = y

output_path = "data/preprocessed_loan_approval_dataset.csv"
df.to_csv(output_path, index=False)
print(f"Cleaned dataset saved to: {output_path}")

Cleaned dataset saved to: data/preprocessed_loan_approval_dataset.csv
