**Importing Libraries**

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.preprocessing import StandardScaler

In [3]:
from xgboost import XGBClassifier

In [4]:
df1 = pd.read_csv("train.csv")
df1.head()

Unnamed: 0,id,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status
0,0,37,35000,RENT,0.0,EDUCATION,B,6000,11.49,0.17,N,14,0
1,1,22,56000,OWN,6.0,MEDICAL,C,4000,13.35,0.07,N,2,0
2,2,29,28800,OWN,8.0,PERSONAL,A,6000,8.9,0.21,N,10,0
3,3,30,70000,RENT,14.0,VENTURE,B,12000,11.11,0.17,N,5,0
4,4,22,60000,RENT,2.0,MEDICAL,A,6000,6.92,0.1,N,3,0


**EDA**

In [None]:
#Shape of the dataset
df1.shape

In [None]:
#Checking for null values
df1.isnull().sum()

In [None]:
#Descriptive Analysis of Numerical Variables
df1.describe()

In [5]:
#Dropping 'id' column
df1.drop(columns='id', axis=1, inplace=True)

In [None]:
#Checking if the target column - loan_status is balanced
df1['loan_status'].value_counts()

**Handling Categorical Columns**

Person_Home_Ownership

In [None]:
df1['person_home_ownership'].value_counts()

In [None]:
#One Hot Encode the column
df1 = pd.get_dummies(df1, columns=['person_home_ownership'], prefix='ownership')

# Display the updated dataframe
df1.head()

Loan_Intent

In [None]:
df1['loan_intent'].value_counts()

In [None]:
#One Hot Encode the column
df1 = pd.get_dummies(df1, columns=['loan_intent'], prefix='intent')

# Display the updated dataframe
df1.head()

In [None]:
#New shape after OHE
df1.shape

Loan_Grade

In [None]:
df1['loan_grade'].value_counts()

In [None]:
# Define custom mapping where A has the highest label
loan_grade_mapping = {'A': 6, 'B': 5, 'C': 4, 'D': 3, 'E': 2, 'F': 1, 'G': 0}

# Map the loan_grade column using the custom mapping
df1['loan_grade_encoded'] = df1['loan_grade'].map(loan_grade_mapping)

In [None]:
df1.drop(columns='loan_grade', axis=1, inplace=True)

In [None]:
df1.sample(n=5)

cb_person_default_on_file

In [None]:
df1['cb_person_default_on_file'].value_counts()

In [None]:
# Label Encode the value
le = LabelEncoder()

# Fit and transform
df1['cb_person_default_on_file'] = le.fit_transform(df1['cb_person_default_on_file'])

In [None]:
# To check the mapping of the original labels to the encoded values
label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print(label_mapping)

In [None]:
df1.sample(n=5)

**Test Data**

In [6]:
df_test = pd.read_csv("test (1).csv")
df_test.head()

Unnamed: 0,id,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,58645,23,69000,RENT,3.0,HOMEIMPROVEMENT,F,25000,15.76,0.36,N,2
1,58646,26,96000,MORTGAGE,6.0,PERSONAL,C,10000,12.68,0.1,Y,4
2,58647,26,30000,RENT,5.0,VENTURE,E,4000,17.19,0.13,Y,2
3,58648,33,50000,RENT,4.0,DEBTCONSOLIDATION,A,7000,8.9,0.14,N,7
4,58649,26,102000,MORTGAGE,8.0,HOMEIMPROVEMENT,D,15000,16.32,0.15,Y,4


In [7]:
#Handling 'id' column
id = df_test['id']
df_test.drop(columns='id', axis=1, inplace=True)
df_test.sample(n=5)

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
6623,32,40000,RENT,2.0,MEDICAL,A,10000,8.94,0.25,N,10
30335,24,92000,RENT,7.0,HOMEIMPROVEMENT,A,18000,6.91,0.19,N,4
9780,27,37200,RENT,2.0,VENTURE,B,5000,9.99,0.13,N,5
5073,21,85000,MORTGAGE,5.0,EDUCATION,A,9600,6.03,0.11,N,3
12712,23,80000,MORTGAGE,3.0,EDUCATION,A,10000,7.88,0.13,N,3


In [None]:
#One Hot Encode the column - person_home_ownership and loan_intent
df_test = pd.get_dummies(df_test, columns=['person_home_ownership', 'loan_intent'], prefix=['ownership','intent'])

# Display the updated dataframe
df_test.head()

In [None]:
# Map the loan_grade column using the custom mapping
df_test['loan_grade_encoded'] = df_test['loan_grade'].map(loan_grade_mapping)
df_test.head()

In [None]:
#Drop the loan_grade column
df_test.drop(columns='loan_grade', axis=1, inplace=True)
df_test.sample(n=5)

In [None]:
#Label Encode cb_person_default_on_file
# Fit and transform
df_test['cb_person_default_on_file'] = le.fit_transform(df_test['cb_person_default_on_file'])

df_test.sample(n=5)

**XG Boost**

In [14]:
# Convert specific object columns to category type - Only for Plain Vanilla XG Boost
categorical_columns = ['person_home_ownership', 'loan_intent', 'loan_grade', 'cb_person_default_on_file']
df1[categorical_columns] = df1[categorical_columns].astype('category')
df_test[categorical_columns] = df_test[categorical_columns].astype('category')

In [15]:
# Separate features (X) and target (y) from df1
X = df1.drop(columns=['loan_status'])
y = df1['loan_status']

In [16]:
# Split the train data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
# Initialize and train the model
model = XGBClassifier(enable_categorical=True, eval_metric="logloss")
model.fit(X_train, y_train)

In [18]:
# Validate the model
y_val_pred = model.predict(X_val)
accuracy = accuracy_score(y_val, y_val_pred)
roc_auc = roc_auc_score(y_val, model.predict_proba(X_val)[:, 1])
print(f"Validation Accuracy: {accuracy}")
print(f"Validation ROC AUC Score: {roc_auc}")

Validation Accuracy: 0.9531929405746441
Validation ROC AUC Score: 0.9537336983106897


In [19]:
# Make prediction on the unseen test data
df_test['loan_status_prediction'] = model.predict(df_test)

In [20]:
submission = pd.DataFrame({'id': id, 'loan_status': df_test['loan_status_prediction']})
submission.to_csv('submission_v3.csv', index=False)
print("Submission file created: 'submission_v3.csv'")

Submission file created: 'submission_v3.csv'
