**Importing Libraries**

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.preprocessing import StandardScaler

In [3]:
import lightgbm as lgb

In [4]:
df1 = pd.read_csv("train.csv")
df1.head()

Unnamed: 0,id,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status
0,0,37,35000,RENT,0.0,EDUCATION,B,6000,11.49,0.17,N,14,0
1,1,22,56000,OWN,6.0,MEDICAL,C,4000,13.35,0.07,N,2,0
2,2,29,28800,OWN,8.0,PERSONAL,A,6000,8.9,0.21,N,10,0
3,3,30,70000,RENT,14.0,VENTURE,B,12000,11.11,0.17,N,5,0
4,4,22,60000,RENT,2.0,MEDICAL,A,6000,6.92,0.1,N,3,0


**EDA**

In [5]:
#Shape of the dataset
df1.shape

(58645, 13)

In [6]:
#Checking for null values
df1.isnull().sum()

id                            0
person_age                    0
person_income                 0
person_home_ownership         0
person_emp_length             0
loan_intent                   0
loan_grade                    0
loan_amnt                     0
loan_int_rate                 0
loan_percent_income           0
cb_person_default_on_file     0
cb_person_cred_hist_length    0
loan_status                   0
dtype: int64

In [7]:
#Descriptive Analysis of Numerical Variables
df1.describe()

Unnamed: 0,id,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,loan_status
count,58645.0,58645.0,58645.0,58645.0,58645.0,58645.0,58645.0,58645.0,58645.0
mean,29322.0,27.550857,64046.17,4.701015,9217.556518,10.677874,0.159238,5.813556,0.142382
std,16929.497605,6.033216,37931.11,3.959784,5563.807384,3.034697,0.091692,4.029196,0.349445
min,0.0,20.0,4200.0,0.0,500.0,5.42,0.0,2.0,0.0
25%,14661.0,23.0,42000.0,2.0,5000.0,7.88,0.09,3.0,0.0
50%,29322.0,26.0,58000.0,4.0,8000.0,10.75,0.14,4.0,0.0
75%,43983.0,30.0,75600.0,7.0,12000.0,12.99,0.21,8.0,0.0
max,58644.0,123.0,1900000.0,123.0,35000.0,23.22,0.83,30.0,1.0


In [8]:
#Dropping 'id' column
df1.drop(columns='id', axis=1, inplace=True)

In [9]:
#Checking if the target column - loan_status is balanced
df1['loan_status'].value_counts()

loan_status
0    50295
1     8350
Name: count, dtype: int64

**Handling Categorical Columns**

Person_Home_Ownership

In [10]:
df1['person_home_ownership'].value_counts()

person_home_ownership
RENT        30594
MORTGAGE    24824
OWN          3138
OTHER          89
Name: count, dtype: int64

In [11]:
#One Hot Encode the column
df1 = pd.get_dummies(df1, columns=['person_home_ownership'], prefix='ownership')

# Display the updated dataframe
df1.head()

Unnamed: 0,person_age,person_income,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status,ownership_MORTGAGE,ownership_OTHER,ownership_OWN,ownership_RENT
0,37,35000,0.0,EDUCATION,B,6000,11.49,0.17,N,14,0,False,False,False,True
1,22,56000,6.0,MEDICAL,C,4000,13.35,0.07,N,2,0,False,False,True,False
2,29,28800,8.0,PERSONAL,A,6000,8.9,0.21,N,10,0,False,False,True,False
3,30,70000,14.0,VENTURE,B,12000,11.11,0.17,N,5,0,False,False,False,True
4,22,60000,2.0,MEDICAL,A,6000,6.92,0.1,N,3,0,False,False,False,True


Loan_Intent

In [12]:
df1['loan_intent'].value_counts()

loan_intent
EDUCATION            12271
MEDICAL              10934
PERSONAL             10016
VENTURE              10011
DEBTCONSOLIDATION     9133
HOMEIMPROVEMENT       6280
Name: count, dtype: int64

In [13]:
#One Hot Encode the column
df1 = pd.get_dummies(df1, columns=['loan_intent'], prefix='intent')

# Display the updated dataframe
df1.head()

Unnamed: 0,person_age,person_income,person_emp_length,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status,ownership_MORTGAGE,ownership_OTHER,ownership_OWN,ownership_RENT,intent_DEBTCONSOLIDATION,intent_EDUCATION,intent_HOMEIMPROVEMENT,intent_MEDICAL,intent_PERSONAL,intent_VENTURE
0,37,35000,0.0,B,6000,11.49,0.17,N,14,0,False,False,False,True,False,True,False,False,False,False
1,22,56000,6.0,C,4000,13.35,0.07,N,2,0,False,False,True,False,False,False,False,True,False,False
2,29,28800,8.0,A,6000,8.9,0.21,N,10,0,False,False,True,False,False,False,False,False,True,False
3,30,70000,14.0,B,12000,11.11,0.17,N,5,0,False,False,False,True,False,False,False,False,False,True
4,22,60000,2.0,A,6000,6.92,0.1,N,3,0,False,False,False,True,False,False,False,True,False,False


In [14]:
#New shape after OHE
df1.shape

(58645, 20)

Loan_Grade

In [15]:
df1['loan_grade'].value_counts()

loan_grade
A    20984
B    20400
C    11036
D     5034
E     1009
F      149
G       33
Name: count, dtype: int64

In [16]:
# Define custom mapping where A has the highest label
loan_grade_mapping = {'A': 6, 'B': 5, 'C': 4, 'D': 3, 'E': 2, 'F': 1, 'G': 0}

# Map the loan_grade column using the custom mapping
df1['loan_grade_encoded'] = df1['loan_grade'].map(loan_grade_mapping)

In [17]:
df1.drop(columns='loan_grade', axis=1, inplace=True)

In [18]:
df1.sample(n=5)

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status,ownership_MORTGAGE,ownership_OTHER,ownership_OWN,ownership_RENT,intent_DEBTCONSOLIDATION,intent_EDUCATION,intent_HOMEIMPROVEMENT,intent_MEDICAL,intent_PERSONAL,intent_VENTURE,loan_grade_encoded
58407,26,90000,0.0,28000,7.9,0.32,N,4,0,True,False,False,False,True,False,False,False,False,False,6
2375,26,68000,5.0,15000,7.88,0.23,N,4,0,True,False,False,False,True,False,False,False,False,False,6
51767,45,34000,14.0,3000,6.76,0.09,N,12,0,False,False,False,True,True,False,False,False,False,False,6
7225,23,45000,2.0,4000,13.48,0.09,Y,2,0,False,False,False,True,True,False,False,False,False,False,4
46599,22,125000,6.0,7500,8.0,0.06,N,2,0,False,False,False,True,False,False,False,False,False,True,6


cb_person_default_on_file

In [19]:
df1['cb_person_default_on_file'].value_counts()

cb_person_default_on_file
N    49943
Y     8702
Name: count, dtype: int64

In [20]:
# Label Encode the value
le = LabelEncoder()

# Fit and transform
df1['cb_person_default_on_file'] = le.fit_transform(df1['cb_person_default_on_file'])

In [21]:
# To check the mapping of the original labels to the encoded values
label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print(label_mapping)

{'N': 0, 'Y': 1}


In [22]:
df1.sample(n=5)

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status,ownership_MORTGAGE,ownership_OTHER,ownership_OWN,ownership_RENT,intent_DEBTCONSOLIDATION,intent_EDUCATION,intent_HOMEIMPROVEMENT,intent_MEDICAL,intent_PERSONAL,intent_VENTURE,loan_grade_encoded
41336,25,28000,9.0,9600,12.22,0.34,1,2,1,False,False,False,True,True,False,False,False,False,False,4
44643,22,91234,6.0,12000,7.88,0.13,0,7,0,True,False,False,False,False,False,False,True,False,False,6
33054,27,35000,3.0,6500,7.51,0.18,0,10,0,False,False,False,True,False,True,False,False,False,False,6
13610,22,24000,3.0,3000,12.53,0.13,1,3,0,False,False,False,True,False,False,False,False,True,False,4
16927,24,68004,5.0,12000,11.86,0.16,0,2,0,False,False,False,True,False,False,False,True,False,False,5


**Test Data**

In [23]:
df_test = pd.read_csv("test (1).csv")
df_test.head()

Unnamed: 0,id,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,58645,23,69000,RENT,3.0,HOMEIMPROVEMENT,F,25000,15.76,0.36,N,2
1,58646,26,96000,MORTGAGE,6.0,PERSONAL,C,10000,12.68,0.1,Y,4
2,58647,26,30000,RENT,5.0,VENTURE,E,4000,17.19,0.13,Y,2
3,58648,33,50000,RENT,4.0,DEBTCONSOLIDATION,A,7000,8.9,0.14,N,7
4,58649,26,102000,MORTGAGE,8.0,HOMEIMPROVEMENT,D,15000,16.32,0.15,Y,4


In [24]:
#Handling 'id' column
id = df_test['id']
df_test.drop(columns='id', axis=1, inplace=True)
df_test.sample(n=5)

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
19603,25,60000,MORTGAGE,9.0,PERSONAL,A,12000,5.42,0.2,N,3
11035,23,24000,RENT,4.0,EDUCATION,A,5000,5.79,0.21,N,3
857,25,50000,RENT,1.0,MEDICAL,A,17000,7.66,0.34,N,3
30195,30,45000,RENT,6.0,DEBTCONSOLIDATION,A,14000,7.88,0.31,N,10
17140,27,90000,MORTGAGE,11.0,MEDICAL,B,20000,11.12,0.22,N,8


In [25]:
#One Hot Encode the column - person_home_ownership and loan_intent
df_test = pd.get_dummies(df_test, columns=['person_home_ownership', 'loan_intent'], prefix=['ownership','intent'])

# Display the updated dataframe
df_test.head()

Unnamed: 0,person_age,person_income,person_emp_length,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,ownership_MORTGAGE,ownership_OTHER,ownership_OWN,ownership_RENT,intent_DEBTCONSOLIDATION,intent_EDUCATION,intent_HOMEIMPROVEMENT,intent_MEDICAL,intent_PERSONAL,intent_VENTURE
0,23,69000,3.0,F,25000,15.76,0.36,N,2,False,False,False,True,False,False,True,False,False,False
1,26,96000,6.0,C,10000,12.68,0.1,Y,4,True,False,False,False,False,False,False,False,True,False
2,26,30000,5.0,E,4000,17.19,0.13,Y,2,False,False,False,True,False,False,False,False,False,True
3,33,50000,4.0,A,7000,8.9,0.14,N,7,False,False,False,True,True,False,False,False,False,False
4,26,102000,8.0,D,15000,16.32,0.15,Y,4,True,False,False,False,False,False,True,False,False,False


In [26]:
# Map the loan_grade column using the custom mapping
df_test['loan_grade_encoded'] = df_test['loan_grade'].map(loan_grade_mapping)
df_test.head()

Unnamed: 0,person_age,person_income,person_emp_length,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,ownership_MORTGAGE,ownership_OTHER,ownership_OWN,ownership_RENT,intent_DEBTCONSOLIDATION,intent_EDUCATION,intent_HOMEIMPROVEMENT,intent_MEDICAL,intent_PERSONAL,intent_VENTURE,loan_grade_encoded
0,23,69000,3.0,F,25000,15.76,0.36,N,2,False,False,False,True,False,False,True,False,False,False,1
1,26,96000,6.0,C,10000,12.68,0.1,Y,4,True,False,False,False,False,False,False,False,True,False,4
2,26,30000,5.0,E,4000,17.19,0.13,Y,2,False,False,False,True,False,False,False,False,False,True,2
3,33,50000,4.0,A,7000,8.9,0.14,N,7,False,False,False,True,True,False,False,False,False,False,6
4,26,102000,8.0,D,15000,16.32,0.15,Y,4,True,False,False,False,False,False,True,False,False,False,3


In [27]:
#Drop the loan_grade column
df_test.drop(columns='loan_grade', axis=1, inplace=True)
df_test.sample(n=5)

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,ownership_MORTGAGE,ownership_OTHER,ownership_OWN,ownership_RENT,intent_DEBTCONSOLIDATION,intent_EDUCATION,intent_HOMEIMPROVEMENT,intent_MEDICAL,intent_PERSONAL,intent_VENTURE,loan_grade_encoded
11055,27,80004,2.0,15000,13.16,0.19,N,8,False,False,False,True,False,False,False,True,False,False,4
5156,24,111239,3.0,13000,8.49,0.12,N,4,True,False,False,False,False,False,False,False,False,True,6
19745,27,72000,11.0,10000,10.37,0.14,N,8,True,False,False,False,False,False,False,False,False,True,5
36303,29,112500,7.0,6400,7.49,0.06,N,8,True,False,False,False,False,False,True,False,False,False,6
38342,24,55000,1.0,3200,11.83,0.06,N,2,False,False,False,True,True,False,False,False,False,False,5


In [28]:
#Label Encode cb_person_default_on_file
# Fit and transform
df_test['cb_person_default_on_file'] = le.fit_transform(df_test['cb_person_default_on_file'])

df_test.sample(n=5)

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,ownership_MORTGAGE,ownership_OTHER,ownership_OWN,ownership_RENT,intent_DEBTCONSOLIDATION,intent_EDUCATION,intent_HOMEIMPROVEMENT,intent_MEDICAL,intent_PERSONAL,intent_VENTURE,loan_grade_encoded
28400,30,100000,14.0,15000,6.03,0.15,0,10,True,False,False,False,False,True,False,False,False,False,6
17081,23,114000,0.0,15000,17.14,0.13,0,2,False,False,True,False,True,False,False,False,False,False,2
12392,27,53000,8.0,15600,9.99,0.28,0,7,True,False,False,False,False,False,True,False,False,False,5
31701,24,27000,0.0,11500,15.99,0.41,0,4,False,False,False,True,False,False,True,False,False,False,3
18033,25,50000,4.0,10000,11.71,0.2,0,2,False,False,False,True,False,False,False,False,True,False,5


**LightGBM**

In [None]:
"""# Convert specific object columns to category type - Only for Plain Vanilla XG Boost
categorical_columns = ['person_home_ownership', 'loan_intent', 'loan_grade', 'cb_person_default_on_file']
df1[categorical_columns] = df1[categorical_columns].astype('category')
df_test[categorical_columns] = df_test[categorical_columns].astype('category')"""

In [29]:
# Separate features (X) and target (y) from df1
X = df1.drop(columns=['loan_status'])
y = df1['loan_status']

In [30]:
# Split the train data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [31]:
# Set up the LightGBM dataset with categorical feature specification
train_data = lgb.Dataset(X_train, label=y_train, free_raw_data=False)
val_data = lgb.Dataset(X_val, label=y_val, reference=train_data, free_raw_data=False)

In [32]:
# Define LightGBM parameters
params = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'is_unbalance':True
}

In [33]:
# Train the model
model = lgb.train(
    params,
    train_data,
    num_boost_round=1000,  # Specify max rounds
    valid_sets=[train_data, val_data],
    callbacks=[lgb.early_stopping(stopping_rounds=50)],  # Using callback for early stopping
)

[LightGBM] [Info] Number of positive: 6708, number of negative: 40208
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004203 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 850
[LightGBM] [Info] Number of data points in the train set: 46916, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.142979 -> initscore=-1.790765
[LightGBM] [Info] Start training from score -1.790765
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[248]	training's auc: 0.979502	valid_1's auc: 0.959202


In [34]:
# Evaluate the model
y_val_pred = model.predict(X_val)
roc_auc = roc_auc_score(y_val, y_val_pred)
print(f"Validation AUC-ROC: {roc_auc}")

Validation AUC-ROC: 0.9592017474766124


In [None]:
"""# Validate the model
y_val_pred = model.predict(X_val)
accuracy = accuracy_score(y_val, y_val_pred)
roc_auc = roc_auc_score(y_val, y_val_pred)
print(f"Validation Accuracy: {accuracy}")
print(f"Validation ROC AUC Score: {roc_auc}")"""

In [35]:
# Make prediction on the unseen test data
df_test['loan_status_prediction'] = model.predict(df_test)

In [36]:
submission = pd.DataFrame({'id': id, 'loan_status': df_test['loan_status_prediction']})
submission.to_csv('submission_v8.csv', index=False)
print("Submission file created: 'submission_v8.csv'")

Submission file created: 'submission_v8.csv'
