**Importing Libraries**

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.preprocessing import StandardScaler

In [38]:
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV

In [4]:
df1 = pd.read_csv("train.csv")
df1.head()

Unnamed: 0,id,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status
0,0,37,35000,RENT,0.0,EDUCATION,B,6000,11.49,0.17,N,14,0
1,1,22,56000,OWN,6.0,MEDICAL,C,4000,13.35,0.07,N,2,0
2,2,29,28800,OWN,8.0,PERSONAL,A,6000,8.9,0.21,N,10,0
3,3,30,70000,RENT,14.0,VENTURE,B,12000,11.11,0.17,N,5,0
4,4,22,60000,RENT,2.0,MEDICAL,A,6000,6.92,0.1,N,3,0


**EDA**

In [5]:
#Shape of the dataset
df1.shape

(58645, 13)

In [6]:
#Checking for null values
df1.isnull().sum()

id                            0
person_age                    0
person_income                 0
person_home_ownership         0
person_emp_length             0
loan_intent                   0
loan_grade                    0
loan_amnt                     0
loan_int_rate                 0
loan_percent_income           0
cb_person_default_on_file     0
cb_person_cred_hist_length    0
loan_status                   0
dtype: int64

In [7]:
#Descriptive Analysis of Numerical Variables
df1.describe()

Unnamed: 0,id,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,loan_status
count,58645.0,58645.0,58645.0,58645.0,58645.0,58645.0,58645.0,58645.0,58645.0
mean,29322.0,27.550857,64046.17,4.701015,9217.556518,10.677874,0.159238,5.813556,0.142382
std,16929.497605,6.033216,37931.11,3.959784,5563.807384,3.034697,0.091692,4.029196,0.349445
min,0.0,20.0,4200.0,0.0,500.0,5.42,0.0,2.0,0.0
25%,14661.0,23.0,42000.0,2.0,5000.0,7.88,0.09,3.0,0.0
50%,29322.0,26.0,58000.0,4.0,8000.0,10.75,0.14,4.0,0.0
75%,43983.0,30.0,75600.0,7.0,12000.0,12.99,0.21,8.0,0.0
max,58644.0,123.0,1900000.0,123.0,35000.0,23.22,0.83,30.0,1.0


In [8]:
#Dropping 'id' column
df1.drop(columns='id', axis=1, inplace=True)

In [9]:
#Checking if the target column - loan_status is balanced
df1['loan_status'].value_counts()

loan_status
0    50295
1     8350
Name: count, dtype: int64

**Handling Categorical Columns**

Person_Home_Ownership

In [10]:
df1['person_home_ownership'].value_counts()

person_home_ownership
RENT        30594
MORTGAGE    24824
OWN          3138
OTHER          89
Name: count, dtype: int64

In [11]:
#One Hot Encode the column
df1 = pd.get_dummies(df1, columns=['person_home_ownership'], prefix='ownership')

# Display the updated dataframe
df1.head()

Unnamed: 0,person_age,person_income,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status,ownership_MORTGAGE,ownership_OTHER,ownership_OWN,ownership_RENT
0,37,35000,0.0,EDUCATION,B,6000,11.49,0.17,N,14,0,False,False,False,True
1,22,56000,6.0,MEDICAL,C,4000,13.35,0.07,N,2,0,False,False,True,False
2,29,28800,8.0,PERSONAL,A,6000,8.9,0.21,N,10,0,False,False,True,False
3,30,70000,14.0,VENTURE,B,12000,11.11,0.17,N,5,0,False,False,False,True
4,22,60000,2.0,MEDICAL,A,6000,6.92,0.1,N,3,0,False,False,False,True


Loan_Intent

In [12]:
df1['loan_intent'].value_counts()

loan_intent
EDUCATION            12271
MEDICAL              10934
PERSONAL             10016
VENTURE              10011
DEBTCONSOLIDATION     9133
HOMEIMPROVEMENT       6280
Name: count, dtype: int64

In [13]:
#One Hot Encode the column
df1 = pd.get_dummies(df1, columns=['loan_intent'], prefix='intent')

# Display the updated dataframe
df1.head()

Unnamed: 0,person_age,person_income,person_emp_length,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status,ownership_MORTGAGE,ownership_OTHER,ownership_OWN,ownership_RENT,intent_DEBTCONSOLIDATION,intent_EDUCATION,intent_HOMEIMPROVEMENT,intent_MEDICAL,intent_PERSONAL,intent_VENTURE
0,37,35000,0.0,B,6000,11.49,0.17,N,14,0,False,False,False,True,False,True,False,False,False,False
1,22,56000,6.0,C,4000,13.35,0.07,N,2,0,False,False,True,False,False,False,False,True,False,False
2,29,28800,8.0,A,6000,8.9,0.21,N,10,0,False,False,True,False,False,False,False,False,True,False
3,30,70000,14.0,B,12000,11.11,0.17,N,5,0,False,False,False,True,False,False,False,False,False,True
4,22,60000,2.0,A,6000,6.92,0.1,N,3,0,False,False,False,True,False,False,False,True,False,False


In [14]:
#New shape after OHE
df1.shape

(58645, 20)

Loan_Grade

In [15]:
df1['loan_grade'].value_counts()

loan_grade
A    20984
B    20400
C    11036
D     5034
E     1009
F      149
G       33
Name: count, dtype: int64

In [16]:
# Define custom mapping where A has the highest label
loan_grade_mapping = {'A': 6, 'B': 5, 'C': 4, 'D': 3, 'E': 2, 'F': 1, 'G': 0}

# Map the loan_grade column using the custom mapping
df1['loan_grade_encoded'] = df1['loan_grade'].map(loan_grade_mapping)

In [17]:
df1.drop(columns='loan_grade', axis=1, inplace=True)

In [18]:
df1.sample(n=5)

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status,ownership_MORTGAGE,ownership_OTHER,ownership_OWN,ownership_RENT,intent_DEBTCONSOLIDATION,intent_EDUCATION,intent_HOMEIMPROVEMENT,intent_MEDICAL,intent_PERSONAL,intent_VENTURE,loan_grade_encoded
2931,21,54000,5.0,6500,12.87,0.12,Y,3,0,True,False,False,False,False,False,False,True,False,False,4
2642,24,38000,0.0,5000,11.58,0.13,N,3,0,False,False,False,True,False,False,False,False,False,True,5
1923,25,78500,7.0,12000,10.99,0.15,N,4,1,False,False,False,True,False,False,False,True,False,False,5
23454,27,80000,6.0,10000,7.29,0.13,N,5,0,False,False,False,True,False,False,False,False,False,True,6
18648,27,122000,9.0,15000,8.49,0.12,N,8,0,True,False,False,False,False,False,False,False,True,False,6


cb_person_default_on_file

In [19]:
df1['cb_person_default_on_file'].value_counts()

cb_person_default_on_file
N    49943
Y     8702
Name: count, dtype: int64

In [20]:
# Label Encode the value
le = LabelEncoder()

# Fit and transform
df1['cb_person_default_on_file'] = le.fit_transform(df1['cb_person_default_on_file'])

In [21]:
# To check the mapping of the original labels to the encoded values
label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print(label_mapping)

{'N': 0, 'Y': 1}


In [22]:
df1.sample(n=5)

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status,ownership_MORTGAGE,ownership_OTHER,ownership_OWN,ownership_RENT,intent_DEBTCONSOLIDATION,intent_EDUCATION,intent_HOMEIMPROVEMENT,intent_MEDICAL,intent_PERSONAL,intent_VENTURE,loan_grade_encoded
32061,22,65000,6.0,10000,6.99,0.15,0,2,0,True,False,False,False,False,False,False,False,False,True,6
39891,24,70000,4.0,4000,10.25,0.06,0,3,0,True,False,False,False,False,False,False,True,False,False,5
50072,23,36000,2.0,5600,5.42,0.16,0,4,0,True,False,False,False,True,False,False,False,False,False,6
38778,37,75000,13.0,20000,15.33,0.27,0,17,0,True,False,False,False,False,False,False,False,True,False,3
28453,22,40000,3.0,5600,13.85,0.14,0,2,0,False,False,False,True,True,False,False,False,False,False,4


**Test Data**

In [23]:
df_test = pd.read_csv("test (1).csv")
df_test.head()

Unnamed: 0,id,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,58645,23,69000,RENT,3.0,HOMEIMPROVEMENT,F,25000,15.76,0.36,N,2
1,58646,26,96000,MORTGAGE,6.0,PERSONAL,C,10000,12.68,0.1,Y,4
2,58647,26,30000,RENT,5.0,VENTURE,E,4000,17.19,0.13,Y,2
3,58648,33,50000,RENT,4.0,DEBTCONSOLIDATION,A,7000,8.9,0.14,N,7
4,58649,26,102000,MORTGAGE,8.0,HOMEIMPROVEMENT,D,15000,16.32,0.15,Y,4


In [24]:
#Handling 'id' column
id = df_test['id']
df_test.drop(columns='id', axis=1, inplace=True)
df_test.sample(n=5)

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
26902,33,53000,MORTGAGE,3.0,DEBTCONSOLIDATION,A,9600,7.66,0.18,N,6
13845,41,115000,MORTGAGE,2.0,EDUCATION,A,20000,8.9,0.17,N,11
22581,29,52800,RENT,3.0,EDUCATION,B,14000,10.37,0.28,N,8
32846,24,36000,RENT,3.0,PERSONAL,B,15000,11.58,0.42,N,3
8226,34,64000,RENT,13.0,EDUCATION,B,5125,12.69,0.08,N,6


In [25]:
#One Hot Encode the column - person_home_ownership and loan_intent
df_test = pd.get_dummies(df_test, columns=['person_home_ownership', 'loan_intent'], prefix=['ownership','intent'])

# Display the updated dataframe
df_test.head()

Unnamed: 0,person_age,person_income,person_emp_length,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,ownership_MORTGAGE,ownership_OTHER,ownership_OWN,ownership_RENT,intent_DEBTCONSOLIDATION,intent_EDUCATION,intent_HOMEIMPROVEMENT,intent_MEDICAL,intent_PERSONAL,intent_VENTURE
0,23,69000,3.0,F,25000,15.76,0.36,N,2,False,False,False,True,False,False,True,False,False,False
1,26,96000,6.0,C,10000,12.68,0.1,Y,4,True,False,False,False,False,False,False,False,True,False
2,26,30000,5.0,E,4000,17.19,0.13,Y,2,False,False,False,True,False,False,False,False,False,True
3,33,50000,4.0,A,7000,8.9,0.14,N,7,False,False,False,True,True,False,False,False,False,False
4,26,102000,8.0,D,15000,16.32,0.15,Y,4,True,False,False,False,False,False,True,False,False,False


In [26]:
# Map the loan_grade column using the custom mapping
df_test['loan_grade_encoded'] = df_test['loan_grade'].map(loan_grade_mapping)
df_test.head()

Unnamed: 0,person_age,person_income,person_emp_length,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,ownership_MORTGAGE,ownership_OTHER,ownership_OWN,ownership_RENT,intent_DEBTCONSOLIDATION,intent_EDUCATION,intent_HOMEIMPROVEMENT,intent_MEDICAL,intent_PERSONAL,intent_VENTURE,loan_grade_encoded
0,23,69000,3.0,F,25000,15.76,0.36,N,2,False,False,False,True,False,False,True,False,False,False,1
1,26,96000,6.0,C,10000,12.68,0.1,Y,4,True,False,False,False,False,False,False,False,True,False,4
2,26,30000,5.0,E,4000,17.19,0.13,Y,2,False,False,False,True,False,False,False,False,False,True,2
3,33,50000,4.0,A,7000,8.9,0.14,N,7,False,False,False,True,True,False,False,False,False,False,6
4,26,102000,8.0,D,15000,16.32,0.15,Y,4,True,False,False,False,False,False,True,False,False,False,3


In [27]:
#Drop the loan_grade column
df_test.drop(columns='loan_grade', axis=1, inplace=True)
df_test.sample(n=5)

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,ownership_MORTGAGE,ownership_OTHER,ownership_OWN,ownership_RENT,intent_DEBTCONSOLIDATION,intent_EDUCATION,intent_HOMEIMPROVEMENT,intent_MEDICAL,intent_PERSONAL,intent_VENTURE,loan_grade_encoded
11539,23,30000,2.0,2000,5.42,0.07,N,2,False,False,False,True,False,True,False,False,False,False,6
34805,25,65000,5.0,2000,5.42,0.03,N,2,True,False,False,False,False,False,False,False,True,False,6
39056,23,40000,7.0,5000,5.42,0.13,N,2,True,False,False,False,False,False,False,True,False,False,6
1272,22,74476,5.0,5000,5.42,0.06,N,2,True,False,False,False,False,True,False,False,False,False,6
31621,23,60000,5.0,4800,8.94,0.08,N,2,False,False,True,False,False,False,False,False,False,True,6


In [28]:
#Label Encode cb_person_default_on_file
# Fit and transform
df_test['cb_person_default_on_file'] = le.fit_transform(df_test['cb_person_default_on_file'])

df_test.sample(n=5)

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,ownership_MORTGAGE,ownership_OTHER,ownership_OWN,ownership_RENT,intent_DEBTCONSOLIDATION,intent_EDUCATION,intent_HOMEIMPROVEMENT,intent_MEDICAL,intent_PERSONAL,intent_VENTURE,loan_grade_encoded
27054,24,63000,2.0,5000,10.74,0.08,0,3,False,False,False,True,False,False,True,False,False,False,5
37587,30,54396,8.0,3600,11.49,0.07,0,7,False,False,False,True,False,False,False,True,False,False,5
8900,27,69996,0.0,10000,12.84,0.14,1,9,False,False,True,False,False,False,False,False,True,False,4
12379,37,85000,21.0,10000,6.62,0.12,0,17,True,False,False,False,False,False,False,False,False,True,6
3461,29,96000,4.0,14400,5.99,0.15,0,10,False,False,False,True,False,False,True,False,False,False,6


**XG Boost**

In [29]:
"""# Convert specific object columns to category type - Only for Plain Vanilla XG Boost
categorical_columns = ['person_home_ownership', 'loan_intent', 'loan_grade', 'cb_person_default_on_file']
df1[categorical_columns] = df1[categorical_columns].astype('category')
df_test[categorical_columns] = df_test[categorical_columns].astype('category')"""

"# Convert specific object columns to category type - Only for Plain Vanilla XG Boost\ncategorical_columns = ['person_home_ownership', 'loan_intent', 'loan_grade', 'cb_person_default_on_file']\ndf1[categorical_columns] = df1[categorical_columns].astype('category')\ndf_test[categorical_columns] = df_test[categorical_columns].astype('category')"

In [30]:
# Separate features (X) and target (y) from df1
X = df1.drop(columns=['loan_status'])
y = df1['loan_status']

In [31]:
# Split the train data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [32]:
# Define the XGBoost model
model = XGBClassifier(eval_metric="logloss", use_label_encoder=False)

In [33]:
# Define the grid of hyperparameters to search
param_grid = {
    'n_estimators': [100, 200, 300],           # Number of boosting rounds
    'learning_rate': [0.01, 0.05, 0.1],        # Step size shrinkage
    'max_depth': [3, 5, 7],                    # Maximum depth of trees
    'min_child_weight': [1, 3, 5],             # Minimum sum of weights for a child node
    'subsample': [0.6, 0.8, 1.0],              # Subsample ratio of the training instances
    'colsample_bytree': [0.6, 0.8, 1.0],       # Subsample ratio of columns when constructing each tree
    'gamma': [0, 0.1, 0.2],                    # Minimum loss reduction required to make a split
    'reg_alpha': [0, 0.01, 0.1],               # L1 regularization
    'reg_lambda': [1, 1.5, 2]                  # L2 regularization
}

In [40]:
# Set up RandomizedSearchCV with 5-fold cross-validation
random_search = RandomizedSearchCV(
    estimator=model, 
    param_distributions=param_grid, 
    scoring='roc_auc', 
    cv=5, 
    verbose=1, 
    n_jobs=-1, 
    n_iter=20  # Number of parameter settings sampled
)

In [41]:
# Fit the search to the training data
random_search.fit(X_train, y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


In [43]:
# Output the best parameters and best score
print("Best Parameters:", random_search.best_params_)
print("Best AUC-ROC Score:", random_search.best_score_)

Best Parameters: {'subsample': 1.0, 'reg_lambda': 2, 'reg_alpha': 0, 'n_estimators': 300, 'min_child_weight': 5, 'max_depth': 7, 'learning_rate': 0.05, 'gamma': 0.1, 'colsample_bytree': 0.8}
Best AUC-ROC Score: 0.9547451651526642


In [46]:
# Validate the model
y_val_pred = random_search.best_estimator_.predict(X_val)
accuracy = accuracy_score(y_val, y_val_pred)
roc_auc = roc_auc_score(y_val, random_search.best_estimator_.predict_proba(X_val)[:, 1])
print(f"Validation Accuracy: {accuracy}")
print(f"Validation ROC AUC Score: {roc_auc}")

Validation Accuracy: 0.953875010657345
Validation ROC AUC Score: 0.9584811289165502


In [48]:
# Make prediction on the unseen test data
df_test['loan_status_prediction'] = random_search.best_estimator_.predict(df_test)

In [49]:
submission = pd.DataFrame({'id': id, 'loan_status': df_test['loan_status_prediction']})
submission.to_csv('submission_v6.csv', index=False)
print("Submission file created: 'submission_v6.csv'")

Submission file created: 'submission_v6.csv'
