# Gold Layer Exploration and Logistic Regression Modeling

This script:
1. Loads and explores the gold layer feature store
2. Performs EDA on features and target variable
3. Prepares data for modeling (feature selection, train/test split)
4. Trains a logistic regression model
5. Evaluates model performance with metrics and visualizations

In [1]:
import os
import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

import pyspark
from pyspark.sql.functions import col
import pyspark.sql.functions as F

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    classification_report, 
    confusion_matrix, 
    roc_auc_score, 
    roc_curve,
    precision_recall_curve,
    average_precision_score
)

In [2]:
# Display all columns
pd.set_option('display.max_columns', None)

In [3]:
# Set visualization style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

In [4]:
# Initialize Spark
spark = pyspark.sql.SparkSession.builder \
    .appName("explore_gold") \
    .master("local[*]") \
    .config("spark.sql.parquet.mergeSchema", "true") \
    .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/09 11:16:00 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
# Configuration
FEATURE_DIR = "/app/datamart/gold/feature_store/"
LABEL_DIR = "/app/datamart/gold/label_store/"
APPLICATION_DIR = "/app/datamart/gold/application_store/"

In [6]:
spark.read.parquet(APPLICATION_DIR).printSchema()

                                                                                

root
 |-- Customer_ID: string (nullable = true)
 |-- loan_amt: integer (nullable = true)
 |-- tenure: integer (nullable = true)
 |-- application_date: date (nullable = true)



In [7]:
application = spark.read.parquet(APPLICATION_DIR)

In [8]:
application.groupBy("Customer_ID").count().filter("count > 1").show()



+-----------+-----+
|Customer_ID|count|
+-----------+-----+
+-----------+-----+



                                                                                

# LOAD GOLD LAYER DATA

In [9]:
# Get all parquet files
feature_files = spark.read.parquet(FEATURE_DIR)
feature_files.show()

                                                                                

+-----------+--------+------+-------------+----------------+------------------+--------------------+------------------+-------------------------------------------------+------------------------------------------------+-------------------------------------------------+------------------------------------------------+--------------------------------------------------+-------------------------------------------------+-------------------+---------------+--------------+----------------------+--------------------------------+--------------------------+-----------------------------+--------------------------+-------------------------+------------------------------------+------------------------+-----------------------+------------------------+-------------------------+-------------------------+--------------------------+-------------+----------------+----+--------------+--------------+--------------+--------------+-----------+-----------+------------+-----------+-----------+------------+-----

In [10]:
# Get all parquet files
label_files_pattern = LABEL_DIR + "gold_label_store_*.parquet"

label_files = spark.read.parquet(label_files_pattern)
label_files.show()

                                                                                

+--------------------+-----------+-----+----------+-------------+
|             loan_id|Customer_ID|label| label_def|snapshot_date|
+--------------------+-----------+-----+----------+-------------+
|CUS_0x1037_2023_0...| CUS_0x1037|    0|30dpd_6mob|   2023-07-01|
|CUS_0x1069_2023_0...| CUS_0x1069|    0|30dpd_6mob|   2023-07-01|
|CUS_0x114a_2023_0...| CUS_0x114a|    0|30dpd_6mob|   2023-07-01|
|CUS_0x1184_2023_0...| CUS_0x1184|    0|30dpd_6mob|   2023-07-01|
|CUS_0x1297_2023_0...| CUS_0x1297|    1|30dpd_6mob|   2023-07-01|
|CUS_0x12fb_2023_0...| CUS_0x12fb|    0|30dpd_6mob|   2023-07-01|
|CUS_0x1325_2023_0...| CUS_0x1325|    0|30dpd_6mob|   2023-07-01|
|CUS_0x1341_2023_0...| CUS_0x1341|    0|30dpd_6mob|   2023-07-01|
|CUS_0x1375_2023_0...| CUS_0x1375|    1|30dpd_6mob|   2023-07-01|
|CUS_0x13a8_2023_0...| CUS_0x13a8|    0|30dpd_6mob|   2023-07-01|
|CUS_0x13ef_2023_0...| CUS_0x13ef|    0|30dpd_6mob|   2023-07-01|
|CUS_0x1440_2023_0...| CUS_0x1440|    0|30dpd_6mob|   2023-07-01|
|CUS_0x144

In [11]:
# Convert to Pandas for easier analysis and modeling
print("\nConverting to Pandas...")
feature_pd = feature_files.toPandas()
print(f"✓ Pandas DataFrame shape: {feature_pd.shape}")


Converting to Pandas...


                                                                                

✓ Pandas DataFrame shape: (12500, 164)


In [12]:
# Convert to Pandas for easier analysis and modeling
print("\nConverting to Pandas...")
label_pd = label_files.toPandas()
print(f"✓ Pandas DataFrame shape: {label_pd.shape}")


Converting to Pandas...




✓ Pandas DataFrame shape: (21474, 5)


                                                                                

# BASIC EXPLORATION

In [13]:
feature_pd.head(10)

Unnamed: 0,Customer_ID,loan_amt,tenure,Annual_Income,Outstanding_Debt,Num_of_Loan_active,DTI,log_Annual_Income,Payment_Behaviour_High_spent_Small_value_payments,Payment_Behaviour_Low_spent_Large_value_payments,Payment_Behaviour_Low_spent_Medium_value_payments,Payment_Behaviour_Low_spent_Small_value_payments,Payment_Behaviour_High_spent_Medium_value_payments,Payment_Behaviour_High_spent_Large_value_payments,Credit_Mix_Standard,Credit_Mix_Good,Credit_Mix_Bad,Type_of_Loan_Auto_Loan,Type_of_Loan_Credit_Builder_Loan,Type_of_Loan_Personal_Loan,Type_of_Loan_Home_Equity_Loan,Type_of_Loan_Mortgage_Loan,Type_of_Loan_Student_Loan,Type_of_Loan_Debt_Consolidation_Loan,Type_of_Loan_Payday_Loan,Credit_History_Age_Year,Credit_History_Age_Month,Num_of_Delayed_Payment_3m,Num_of_Delayed_Payment_6m,Num_of_Delayed_Payment_12m,max_dpd_prior,ever_30dpd_prior,Age,age_band_18_24,age_band_25_34,age_band_35_44,age_band_45_54,age_band_55,fe_1_sum_7d,fe_1_mean_7d,fe_1_std_7d,fe_2_sum_7d,fe_2_mean_7d,fe_2_std_7d,fe_3_sum_7d,fe_3_mean_7d,fe_3_std_7d,fe_4_sum_7d,fe_4_mean_7d,fe_4_std_7d,fe_5_sum_7d,fe_5_mean_7d,fe_5_std_7d,fe_6_sum_7d,fe_6_mean_7d,fe_6_std_7d,fe_7_sum_7d,fe_7_mean_7d,fe_7_std_7d,fe_8_sum_7d,fe_8_mean_7d,fe_8_std_7d,fe_9_sum_7d,fe_9_mean_7d,fe_9_std_7d,fe_10_sum_7d,fe_10_mean_7d,fe_10_std_7d,fe_11_sum_7d,fe_11_mean_7d,fe_11_std_7d,fe_12_sum_7d,fe_12_mean_7d,fe_12_std_7d,fe_13_sum_7d,fe_13_mean_7d,fe_13_std_7d,fe_14_sum_7d,fe_14_mean_7d,fe_14_std_7d,fe_15_sum_7d,fe_15_mean_7d,fe_15_std_7d,fe_16_sum_7d,fe_16_mean_7d,fe_16_std_7d,fe_17_sum_7d,fe_17_mean_7d,fe_17_std_7d,fe_18_sum_7d,fe_18_mean_7d,fe_18_std_7d,fe_19_sum_7d,fe_19_mean_7d,fe_19_std_7d,fe_20_sum_7d,fe_20_mean_7d,fe_20_std_7d,fe_1_sum_30d,fe_1_mean_30d,fe_1_std_30d,fe_2_sum_30d,fe_2_mean_30d,fe_2_std_30d,fe_3_sum_30d,fe_3_mean_30d,fe_3_std_30d,fe_4_sum_30d,fe_4_mean_30d,fe_4_std_30d,fe_5_sum_30d,fe_5_mean_30d,fe_5_std_30d,fe_6_sum_30d,fe_6_mean_30d,fe_6_std_30d,fe_7_sum_30d,fe_7_mean_30d,fe_7_std_30d,fe_8_sum_30d,fe_8_mean_30d,fe_8_std_30d,fe_9_sum_30d,fe_9_mean_30d,fe_9_std_30d,fe_10_sum_30d,fe_10_mean_30d,fe_10_std_30d,fe_11_sum_30d,fe_11_mean_30d,fe_11_std_30d,fe_12_sum_30d,fe_12_mean_30d,fe_12_std_30d,fe_13_sum_30d,fe_13_mean_30d,fe_13_std_30d,fe_14_sum_30d,fe_14_mean_30d,fe_14_std_30d,fe_15_sum_30d,fe_15_mean_30d,fe_15_std_30d,fe_16_sum_30d,fe_16_mean_30d,fe_16_std_30d,fe_17_sum_30d,fe_17_mean_30d,fe_17_std_30d,fe_18_sum_30d,fe_18_mean_30d,fe_18_std_30d,fe_19_sum_30d,fe_19_mean_30d,fe_19_std_30d,fe_20_sum_30d,fe_20_mean_30d,fe_20_std_30d,estimated_EMI,EMI_to_income,requested_amount,requested_tenure,snapshot_date,application_date
0,CUS_0x1000,10000,10,30625.94,1562.91,2.0,0.051032,10.329603,0,0,0,0,1,0,0,0,1,0,1,0,1,0,0,0,0,10,9,26.0,26.0,26.0,26.0,0,18.0,1,0,0,0,0,58,58.0,0.0,15,15.0,0.0,18,18.0,0.0,-101,-101.0,0.0,48,48.0,0.0,-33,-33.0,0.0,238,238.0,0.0,119,119.0,0.0,75,75.0,0.0,52,52.0,0.0,39,39.0,0.0,131,131.0,0.0,-34,-34.0,0.0,145,145.0,0.0,-61,-61.0,0.0,39,39.0,0.0,52,52.0,0.0,80,80.0,0.0,4,4.0,0.0,31,31.0,0.0,150,75.0,24.041631,121,60.5,64.346717,124,62.0,62.225397,-62,-31.0,98.994949,336,168.0,169.705627,159,79.5,159.099026,492,246.0,11.313708,213,106.5,17.67767,245,122.5,67.175144,66,33.0,26.870058,203,101.5,88.388348,100,50.0,114.551299,105,52.5,122.329473,265,132.5,17.67767,8,4.0,91.923882,211,105.5,94.045202,138,69.0,24.041631,155,77.5,3.535534,68,34.0,42.426407,93,46.5,21.92031,1055.820766,0.413697,10000,10,2023-05-01,2023-05-01
1,CUS_0x108a,10000,10,36982.36,4882.12,9.0,0.132012,10.518196,0,0,1,0,0,0,0,0,0,0,1,1,1,0,1,1,1,7,9,15.0,15.0,15.0,15.0,0,38.0,0,0,1,0,0,77,77.0,0.0,80,80.0,0.0,44,44.0,0.0,-95,-95.0,0.0,161,161.0,0.0,126,126.0,0.0,50,50.0,0.0,-122,-122.0,0.0,105,105.0,0.0,112,112.0,0.0,170,170.0,0.0,232,232.0,0.0,268,268.0,0.0,-50,-50.0,0.0,12,12.0,0.0,123,123.0,0.0,137,137.0,0.0,105,105.0,0.0,31,31.0,0.0,416,416.0,0.0,140,70.0,9.899495,144,72.0,11.313708,275,137.5,132.228968,-16,-8.0,123.03658,344,172.0,15.556349,454,227.0,142.83557,96,48.0,2.828427,82,41.0,230.516811,309,154.5,70.003571,384,192.0,113.137085,288,144.0,36.769553,441,220.5,16.263456,398,199.0,97.580736,29,14.5,91.216775,150,75.0,89.095454,314,157.0,48.083261,195,97.5,55.861436,534,267.0,229.102597,88,44.0,18.384776,580,290.0,178.190909,1055.820766,0.342592,10000,10,2023-05-01,2023-05-01
2,CUS_0x10f9,10000,10,150131.68,1138.36,0.0,0.007582,11.919268,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,31,11,0.0,0.0,0.0,0.0,0,54.0,0,0,0,1,0,269,269.0,0.0,109,109.0,0.0,92,92.0,0.0,64,64.0,0.0,158,158.0,0.0,-1,-1.0,0.0,206,206.0,0.0,-15,-15.0,0.0,207,207.0,0.0,49,49.0,0.0,64,64.0,0.0,28,28.0,0.0,106,106.0,0.0,-59,-59.0,0.0,31,31.0,0.0,4,4.0,0.0,20,20.0,0.0,166,166.0,0.0,221,221.0,0.0,147,147.0,0.0,455,227.5,58.689863,122,61.0,67.882251,312,156.0,90.509668,225,112.5,68.589358,158,79.0,111.722871,-20,-10.0,12.727922,237,118.5,123.743687,97,48.5,89.802561,402,201.0,8.485281,268,134.0,120.208153,261,130.5,94.045202,-59,-29.5,81.31728,373,186.5,113.844192,41,20.5,112.429978,112,56.0,35.355339,126,63.0,83.4386,222,111.0,128.693434,47,23.5,201.525433,349,174.5,65.760931,127,63.5,118.086832,1055.820766,0.084392,10000,10,2023-05-01,2023-05-01
3,CUS_0x1119,10000,10,56301.9,2161.23,2.0,0.038386,10.938484,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,14,6,18.0,18.0,18.0,18.0,0,36.0,0,0,1,0,0,-79,-79.0,0.0,30,30.0,0.0,91,91.0,0.0,122,122.0,0.0,195,195.0,0.0,284,284.0,0.0,29,29.0,0.0,-61,-61.0,0.0,260,260.0,0.0,94,94.0,0.0,55,55.0,0.0,-113,-113.0,0.0,42,42.0,0.0,-14,-14.0,0.0,71,71.0,0.0,4,4.0,0.0,92,92.0,0.0,118,118.0,0.0,94,94.0,0.0,80,80.0,0.0,186,93.0,243.244733,271,135.5,149.199531,370,185.0,132.936075,102,51.0,100.409163,272,136.0,83.4386,493,246.5,53.033009,136,68.0,55.154329,42,21.0,115.965512,436,218.0,59.39697,7,3.5,127.986327,216,108.0,74.953319,-313,-156.5,61.51829,79,39.5,3.535534,63,31.5,64.346717,238,119.0,67.882251,162,81.0,108.894444,263,131.5,55.861436,256,128.0,14.142136,30,15.0,111.722871,146,73.0,9.899495,1055.820766,0.225034,10000,10,2023-05-01,2023-05-01
4,CUS_0x1192,10000,10,16319.375,1275.32,2.0,0.078148,9.700108,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,23,9,13.0,13.0,13.0,13.0,0,0.0,0,0,0,0,1,191,191.0,0.0,144,144.0,0.0,28,28.0,0.0,127,127.0,0.0,245,245.0,0.0,-37,-37.0,0.0,149,149.0,0.0,244,244.0,0.0,100,100.0,0.0,219,219.0,0.0,98,98.0,0.0,190,190.0,0.0,139,139.0,0.0,68,68.0,0.0,-37,-37.0,0.0,235,235.0,0.0,-25,-25.0,0.0,89,89.0,0.0,230,230.0,0.0,12,12.0,0.0,336,168.0,32.526912,247,123.5,28.991378,152,76.0,67.882251,299,149.5,31.819805,269,134.5,156.270599,-46,-23.0,19.79899,219,109.5,55.861436,172,86.0,223.445743,184,92.0,11.313708,299,149.5,98.287843,154,77.0,29.698485,216,108.0,115.965512,211,105.5,47.376154,140,70.0,2.828427,-31,-15.5,30.405592,514,257.0,31.112698,58,29.0,76.367532,-92,-46.0,190.918831,476,238.0,11.313708,21,10.5,2.12132,1055.820766,0.776369,10000,10,2023-05-01,2023-05-01
5,CUS_0x11b1,10000,10,34819.83,905.89,1.0,0.026016,10.457942,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,18,10,12.0,12.0,12.0,12.0,0,28.0,0,1,0,0,0,-67,-67.0,0.0,-2,-2.0,0.0,189,189.0,0.0,83,83.0,0.0,149,149.0,0.0,-20,-20.0,0.0,246,246.0,0.0,49,49.0,0.0,128,128.0,0.0,94,94.0,0.0,36,36.0,0.0,-16,-16.0,0.0,19,19.0,0.0,58,58.0,0.0,128,128.0,0.0,40,40.0,0.0,181,181.0,0.0,41,41.0,0.0,253,253.0,0.0,61,61.0,0.0,-64,-32.0,49.497475,165,82.5,119.501046,295,147.5,58.689863,187,93.5,14.849242,215,107.5,58.689863,69,34.5,77.074639,460,230.0,22.627417,227,113.5,91.216775,329,164.5,51.618795,145,72.5,30.405592,117,58.5,31.819805,-10,-5.0,15.556349,-45,-22.5,58.689863,158,79.0,29.698485,134,67.0,86.267027,6,3.0,52.325902,285,142.5,54.447222,99,49.5,12.020815,386,193.0,84.852814,82,41.0,28.284271,1055.820766,0.363869,10000,10,2023-05-01,2023-05-01
6,CUS_0x12a7,10000,10,26450.59,952.9,4.0,0.036026,10.183034,0,0,0,0,0,1,1,0,0,1,0,0,1,0,0,1,0,28,8,18.0,18.0,18.0,18.0,0,37.0,0,0,1,0,0,224,224.0,0.0,110,110.0,0.0,3,3.0,0.0,-63,-63.0,0.0,-49,-49.0,0.0,183,183.0,0.0,143,143.0,0.0,31,31.0,0.0,8,8.0,0.0,225,225.0,0.0,105,105.0,0.0,123,123.0,0.0,75,75.0,0.0,221,221.0,0.0,62,62.0,0.0,-53,-53.0,0.0,159,159.0,0.0,115,115.0,0.0,104,104.0,0.0,33,33.0,0.0,377,188.5,50.204581,268,134.0,33.941125,188,94.0,128.693434,109,54.5,166.170094,122,61.0,155.563492,198,99.0,118.793939,246,123.0,28.284271,42,21.0,14.142136,59,29.5,30.405592,451,225.5,0.707107,214,107.0,2.828427,313,156.5,47.376154,268,134.0,83.4386,348,174.0,66.468037,249,124.5,88.388348,236,118.0,241.830519,257,128.5,43.133514,240,120.0,7.071068,14,7.0,137.178716,249,124.5,129.400541,1055.820766,0.479001,10000,10,2023-05-01,2023-05-01
7,CUS_0x12a9,10000,10,44822.21,730.64,4.0,0.016301,10.710459,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,1,1,24,1,7.0,7.0,7.0,7.0,0,44.0,0,0,1,0,0,211,211.0,0.0,-41,-41.0,0.0,48,48.0,0.0,140,140.0,0.0,68,68.0,0.0,23,23.0,0.0,86,86.0,0.0,164,164.0,0.0,262,262.0,0.0,173,173.0,0.0,252,252.0,0.0,75,75.0,0.0,-163,-163.0,0.0,29,29.0,0.0,38,38.0,0.0,104,104.0,0.0,29,29.0,0.0,205,205.0,0.0,88,88.0,0.0,74,74.0,0.0,302,151.0,84.852814,28,14.0,77.781746,155,77.5,41.7193,197,98.5,58.689863,261,130.5,88.388348,31,15.5,10.606602,255,127.5,58.689863,318,159.0,7.071068,359,179.5,116.672619,474,237.0,90.509668,301,150.5,143.542677,54,27.0,67.882251,-67,-33.5,183.140656,56,28.0,1.414214,195,97.5,84.145707,294,147.0,60.811183,83,41.5,17.67767,100,50.0,219.203102,263,131.5,61.51829,172,86.0,16.970563,1055.820766,0.282669,10000,10,2023-05-01,2023-05-01
8,CUS_0x1340,10000,10,40600.61,1198.96,2.0,0.029531,10.611538,1,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,21,3,0.0,0.0,0.0,0.0,0,42.0,0,0,1,0,0,48,48.0,0.0,119,119.0,0.0,122,122.0,0.0,174,174.0,0.0,120,120.0,0.0,82,82.0,0.0,29,29.0,0.0,56,56.0,0.0,72,72.0,0.0,116,116.0,0.0,167,167.0,0.0,14,14.0,0.0,-32,-32.0,0.0,172,172.0,0.0,152,152.0,0.0,5,5.0,0.0,6,6.0,0.0,82,82.0,0.0,3,3.0,0.0,208,208.0,0.0,-53,-26.5,105.35891,349,174.5,78.488853,165,82.5,55.861436,108,54.0,169.705627,59,29.5,127.986327,241,120.5,54.447222,98,49.0,28.284271,86,43.0,18.384776,213,106.5,48.790368,198,99.0,24.041631,332,166.0,1.414214,115,57.5,61.51829,266,133.0,233.345238,74,37.0,190.918831,169,84.5,95.459415,57,28.5,33.234019,155,77.5,101.11627,164,82.0,0.0,197,98.5,135.057395,182,91.0,165.462987,1055.820766,0.312061,10000,10,2023-05-01,2023-05-01
9,CUS_0x13d1,10000,10,22020.37,1343.64,3.0,0.061018,9.999723,0,0,0,1,0,0,0,1,0,0,1,1,0,0,0,1,0,26,4,6.0,6.0,6.0,6.0,0,22.0,1,0,0,0,0,-43,-43.0,0.0,70,70.0,0.0,147,147.0,0.0,-127,-127.0,0.0,117,117.0,0.0,105,105.0,0.0,84,84.0,0.0,-1,-1.0,0.0,-4,-4.0,0.0,-32,-32.0,0.0,155,155.0,0.0,32,32.0,0.0,112,112.0,0.0,118,118.0,0.0,-1,-1.0,0.0,214,214.0,0.0,158,158.0,0.0,-9,-9.0,0.0,204,204.0,0.0,114,114.0,0.0,120,60.0,145.663997,84,42.0,39.59798,334,167.0,28.284271,123,61.5,266.579257,212,106.0,15.556349,323,161.5,79.903066,153,76.5,10.606602,148,74.0,106.066017,87,43.5,67.175144,-137,-68.5,51.618795,175,87.5,95.459415,144,72.0,56.568542,176,88.0,33.941125,351,175.5,81.31728,266,133.0,189.504617,376,188.0,36.769553,343,171.5,19.091883,147,73.5,116.672619,292,146.0,82.024387,304,152.0,53.740115,1055.820766,0.575369,10000,10,2023-05-01,2023-05-01


In [14]:
label_pd.head(10)

Unnamed: 0,loan_id,Customer_ID,label,label_def,snapshot_date
0,CUS_0x1037_2023_01_01,CUS_0x1037,0,30dpd_6mob,2023-07-01
1,CUS_0x1069_2023_01_01,CUS_0x1069,0,30dpd_6mob,2023-07-01
2,CUS_0x114a_2023_01_01,CUS_0x114a,0,30dpd_6mob,2023-07-01
3,CUS_0x1184_2023_01_01,CUS_0x1184,0,30dpd_6mob,2023-07-01
4,CUS_0x1297_2023_01_01,CUS_0x1297,1,30dpd_6mob,2023-07-01
5,CUS_0x12fb_2023_01_01,CUS_0x12fb,0,30dpd_6mob,2023-07-01
6,CUS_0x1325_2023_01_01,CUS_0x1325,0,30dpd_6mob,2023-07-01
7,CUS_0x1341_2023_01_01,CUS_0x1341,0,30dpd_6mob,2023-07-01
8,CUS_0x1375_2023_01_01,CUS_0x1375,1,30dpd_6mob,2023-07-01
9,CUS_0x13a8_2023_01_01,CUS_0x13a8,0,30dpd_6mob,2023-07-01


In [15]:
label_pd[label_pd['label'] == 1]

Unnamed: 0,loan_id,Customer_ID,label,label_def,snapshot_date
4,CUS_0x1297_2023_01_01,CUS_0x1297,1,30dpd_6mob,2023-07-01
8,CUS_0x1375_2023_01_01,CUS_0x1375,1,30dpd_6mob,2023-07-01
17,CUS_0x15ad_2023_01_01,CUS_0x15ad,1,30dpd_6mob,2023-07-01
18,CUS_0x1630_2023_01_01,CUS_0x1630,1,30dpd_6mob,2023-07-01
22,CUS_0x1733_2023_01_01,CUS_0x1733,1,30dpd_6mob,2023-07-01
...,...,...,...,...,...
13445,CUS_0xc075_2023_09_01,CUS_0xc075,1,30dpd_6mob,2024-03-01
13448,CUS_0xc122_2023_09_01,CUS_0xc122,1,30dpd_6mob,2024-03-01
13450,CUS_0xc1fe_2023_09_01,CUS_0xc1fe,1,30dpd_6mob,2024-03-01
13454,CUS_0xc2b7_2023_09_01,CUS_0xc2b7,1,30dpd_6mob,2024-03-01


In [16]:
df = spark.read.parquet("/app/datamart/silver/lms_loan_daily")
df.select("snapshot_date").distinct().show()



+-------------+
|snapshot_date|
+-------------+
|   2025-02-01|
|   2024-08-01|
|   2025-03-01|
|   2024-05-01|
|   2023-08-01|
|   2025-01-01|
|   2025-04-01|
|   2023-10-01|
|   2023-09-01|
|   2024-10-01|
|   2024-04-01|
|   2023-12-01|
|   2024-03-01|
|   2023-07-01|
|   2023-11-01|
|   2025-05-01|
|   2024-12-01|
|   2023-06-01|
|   2024-09-01|
|   2023-05-01|
+-------------+
only showing top 20 rows



                                                                                

In [17]:
df.groupBy("mob").count().orderBy("mob").show()
df.select(F.max("dpd"), F.min("dpd")).show()

                                                                                

+---+-----+
|mob|count|
+---+-----+
|  0|12500|
|  1|12500|
|  2|12500|
|  3|12500|
|  4|12500|
|  5|12500|
|  6|12500|
|  7|12500|
|  8|12500|
|  9|12500|
| 10|12500|
+---+-----+





+--------+--------+
|max(dpd)|min(dpd)|
+--------+--------+
|     306|       0|
+--------+--------+



                                                                                

In [18]:
snapshot_date = "2023-07-01"  # pick one that exists
df = df.filter(F.col("snapshot_date") == F.lit(snapshot_date))
df = df.filter(F.col("mob") == 6)
df = df.withColumn("label", F.when(F.col("dpd") >= 30, 1).otherwise(0))
df.groupBy("label").count().show()




+-----+-----+
|label|count|
+-----+-----+
|    1|  140|
|    0|  390|
+-----+-----+



                                                                                

In [19]:
# Missing values
print("\n--- Missing Values Summary ---")
missing = feature_pd.isnull().sum()
missing_pct = (missing / len(feature_pd)) * 100
missing_df = pd.DataFrame({
    'missing_count': missing,
    'missing_pct': missing_pct
})
missing_df = missing_df[missing_df['missing_count'] > 0].sort_values('missing_pct', ascending=False)

if len(missing_df) > 0:
    print(f"\nFeatures with missing values: {len(missing_df)}")
    print("\nTop 10 features with most missing values:")
    print(missing_df.head(10))
else:
    print("✓ No missing values found!")


--- Missing Values Summary ---
✓ No missing values found!


In [20]:
# Missing values
print("\n--- Missing Values Summary ---")
missing = label_pd.isnull().sum()
missing_pct = (missing / len(label_pd)) * 100
missing_df = pd.DataFrame({
    'missing_count': missing,
    'missing_pct': missing_pct
})
missing_df = missing_df[missing_df['missing_count'] > 0].sort_values('missing_pct', ascending=False)

if len(missing_df) > 0:
    print(f"\nFeatures with missing values: {len(missing_df)}")
    print("\nTop 10 features with most missing values:")
    print(missing_df.head(10))
else:
    print("✓ No missing values found!")


--- Missing Values Summary ---
✓ No missing values found!


# FEATURE ANALYSIS

In [21]:
# Identify feature types
numeric_features = feature_pd.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = feature_pd.select_dtypes(include=['object', 'category']).columns.tolist()

# Remove non-feature columns
exclude_cols = ['loan_id', 'Customer_ID', 'application_date', 'snapshot_date', 'default_label']
numeric_features = [f for f in numeric_features if f not in exclude_cols]
categorical_features = [f for f in categorical_features if f not in exclude_cols]

print(f"\nNumeric features: {len(numeric_features)}")
print(f"Categorical features: {len(categorical_features)}")


Numeric features: 161
Categorical features: 0


In [22]:
feature_pd.describe()

Unnamed: 0,loan_amt,tenure,Annual_Income,Outstanding_Debt,Num_of_Loan_active,DTI,log_Annual_Income,Payment_Behaviour_High_spent_Small_value_payments,Payment_Behaviour_Low_spent_Large_value_payments,Payment_Behaviour_Low_spent_Medium_value_payments,Payment_Behaviour_Low_spent_Small_value_payments,Payment_Behaviour_High_spent_Medium_value_payments,Payment_Behaviour_High_spent_Large_value_payments,Credit_Mix_Standard,Credit_Mix_Good,Credit_Mix_Bad,Type_of_Loan_Auto_Loan,Type_of_Loan_Credit_Builder_Loan,Type_of_Loan_Personal_Loan,Type_of_Loan_Home_Equity_Loan,Type_of_Loan_Mortgage_Loan,Type_of_Loan_Student_Loan,Type_of_Loan_Debt_Consolidation_Loan,Type_of_Loan_Payday_Loan,Credit_History_Age_Year,Credit_History_Age_Month,Num_of_Delayed_Payment_3m,Num_of_Delayed_Payment_6m,Num_of_Delayed_Payment_12m,max_dpd_prior,ever_30dpd_prior,Age,age_band_18_24,age_band_25_34,age_band_35_44,age_band_45_54,age_band_55,fe_1_sum_7d,fe_1_mean_7d,fe_1_std_7d,fe_2_sum_7d,fe_2_mean_7d,fe_2_std_7d,fe_3_sum_7d,fe_3_mean_7d,fe_3_std_7d,fe_4_sum_7d,fe_4_mean_7d,fe_4_std_7d,fe_5_sum_7d,fe_5_mean_7d,fe_5_std_7d,fe_6_sum_7d,fe_6_mean_7d,fe_6_std_7d,fe_7_sum_7d,fe_7_mean_7d,fe_7_std_7d,fe_8_sum_7d,fe_8_mean_7d,fe_8_std_7d,fe_9_sum_7d,fe_9_mean_7d,fe_9_std_7d,fe_10_sum_7d,fe_10_mean_7d,fe_10_std_7d,fe_11_sum_7d,fe_11_mean_7d,fe_11_std_7d,fe_12_sum_7d,fe_12_mean_7d,fe_12_std_7d,fe_13_sum_7d,fe_13_mean_7d,fe_13_std_7d,fe_14_sum_7d,fe_14_mean_7d,fe_14_std_7d,fe_15_sum_7d,fe_15_mean_7d,fe_15_std_7d,fe_16_sum_7d,fe_16_mean_7d,fe_16_std_7d,fe_17_sum_7d,fe_17_mean_7d,fe_17_std_7d,fe_18_sum_7d,fe_18_mean_7d,fe_18_std_7d,fe_19_sum_7d,fe_19_mean_7d,fe_19_std_7d,fe_20_sum_7d,fe_20_mean_7d,fe_20_std_7d,fe_1_sum_30d,fe_1_mean_30d,fe_1_std_30d,fe_2_sum_30d,fe_2_mean_30d,fe_2_std_30d,fe_3_sum_30d,fe_3_mean_30d,fe_3_std_30d,fe_4_sum_30d,fe_4_mean_30d,fe_4_std_30d,fe_5_sum_30d,fe_5_mean_30d,fe_5_std_30d,fe_6_sum_30d,fe_6_mean_30d,fe_6_std_30d,fe_7_sum_30d,fe_7_mean_30d,fe_7_std_30d,fe_8_sum_30d,fe_8_mean_30d,fe_8_std_30d,fe_9_sum_30d,fe_9_mean_30d,fe_9_std_30d,fe_10_sum_30d,fe_10_mean_30d,fe_10_std_30d,fe_11_sum_30d,fe_11_mean_30d,fe_11_std_30d,fe_12_sum_30d,fe_12_mean_30d,fe_12_std_30d,fe_13_sum_30d,fe_13_mean_30d,fe_13_std_30d,fe_14_sum_30d,fe_14_mean_30d,fe_14_std_30d,fe_15_sum_30d,fe_15_mean_30d,fe_15_std_30d,fe_16_sum_30d,fe_16_mean_30d,fe_16_std_30d,fe_17_sum_30d,fe_17_mean_30d,fe_17_std_30d,fe_18_sum_30d,fe_18_mean_30d,fe_18_std_30d,fe_19_sum_30d,fe_19_mean_30d,fe_19_std_30d,fe_20_sum_30d,fe_20_mean_30d,fe_20_std_30d,estimated_EMI,EMI_to_income,requested_amount,requested_tenure
count,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0
mean,10000.0,10.0,161620.6,1426.220376,3.09944,0.060092,10.587421,0.11112,0.104,0.13488,0.25616,0.17936,0.13464,0.35976,0.24256,0.1888,0.3056,0.31728,0.31104,0.314,0.3136,0.3104,0.3104,0.31944,18.23592,5.77768,33.02496,33.02496,33.02496,33.02496,0.0084,31.91848,0.18352,0.28288,0.28,0.15336,0.10024,73.98736,73.98736,0.0,74.1084,74.1084,0.0,74.54432,74.54432,0.0,75.6216,75.6216,0.0,76.84872,76.84872,0.0,72.02368,72.02368,0.0,78.41112,78.41112,0.0,79.0764,79.0764,0.0,81.41592,81.41592,0.0,84.48312,84.48312,0.0,72.0544,72.0544,0.0,71.6512,71.6512,0.0,73.05752,73.05752,0.0,70.80248,70.80248,0.0,71.65224,71.65224,0.0,71.31016,71.31016,0.0,72.23728,72.23728,0.0,73.86392,73.86392,0.0,70.98464,70.98464,0.0,70.6088,70.6088,0.0,102.80936,73.95392,22.124749,102.66832,73.98592,22.089224,103.5508,74.67364,21.986891,104.85424,75.75732,22.61616,106.73496,76.77224,22.564513,100.52376,72.5394,22.229797,108.04856,78.15592,21.826633,110.97192,79.8784,22.083567,112.62448,81.05968,22.380212,117.51864,84.63484,22.008727,99.57104,71.717,22.270413,99.42728,71.71272,22.580805,101.26416,72.81356,23.257081,98.14808,70.81944,22.539736,99.39816,71.56016,22.318213,99.45712,71.6988,22.682854,100.37296,72.65684,22.717644,101.81376,73.6218,23.204586,99.34056,71.25344,22.357133,98.44296,71.051,23.412984,1055.821,0.45176,10000.0,10.0
std,0.0,0.0,1297842.0,1155.169458,65.105277,0.087094,0.9505,0.314293,0.305273,0.341609,0.436529,0.383669,0.341352,0.479949,0.428648,0.391365,0.46068,0.465436,0.462938,0.464135,0.463974,0.462676,0.462676,0.466278,8.313547,3.451065,238.695905,238.695905,238.695905,238.695905,0.091269,13.502249,0.387108,0.450417,0.449017,0.360349,0.300332,97.023167,97.023167,0.0,96.289183,96.289183,0.0,97.524654,97.524654,0.0,97.512704,97.512704,0.0,97.247981,97.247981,0.0,95.453906,95.453906,0.0,97.944203,97.944203,0.0,98.915177,98.915177,0.0,99.001045,99.001045,0.0,100.676935,100.676935,0.0,95.743018,95.743018,0.0,95.849441,95.849441,0.0,96.962021,96.962021,0.0,96.263896,96.263896,0.0,96.649254,96.649254,0.0,96.579905,96.579905,0.0,96.862312,96.862312,0.0,98.080376,98.080376,0.0,97.804965,97.804965,0.0,99.622062,99.622062,0.0,126.450768,89.001899,47.988126,126.256427,89.001321,47.644424,127.922865,89.834996,47.981725,128.024081,90.013636,48.820879,128.264888,89.616873,48.625882,125.258388,88.069505,47.51279,128.311274,90.464705,47.73745,130.687879,92.05443,47.907587,131.446026,91.61377,47.862014,133.78646,93.660134,47.133474,124.887928,87.884696,47.859219,125.012227,88.209868,48.854221,125.845783,88.692572,49.891682,124.625648,88.614491,48.630594,126.278383,89.424685,47.983837,125.711055,89.144639,48.340382,125.036239,88.980612,49.026048,127.377181,90.374623,49.735078,127.380671,89.808167,48.781916,127.564944,91.258833,50.32452,2.273828e-13,0.36922,0.0,0.0
min,10000.0,10.0,7005.93,0.23,-100.0,2e-06,8.854512,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,-3.0,-3.0,-3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-307.0,-307.0,0.0,-301.0,-301.0,0.0,-256.0,-256.0,0.0,-292.0,-292.0,0.0,-258.0,-258.0,0.0,-259.0,-259.0,0.0,-240.0,-240.0,0.0,-348.0,-348.0,0.0,-244.0,-244.0,0.0,-264.0,-264.0,0.0,-298.0,-298.0,0.0,-289.0,-289.0,0.0,-248.0,-248.0,0.0,-293.0,-293.0,0.0,-271.0,-271.0,0.0,-277.0,-277.0,0.0,-309.0,-309.0,0.0,-299.0,-299.0,0.0,-278.0,-278.0,0.0,-313.0,-313.0,0.0,-307.0,-307.0,0.0,-301.0,-301.0,0.0,-324.0,-247.0,0.0,-259.0,-229.0,0.0,-347.0,-258.0,0.0,-259.0,-259.0,0.0,-286.0,-236.0,0.0,-348.0,-348.0,0.0,-258.0,-244.0,0.0,-264.0,-264.0,0.0,-298.0,-298.0,0.0,-316.0,-242.0,0.0,-252.0,-248.0,0.0,-293.0,-293.0,0.0,-368.0,-271.0,0.0,-273.0,-258.0,0.0,-498.0,-262.0,0.0,-349.0,-299.0,0.0,-340.0,-278.0,0.0,-283.0,-283.0,0.0,1055.821,0.000532,10000.0,10.0
25%,10000.0,10.0,19453.33,566.0725,1.0,0.009291,9.875773,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.0,3.0,9.0,9.0,9.0,9.0,0.0,24.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1055.821,0.174299,10000.0,10.0
50%,10000.0,10.0,37572.38,1166.155,3.0,0.027842,10.534024,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.0,5.0,14.0,14.0,14.0,14.0,0.0,33.0,0.0,0.0,0.0,0.0,0.0,51.0,51.0,0.0,53.0,53.0,0.0,52.0,52.0,0.0,53.0,53.0,0.0,55.0,55.0,0.0,49.0,49.0,0.0,58.0,58.0,0.0,58.0,58.0,0.0,61.0,61.0,0.0,65.0,65.0,0.0,50.0,50.0,0.0,49.0,49.0,0.0,51.0,51.0,0.0,49.0,49.0,0.0,50.0,50.0,0.0,47.0,47.0,0.0,49.0,49.0,0.0,50.0,50.0,0.0,47.0,47.0,0.0,44.0,44.0,0.0,74.0,59.0,0.0,73.0,59.0,0.0,72.0,58.5,0.0,77.0,61.0,0.0,78.0,61.5,0.0,71.0,56.5,0.0,80.0,64.0,0.0,84.0,65.0,0.0,86.0,68.0,0.0,92.0,73.0,0.0,70.0,55.0,0.0,70.0,56.0,0.0,73.0,57.0,0.0,69.0,54.0,0.0,70.0,55.5,0.0,71.0,55.5,0.0,70.0,56.0,0.0,71.0,56.0,0.0,69.0,55.0,0.0,65.0,52.0,0.0,1055.821,0.337212,10000.0,10.0
75%,10000.0,10.0,72690.21,1945.9625,5.0,0.069197,11.193962,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,25.0,9.0,18.0,18.0,18.0,18.0,0.0,42.0,0.0,1.0,1.0,0.0,0.0,143.0,143.0,0.0,141.0,141.0,0.0,142.0,142.0,0.0,144.0,144.0,0.0,147.0,147.0,0.0,140.0,140.0,0.0,149.0,149.0,0.0,150.0,150.0,0.0,150.0,150.0,0.0,157.0,157.0,0.0,139.0,139.0,0.0,140.0,140.0,0.0,143.0,143.0,0.0,136.0,136.0,0.0,139.0,139.0,0.0,138.0,138.0,0.0,140.0,140.0,0.0,142.0,142.0,0.0,138.0,138.0,0.0,138.0,138.0,0.0,183.0,137.0,12.727922,183.25,136.0,13.435029,185.0,138.0,12.197592,186.0,139.0,12.727922,188.0,142.0,13.435029,178.0,135.0,12.727922,191.0,143.0,12.727922,196.0,145.0,12.020815,197.0,145.125,12.727922,204.0,152.0,12.727922,178.0,134.0,12.727922,179.0,134.0,12.904699,181.0,135.125,12.197592,177.0,132.0,12.727922,180.0,133.0,12.727922,179.0,134.0,13.435029,182.0,135.5,13.435029,182.0,137.5,14.142136,179.0,134.0,12.020815,179.0,134.0,12.727922,1055.821,0.651295,10000.0,10.0
max,10000.0,10.0,23834700.0,4998.07,1495.0,0.683252,16.986653,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,33.0,11.0,4293.0,4293.0,4293.0,4293.0,1.0,56.0,1.0,1.0,1.0,1.0,1.0,460.0,460.0,0.0,560.0,560.0,0.0,465.0,465.0,0.0,506.0,506.0,0.0,504.0,504.0,0.0,459.0,459.0,0.0,508.0,508.0,0.0,480.0,480.0,0.0,473.0,473.0,0.0,489.0,489.0,0.0,493.0,493.0,0.0,490.0,490.0,0.0,485.0,485.0,0.0,494.0,494.0,0.0,452.0,452.0,0.0,498.0,498.0,0.0,493.0,493.0,0.0,474.0,474.0,0.0,500.0,500.0,0.0,510.0,510.0,0.0,690.0,438.0,386.080303,750.0,560.0,347.896536,692.0,448.0,382.544769,768.0,506.0,365.574206,683.0,423.0,391.737157,702.0,459.0,316.783838,695.0,508.0,381.837662,747.0,480.0,392.444264,744.0,473.0,340.118362,900.0,489.0,373.35238,784.0,493.0,351.43207,659.0,490.0,367.695526,726.0,473.0,409.414826,685.0,494.0,375.473701,733.0,430.0,343.653896,731.0,476.0,363.452886,667.0,493.0,373.35238,712.0,474.0,437.699098,730.0,480.0,398.808225,733.0,467.0,382.544769,1055.821,1.808446,10000.0,10.0


In [24]:
feature_pd.dtypes

Customer_ID          object
loan_amt              int32
tenure                int32
Annual_Income       float64
Outstanding_Debt    float64
                     ...   
EMI_to_income       float64
requested_amount      int32
requested_tenure      int32
snapshot_date        object
application_date     object
Length: 164, dtype: object

In [25]:
feature_pd.columns.values

array(['Customer_ID', 'loan_amt', 'tenure', 'Annual_Income',
       'Outstanding_Debt', 'Num_of_Loan_active', 'DTI',
       'log_Annual_Income',
       'Payment_Behaviour_High_spent_Small_value_payments',
       'Payment_Behaviour_Low_spent_Large_value_payments',
       'Payment_Behaviour_Low_spent_Medium_value_payments',
       'Payment_Behaviour_Low_spent_Small_value_payments',
       'Payment_Behaviour_High_spent_Medium_value_payments',
       'Payment_Behaviour_High_spent_Large_value_payments',
       'Credit_Mix_Standard', 'Credit_Mix_Good', 'Credit_Mix_Bad',
       'Type_of_Loan_Auto_Loan', 'Type_of_Loan_Credit_Builder_Loan',
       'Type_of_Loan_Personal_Loan', 'Type_of_Loan_Home_Equity_Loan',
       'Type_of_Loan_Mortgage_Loan', 'Type_of_Loan_Student_Loan',
       'Type_of_Loan_Debt_Consolidation_Loan', 'Type_of_Loan_Payday_Loan',
       'Credit_History_Age_Year', 'Credit_History_Age_Month',
       'Num_of_Delayed_Payment_3m', 'Num_of_Delayed_Payment_6m',
       'Num_of_Delayed

In [33]:
# Key numeric features
key_numeric = ['DTI', 'log_Annual_Income', 'Annual_Income', 'Age', 
               'Credit_History_Age_Year', 'Num_of_Loan_active',
               'Num_of_Delayed_Payment_3m', 'Num_of_Delayed_Payment_6m',
               'Num_of_Delayed_Payment_12m', 'max_dpd_prior', 'ever_30dpd_prior', 
               'EMI_to_income','fe_1_sum_7d', 'fe_1_mean_7d',
       'fe_1_std_7d', 'fe_2_sum_7d', 'fe_2_mean_7d', 'fe_2_std_7d',
       'fe_3_sum_7d', 'fe_3_mean_7d', 'fe_3_std_7d', 'fe_4_sum_7d',
       'fe_4_mean_7d', 'fe_4_std_7d', 'fe_5_sum_7d', 'fe_5_mean_7d',
       'fe_5_std_7d', 'fe_6_sum_7d', 'fe_6_mean_7d', 'fe_6_std_7d',
       'fe_7_sum_7d', 'fe_7_mean_7d', 'fe_7_std_7d', 'fe_8_sum_7d',
       'fe_8_mean_7d', 'fe_8_std_7d', 'fe_9_sum_7d', 'fe_9_mean_7d',
       'fe_9_std_7d', 'fe_10_sum_7d', 'fe_10_mean_7d', 'fe_10_std_7d',
       'fe_11_sum_7d', 'fe_11_mean_7d', 'fe_11_std_7d', 'fe_12_sum_7d',
       'fe_12_mean_7d', 'fe_12_std_7d', 'fe_13_sum_7d', 'fe_13_mean_7d',
       'fe_13_std_7d', 'fe_14_sum_7d', 'fe_14_mean_7d', 'fe_14_std_7d',
       'fe_15_sum_7d', 'fe_15_mean_7d', 'fe_15_std_7d', 'fe_16_sum_7d',
       'fe_16_mean_7d', 'fe_16_std_7d', 'fe_17_sum_7d', 'fe_17_mean_7d',
       'fe_17_std_7d', 'fe_18_sum_7d', 'fe_18_mean_7d', 'fe_18_std_7d',
       'fe_19_sum_7d', 'fe_19_mean_7d', 'fe_19_std_7d', 'fe_20_sum_7d',
       'fe_20_mean_7d', 'fe_20_std_7d', 'fe_1_sum_30d', 'fe_1_mean_30d',
       'fe_1_std_30d', 'fe_2_sum_30d', 'fe_2_mean_30d', 'fe_2_std_30d',
       'fe_3_sum_30d', 'fe_3_mean_30d', 'fe_3_std_30d', 'fe_4_sum_30d',
       'fe_4_mean_30d', 'fe_4_std_30d', 'fe_5_sum_30d', 'fe_5_mean_30d',
       'fe_5_std_30d', 'fe_6_sum_30d', 'fe_6_mean_30d', 'fe_6_std_30d',
       'fe_7_sum_30d', 'fe_7_mean_30d', 'fe_7_std_30d', 'fe_8_sum_30d',
       'fe_8_mean_30d', 'fe_8_std_30d', 'fe_9_sum_30d', 'fe_9_mean_30d',
       'fe_9_std_30d', 'fe_10_sum_30d', 'fe_10_mean_30d', 'fe_10_std_30d',
       'fe_11_sum_30d', 'fe_11_mean_30d', 'fe_11_std_30d',
       'fe_12_sum_30d', 'fe_12_mean_30d', 'fe_12_std_30d',
       'fe_13_sum_30d', 'fe_13_mean_30d', 'fe_13_std_30d',
       'fe_14_sum_30d', 'fe_14_mean_30d', 'fe_14_std_30d',
       'fe_15_sum_30d', 'fe_15_mean_30d', 'fe_15_std_30d',
       'fe_16_sum_30d', 'fe_16_mean_30d', 'fe_16_std_30d',
       'fe_17_sum_30d', 'fe_17_mean_30d', 'fe_17_std_30d',
       'fe_18_sum_30d', 'fe_18_mean_30d', 'fe_18_std_30d',
       'fe_19_sum_30d', 'fe_19_mean_30d', 'fe_19_std_30d',
       'fe_20_sum_30d', 'fe_20_mean_30d', 'fe_20_std_30d',
       'estimated_EMI']
key_numeric = [f for f in key_numeric if f in feature_pd.columns]

print("\n--- Key Numeric Features Statistics ---")
feature_pd[key_numeric].describe()


--- Key Numeric Features Statistics ---


Unnamed: 0,DTI,log_Annual_Income,Annual_Income,Age,Credit_History_Age_Year,Num_of_Loan_active,Num_of_Delayed_Payment_3m,Num_of_Delayed_Payment_6m,Num_of_Delayed_Payment_12m,max_dpd_prior,ever_30dpd_prior,EMI_to_income,fe_1_sum_7d,fe_1_mean_7d,fe_1_std_7d,fe_2_sum_7d,fe_2_mean_7d,fe_2_std_7d,fe_3_sum_7d,fe_3_mean_7d,fe_3_std_7d,fe_4_sum_7d,fe_4_mean_7d,fe_4_std_7d,fe_5_sum_7d,fe_5_mean_7d,fe_5_std_7d,fe_6_sum_7d,fe_6_mean_7d,fe_6_std_7d,fe_7_sum_7d,fe_7_mean_7d,fe_7_std_7d,fe_8_sum_7d,fe_8_mean_7d,fe_8_std_7d,fe_9_sum_7d,fe_9_mean_7d,fe_9_std_7d,fe_10_sum_7d,fe_10_mean_7d,fe_10_std_7d,fe_11_sum_7d,fe_11_mean_7d,fe_11_std_7d,fe_12_sum_7d,fe_12_mean_7d,fe_12_std_7d,fe_13_sum_7d,fe_13_mean_7d,fe_13_std_7d,fe_14_sum_7d,fe_14_mean_7d,fe_14_std_7d,fe_15_sum_7d,fe_15_mean_7d,fe_15_std_7d,fe_16_sum_7d,fe_16_mean_7d,fe_16_std_7d,fe_17_sum_7d,fe_17_mean_7d,fe_17_std_7d,fe_18_sum_7d,fe_18_mean_7d,fe_18_std_7d,fe_19_sum_7d,fe_19_mean_7d,fe_19_std_7d,fe_20_sum_7d,fe_20_mean_7d,fe_20_std_7d,fe_1_sum_30d,fe_1_mean_30d,fe_1_std_30d,fe_2_sum_30d,fe_2_mean_30d,fe_2_std_30d,fe_3_sum_30d,fe_3_mean_30d,fe_3_std_30d,fe_4_sum_30d,fe_4_mean_30d,fe_4_std_30d,fe_5_sum_30d,fe_5_mean_30d,fe_5_std_30d,fe_6_sum_30d,fe_6_mean_30d,fe_6_std_30d,fe_7_sum_30d,fe_7_mean_30d,fe_7_std_30d,fe_8_sum_30d,fe_8_mean_30d,fe_8_std_30d,fe_9_sum_30d,fe_9_mean_30d,fe_9_std_30d,fe_10_sum_30d,fe_10_mean_30d,fe_10_std_30d,fe_11_sum_30d,fe_11_mean_30d,fe_11_std_30d,fe_12_sum_30d,fe_12_mean_30d,fe_12_std_30d,fe_13_sum_30d,fe_13_mean_30d,fe_13_std_30d,fe_14_sum_30d,fe_14_mean_30d,fe_14_std_30d,fe_15_sum_30d,fe_15_mean_30d,fe_15_std_30d,fe_16_sum_30d,fe_16_mean_30d,fe_16_std_30d,fe_17_sum_30d,fe_17_mean_30d,fe_17_std_30d,fe_18_sum_30d,fe_18_mean_30d,fe_18_std_30d,fe_19_sum_30d,fe_19_mean_30d,fe_19_std_30d,fe_20_sum_30d,fe_20_mean_30d,fe_20_std_30d,estimated_EMI
count,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0
mean,0.060092,10.587421,161620.6,31.91848,18.23592,3.09944,33.02496,33.02496,33.02496,33.02496,0.0084,0.45176,73.98736,73.98736,0.0,74.1084,74.1084,0.0,74.54432,74.54432,0.0,75.6216,75.6216,0.0,76.84872,76.84872,0.0,72.02368,72.02368,0.0,78.41112,78.41112,0.0,79.0764,79.0764,0.0,81.41592,81.41592,0.0,84.48312,84.48312,0.0,72.0544,72.0544,0.0,71.6512,71.6512,0.0,73.05752,73.05752,0.0,70.80248,70.80248,0.0,71.65224,71.65224,0.0,71.31016,71.31016,0.0,72.23728,72.23728,0.0,73.86392,73.86392,0.0,70.98464,70.98464,0.0,70.6088,70.6088,0.0,102.80936,73.95392,22.124749,102.66832,73.98592,22.089224,103.5508,74.67364,21.986891,104.85424,75.75732,22.61616,106.73496,76.77224,22.564513,100.52376,72.5394,22.229797,108.04856,78.15592,21.826633,110.97192,79.8784,22.083567,112.62448,81.05968,22.380212,117.51864,84.63484,22.008727,99.57104,71.717,22.270413,99.42728,71.71272,22.580805,101.26416,72.81356,23.257081,98.14808,70.81944,22.539736,99.39816,71.56016,22.318213,99.45712,71.6988,22.682854,100.37296,72.65684,22.717644,101.81376,73.6218,23.204586,99.34056,71.25344,22.357133,98.44296,71.051,23.412984,1055.821
std,0.087094,0.9505,1297842.0,13.502249,8.313547,65.105277,238.695905,238.695905,238.695905,238.695905,0.091269,0.36922,97.023167,97.023167,0.0,96.289183,96.289183,0.0,97.524654,97.524654,0.0,97.512704,97.512704,0.0,97.247981,97.247981,0.0,95.453906,95.453906,0.0,97.944203,97.944203,0.0,98.915177,98.915177,0.0,99.001045,99.001045,0.0,100.676935,100.676935,0.0,95.743018,95.743018,0.0,95.849441,95.849441,0.0,96.962021,96.962021,0.0,96.263896,96.263896,0.0,96.649254,96.649254,0.0,96.579905,96.579905,0.0,96.862312,96.862312,0.0,98.080376,98.080376,0.0,97.804965,97.804965,0.0,99.622062,99.622062,0.0,126.450768,89.001899,47.988126,126.256427,89.001321,47.644424,127.922865,89.834996,47.981725,128.024081,90.013636,48.820879,128.264888,89.616873,48.625882,125.258388,88.069505,47.51279,128.311274,90.464705,47.73745,130.687879,92.05443,47.907587,131.446026,91.61377,47.862014,133.78646,93.660134,47.133474,124.887928,87.884696,47.859219,125.012227,88.209868,48.854221,125.845783,88.692572,49.891682,124.625648,88.614491,48.630594,126.278383,89.424685,47.983837,125.711055,89.144639,48.340382,125.036239,88.980612,49.026048,127.377181,90.374623,49.735078,127.380671,89.808167,48.781916,127.564944,91.258833,50.32452,2.273828e-13
min,2e-06,8.854512,7005.93,0.0,0.0,-100.0,-3.0,-3.0,-3.0,-3.0,0.0,0.000532,-307.0,-307.0,0.0,-301.0,-301.0,0.0,-256.0,-256.0,0.0,-292.0,-292.0,0.0,-258.0,-258.0,0.0,-259.0,-259.0,0.0,-240.0,-240.0,0.0,-348.0,-348.0,0.0,-244.0,-244.0,0.0,-264.0,-264.0,0.0,-298.0,-298.0,0.0,-289.0,-289.0,0.0,-248.0,-248.0,0.0,-293.0,-293.0,0.0,-271.0,-271.0,0.0,-277.0,-277.0,0.0,-309.0,-309.0,0.0,-299.0,-299.0,0.0,-278.0,-278.0,0.0,-313.0,-313.0,0.0,-307.0,-307.0,0.0,-301.0,-301.0,0.0,-324.0,-247.0,0.0,-259.0,-229.0,0.0,-347.0,-258.0,0.0,-259.0,-259.0,0.0,-286.0,-236.0,0.0,-348.0,-348.0,0.0,-258.0,-244.0,0.0,-264.0,-264.0,0.0,-298.0,-298.0,0.0,-316.0,-242.0,0.0,-252.0,-248.0,0.0,-293.0,-293.0,0.0,-368.0,-271.0,0.0,-273.0,-258.0,0.0,-498.0,-262.0,0.0,-349.0,-299.0,0.0,-340.0,-278.0,0.0,-283.0,-283.0,0.0,1055.821
25%,0.009291,9.875773,19453.33,24.0,12.0,1.0,9.0,9.0,9.0,9.0,0.0,0.174299,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1055.821
50%,0.027842,10.534024,37572.38,33.0,18.0,3.0,14.0,14.0,14.0,14.0,0.0,0.337212,51.0,51.0,0.0,53.0,53.0,0.0,52.0,52.0,0.0,53.0,53.0,0.0,55.0,55.0,0.0,49.0,49.0,0.0,58.0,58.0,0.0,58.0,58.0,0.0,61.0,61.0,0.0,65.0,65.0,0.0,50.0,50.0,0.0,49.0,49.0,0.0,51.0,51.0,0.0,49.0,49.0,0.0,50.0,50.0,0.0,47.0,47.0,0.0,49.0,49.0,0.0,50.0,50.0,0.0,47.0,47.0,0.0,44.0,44.0,0.0,74.0,59.0,0.0,73.0,59.0,0.0,72.0,58.5,0.0,77.0,61.0,0.0,78.0,61.5,0.0,71.0,56.5,0.0,80.0,64.0,0.0,84.0,65.0,0.0,86.0,68.0,0.0,92.0,73.0,0.0,70.0,55.0,0.0,70.0,56.0,0.0,73.0,57.0,0.0,69.0,54.0,0.0,70.0,55.5,0.0,71.0,55.5,0.0,70.0,56.0,0.0,71.0,56.0,0.0,69.0,55.0,0.0,65.0,52.0,0.0,1055.821
75%,0.069197,11.193962,72690.21,42.0,25.0,5.0,18.0,18.0,18.0,18.0,0.0,0.651295,143.0,143.0,0.0,141.0,141.0,0.0,142.0,142.0,0.0,144.0,144.0,0.0,147.0,147.0,0.0,140.0,140.0,0.0,149.0,149.0,0.0,150.0,150.0,0.0,150.0,150.0,0.0,157.0,157.0,0.0,139.0,139.0,0.0,140.0,140.0,0.0,143.0,143.0,0.0,136.0,136.0,0.0,139.0,139.0,0.0,138.0,138.0,0.0,140.0,140.0,0.0,142.0,142.0,0.0,138.0,138.0,0.0,138.0,138.0,0.0,183.0,137.0,12.727922,183.25,136.0,13.435029,185.0,138.0,12.197592,186.0,139.0,12.727922,188.0,142.0,13.435029,178.0,135.0,12.727922,191.0,143.0,12.727922,196.0,145.0,12.020815,197.0,145.125,12.727922,204.0,152.0,12.727922,178.0,134.0,12.727922,179.0,134.0,12.904699,181.0,135.125,12.197592,177.0,132.0,12.727922,180.0,133.0,12.727922,179.0,134.0,13.435029,182.0,135.5,13.435029,182.0,137.5,14.142136,179.0,134.0,12.020815,179.0,134.0,12.727922,1055.821
max,0.683252,16.986653,23834700.0,56.0,33.0,1495.0,4293.0,4293.0,4293.0,4293.0,1.0,1.808446,460.0,460.0,0.0,560.0,560.0,0.0,465.0,465.0,0.0,506.0,506.0,0.0,504.0,504.0,0.0,459.0,459.0,0.0,508.0,508.0,0.0,480.0,480.0,0.0,473.0,473.0,0.0,489.0,489.0,0.0,493.0,493.0,0.0,490.0,490.0,0.0,485.0,485.0,0.0,494.0,494.0,0.0,452.0,452.0,0.0,498.0,498.0,0.0,493.0,493.0,0.0,474.0,474.0,0.0,500.0,500.0,0.0,510.0,510.0,0.0,690.0,438.0,386.080303,750.0,560.0,347.896536,692.0,448.0,382.544769,768.0,506.0,365.574206,683.0,423.0,391.737157,702.0,459.0,316.783838,695.0,508.0,381.837662,747.0,480.0,392.444264,744.0,473.0,340.118362,900.0,489.0,373.35238,784.0,493.0,351.43207,659.0,490.0,367.695526,726.0,473.0,409.414826,685.0,494.0,375.473701,733.0,430.0,343.653896,731.0,476.0,363.452886,667.0,493.0,373.35238,712.0,474.0,437.699098,730.0,480.0,398.808225,733.0,467.0,382.544769,1055.821


In [34]:
label_pd.head()

Unnamed: 0,loan_id,Customer_ID,label,label_def,snapshot_date
0,CUS_0x1037_2023_01_01,CUS_0x1037,0,30dpd_6mob,2023-07-01
1,CUS_0x1069_2023_01_01,CUS_0x1069,0,30dpd_6mob,2023-07-01
2,CUS_0x114a_2023_01_01,CUS_0x114a,0,30dpd_6mob,2023-07-01
3,CUS_0x1184_2023_01_01,CUS_0x1184,0,30dpd_6mob,2023-07-01
4,CUS_0x1297_2023_01_01,CUS_0x1297,1,30dpd_6mob,2023-07-01


In [30]:
df = pd.merge(
    feature_pd,
    label_pd,
    left_on=["Customer_ID", "snapshot_date"],
    right_on=["Customer_ID", "snapshot_date"],
    how="inner"
)
df.head()

Unnamed: 0,Customer_ID,loan_amt,tenure,Annual_Income,Outstanding_Debt,Num_of_Loan_active,DTI,log_Annual_Income,Payment_Behaviour_High_spent_Small_value_payments,Payment_Behaviour_Low_spent_Large_value_payments,Payment_Behaviour_Low_spent_Medium_value_payments,Payment_Behaviour_Low_spent_Small_value_payments,Payment_Behaviour_High_spent_Medium_value_payments,Payment_Behaviour_High_spent_Large_value_payments,Credit_Mix_Standard,Credit_Mix_Good,Credit_Mix_Bad,Type_of_Loan_Auto_Loan,Type_of_Loan_Credit_Builder_Loan,Type_of_Loan_Personal_Loan,Type_of_Loan_Home_Equity_Loan,Type_of_Loan_Mortgage_Loan,Type_of_Loan_Student_Loan,Type_of_Loan_Debt_Consolidation_Loan,Type_of_Loan_Payday_Loan,Credit_History_Age_Year,Credit_History_Age_Month,Num_of_Delayed_Payment_3m,Num_of_Delayed_Payment_6m,Num_of_Delayed_Payment_12m,max_dpd_prior,ever_30dpd_prior,Age,age_band_18_24,age_band_25_34,age_band_35_44,age_band_45_54,age_band_55,fe_1_sum_7d,fe_1_mean_7d,fe_1_std_7d,fe_2_sum_7d,fe_2_mean_7d,fe_2_std_7d,fe_3_sum_7d,fe_3_mean_7d,fe_3_std_7d,fe_4_sum_7d,fe_4_mean_7d,fe_4_std_7d,fe_5_sum_7d,fe_5_mean_7d,fe_5_std_7d,fe_6_sum_7d,fe_6_mean_7d,fe_6_std_7d,fe_7_sum_7d,fe_7_mean_7d,fe_7_std_7d,fe_8_sum_7d,fe_8_mean_7d,fe_8_std_7d,fe_9_sum_7d,fe_9_mean_7d,fe_9_std_7d,fe_10_sum_7d,fe_10_mean_7d,fe_10_std_7d,fe_11_sum_7d,fe_11_mean_7d,fe_11_std_7d,fe_12_sum_7d,fe_12_mean_7d,fe_12_std_7d,fe_13_sum_7d,fe_13_mean_7d,fe_13_std_7d,fe_14_sum_7d,fe_14_mean_7d,fe_14_std_7d,fe_15_sum_7d,fe_15_mean_7d,fe_15_std_7d,fe_16_sum_7d,fe_16_mean_7d,fe_16_std_7d,fe_17_sum_7d,fe_17_mean_7d,fe_17_std_7d,fe_18_sum_7d,fe_18_mean_7d,fe_18_std_7d,fe_19_sum_7d,fe_19_mean_7d,fe_19_std_7d,fe_20_sum_7d,fe_20_mean_7d,fe_20_std_7d,fe_1_sum_30d,fe_1_mean_30d,fe_1_std_30d,fe_2_sum_30d,fe_2_mean_30d,fe_2_std_30d,fe_3_sum_30d,fe_3_mean_30d,fe_3_std_30d,fe_4_sum_30d,fe_4_mean_30d,fe_4_std_30d,fe_5_sum_30d,fe_5_mean_30d,fe_5_std_30d,fe_6_sum_30d,fe_6_mean_30d,fe_6_std_30d,fe_7_sum_30d,fe_7_mean_30d,fe_7_std_30d,fe_8_sum_30d,fe_8_mean_30d,fe_8_std_30d,fe_9_sum_30d,fe_9_mean_30d,fe_9_std_30d,fe_10_sum_30d,fe_10_mean_30d,fe_10_std_30d,fe_11_sum_30d,fe_11_mean_30d,fe_11_std_30d,fe_12_sum_30d,fe_12_mean_30d,fe_12_std_30d,fe_13_sum_30d,fe_13_mean_30d,fe_13_std_30d,fe_14_sum_30d,fe_14_mean_30d,fe_14_std_30d,fe_15_sum_30d,fe_15_mean_30d,fe_15_std_30d,fe_16_sum_30d,fe_16_mean_30d,fe_16_std_30d,fe_17_sum_30d,fe_17_mean_30d,fe_17_std_30d,fe_18_sum_30d,fe_18_mean_30d,fe_18_std_30d,fe_19_sum_30d,fe_19_mean_30d,fe_19_std_30d,fe_20_sum_30d,fe_20_mean_30d,fe_20_std_30d,estimated_EMI,EMI_to_income,requested_amount,requested_tenure,snapshot_date,application_date,loan_id,label,label_def
0,CUS_0x1000,10000,10,30625.94,1562.91,2.0,0.051032,10.329603,0,0,0,0,1,0,0,0,1,0,1,0,1,0,0,0,0,10,9,26.0,26.0,26.0,26.0,0,18.0,1,0,0,0,0,58,58.0,0.0,15,15.0,0.0,18,18.0,0.0,-101,-101.0,0.0,48,48.0,0.0,-33,-33.0,0.0,238,238.0,0.0,119,119.0,0.0,75,75.0,0.0,52,52.0,0.0,39,39.0,0.0,131,131.0,0.0,-34,-34.0,0.0,145,145.0,0.0,-61,-61.0,0.0,39,39.0,0.0,52,52.0,0.0,80,80.0,0.0,4,4.0,0.0,31,31.0,0.0,150,75.0,24.041631,121,60.5,64.346717,124,62.0,62.225397,-62,-31.0,98.994949,336,168.0,169.705627,159,79.5,159.099026,492,246.0,11.313708,213,106.5,17.67767,245,122.5,67.175144,66,33.0,26.870058,203,101.5,88.388348,100,50.0,114.551299,105,52.5,122.329473,265,132.5,17.67767,8,4.0,91.923882,211,105.5,94.045202,138,69.0,24.041631,155,77.5,3.535534,68,34.0,42.426407,93,46.5,21.92031,1055.820766,0.413697,10000,10,2023-05-01,2023-05-01,CUS_0x1000_2023_05_01,0,30dpd_0mob
1,CUS_0x108a,10000,10,36982.36,4882.12,9.0,0.132012,10.518196,0,0,1,0,0,0,0,0,0,0,1,1,1,0,1,1,1,7,9,15.0,15.0,15.0,15.0,0,38.0,0,0,1,0,0,77,77.0,0.0,80,80.0,0.0,44,44.0,0.0,-95,-95.0,0.0,161,161.0,0.0,126,126.0,0.0,50,50.0,0.0,-122,-122.0,0.0,105,105.0,0.0,112,112.0,0.0,170,170.0,0.0,232,232.0,0.0,268,268.0,0.0,-50,-50.0,0.0,12,12.0,0.0,123,123.0,0.0,137,137.0,0.0,105,105.0,0.0,31,31.0,0.0,416,416.0,0.0,140,70.0,9.899495,144,72.0,11.313708,275,137.5,132.228968,-16,-8.0,123.03658,344,172.0,15.556349,454,227.0,142.83557,96,48.0,2.828427,82,41.0,230.516811,309,154.5,70.003571,384,192.0,113.137085,288,144.0,36.769553,441,220.5,16.263456,398,199.0,97.580736,29,14.5,91.216775,150,75.0,89.095454,314,157.0,48.083261,195,97.5,55.861436,534,267.0,229.102597,88,44.0,18.384776,580,290.0,178.190909,1055.820766,0.342592,10000,10,2023-05-01,2023-05-01,CUS_0x108a_2023_05_01,0,30dpd_0mob
2,CUS_0x10f9,10000,10,150131.68,1138.36,0.0,0.007582,11.919268,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,31,11,0.0,0.0,0.0,0.0,0,54.0,0,0,0,1,0,269,269.0,0.0,109,109.0,0.0,92,92.0,0.0,64,64.0,0.0,158,158.0,0.0,-1,-1.0,0.0,206,206.0,0.0,-15,-15.0,0.0,207,207.0,0.0,49,49.0,0.0,64,64.0,0.0,28,28.0,0.0,106,106.0,0.0,-59,-59.0,0.0,31,31.0,0.0,4,4.0,0.0,20,20.0,0.0,166,166.0,0.0,221,221.0,0.0,147,147.0,0.0,455,227.5,58.689863,122,61.0,67.882251,312,156.0,90.509668,225,112.5,68.589358,158,79.0,111.722871,-20,-10.0,12.727922,237,118.5,123.743687,97,48.5,89.802561,402,201.0,8.485281,268,134.0,120.208153,261,130.5,94.045202,-59,-29.5,81.31728,373,186.5,113.844192,41,20.5,112.429978,112,56.0,35.355339,126,63.0,83.4386,222,111.0,128.693434,47,23.5,201.525433,349,174.5,65.760931,127,63.5,118.086832,1055.820766,0.084392,10000,10,2023-05-01,2023-05-01,CUS_0x10f9_2023_05_01,0,30dpd_0mob
3,CUS_0x1119,10000,10,56301.9,2161.23,2.0,0.038386,10.938484,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,14,6,18.0,18.0,18.0,18.0,0,36.0,0,0,1,0,0,-79,-79.0,0.0,30,30.0,0.0,91,91.0,0.0,122,122.0,0.0,195,195.0,0.0,284,284.0,0.0,29,29.0,0.0,-61,-61.0,0.0,260,260.0,0.0,94,94.0,0.0,55,55.0,0.0,-113,-113.0,0.0,42,42.0,0.0,-14,-14.0,0.0,71,71.0,0.0,4,4.0,0.0,92,92.0,0.0,118,118.0,0.0,94,94.0,0.0,80,80.0,0.0,186,93.0,243.244733,271,135.5,149.199531,370,185.0,132.936075,102,51.0,100.409163,272,136.0,83.4386,493,246.5,53.033009,136,68.0,55.154329,42,21.0,115.965512,436,218.0,59.39697,7,3.5,127.986327,216,108.0,74.953319,-313,-156.5,61.51829,79,39.5,3.535534,63,31.5,64.346717,238,119.0,67.882251,162,81.0,108.894444,263,131.5,55.861436,256,128.0,14.142136,30,15.0,111.722871,146,73.0,9.899495,1055.820766,0.225034,10000,10,2023-05-01,2023-05-01,CUS_0x1119_2023_05_01,0,30dpd_0mob
4,CUS_0x1192,10000,10,16319.375,1275.32,2.0,0.078148,9.700108,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,23,9,13.0,13.0,13.0,13.0,0,0.0,0,0,0,0,1,191,191.0,0.0,144,144.0,0.0,28,28.0,0.0,127,127.0,0.0,245,245.0,0.0,-37,-37.0,0.0,149,149.0,0.0,244,244.0,0.0,100,100.0,0.0,219,219.0,0.0,98,98.0,0.0,190,190.0,0.0,139,139.0,0.0,68,68.0,0.0,-37,-37.0,0.0,235,235.0,0.0,-25,-25.0,0.0,89,89.0,0.0,230,230.0,0.0,12,12.0,0.0,336,168.0,32.526912,247,123.5,28.991378,152,76.0,67.882251,299,149.5,31.819805,269,134.5,156.270599,-46,-23.0,19.79899,219,109.5,55.861436,172,86.0,223.445743,184,92.0,11.313708,299,149.5,98.287843,154,77.0,29.698485,216,108.0,115.965512,211,105.5,47.376154,140,70.0,2.828427,-31,-15.5,30.405592,514,257.0,31.112698,58,29.0,76.367532,-92,-46.0,190.918831,476,238.0,11.313708,21,10.5,2.12132,1055.820766,0.776369,10000,10,2023-05-01,2023-05-01,CUS_0x1192_2023_05_01,0,30dpd_0mob


In [32]:
# Correlation with target
print("\n--- Top 15 Features Correlated with Default ---")
correlations = []
for col_name in key_numeric:
    if col_name in df.columns:
        try:
            corr = df[['default_label', col_name]].corr().iloc[0, 1]
            if not np.isnan(corr):
                correlations.append((col_name, corr))
        except:
            pass

correlations.sort(key=lambda x: abs(x[1]), reverse=True)

print("\nFeature                              Correlation")
print("-" * 60)
for feat, corr in correlations[:15]:
    direction = "↑ Risk+" if corr > 0 else "↓ Risk-"
    print(f"{feat:35s} {corr:+.4f}  {direction}")


--- Top 15 Features Correlated with Default ---

Feature                              Correlation
------------------------------------------------------------


In [None]:
df[key_numeric].nunique().sort_values()

In [None]:
df[key_numeric].dtypes

In [None]:
# Visualization: Target distribution
plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
df['default_label'].value_counts().plot(kind='bar', color=['green', 'red'], alpha=0.7, edgecolor='black')
plt.xlabel('Default Label')
plt.ylabel('Count')
plt.title('Target Variable Distribution')
plt.xticks([0, 1], ['Non-Default (0)', 'Default (1)'], rotation=0)
plt.grid(True, alpha=0.3)

plt.subplot(1, 3, 2)
if 'income_band' in df.columns:
    default_by_income = df.groupby('income_band')['default_label'].mean() * 100
    default_by_income.plot(kind='bar', color='coral', alpha=0.7, edgecolor='black')
    plt.xlabel('Income Band')
    plt.ylabel('Default Rate (%)')
    plt.title('Default Rate by Income Band')
    plt.xticks(rotation=45)
    plt.grid(True, alpha=0.3)

plt.subplot(1, 3, 3)
if 'age_band' in df.columns:
    default_by_age = df.groupby('age_band')['default_label'].mean() * 100
    default_by_age.plot(kind='bar', color='skyblue', alpha=0.7, edgecolor='black')
    plt.xlabel('Age Band')
    plt.ylabel('Default Rate (%)')
    plt.title('Default Rate by Age Band')
    plt.xticks(rotation=45)
    plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('gold_target_distribution.png', dpi=150, bbox_inches='tight')
plt.show()
print("\n✓ Saved: gold_target_distribution.png")

In [None]:
# Visualization: Feature distributions by default status
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.ravel()

plot_features = ['DTI', 'Annual_Income', 'Age', 'Num_of_Delayed_Payment_12m', 
                'Credit_History_Age_Year', 'EMI_to_income']
plot_features = [f for f in plot_features if f in df.columns]

for idx, feature in enumerate(plot_features[:6]):
    ax = axes[idx]
    
    non_default = df[df['default_label'] == 0][feature].dropna()
    default = df[df['default_label'] == 1][feature].dropna()
    
    # Handle outliers for better visualization
    if feature in ['DTI', 'EMI_to_income']:
        non_default = non_default.clip(upper=non_default.quantile(0.95))
        default = default.clip(upper=default.quantile(0.95))
    
    ax.hist(non_default, bins=50, alpha=0.5, label='Non-Default', 
           color='green', edgecolor='black', density=True)
    ax.hist(default, bins=50, alpha=0.5, label='Default', 
           color='red', edgecolor='black', density=True)
    
    ax.set_xlabel(feature)
    ax.set_ylabel('Density')
    ax.set_title(f'{feature} Distribution by Default Status')
    ax.legend()
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('gold_feature_distributions.png', dpi=150, bbox_inches='tight')
plt.show()
print("✓ Saved: gold_feature_distributions.png")


# DATA PREPARATION FOR MODELING

In [None]:
# Select features for modeling
print("\n--- Feature Selection ---")

# Strategy: Use numeric features with low missing values
# Filter out features with >50% missing values
valid_features = []
for feat in numeric_features:
    missing_pct = df[feat].isnull().sum() / len(df) * 100
    if missing_pct < 50:
        valid_features.append(feat)

print(f"Total numeric features: {len(numeric_features)}")
print(f"Valid features (< 50% missing): {len(valid_features)}")

In [None]:
# Create modeling dataset
print("\n--- Creating Modeling Dataset ---")

# Select features and target
X = df[valid_features].copy()
y = df['default_label'].copy()

print(f"Feature matrix shape: {X.shape}")
print(f"Target vector shape: {y.shape}")

In [None]:
# Handle missing values (simple imputation with median)
print("\n--- Handling Missing Values ---")
missing_before = X.isnull().sum().sum()
X = X.fillna(X.median())
missing_after = X.isnull().sum().sum()
print(f"Missing values before imputation: {missing_before:,}")
print(f"Missing values after imputation: {missing_after:,}")

In [None]:
# Handle infinite values
X = X.replace([np.inf, -np.inf], np.nan)
X = X.fillna(X.median())

In [None]:
# Remove constant features (zero variance)
print("\n--- Removing Constant Features ---")
constant_features = [col for col in X.columns if X[col].nunique() <= 1]
if constant_features:
    print(f"Removing {len(constant_features)} constant features")
    X = X.drop(columns=constant_features)
    print(f"Features after removal: {len(X.columns)}")

In [None]:
# Feature importance using correlation (for feature selection)
print("\n--- Feature Selection by Correlation ---")
correlations_with_target = []
for col_name in X.columns:
    corr = np.corrcoef(X[col_name], y)[0, 1]
    if not np.isnan(corr):
        correlations_with_target.append((col_name, abs(corr)))

correlations_with_target.sort(key=lambda x: x[1], reverse=True)

# Select top 20 most correlated features
top_n_features = 20
selected_features = [feat for feat, corr in correlations_with_target[:top_n_features]]

print(f"\nSelecting top {top_n_features} features by correlation:")
for i, (feat, corr) in enumerate(correlations_with_target[:top_n_features], 1):
    print(f"  {i:2d}. {feat:40s} {corr:.4f}")

X_selected = X[selected_features].copy()

print(f"\n✓ Final feature matrix shape: {X_selected.shape}")

In [None]:
# Train-test split
print("\n--- Train-Test Split ---")
X_train, X_test, y_train, y_test = train_test_split(
    X_selected, y, 
    test_size=0.3, 
    random_state=42, 
    stratify=y
)

print(f"Training set: {X_train.shape[0]:,} samples")
print(f"Test set: {X_test.shape[0]:,} samples")
print(f"Training default rate: {y_train.mean()*100:.2f}%")
print(f"Test default rate: {y_test.mean()*100:.2f}%")


In [None]:
# Feature scaling
print("\n--- Feature Scaling (Standardization) ---")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("✓ Features standardized (mean=0, std=1)")

# LOGISTIC REGRESSION MODEL

In [None]:
# Calculate class weights for imbalanced data
class_weight_ratio = (len(y_train) - y_train.sum()) / y_train.sum()
print(f"\nClass imbalance ratio: {class_weight_ratio:.2f}:1 (non-default:default)")

In [None]:
# Train logistic regression
print("\nTraining Logistic Regression with class weights...")

lr_model = LogisticRegression(
    class_weight='balanced',  # Handle class imbalance
    max_iter=1000,
    random_state=42,
    solver='lbfgs'
)

lr_model.fit(X_train_scaled, y_train)

print("✓ Model trained successfully")

In [None]:
# Feature coefficients
print("\n--- Feature Coefficients (Top 10) ---")
feature_importance = pd.DataFrame({
    'feature': selected_features,
    'coefficient': lr_model.coef_[0]
})
feature_importance['abs_coefficient'] = feature_importance['coefficient'].abs()
feature_importance = feature_importance.sort_values('abs_coefficient', ascending=False)

print("\nFeature                              Coefficient   Impact")
print("-" * 70)
for idx, row in feature_importance.head(10).iterrows():
    impact = "↑ Increases Risk" if row['coefficient'] > 0 else "↓ Decreases Risk"
    print(f"{row['feature']:35s} {row['coefficient']:+8.4f}   {impact}")

# MODEL EVALUATION

In [None]:
# Predictions
y_train_pred = lr_model.predict(X_train_scaled)
y_test_pred = lr_model.predict(X_test_scaled)

y_train_proba = lr_model.predict_proba(X_train_scaled)[:, 1]
y_test_proba = lr_model.predict_proba(X_test_scaled)[:, 1]

In [None]:
# Training metrics
print("\n--- Training Set Performance ---")
print(classification_report(y_train, y_train_pred, 
                          target_names=['Non-Default', 'Default']))

train_auc = roc_auc_score(y_train, y_train_proba)
print(f"ROC-AUC Score: {train_auc:.4f}")

# Test metrics
print("\n--- Test Set Performance ---")
print(classification_report(y_test, y_test_pred, 
                          target_names=['Non-Default', 'Default']))

test_auc = roc_auc_score(y_test, y_test_proba)
print(f"ROC-AUC Score: {test_auc:.4f}")

# Check for overfitting
auc_diff = train_auc - test_auc
if auc_diff > 0.05:
    print(f"\n⚠️  WARNING: Possible overfitting detected (AUC diff: {auc_diff:.4f})")
else:
    print(f"\n✓ Model generalizes well (AUC diff: {auc_diff:.4f})")

# Confusion matrix
print("\n--- Confusion Matrix (Test Set) ---")
cm = confusion_matrix(y_test, y_test_pred)
print("\n                 Predicted")
print("               Non-Default  Default")
print(f"Actual Non-Default  {cm[0,0]:6d}     {cm[0,1]:6d}")
print(f"       Default      {cm[1,0]:6d}     {cm[1,1]:6d}")

# VISUALIZATIONS

In [None]:
# Create comprehensive visualization
fig = plt.figure(figsize=(18, 12))

# 1. Confusion Matrix
ax1 = plt.subplot(2, 3, 1)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Non-Default', 'Default'],
            yticklabels=['Non-Default', 'Default'])
plt.title('Confusion Matrix (Test Set)')
plt.ylabel('Actual')
plt.xlabel('Predicted')

# 2. ROC Curve
ax2 = plt.subplot(2, 3, 2)
fpr, tpr, _ = roc_curve(y_test, y_test_proba)
plt.plot(fpr, tpr, linewidth=2, label=f'ROC Curve (AUC = {test_auc:.4f})')
plt.plot([0, 1], [0, 1], 'k--', linewidth=1, label='Random Classifier')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.grid(True, alpha=0.3)

# 3. Precision-Recall Curve
ax3 = plt.subplot(2, 3, 3)
precision, recall, _ = precision_recall_curve(y_test, y_test_proba)
avg_precision = average_precision_score(y_test, y_test_proba)
plt.plot(recall, precision, linewidth=2, label=f'PR Curve (AP = {avg_precision:.4f})')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend()
plt.grid(True, alpha=0.3)

# 4. Feature Coefficients
ax4 = plt.subplot(2, 3, 4)
top_features = feature_importance.head(15).sort_values('coefficient')
colors = ['red' if x > 0 else 'green' for x in top_features['coefficient']]
plt.barh(range(len(top_features)), top_features['coefficient'], color=colors, alpha=0.7, edgecolor='black')
plt.yticks(range(len(top_features)), top_features['feature'], fontsize=8)
plt.xlabel('Coefficient')
plt.title('Top 15 Feature Coefficients')
plt.axvline(x=0, color='black', linestyle='-', linewidth=0.5)
plt.grid(True, alpha=0.3)

# 5. Predicted Probability Distribution
ax5 = plt.subplot(2, 3, 5)
plt.hist(y_test_proba[y_test == 0], bins=50, alpha=0.5, label='Non-Default', 
         color='green', edgecolor='black', density=True)
plt.hist(y_test_proba[y_test == 1], bins=50, alpha=0.5, label='Default', 
         color='red', edgecolor='black', density=True)
plt.xlabel('Predicted Probability')
plt.ylabel('Density')
plt.title('Predicted Probability Distribution')
plt.legend()
plt.grid(True, alpha=0.3)

# 6. Calibration plot (optional)
ax6 = plt.subplot(2, 3, 6)
from sklearn.calibration import calibration_curve
fraction_of_positives, mean_predicted_value = calibration_curve(
    y_test, y_test_proba, n_bins=10
)
plt.plot(mean_predicted_value, fraction_of_positives, marker='o', linewidth=2, label='Model')
plt.plot([0, 1], [0, 1], 'k--', linewidth=1, label='Perfect Calibration')
plt.xlabel('Mean Predicted Probability')
plt.ylabel('Fraction of Positives')
plt.title('Calibration Plot')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('logistic_regression_evaluation.png', dpi=150, bbox_inches='tight')
plt.show()
print("\n✓ Saved: logistic_regression_evaluation.png")

# MODEL SUMMARY

In [None]:
print(f"""
📊 DATASET:
   • Total applications: {len(df):,}
   • Training samples: {len(X_train):,}
   • Test samples: {len(X_test):,}
   • Features used: {len(selected_features)}
   • Default rate: {df['default_label'].mean()*100:.2f}%

🎯 MODEL PERFORMANCE (Test Set):
   • ROC-AUC: {test_auc:.4f}
   • Precision (Default class): {classification_report(y_test, y_test_pred, output_dict=True)['1']['precision']:.4f}
   • Recall (Default class): {classification_report(y_test, y_test_pred, output_dict=True)['1']['recall']:.4f}
   • F1-Score (Default class): {classification_report(y_test, y_test_pred, output_dict=True)['1']['f1-score']:.4f}

🔝 TOP 5 RISK FACTORS (Positive Coefficients):
""")

for idx, row in feature_importance[feature_importance['coefficient'] > 0].head(5).iterrows():
    print(f"   {row['feature']:35s} +{row['coefficient']:.4f}")

print(f"""
🛡️  TOP 5 PROTECTIVE FACTORS (Negative Coefficients):
""")

for idx, row in feature_importance[feature_importance['coefficient'] < 0].head(5).iterrows():
    print(f"   {row['feature']:35s} {row['coefficient']:.4f}")

print("\n" + "="*80)
print("✅ EXPLORATION AND MODELING COMPLETED")
print("="*80)

# 💡 Next Steps:
   1. Try more advanced models (Random Forest, XGBoost, LightGBM)
   2. Perform hyperparameter tuning
   3. Implement cross-validation
   4. Feature engineering (polynomial features, interactions)
   5. Ensemble methods
   6. Deploy model to production

In [None]:
# Cleanup
spark.stop()
print("\n✓ Spark session stopped")