## Load Packages

In [9]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
pio.renderers.default = "plotly_mimetype+notebook"
import numpy as np
import scipy.stats as stats

from dotenv import load_dotenv
import os

from pyod.models.auto_encoder import AutoEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore")

# Load environment variables from .env file
load_dotenv()

data_dir = os.getenv("DATA_DIR") + "mortgage_prob_default/"

## Load Data

In [11]:
df = pd.read_csv(data_dir + "XYZloan_default_llm.csv")
print(f"Shape of data: {df.shape}")
df.head()

Shape of data: (16000, 32)


Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,AP001,AP002,AP003,AP006,AP007,AP008,CR004,CR009,...,TD005,TD006,TD009,TD010,TD013,TD014,TD022,TD024,loan_default,reason
0,4,76031,33,1,3,h5,4,3,4,63100,...,4,1,4,1,4,1,10.0,0.0,1,I’d really appreciate if we could move faster ...
1,5,23312,34,1,3,h5,5,5,3,53370,...,3,1,6,2,7,2,15.0,10.0,1,We’re trying to align the closing date with a ...
2,9,66033,36,2,1,ios,2,2,3,5400,...,4,2,4,2,5,2,25.0,0.0,1,It would really help to close this week so I c...
3,10,41847,28,1,1,ios,5,5,3,2000,...,4,4,7,4,7,4,25.0,6.0,1,There are some logistics around my move that m...
4,13,28275,35,2,4,h5,3,3,4,27704,...,4,1,4,1,7,1,25.0,0.0,1,I’d like to close by Friday if possible—the se...


In [23]:
glossary = pd.read_excel(data_dir + "Variables_Dictionary_2019.xlsx", skiprows=3, usecols=['Var', 'description'])
glossary.head()

Unnamed: 0,Var,description
0,AP001,YR_AGE
1,AP002,CODE_GENDER
2,AP003,CODE_EDUCATION
3,AP004,LOAN_TERM
4,AP005,DATE_APPLIED


In [24]:
# Create a mapping from Var to description
var_to_desc = dict(zip(glossary['Var'], glossary['description']))

# Map df columns using the glossary, fallback to original name if not found
df.columns = [var_to_desc.get(col, col) for col in df.columns]
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,YR_AGE,CODE_GENDER,CODE_EDUCATION,OS_TYPE,LEVEL_APPL_CITY,FLAG_IP_CITY_NOT_APPL_CITY,CNT_QUERY_TIME_LAST_1MON,AMT_LOAN_TOTAL,...,TD_CNT_QUERY_LAST_1MON_P2P,TD_CNT_QUERY_LAST_1MON_SMALL_LOAN,TD_CNT_QUERY_LAST_3MON_P2P,TD_CNT_QUERY_LAST_3MON_SMALL_LOAN,TD_CNT_QUERY_LAST_6MON_P2P,TD_CNT_QUERY_LAST_6MON_SMALL_LOAN,TD022,TD024,loan_default,reason
0,4,76031,33,1,3,h5,4,3,4,63100,...,4,1,4,1,4,1,10.0,0.0,1,I’d really appreciate if we could move faster ...
1,5,23312,34,1,3,h5,5,5,3,53370,...,3,1,6,2,7,2,15.0,10.0,1,We’re trying to align the closing date with a ...
2,9,66033,36,2,1,ios,2,2,3,5400,...,4,2,4,2,5,2,25.0,0.0,1,It would really help to close this week so I c...
3,10,41847,28,1,1,ios,5,5,3,2000,...,4,4,7,4,7,4,25.0,6.0,1,There are some logistics around my move that m...
4,13,28275,35,2,4,h5,3,3,4,27704,...,4,1,4,1,7,1,25.0,0.0,1,I’d like to close by Friday if possible—the se...


### Check Data Structure

In here we will see the Data Types, Null Counts, and Unique Counts per columns to understand better what we are working with.

In [26]:
# Combine data types, null counts, and unique counts into a single DataFrame

sample_values = df.apply(lambda col: "|".join(map(str, col.dropna().sample(n=min(10, col.dropna().shape[0]), random_state=42).astype(str))), axis=0)
sample_values.name = "sample_values"

summary_df = pd.concat(
    [df.dtypes.rename("dtype"),
     df.isnull().sum().rename("null_count"),
     df.nunique().rename("unique_count"), 
     sample_values],
    axis=1
)
summary_df

Unnamed: 0,dtype,null_count,unique_count,sample_values
Unnamed: 0.1,int64,0,16000,7002|1952|3714|1588|6369|8025|5405|8523|5928|6490
Unnamed: 0,int64,0,16000,32353|42230|64717|59473|63539|48711|53192|7927...
YR_AGE,int64,0,37,37|28|28|27|29|49|36|24|25|46
CODE_GENDER,int64,0,2,2|1|1|1|2|1|2|1|1|1
CODE_EDUCATION,int64,0,4,1|1|3|3|3|1|4|1|3|1
OS_TYPE,object,0,4,android|h5|h5|h5|h5|android|android|ios|h5|h5
LEVEL_APPL_CITY,int64,0,5,4|4|4|5|3|3|4|3|4|2
FLAG_IP_CITY_NOT_APPL_CITY,int64,0,5,4|1|4|5|5|3|4|2|4|2
CNT_QUERY_TIME_LAST_1MON,int64,0,4,4|3|1|4|4|2|4|2|1|2
AMT_LOAN_TOTAL,int64,0,6989,141511|0|81400|120374|11488|0|0|210510|12000|0


- Several columns in the dataframe use `int64` as their dtype but should be treated as **categorical variables** due to low cardinality or meaningful grouping.

- Examples of categorical variables stored as integers:
  - `CODE_GENDER` (`int64`, unique_count = 2): values such as 1 and 2 likely represent gender categories.
  - `CODE_EDUCATION` (`int64`, unique_count = 4): encoded education levels (e.g. 1 = high school, 4 = postgraduate).
  - `LEVEL_APPL_CITY` (`int64`, unique_count = 5): application-level classification for cities.
  - `FLAG_IP_CITY_NOT_APPL_CITY` (`int64`, unique_count = 5): likely a binary or categorical flag.
  - `CNT_QUERY_TIME_LAST_1MON` (`int64`, unique_count = 4): counts grouped into small fixed buckets.
  - `MONTH_CREDIT_CARD_MOB_MAX` (`int64`, unique_count = 5): possibly denotes discrete mobility levels for credit cards.
  - `SCORE_DEBIT_CARD_TOTAL_AMT`, `SCORE_DEBIT_CARD_UTILITY_AMT`, and `SCORE_SINGLE_DEBIT_CARD_LIMIT`: all use low-range integer scores and may reflect scoring tiers.

- Even some float64 columns such as `TD022` (unique_count = 5) suggest discretized bins that behave categorically.

- Recommendation: During preprocessing, consider converting such columns to `category` dtype or encoding them accordingly to support downstream modeling and visualization tasks.


In [29]:
df.groupby('reason').size().reset_index(name='count').sort_values('count', ascending=False).head(20)

Unnamed: 0,reason,count
286,Our family're looking to buy our family's firs...,556
381,We're looking to buy my first home and are rea...,556
383,We're looking to buy our first home and are re...,554
287,Our family're looking to buy our first home an...,546
289,Our family’ve outgrown our current apartment a...,535
288,Our family’ve outgrown my current apartment an...,533
382,We're looking to buy our family's first home a...,533
285,Our family're looking to buy my first home and...,521
290,Our family’ve outgrown our family's current ap...,505
380,We have outgrown our family's current apartmen...,205


In [None]:
categorical_cols = [
    "CODE_GENDER",
    "CODE_EDUCATION",
    "OS_TYPE",
    "LEVEL_APPL_CITY",
    "FLAG_IP_CITY_NOT_APPL_CITY",
    "CNT_QUERY_TIME_LAST_1MON",
    "MONTH_CREDIT_CARD_MOB_MAX",
    "SCORE_DEBIT_CARD_TOTAL_AMT",
    "SCORE_DEBIT_CARD_UTILITY_AMT",
    "SCORE_SINGLE_DEBIT_CARD_LIMIT",
    "MB007",
    "TD_CNT_QUERY_LAST_7Day_P2P",
    "TD_CNT_QUERY_LAST_1MON_P2P",
    "TD_CNT_QUERY_LAST_1MON_SMALL_LOAN",
    "TD_CNT_QUERY_LAST_3MON_P2P",
    "TD_CNT_QUERY_LAST_3MON_SMALL_LOAN",
    "TD_CNT_QUERY_LAST_6MON_P2P",
    "TD_CNT_QUERY_LAST_6MON_SMALL_LOAN",
    "TD022",
    "TD024",
    "reason"
]

target_col = ["loan_default"]