## Imports


In [1]:
import warnings
from typing import Optional

import numpy as np
import pandas as pd
import pingouin as pg
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import polars as pl
import polars.selectors as cs
from category_encoders import WOEEncoder
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.under_sampling import RandomUnderSampler
from lightgbm import LGBMClassifier
from optuna import Study, Trial, create_study
from optuna.pruners import SuccessiveHalvingPruner
from optuna.samplers import RandomSampler
from optuna.trial._frozen import FrozenTrial
from pandas import DataFrame, Series
from plotly.subplots import make_subplots
from polars import DataFrame
from skimpy import skim
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.ensemble._forest import ExtraTreesClassifier
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.linear_model._logistic import LogisticRegression
from sklearn.metrics import (
    balanced_accuracy_score,
    confusion_matrix,
    fbeta_score,
    make_scorer,
    matthews_corrcoef,
    precision_score,
    recall_score,
    roc_auc_score,
)
from sklearn.model_selection import (
    GridSearchCV,
    KFold,
    RepeatedStratifiedKFold,
    cross_val_score,
    train_test_split,
)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler, StandardScaler
from skopt import BayesSearchCV
from xgboost import XGBClassifier
from polars.series.series import Series
import scipy.stats as stats
from numpy.typing import NDArray
from optuna.study.study import Study


warnings.filterwarnings("ignore")
pio.templates.default = "plotly_dark"

#set trials globally to optimize run time
num_trials = 1

### Loan_functions.py

In [2]:
from loan_functions import (
    calculate_value_counts,
    clear,
    column_comparison,
    column_description,
    create_formatted_df,
    int_range,
    lower_column_names,
    lower_column_values,
    make_subplot,
    plot_histogram,
    null_count_comparison,
    objective_1,calculate_model_statistics, instantiate_model, 
    create_encoder_mapping, 
    encode_feature
)

We'll read in the csv datasets one by one to save on memory, clearing them each time. Let's start with the training set, followed by our other datasets.

### Data Exploration


Let's start by reading in the training set and looking at balance. I'll keep the EDA of this notebook focused on the training set to have a more focused analysis and discussion of modeling and model performance, but a more detailed exploration of the supporting datasets can be found in the [notebook] found in this repository.

In [3]:
application_train = create_formatted_df("application_train.csv")

print(
    f"Training set has {application_train.shape[0]} rows and {application_train.shape[1]} columns."
)

application_train.sample(5)

Training set has 307511 rows and 122 columns.


sk_id_curr,target,name_contract_type,code_gender,flag_own_car,flag_own_realty,cnt_children,amt_income_total,amt_credit,amt_annuity,amt_goods_price,name_type_suite,name_income_type,name_education_type,name_family_status,name_housing_type,region_population_relative,days_birth,days_employed,days_registration,days_id_publish,own_car_age,flag_mobil,flag_emp_phone,flag_work_phone,flag_cont_mobile,flag_phone,flag_email,occupation_type,cnt_fam_members,region_rating_client,region_rating_client_w_city,weekday_appr_process_start,hour_appr_process_start,reg_region_not_live_region,reg_region_not_work_region,live_region_not_work_region,…,nonlivingarea_medi,fondkapremont_mode,housetype_mode,totalarea_mode,wallsmaterial_mode,emergencystate_mode,obs_30_cnt_social_circle,def_30_cnt_social_circle,obs_60_cnt_social_circle,def_60_cnt_social_circle,days_last_phone_change,flag_document_2,flag_document_3,flag_document_4,flag_document_5,flag_document_6,flag_document_7,flag_document_8,flag_document_9,flag_document_10,flag_document_11,flag_document_12,flag_document_13,flag_document_14,flag_document_15,flag_document_16,flag_document_17,flag_document_18,flag_document_19,flag_document_20,flag_document_21,amt_req_credit_bureau_hour,amt_req_credit_bureau_day,amt_req_credit_bureau_week,amt_req_credit_bureau_mon,amt_req_credit_bureau_qrt,amt_req_credit_bureau_year
i64,i64,str,str,str,str,i64,f64,f64,f64,f64,str,str,str,str,str,f64,i64,i64,f64,i64,f64,i64,i64,i64,i64,i64,i64,str,f64,i64,i64,str,i64,i64,i64,i64,…,f64,str,str,f64,str,str,f64,f64,f64,f64,f64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,f64,f64,f64,f64,f64,f64
444280,1,"""cash loans""","""m""","""y""","""y""",2,157500.0,2013840.0,55507.5,1800000.0,"""unaccompanied""","""state servant""","""higher education""","""married""","""house / apartment""",0.01885,-15006,-1621,-1497.0,-3623,6.0,1,1,0,1,0,0,"""drivers""",4.0,2,2,"""monday""",11,0,0,0,…,0.0044,"""not specified""","""block of flats""",0.273,"""stone, brick""","""no""",0.0,0.0,0.0,0.0,0.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,,,,,,
240731,0,"""cash loans""","""f""","""n""","""n""",1,119250.0,495000.0,22081.5,495000.0,"""unaccompanied""","""commercial associate""","""higher education""","""separated""","""house / apartment""",0.025164,-8644,-717,-1487.0,-762,,1,1,1,1,1,0,"""core staff""",2.0,2,2,"""monday""",11,0,0,0,…,,,,0.0724,,"""no""",0.0,0.0,0.0,0.0,-3.0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
397628,0,"""cash loans""","""f""","""n""","""n""",0,135000.0,225000.0,10953.0,225000.0,"""unaccompanied""","""working""","""secondary / secondary special""","""married""","""house / apartment""",0.022625,-13967,-4850,-7355.0,-4142,,1,1,1,1,1,0,"""high skill tech staff""",2.0,2,2,"""sunday""",11,0,0,0,…,0.0526,"""reg oper account""","""block of flats""",0.0517,"""block""","""no""",3.0,0.0,3.0,0.0,-953.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
192387,0,"""cash loans""","""f""","""n""","""y""",0,292500.0,1182051.0,39195.0,967500.0,"""unaccompanied""","""commercial associate""","""secondary / secondary special""","""single / not married""","""house / apartment""",0.018634,-16930,-4211,-9259.0,-487,,1,1,0,1,0,0,"""sales staff""",1.0,2,2,"""monday""",9,0,1,1,…,,,,,,,1.0,1.0,1.0,0.0,-1539.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,,
265449,0,"""revolving loans""","""f""","""n""","""y""",0,62100.0,135000.0,6750.0,135000.0,"""unaccompanied""","""pensioner""","""secondary / secondary special""","""married""","""house / apartment""",0.025164,-22444,365243,-285.0,-2302,,1,0,0,1,0,0,,2.0,2,2,"""monday""",8,0,0,0,…,,,,,,,4.0,1.0,4.0,1.0,-474.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,1.0,0.0


Let's now take a look at balance of the target variable since that is our modeling variable of interest. 

In [4]:
plot_histogram(application_train, "target", title="Target Variable Distribution")

In [5]:
description = pl.read_csv("HomeCredit_columns_description.csv", encoding="latin1")
description = lower_column_names(description)
description = lower_column_values(description)

# Delete training spaces for some entries in the row column
description = description.with_columns(
    pl.col("row").map_elements(lambda x: x.split(" ")[0], return_dtype=pl.String)
)

null_df = null_count_comparison(application_train, description, "amt_annuity")
null_df

table,feature,null_count
str,str,i64
"""application_{train|test}.csv""","""amt_annuity""",12
"""bureau.csv""","""amt_annuity""",42
"""previous_application.csv""","""amt_annuity""",93


### Null values 

In [6]:
null_df = (
    application_train.null_count()
    .transpose(include_header=True)
    .rename(mapping={"column": "feature", "column_0": "null_count"})
    .sort(by="null_count", descending=True)
    .with_columns(
        pl.col("null_count")
        .map_elements(lambda x: x / len(application_train), return_dtype=pl.Float32)
        .alias("percentage")
    )
)

px.histogram(
    null_df,
    x="percentage",
    text_auto=True,
    title="Null value percentages across dataset",
).update_layout(bargap=0.2)

The graph above shows that we have a pretty wide spread of null value prevalence within this dataset. Thankfully none of these null values are in the target variable, but we will have to make a choice in our analysis of how to treat null values within this dataset since some classifiers are more tolerant of null values than others. LightGBM and XGBoost can handle null values when fitting, but commonly used sklearn classifiers do not.

#### Look for categorical anomalies 

## Correlations 


In [7]:
corr: DataFrame = (
    application_train.select(cs.by_dtype(pl.NUMERIC_DTYPES)).to_pandas().corr()
)
corr = pd.DataFrame(corr["target"])

px.histogram(
    corr, "target", title="Correlations with Target Variable"
).update_layout(bargap=0.2)

The graph above shows the distribution of correlations as a graphical distribution rather than in tabular form because we have so many variables that we're working with. None of our variables are correlated with target, save for target itself.

## Hypothesis testing

### Is there difference in proportions of clients that default when broken down by their region of residence? 
$H_0$: $p_1$ = $p_2$ = $p_3$<br>
$H_1$: $p_1$ $\neq$ $p_2$ $\neq$ $p_3$


In [8]:
region_default: DataFrame = pd.crosstab(
    application_train["region_rating_client"].to_pandas(),
    application_train["target"].to_pandas(),
    rownames=["region_rating_client"],
    colnames=["target"],
)
region_default["default_proportion"] = region_default.iloc[:, 1] / region_default.sum(
    axis=1
)
region_default["total"] = region_default.iloc[:, :-1].sum(axis=1)

# pooled sample proportion
p1_default_prop:float = region_default.default_proportion.iloc[0]
p2_default_prop:float = region_default.default_proportion.iloc[1]
p3_default_prop:float = region_default.default_proportion.iloc[2]

p1_population:float = region_default.total.iloc[0]
p2_population:float = region_default.total.iloc[1]
p3_population:float = region_default.total.iloc[2]

p:float = (
    p1_default_prop * p1_population
    + p2_default_prop * p2_population
    + p3_default_prop * p3_population
) / (p1_population + p2_population + p3_population)

# standard error
se:float = np.sqrt(
    (p * (1 - p)) * ((1 / p1_population) + (1 / p2_population) + (1 / p2_population))
)

# test statistic
z:float = (p1_default_prop - p2_default_prop - p3_default_prop) / se

if np.abs(z) < 1.64485:
    print(
        f"Fail to reject the null hypothesis. We can assume the default percentage to be the same across {'region_rating_client'}."
    )
else:
    print(
        f"z = {z:.3f}. Reject null hypothesis. The proportion of credit defaults across values of {'region_rating_client'} is not equal."
    )

z = -82.387. Reject null hypothesis. The proportion of credit defaults across values of region_rating_client is not equal.


Because both gender and region are not equal across their values when compared to our target variable, we they are more likely to have a significant relationship with default risk. We should include these variables in our predictor pool. This raises the question of what variables we should use to predict our models, given that we feasibly cannot include all of them. Let's move on to feature selection so we can restrict our feature set to one that is both significant and predictive of the target variable without being so onerous to run calculations on. 


### Are younger homeowners are more likely to default on credit payments? 

$H_0$: $\mu_d$ =  $\mu_n$<br>
$H_1$:  $\mu_d$ $\neq$ $\mu_n$

s.t. <br>
$\mu_d$: the average age of clients who default<br>
$\mu_n$: the average age of clients who do not default

In [9]:
age_df: DataFrame = application_train[["target", "days_birth"]]
age_df = age_df.with_columns((pl.col("days_birth") // -365)).rename(
    {"days_birth": "age"}
)

age_default: Series = age_df.filter(pl.col("target") == 1)["age"]
age_no_default: Series = age_df.filter(pl.col("target") == 0)["age"]

t_stat, p_value = stats.ttest_ind(age_no_default, age_default)

print("Two-sample t-test results:\n")
print(f"t-statistic: {t_stat:.3f}")
print(f"p-value: {p_value:.3f}\n")

if p_value < 0.05:
    print(
        "Reject the null hypothesis: There is a significant difference in the average ages \nof clients who default vs. those who don't.\n"
    )
else:
    print(
        "Fail to reject the null hypothesis: There is no significant difference in the average ages of clients who \ndefault vs. those who don't.\n"
    )

Two-sample t-test results:

t-statistic: 43.517
p-value: 0.000

Reject the null hypothesis: There is a significant difference in the average ages 
of clients who default vs. those who don't.



## Preparing data for modeling


Before we start creating models, it's important to lay out what our north star assessment criteria are for model assessment, and secondarily how we will choose what a high-performing model is. 

Given that our business problem is designing predictive classifiers of credit clients that are likely to default on payments, and that our training data is highly imbalanced such that our positive class is the minority class at a ratio of 10:1, we will need to be very precise in our metric selection. 

Therefore each model that we create will be judged on its ROC-AUC score, F1 score, Matthews Correlation Coefficient, Balanced Accuracy, Precision, and Recall. This sounds like a long list of metrics, so our primary determinants will be the ROC-AUC and MCC since ROC-AUC can be less powerful for highly imbalanced datasets, and MCC gives a pretty comprehensive view of a model's performance across all 4 quadrants of the confusion matrix.

## Modeling

Now that we've defined our metrics set and evaluation method, our workflow for the next section is as follows: 
1. Create pipelines to impute missing values, scale data, and fit to our classifier of choice
2. Calculate model statistics for each classifier



In [10]:
application_train: DataFrame = create_formatted_df("application_train.csv")

x: DataFrame = application_train.drop(["sk_id_curr", "target"]).to_pandas()
y: DataFrame = pl.DataFrame(application_train["target"]).to_pandas()


x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.3, random_state=0, stratify=y
)

y_train: NDArray = np.array(y_train).ravel()

numerical_columns: list[str] = [
    *x.select_dtypes(exclude=["object", "category"]).columns
]

categorical_columns: list[str] = [
    *x.select_dtypes(include=["object", "category"]).columns
]

In [22]:
classifiers: list[str] = ["logistic_regression", "extra_trees", "random_forest"]

model_performance = pd.DataFrame()
model_specifications = dict()

for classifier in classifiers:
    study = create_study(
        direction="maximize",
        pruner=SuccessiveHalvingPruner(reduction_factor=2),
        sampler=RandomSampler(seed=42),
    )
    study.optimize(
        lambda trial: objective_1(
            classifier, trial, x_train, np.array(y_train).ravel()
        ),
        n_trials=num_trials,
    )

    model_specifications[classifier] = study.best_params

    best_trial: FrozenTrial = study.best_trial
    model: Pipeline = instantiate_model(
        classifier,
        trial=best_trial,
        numerical_columns=numerical_columns,
        categorical_columns=categorical_columns,
    )
    model.fit(x_train, y_train)
    predictions: NDArray = model.predict(x_test)
    model_performance[classifier] = calculate_model_statistics(y_test, predictions)
model_performance


[I 2024-12-30 17:16:04,408] A new study created in memory with name: no-name-917b7550-eec0-4915-9729-ffd90447609e


These models are performing really poorly so far, with both the Matthews Correlation Coefficient and ROC-AUC pointing to the modeling being no better than random guessing then it comes to predicting on defaults. Let's see how well the lightgbm and xgboost classifiers perform. 

In [14]:
classifiers: list[str] = ["lightgbm",
                          "xgboost"]

for classifier in classifiers:
    #study = create_study(study_name="optimization", direction="maximize")

    study = create_study(
        direction="maximize",
        pruner=SuccessiveHalvingPruner(reduction_factor=2),
        sampler=RandomSampler(seed=42),
)

    study.optimize(
        lambda trial: objective_1(classifier, trial, x_train, np.array(y_train).ravel()),
        n_trials=num_trials#4,
    )  # n_trials=100 is the original value

    model_specifications[classifier]= study.best_params


    best_trial: FrozenTrial = study.best_trial
    model: Pipeline = instantiate_model(
        classifier,
        trial=best_trial,
        numerical_columns=numerical_columns,
        categorical_columns=categorical_columns,
    )
    model.fit(x_train, y_train)
    predictions: NDArray = model.predict(x_test)
    model_performance[classifier] = calculate_model_statistics(y_test, predictions)
model_performance

[I 2024-12-30 17:18:48,903] A new study created in memory with name: no-name-bc7ecfc9-b777-4ce3-a23a-c5cfc31bc67e
[I 2024-12-30 17:18:59,297] Trial 0 finished with value: 0.682842118157641 and parameters: {'numerical_strategy': 'median', 'with_centering': True, 'with_scaling': False, 'categorical_strategy': 'constant', 'sigma': 0.10390188698471643, 'regularization': 4.8495492608099715, 'randomized': True, 'boosting_type': 'dart', 'num_leaves': 21, 'max_depth': 10, 'n_estimators': 85}. Best is trial 0 with value: 0.682842118157641.
[I 2024-12-30 17:19:06,355] A new study created in memory with name: no-name-1a72ac57-331e-457c-a45e-34b63caa7088
[I 2024-12-30 17:19:20,310] Trial 0 finished with value: 0.5 and parameters: {'numerical_strategy': 'median', 'with_centering': True, 'with_scaling': False, 'categorical_strategy': 'constant', 'sigma': 0.10390188698471643, 'regularization': 4.8495492608099715, 'randomized': True, 'objective': 'binary:logistic', 'booster': 'dart', 'max_leaves': 1, 

Unnamed: 0,lightgbm,xgboost
roc_auc,0.685468,0.5
matthews_correlation,0.213548,0.0
f_beta,0.515052,0.0
precision,0.162056,0.0
recall,0.679511,0.0
balanced_accuracy,0.685468,0.5


We're still not getting anywhere with these either, which means that our data could use a little more polishing before we feed them to models. One way to strengthen the models is with SMOTE oversampling, so we'll start there. 

## SMOTE Oversampling 
We'll use the combination of undersampling and oversampling originally put forth in the SMOTE paper to see how much that improves our model performance. We'll start by numerically encoding the categorical variables across application train before applying PCA and the conducting our under- and oversampling. 

In [31]:
numerical_train: DataFrame = application_train.clone()

# numerically encode features
encoder_mapping_key = dict()
for col in numerical_train.columns:
    try:
        key: dict[str, int] = create_encoder_mapping(numerical_train, col)
        numerical_train = encode_feature(numerical_train, col, key)
        encoder_mapping_key[col] = key
    except:
        pass


# fill missing values
for col in numerical_train.columns:
    median: float = numerical_train[col].median()
    numerical_train = numerical_train.with_columns(pl.col(col).fill_null(value=median))


x: DataFrame = numerical_train.drop("target").to_pandas()
y: Series = numerical_train["target"].to_pandas()

In [33]:
oversampling = SMOTE(sampling_strategy=0.1)
undersampling = RandomUnderSampler(sampling_strategy=0.5)

steps: list = [("oversample", oversampling), ("undersample", undersampling)]
pipeline = ImbPipeline(steps=steps)
smote_x, smote_y = pipeline.fit_resample(x, y)

pca = PCA(n_components=smote_x.shape[1])
smote_x: NDArray = pca.fit_transform(smote_x)

x_train, x_test, y_train, y_test = train_test_split(
    smote_x, smote_y, stratify=smote_y, random_state=15
)

Now that we've created our smote dataset, we can fit the models. However our previous model specifications aren't as helpful since the dataset is new, and models may perform better on different hyperparameters. Let's create another study and run it again! 

In [34]:
study: Study = create_study(
    direction="maximize",
    pruner=SuccessiveHalvingPruner(reduction_factor=2),
    sampler=RandomSampler(seed=42),
)
study.optimize(lambda trial: objective_1("lightgbm", trial, x, y), n_trials=num_trials)


model_specifications["SMOTE_lgbm"] = study.best_params

smote_lgbm = LGBMClassifier(**study.best_params,device='gpu')
smote_lgbm.fit(x_train, y_train)
predictions: NDArray = smote_lgbm.predict(x_test)
model_performance["SMOTE_lgbm"] = calculate_model_statistics(y_test, predictions)
model_performance

[I 2024-12-30 17:12:28,202] A new study created in memory with name: no-name-52450ac0-782f-4efb-9ca4-d6ceb7734339
[I 2024-12-30 17:12:38,164] Trial 0 finished with value: 0.6847532748880383 and parameters: {'numerical_strategy': 'median', 'with_centering': True, 'with_scaling': False, 'categorical_strategy': 'constant', 'sigma': 0.10390188698471643, 'regularization': 4.8495492608099715, 'randomized': True, 'boosting_type': 'dart', 'num_leaves': 21, 'max_depth': 10, 'n_estimators': 85}. Best is trial 0 with value: 0.6847532748880383.


Unnamed: 0,logistic_regression,extra_trees,random_forest,lightgbm,xgboost,SMOTE_lgbm
roc_auc,0.555503,0.5,0.599777,0.685943,0.5,0.629475
matthews_correlation,0.063039,0.0,0.199945,0.214525,0.0,0.316966
f_beta,0.342875,0.0,0.263943,0.514958,0.0,0.370683
precision,0.10368,0.0,0.264968,0.162933,0.0,0.651042
recall,0.461063,0.0,0.263829,0.677632,0.0,0.353757
balanced_accuracy,0.555503,0.5,0.599777,0.685943,0.5,0.629475


In [35]:
study: Study = create_study(
    direction="maximize",
    pruner=SuccessiveHalvingPruner(reduction_factor=2),
    sampler=RandomSampler(seed=42),
)
study.optimize(lambda trial: objective_1("xgboost", trial, x, y), n_trials=num_trials)
smote_xgb: XGBClassifier = XGBClassifier(**study.best_params, tree_method='gpu_hist',predictor='gpu_predictor', device='gpu')
#smote_xgb: XGBClassifier = XGBClassifier(**study.best_params)
smote_xgb.fit(x_train, y_train)
predictions = smote_xgb.predict(x_test)
model_performance["SMOTE_xgb"] = calculate_model_statistics(y_test, predictions)
model_performance

[I 2024-12-30 17:13:10,822] A new study created in memory with name: no-name-8c0c967d-aa3b-4729-89df-dfd9c8696380
[I 2024-12-30 17:13:19,595] Trial 0 finished with value: 0.5 and parameters: {'numerical_strategy': 'median', 'with_centering': True, 'with_scaling': False, 'categorical_strategy': 'constant', 'sigma': 0.10390188698471643, 'regularization': 4.8495492608099715, 'randomized': True, 'objective': 'binary:logistic', 'booster': 'dart', 'max_leaves': 1, 'max_depth': 7, 'grow_policy': 'depthwise', 'n_estimators': 81, 'learning_rate': 0.14809892204552141}. Best is trial 0 with value: 0.5.


Unnamed: 0,logistic_regression,extra_trees,random_forest,lightgbm,xgboost,SMOTE_lgbm,SMOTE_xgb
roc_auc,0.555503,0.5,0.599777,0.685943,0.5,0.629475,0.5
matthews_correlation,0.063039,0.0,0.199945,0.214525,0.0,0.316966,0.0
f_beta,0.342875,0.0,0.263943,0.514958,0.0,0.370683,0.0
precision,0.10368,0.0,0.264968,0.162933,0.0,0.651042,0.0
recall,0.461063,0.0,0.263829,0.677632,0.0,0.353757,0.0
balanced_accuracy,0.555503,0.5,0.599777,0.685943,0.5,0.629475,0.5
