## Imports


In [84]:
import warnings
from typing import Optional
import numpy as np
import pandas as pd
import pingouin as pg
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import polars as pl
import polars.selectors as cs
from category_encoders import WOEEncoder
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.under_sampling import RandomUnderSampler
from lightgbm import LGBMClassifier
from optuna import Study, Trial, create_study
from optuna.pruners import SuccessiveHalvingPruner
from optuna.samplers import RandomSampler
from optuna.trial._frozen import FrozenTrial
from pandas import DataFrame, Series
from plotly.subplots import make_subplots
from polars import DataFrame
from skimpy import skim
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.ensemble._forest import ExtraTreesClassifier
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.linear_model._logistic import LogisticRegression
from sklearn.metrics import (
    balanced_accuracy_score,
    confusion_matrix,
    fbeta_score,
    make_scorer,
    matthews_corrcoef,
    precision_score,
    recall_score,
    roc_auc_score,
)
from sklearn.model_selection import (
    GridSearchCV,
    KFold,
    RepeatedStratifiedKFold,
    cross_val_score,
    train_test_split,
)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler, StandardScaler
from skopt import BayesSearchCV
from xgboost import XGBClassifier
from polars.series.series import Series
import scipy.stats as stats
from numpy.typing import NDArray


warnings.filterwarnings("ignore")
pio.templates.default = "plotly_dark"

#set trials globally to optimize run time
num_trials = 15



### Loan_functions.py

In [85]:
from loan_functions import (
    calculate_value_counts,
    clear,
    column_comparison,
    column_description,
    create_formatted_df,
    int_range,
    lower_column_names,
    lower_column_values,
    make_subplot,
    plot_histogram,
    null_count_comparison,
    objective_1,calculate_model_statistics, instantiate_model,
    create_encoder_mapping,
    encode_feature
)

We'll read in the csv datasets one by one to save on memory, clearing them each time. Let's start with the training set, followed by our other datasets.

### Data Exploration


Let's start by reading in the training set and looking at balance. I'll keep the EDA of this notebook focused on the training set to have a more focused analysis and discussion of modeling and model performance, but a more detailed exploration of the supporting datasets can be found in the [notebook] found in this repository.

In [86]:
application_train = create_formatted_df("application_train.csv")

print(
    f"Training set has {application_train.shape[0]} rows and {application_train.shape[1]} columns."
)

application_train.sample(5)

Training set has 38848 rows and 122 columns.


sk_id_curr,target,name_contract_type,code_gender,flag_own_car,flag_own_realty,cnt_children,amt_income_total,amt_credit,amt_annuity,amt_goods_price,name_type_suite,name_income_type,name_education_type,name_family_status,name_housing_type,region_population_relative,days_birth,days_employed,days_registration,days_id_publish,own_car_age,flag_mobil,flag_emp_phone,flag_work_phone,flag_cont_mobile,flag_phone,flag_email,occupation_type,cnt_fam_members,region_rating_client,region_rating_client_w_city,weekday_appr_process_start,hour_appr_process_start,reg_region_not_live_region,reg_region_not_work_region,live_region_not_work_region,…,nonlivingarea_medi,fondkapremont_mode,housetype_mode,totalarea_mode,wallsmaterial_mode,emergencystate_mode,obs_30_cnt_social_circle,def_30_cnt_social_circle,obs_60_cnt_social_circle,def_60_cnt_social_circle,days_last_phone_change,flag_document_2,flag_document_3,flag_document_4,flag_document_5,flag_document_6,flag_document_7,flag_document_8,flag_document_9,flag_document_10,flag_document_11,flag_document_12,flag_document_13,flag_document_14,flag_document_15,flag_document_16,flag_document_17,flag_document_18,flag_document_19,flag_document_20,flag_document_21,amt_req_credit_bureau_hour,amt_req_credit_bureau_day,amt_req_credit_bureau_week,amt_req_credit_bureau_mon,amt_req_credit_bureau_qrt,amt_req_credit_bureau_year
i64,i64,str,str,str,str,i64,f64,f64,f64,f64,str,str,str,str,str,f64,i64,i64,f64,i64,f64,i64,i64,i64,i64,i64,i64,str,f64,i64,i64,str,i64,i64,i64,i64,…,f64,str,str,f64,str,str,f64,f64,f64,f64,f64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,f64,f64,f64,f64,f64,f64
128244,0,"""cash loans""","""f""","""n""","""y""",0,67500.0,544491.0,16047.0,454500.0,"""unaccompanied""","""pensioner""","""secondary / secondary special""","""separated""","""house / apartment""",0.030755,-22898,365243,-2028.0,-4668,,1,0,0,1,0,0,,1.0,2,2,"""tuesday""",11,0,0,0,…,,,,,,,5.0,2.0,5.0,1.0,-1616.0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,1.0,0.0
137835,0,"""cash loans""","""f""","""n""","""y""",0,135000.0,562491.0,23962.5,454500.0,"""unaccompanied""","""pensioner""","""secondary / secondary special""","""widow""","""municipal apartment""",0.005084,-22477,365243,-383.0,-4996,,1,0,0,1,0,0,,1.0,2,2,"""tuesday""",11,0,0,0,…,,"""reg oper account""","""block of flats""",0.1877,"""panel""","""no""",5.0,0.0,5.0,0.0,-1400.0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,1.0,3.0
106922,0,"""cash loans""","""m""","""n""","""n""",0,81000.0,130320.0,9873.0,112500.0,"""unaccompanied""","""commercial associate""","""secondary / secondary special""","""civil marriage""","""house / apartment""",0.011703,-18516,-2395,-10871.0,-2044,,1,1,0,1,0,0,"""low-skill laborers""",2.0,2,2,"""thursday""",13,0,0,0,…,0.0,"""not specified""","""block of flats""",0.0441,"""stone, brick""","""no""",0.0,0.0,0.0,0.0,-605.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,1.0,0.0,3.0
116454,0,"""cash loans""","""f""","""n""","""n""",0,189000.0,900000.0,45954.0,900000.0,"""unaccompanied""","""working""","""secondary / secondary special""","""married""","""house / apartment""",0.028663,-12398,-324,-2495.0,-3762,,1,1,0,1,1,0,"""accountants""",2.0,2,2,"""thursday""",19,0,0,0,…,0.0039,"""not specified""","""block of flats""",0.0088,"""stone, brick""","""no""",0.0,0.0,0.0,0.0,-1751.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,1.0,0.0,0.0,0.0
106307,0,"""cash loans""","""m""","""y""","""y""",2,202500.0,358344.0,23031.0,283500.0,"""unaccompanied""","""working""","""secondary / secondary special""","""married""","""house / apartment""",0.031329,-16152,-1689,-748.0,-4533,12.0,1,1,0,1,0,0,"""laborers""",4.0,2,2,"""tuesday""",13,0,0,0,…,,,,,,,0.0,0.0,0.0,0.0,-1892.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,1.0,0.0,0.0,1.0


Let's now take a look at balance of the target variable since that is our modeling variable of interest.

In [87]:
plot_histogram(application_train, "target", title="Target Variable Distribution")

We can see here that there is a pretty high level of imbalance in the target variable, so we're going to have to be selective with our choices in metrics and our assessment of model performance.

With this in mind, let's see if we can use some of the other csv files in our dataset to plug the holes left by missing values in the training dataset.

In [88]:
description = pl.read_csv("HomeCredit_columns_description.csv", encoding="latin1")
description = lower_column_names(description)
description = lower_column_values(description)

description = description.with_columns(
    pl.col("row").map_elements(lambda x: x.split(" ")[0], return_dtype=pl.String)
)

null_df = null_count_comparison(application_train, description, "amt_annuity")
null_df

table,feature,null_count
str,str,i64
"""application_{train|test}.csv""","""amt_annuity""",0
"""bureau.csv""","""amt_annuity""",0
"""previous_application.csv""","""amt_annuity""",0


When we join the application dataset with the others, we cna see that there are no gains to be had by imputing application_train with matching data from the other tables. Let's proceed with our analysis and make due with the data available to us here.

### Null values

In [89]:
null_df = (
    application_train.null_count()
    .transpose(include_header=True)
    .rename(mapping={"column": "feature", "column_0": "null_count"})
    .sort(by="null_count", descending=True)
    .with_columns(
        pl.col("null_count")
        .map_elements(lambda x: x / len(application_train), return_dtype=pl.Float32)
        .alias("percentage")
    )
)

px.histogram(
    null_df,
    x="percentage",
    text_auto=True,
    title="Null value percentages across dataset",
).update_layout(bargap=0.2)

The graph above shows that we have a pretty wide spread of null value prevalence within this dataset. Thankfully none of these null values are in the target variable, but we will have to make a choice in our analysis of how to treat null values within this dataset since some classifiers are more tolerant of null values than others. LightGBM and XGBoost can handle null values when fitting, but commonly used sklearn classifiers do not.

## Correlations


In [90]:
corr: DataFrame = (
    application_train.select(cs.by_dtype(pl.NUMERIC_DTYPES)).to_pandas().corr()
)
corr = pd.DataFrame(corr["target"])

px.histogram(
    corr, "target", title="Correlations with Target Variable"
).update_layout(bargap=0.2)

The graph above shows the distribution of correlations as a graphical distribution rather than in tabular form because we have so many variables that we're working with. None of our variables are correlated with target, save for target itself.

## Hypothesis testing

### Is there difference in proportions of clients that default when broken down by their region of residence?
$H_0$: $p_1$ = $p_2$ = $p_3$<br>
$H_1$: $p_1$ $\neq$ $p_2$ $\neq$ $p_3$


In [91]:
region_default: DataFrame = pd.crosstab(
    application_train["region_rating_client"].to_pandas(),
    application_train["target"].to_pandas(),
    rownames=["region_rating_client"],
    colnames=["target"],
)
region_default["default_proportion"] = region_default.iloc[:, 1] / region_default.sum(
    axis=1
)
region_default["total"] = region_default.iloc[:, :-1].sum(axis=1)


# pooled sample proportion
p1_default_prop:float = region_default.default_proportion.iloc[0]
p2_default_prop:float = region_default.default_proportion.iloc[1]
p3_default_prop:float = region_default.default_proportion.iloc[2]

p1_population:float = region_default.total.iloc[0]
p2_population:float = region_default.total.iloc[1]
p3_population:float = region_default.total.iloc[2]

p:float = (
    p1_default_prop * p1_population
    + p2_default_prop * p2_population
    + p3_default_prop * p3_population
) / (p1_population + p2_population + p3_population)


# standard error
se:float = np.sqrt(
    (p * (1 - p)) * ((1 / p1_population) + (1 / p2_population) + (1 / p2_population))
)

# test statistic
z:float = (p1_default_prop - p2_default_prop - p3_default_prop) / se

if np.abs(z) < 1.64485:
    print(
        f"Fail to reject the null hypothesis. We can assume the default percentage to be the same across {'region_rating_client'}."
    )
else:
    print(
        f"z = {z:.3f}. Reject null hypothesis. The proportion of credit defaults across values of {'region_rating_client'} is not equal."
    )

z = -31.020. Reject null hypothesis. The proportion of credit defaults across values of region_rating_client is not equal.


Because both gender and region are not equal across their values when compared to our target variable, we they are more likely to have a significant relationship with default risk. We should include these variables in our predictor pool. This raises the question of what variables we should use to predict our models, given that we feasibly cannot include all of them. Let's move on to feature selection so we can restrict our feature set to one that is both significant and predictive of the target variable without being so onerous to run calculations on.


### Are younger homeowners are more likely to default on credit payments?

$H_0$: $\mu_d$ =  $\mu_n$<br>
$H_1$:  $\mu_d$ $\neq$ $\mu_n$

s.t. <br>
$\mu_d$: the average age of clients who default<br>
$\mu_n$: the average age of clients who do not default

In [92]:
age_df: DataFrame = application_train[["target", "days_birth"]]
age_df = age_df.with_columns((pl.col("days_birth") // -365)).rename(
    {"days_birth": "age"}
)

age_default: Series = age_df.filter(pl.col("target") == 1)["age"]
age_no_default: Series = age_df.filter(pl.col("target") == 0)["age"]

t_stat, p_value = stats.ttest_ind(age_no_default, age_default)

print("Two-sample t-test results:\n")
print(f"t-statistic: {t_stat:.3f}")
print(f"p-value: {p_value:.3f}\n")

if p_value < 0.05:
    print(
        "Reject the null hypothesis: There is a significant difference in the average ages \nof clients who default vs. those who don't.\n"
    )
else:
    print(
        "Fail to reject the null hypothesis: There is no significant difference in the average ages of clients who \ndefault vs. those who don't.\n"
    )

Two-sample t-test results:

t-statistic: 14.964
p-value: 0.000

Reject the null hypothesis: There is a significant difference in the average ages 
of clients who default vs. those who don't.



## Preparing data for modeling


Before we start creating models, it's important to lay out what our north star assessment criteria are for model assessment, and secondarily how we will choose what a high-performing model is.

Given that our business problem is designing predictive classifiers of credit clients that are likely to default on payments, and that our training data is highly imbalanced such that our positive class is the minority class at a ratio of 10:1, we will need to be very precise in our metric selection.

Therefore each model that we create will be judged on its ROC-AUC score, F1 score, Matthews Correlation Coefficient, Balanced Accuracy, Precision, and Recall. This sounds like a long list of metrics, so our primary determinants will be the ROC-AUC and MCC since ROC-AUC can be less powerful for highly imbalanced datasets, and MCC gives a pretty comprehensive view of a model's performance across all 4 quadrants of the confusion matrix.

## Modeling

Now that we've defined our metrics set and evaluation method, our workflow for the next section is as follows:
1. Create pipelines to impute missing values, scale data, and fit to our classifier of choice
2. Calculate model statistics for each classifier



In [93]:
application_train: DataFrame = create_formatted_df("application_train.csv")

x: DataFrame = application_train.drop(["sk_id_curr", "target"]).to_pandas()
y: DataFrame = pl.DataFrame(application_train["target"]).to_pandas()
x_columns = x.columns

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.3, random_state=0, stratify=y
)
y_train: NDArray = np.array(y_train).ravel()

numerical_columns: list[str] = [*x.select_dtypes(exclude=["object", "category"]).columns]

categorical_columns: list[str] = [*x.select_dtypes(include=["object", "category"]).columns]

In [94]:
classifiers: list[str] = ["logistic_regression", "extra_trees", "random_forest"]

model_performance = pd.DataFrame()
model_specifications = dict()

for classifier in classifiers:
    study = create_study(
        direction="maximize",
        study_name=classifier,
        pruner=SuccessiveHalvingPruner(reduction_factor=2),
        sampler=RandomSampler(seed=42),
    )
    study.optimize(
        lambda trial: objective_1(
            classifier, trial, x_train, np.array(y_train).ravel()
        ),
        n_trials=num_trials,
    )

    model_specifications[classifier] = study.best_params

    best_trial: FrozenTrial = study.best_trial
    model: Pipeline = instantiate_model(
        classifier,
        trial=best_trial,
        numerical_columns=numerical_columns,
        categorical_columns=categorical_columns,
    )
    model.fit(x_train, y_train)
    predictions: NDArray = model.predict(x_test)
    model_performance[classifier] = calculate_model_statistics(y_test, predictions)
model_performance

[I 2024-12-25 18:43:07,303] A new study created in memory with name: logistic_regression
[I 2024-12-25 18:43:18,213] Trial 0 finished with value: 0.5449981276424636 and parameters: {'numerical_strategy': 'median', 'with_centering': True, 'with_scaling': False, 'categorical_strategy': 'constant', 'sigma': 0.10390188698471643, 'regularization': 4.8495492608099715, 'randomized': True, 'solver': 'sag', 'penalty': None, 'C': 1.3949386065204183}. Best is trial 0 with value: 0.5449981276424636.
[I 2024-12-25 18:43:20,193] Trial 1 finished with value: 0.49983091031146487 and parameters: {'numerical_strategy': 'constant', 'with_centering': False, 'with_scaling': True, 'categorical_strategy': 'most_frequent', 'sigma': 0.32619291333341227, 'regularization': 4.7444276862666666, 'randomized': True, 'solver': 'newton-cholesky', 'penalty': 'l2', 'C': 9.093204020787821}. Best is trial 0 with value: 0.5449981276424636.
[I 2024-12-25 18:43:32,721] Trial 2 finished with value: 0.5018961382618043 and para

Unnamed: 0,logistic_regression,extra_trees,random_forest
roc_auc,0.544172,0.49986,0.657051
matthews_correlation,0.04946,-0.00475,0.180946
f_beta,0.337566,0.0,0.474497
precision,0.097648,0.0,0.149745
recall,0.464324,0.0,0.625133
balanced_accuracy,0.544172,0.49986,0.657051


These models are performing really poorly so far, with both the Matthews Correlation Coefficient and ROC-AUC pointing to the modeling being no better than random guessing then it comes to predicting on defaults. Let's see how well the lightgbm and xgboost classifiers perform.

In [95]:
classifiers: list[str] = ["lightgbm",
                          "xgboost"]

for classifier in classifiers:

    study = create_study(
        direction="maximize",
        study_name=classifier,
        pruner=SuccessiveHalvingPruner(reduction_factor=2),
        sampler=RandomSampler(seed=42),
)


    study.optimize(
        lambda trial: objective_1(classifier, trial, x_train, np.array(y_train).ravel()),
        n_trials=num_trials,
    )

    model_specifications[classifier]= study.best_params


    best_trial: FrozenTrial = study.best_trial
    model: Pipeline = instantiate_model(
        classifier,
        trial=best_trial,
        numerical_columns=numerical_columns,
        categorical_columns=categorical_columns,
    )
    model.fit(x_train, y_train)
    predictions: NDArray = model.predict(x_test)
    model_performance[classifier] = calculate_model_statistics(y_test, predictions)
model_performance

[I 2024-12-25 18:59:15,699] A new study created in memory with name: lightgbm
[I 2024-12-25 18:59:18,855] Trial 0 finished with value: 0.6688017696233804 and parameters: {'numerical_strategy': 'median', 'with_centering': True, 'with_scaling': False, 'categorical_strategy': 'constant', 'sigma': 0.10390188698471643, 'regularization': 4.8495492608099715, 'randomized': True, 'boosting_type': 'dart', 'num_leaves': 21, 'max_depth': 10, 'n_estimators': 85}. Best is trial 0 with value: 0.6688017696233804.
[I 2024-12-25 18:59:21,613] Trial 1 finished with value: 0.6601785263991166 and parameters: {'numerical_strategy': 'median', 'with_centering': False, 'with_scaling': True, 'categorical_strategy': 'constant', 'sigma': 0.23320561318726862, 'regularization': 3.0377242595071916, 'randomized': True, 'boosting_type': 'dart', 'num_leaves': 52, 'max_depth': 5, 'n_estimators': 46}. Best is trial 0 with value: 0.6688017696233804.
[I 2024-12-25 18:59:23,994] Trial 2 finished with value: 0.65986108584141

Unnamed: 0,logistic_regression,extra_trees,random_forest,lightgbm,xgboost
roc_auc,0.544172,0.49986,0.657051,0.683305,0.506129
matthews_correlation,0.04946,-0.00475,0.180946,0.212442,0.065844
f_beta,0.337566,0.0,0.474497,0.508958,0.015328
precision,0.097648,0.0,0.149745,0.163228,0.433333
recall,0.464324,0.0,0.625133,0.665602,0.013845
balanced_accuracy,0.544172,0.49986,0.657051,0.683305,0.506129


In [96]:
model_performance

Unnamed: 0,logistic_regression,extra_trees,random_forest,lightgbm,xgboost
roc_auc,0.544172,0.49986,0.657051,0.683305,0.506129
matthews_correlation,0.04946,-0.00475,0.180946,0.212442,0.065844
f_beta,0.337566,0.0,0.474497,0.508958,0.015328
precision,0.097648,0.0,0.149745,0.163228,0.433333
recall,0.464324,0.0,0.625133,0.665602,0.013845
balanced_accuracy,0.544172,0.49986,0.657051,0.683305,0.506129


## SMOTE Oversampling
One way to strengthen our model performance could be to oversample/undersample on our dataset to rebalance the proportion of positive and negative target classes, and then train our classifier on that. We'll use the method put forth in the original SMOTE paper and then retrain on the models shown above.

In [97]:
# encode the categorical features to start
numerical_train = application_train.clone()


encoder_mapping_key = dict()
for col in numerical_train.columns:
    try:
        key: dict[str, int] = create_encoder_mapping(numerical_train, col)
        numerical_train = encode_feature(numerical_train, col, key)
        encoder_mapping_key[col] = key
    except:
        pass

x = numerical_train.drop("target").to_pandas()
y = numerical_train["target"].to_pandas()

In [98]:
oversampling = SMOTE(sampling_strategy=0.1)
undersampling = RandomUnderSampler(sampling_strategy=0.5)

steps = [("oversample", oversampling), ("undersample", undersampling)]
pipeline = ImbPipeline(steps=steps)
smote_x, smote_y = pipeline.fit_resample(x, y)

pca = PCA(n_components=smote_x.shape[1])
smote_x = pd.DataFrame(pca.fit_transform(smote_x),columns=x.columns)

x_train, x_test, y_train, y_test = train_test_split(
    smote_x, smote_y, stratify=smote_y, random_state=15
)

Now that we've created our smote dataset, we can fit the models. However our previous model specifications aren't as helpful since the dataset is new, and models may perform better on different hyperparameters. Let's create another study and run it again!

In [99]:
study = create_study(
    direction="maximize",
    pruner=SuccessiveHalvingPruner(reduction_factor=2),
    study_name='SMOTE_random_forest',
    sampler=RandomSampler(seed=42),
)
study.optimize(lambda trial: objective_1("random_forest", trial, x, y), n_trials=num_trials)


model_specifications["SMOTE_random_forest"] = study.best_params

smote_lgbm = LGBMClassifier(**study.best_params)
smote_lgbm.fit(x_train, y_train)
predictions: NDArray = smote_lgbm.predict(x_test)
model_performance["SMOTE_random_forest"] = calculate_model_statistics(y_test, predictions)
model_performance

[I 2024-12-25 19:02:14,690] A new study created in memory with name: SMOTE_random_forest
[I 2024-12-25 19:05:04,501] Trial 0 finished with value: 0.5373791295125593 and parameters: {'numerical_strategy': 'median', 'with_centering': True, 'with_scaling': False, 'categorical_strategy': 'constant', 'sigma': 0.10390188698471643, 'regularization': 4.8495492608099715, 'randomized': True, 'criterion': 'log_loss', 'min_samples_split': 35, 'max_depth': 29, 'n_estimators': 68, 'min_samples_leaf': 13, 'class_weight': 'balanced_subsample', 'max_features': None}. Best is trial 0 with value: 0.5373791295125593.
[I 2024-12-25 19:05:24,511] Trial 1 finished with value: 0.5687456004503089 and parameters: {'numerical_strategy': 'most_frequent', 'with_centering': True, 'with_scaling': False, 'categorical_strategy': 'most_frequent', 'sigma': 1.5237642320976799, 'regularization': 0.48836057003191935, 'randomized': True, 'criterion': 'entropy', 'min_samples_split': 59, 'max_depth': 18, 'n_estimators': 111, 

Unnamed: 0,logistic_regression,extra_trees,random_forest,lightgbm,xgboost,SMOTE_random_forest
roc_auc,0.544172,0.49986,0.657051,0.683305,0.506129,0.636338
matthews_correlation,0.04946,-0.00475,0.180946,0.212442,0.065844,0.263216
f_beta,0.337566,0.0,0.474497,0.508958,0.015328,0.564419
precision,0.097648,0.0,0.149745,0.163228,0.433333,0.487643
recall,0.464324,0.0,0.625133,0.665602,0.013845,0.574468
balanced_accuracy,0.544172,0.49986,0.657051,0.683305,0.506129,0.636338


In [100]:
study = create_study(
    direction="maximize",
    pruner=SuccessiveHalvingPruner(reduction_factor=2),
    study_name='SMOTE_lgbm',
    sampler=RandomSampler(seed=42),
)
study.optimize(lambda trial: objective_1("lightgbm", trial, x, y), n_trials=num_trials)


model_specifications["SMOTE_lgbm"] = study.best_params

smote_lgbm = LGBMClassifier(**study.best_params)
smote_lgbm.fit(x_train, y_train)
predictions: NDArray = smote_lgbm.predict(x_test)
model_performance["SMOTE_lgbm"] = calculate_model_statistics(y_test, predictions)
model_performance

[I 2024-12-25 19:14:52,478] A new study created in memory with name: SMOTE_lgbm
[I 2024-12-25 19:14:55,468] Trial 0 finished with value: 0.6287530605924255 and parameters: {'numerical_strategy': 'median', 'with_centering': True, 'with_scaling': False, 'categorical_strategy': 'constant', 'sigma': 0.10390188698471643, 'regularization': 4.8495492608099715, 'randomized': True, 'boosting_type': 'dart', 'num_leaves': 21, 'max_depth': 10, 'n_estimators': 85}. Best is trial 0 with value: 0.6287530605924255.
[I 2024-12-25 19:14:57,709] Trial 1 finished with value: 0.6293767721765933 and parameters: {'numerical_strategy': 'median', 'with_centering': False, 'with_scaling': True, 'categorical_strategy': 'constant', 'sigma': 0.23320561318726862, 'regularization': 3.0377242595071916, 'randomized': True, 'boosting_type': 'dart', 'num_leaves': 52, 'max_depth': 5, 'n_estimators': 46}. Best is trial 1 with value: 0.6293767721765933.
[I 2024-12-25 19:14:59,026] Trial 2 finished with value: 0.624755083713

Unnamed: 0,logistic_regression,extra_trees,random_forest,lightgbm,xgboost,SMOTE_random_forest,SMOTE_lgbm
roc_auc,0.544172,0.49986,0.657051,0.683305,0.506129,0.636338,0.582867
matthews_correlation,0.04946,-0.00475,0.180946,0.212442,0.065844,0.263216,0.246325
f_beta,0.337566,0.0,0.474497,0.508958,0.015328,0.564419,0.239779
precision,0.097648,0.0,0.149745,0.163228,0.433333,0.487643,0.657895
recall,0.464324,0.0,0.625133,0.665602,0.013845,0.574468,0.223964
balanced_accuracy,0.544172,0.49986,0.657051,0.683305,0.506129,0.636338,0.582867


In [101]:
study = create_study(
    direction="maximize",
    pruner=SuccessiveHalvingPruner(reduction_factor=2),
    study_name='SMOTE_xgb',
    sampler=RandomSampler(seed=42),
)
study.optimize(lambda trial: objective_1("xgboost", trial, x, y), n_trials=num_trials)

smote_xgb: XGBClassifier = XGBClassifier(**study.best_params)
smote_xgb.fit(x_train, y_train)
predictions = smote_xgb.predict(x_test)
model_performance["SMOTE_xgb"] = calculate_model_statistics(y_test, predictions)
model_performance

[I 2024-12-25 19:15:25,007] A new study created in memory with name: SMOTE_xgb
[I 2024-12-25 19:16:46,013] Trial 0 finished with value: 0.5 and parameters: {'numerical_strategy': 'median', 'with_centering': True, 'with_scaling': False, 'categorical_strategy': 'constant', 'sigma': 0.10390188698471643, 'regularization': 4.8495492608099715, 'randomized': True, 'objective': 'binary:logistic', 'booster': 'dart', 'max_leaves': 1, 'max_depth': 7, 'grow_policy': 'depthwise', 'n_estimators': 81, 'learning_rate': 0.14809892204552141}. Best is trial 0 with value: 0.5.
[I 2024-12-25 19:16:46,876] Trial 1 finished with value: 0.5 and parameters: {'numerical_strategy': 'constant', 'with_centering': False, 'with_scaling': True, 'categorical_strategy': 'most_frequent', 'sigma': 0.32619291333341227, 'regularization': 4.7444276862666666, 'randomized': True, 'objective': 'binary:hinge', 'booster': 'gbtree', 'max_leaves': 1, 'max_depth': 7, 'grow_policy': 'depthwise', 'n_estimators': 51, 'learning_rate': 

Unnamed: 0,logistic_regression,extra_trees,random_forest,lightgbm,xgboost,SMOTE_random_forest,SMOTE_lgbm,SMOTE_xgb
roc_auc,0.544172,0.49986,0.657051,0.683305,0.506129,0.636338,0.582867,0.507279
matthews_correlation,0.04946,-0.00475,0.180946,0.212442,0.065844,0.263216,0.246325,0.081779
f_beta,0.337566,0.0,0.474497,0.508958,0.015328,0.564419,0.239779,0.01862
precision,0.097648,0.0,0.149745,0.163228,0.433333,0.487643,0.657895,0.789474
recall,0.464324,0.0,0.625133,0.665602,0.013845,0.574468,0.223964,0.016797
balanced_accuracy,0.544172,0.49986,0.657051,0.683305,0.506129,0.636338,0.582867,0.507279


From our model_performance breakdown, we can see that our best performing model overall is the SMOTE_LGBM classifier. This model is still performing below the matthews correlation threshold of 0.5, so we still have some work to do improving the model's performance.

That said, the precision and balanced accuracy aren't bad, and the roc_auc is solid even though some of the models might be performing better on that metric specifically. Given that that metrics is class-blind in its measurement of model performance, that could mean that this classifier is performing better across classes.

In the next section, we'll proceed to make further improvements to the model specifications in order to boost our performance for this particular use case.
