## Imports


In [1]:
#!pip install skimpy polars plotly pingouin xgboost lightgbm scikit-optimize optuna category_encoders sklego
import polars as pl
import pandas as pd
import numpy as np
import optuna
from polars import DataFrame
from skimpy import skim
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.preprocessing import StandardScaler
import plotly.io as pio
import polars.selectors as cs
import pingouin as pg
from lightgbm import LGBMClassifier
from sklearn.metrics import confusion_matrix
from skopt import BayesSearchCV
from polars import DataFrame
from sklearn.model_selection import RepeatedStratifiedKFold
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from typing import Optional
from sklearn.model_selection import cross_val_score, KFold
from pandas import DataFrame, Series
from sklearn.linear_model._logistic import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from optuna.trial._frozen import FrozenTrial
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import (
    accuracy_score,
    make_scorer,
    recall_score,
    precision_score,
    fbeta_score,
    roc_auc_score,
    make_scorer,
    matthews_corrcoef,
    balanced_accuracy_score,
)
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble._forest import ExtraTreesClassifier

from optuna import Study

from sklearn.impute import SimpleImputer
from optuna import Trial,create_study

from category_encoders import WOEEncoder
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import ExtraTreesClassifier
from lightgbm import LGBMClassifier
from optuna import create_study
from optuna.pruners import SuccessiveHalvingPruner
from optuna.samplers import RandomSampler




pio.templates.default = "plotly_dark"

## Helper functions


In [2]:
def calculate_partial_correlation(df, feature1, feature2, control):
    return pg.partial_corr(
        data=df.to_pandas(), x=feature1, y=feature2, covar=control
    )  # ['r'].values[0]


def create_encoder_mapping(df, feature) -> dict[str, int]:
    """Creates dictionary for mapping to encode categorical features

    Args:
        df (polars dataframe): dataframe of features
        feature (string): name of feature of interest

    Returns:
        encoding_key: dictionary of feature values and numbers for encoding
    """
    df: DataFrame = (
        df.group_by(feature)
        .agg(pl.len().alias("values"))
        .sort("values", descending=True)
    )

    options: List = df[feature].to_list()

    numbers_to_encode = list(range(0, len(options)))
    encoding_key = {options[i]: numbers_to_encode[i] for i in range(len(options))}

    if df[feature].str.contains("Yes").to_list()[0] == True:
        encoding_key: dict[str, int] = {"Yes": 1, "No": 0}

    return encoding_key


def encode_feature(df, feature, encoding_key) -> DataFrame:
    """Encode features using supplied encoding key

    Args:
        df (polars): Dataframe to be modified
        feature (string): feature to be encoded
        encoding_key (dict): dictionary of values and numerical codes

    Returns:
        df: input dataframe with feature replaced by numerical values
    """
    df: DataFrame = df.with_columns(
        df.select(pl.col(feature).replace(encoding_key)).cast({feature: pl.Int64})
    )
    return df


def retrieve_csv_columns(csv):
    df = pl.read_csv(csv).head(1)
    columns = set(lower_column_names(df).columns)
    return columns


def find_common_key(table1, table2) -> str:
    """For two names of csv files given as strings, return the common key column between them"""
    sk_id_set = set(
        [
            "bureau.csv",
            "previous_application.csv",
            "credit_card_balance.csv",
            "installments_payments.csv",
            "pos_cash_balance.csv",
            "application_test.csv",
            "application_train.csv",
        ]
    )
    sk_id_bureau_set = set(["bureau.csv", "bureau_balance.csv"])
    sk_id_prev_set = set(
        [
            "pos_cash_balance.csv",
            "installments_payments.csv",
            "credit_card_balance.csv",
            "previous_application.csv",
        ]
    )

    if (table1 in sk_id_set) & (table2 in sk_id_set):
        return "sk_id_curr"
    elif (table1 in sk_id_bureau_set) & (table2 in sk_id_bureau_set):
        return "sk_id_bureau"
    elif (table1 in sk_id_prev_set) & (table2 in sk_id_set):
        return "sk_id_prev"
    else:
        print("no common key found for these tables")


def tables_with_feature(description_df, feature):
    tables = set(
        description_df.filter(pl.col("row") == feature)
        .select("table")
        .unique()
        .to_series()
    )

    if len(tables) == 0:
        print("no results found.")
    else:
        return tables


def calculate_null_count(train, table, feature, mode="null_count"):
    #nulls = []
    if table == "application_{train|test}.csv":
        if mode == "null_count":
            return train[feature].null_count()
        elif mode == "series":
            return train[feature]
        # nulls.append(train[feature].null_count())
    else:
        alternate_df = create_formatted_df(table)
        key = find_common_key("application_train.csv", table)
        temp_df = train.join(alternate_df, on=key, how="inner")
    if mode == "null_count":
        return temp_df[feature].null_count()
    elif mode == "series":
        return temp_df[feature].to_series()


def null_count_comparison(train, description, feature):
    tables = tables_with_feature(description, feature)

    nulls = []
    for table in tables:
        null_count = calculate_null_count(train, table, feature, mode="null_count")
        nulls.append(null_count)
    df = pl.DataFrame(
        data={"table": list(tables), "feature": feature, "null_count": nulls},
        schema={"table": pl.String, "feature": pl.String, "null_count": pl.Int64},
    )

    return df


def replace_feature(train, null_df, feature) -> DataFrame:
    """Replace feature with fewer null values in train, else return training set unchanged"""
    null_min = null_df["null_count"].min()
    table = null_df.filter(pl.col("null_count") == null_min).select("table").item()
    if table == "application_{train|test}.csv":
        return train
    else:
        key = find_common_key("application_train.csv", table)
        alternate_df = create_formatted_df(table)[[key, feature]]
        alternate_feature = train.join(alternate_df, on=key, how="inner")[
            feature
        ].to_series()
        train = train.with_columns(alternate_feature.alias(feature))
        return train


def hypothesis_test_multiple_proportions(train_df, feature):
    """Conducts hypothesis test comparing target proportion and feature proportion"""

    # Determine the number of values in feature
    feature_default_df = pd.crosstab(
        train_df[feature].to_pandas(),
        train_df["target"].to_pandas(),
        rownames=[feature],
        colnames=["target"],
    )
    feature_default_df["default_proportion"] = feature_default_df.iloc[
        :, 1
    ] / feature_default_df.sum(axis=1)
    feature_default_df["total"] = feature_default_df.iloc[:, :-1].sum(axis=1)

    # pooled sample proportion
    p1_default_prop = feature_default_df.default_proportion.iloc[0]
    p2_default_prop = feature_default_df.default_proportion.iloc[1]
    p1_population = feature_default_df.total.iloc[0]
    p2_population = feature_default_df.total.iloc[1]

    p = (p1_default_prop * p1_population + p2_default_prop * p2_population) / (
        p1_population + p2_population
    )

    # standard error
    se = np.sqrt((p * (1 - p)) * ((1 / p1_population) + (1 / p2_population)))

    # test statistic
    z = (p1_default_prop - p2_default_prop) / se

    if np.abs(z) < 1.64485:
        print(
            f"Fail to reject the null hypothesis. We can assume the default percentage to be the same across {feature}."
        )
    else:
        print(
            f"z = {z:.3f}. Reject null hypothesis. The proportion of credit defaults across values of {feature} is not equal."
        )


def np_to_df(array, feature_list, designation="pd"):
    if designation == "pd":
        return pd.DataFrame(array, columns=feature_list)
    if designation == "pl":
        df = pl.DataFrame(array)
        df.columns = feature_list
        return df


def conduct_grid_search_tuning(
    model, grid, x_train, y_train, refit, scoring=make_scorer(fbeta_score, beta=2), cv=5
):
    """Conducts gridsearch for specified model and hyperparameter settings

    Args:
        model (string): string specifying model to test, must be 'knn', 'logistic_regression','decision_tree', or 'random_forest'
        grid (dictionary): grid of lists specifying options for hyperparameters to tune
        xy (list): x and y for model fitting, should be in [x_train,y_train] format
        scoring(string/callable): string defines scoring method to be used within grid search
    """

    grid_search = GridSearchCV(
        model, grid, cv=cv, scoring=scoring, refit=refit, n_jobs=-1
    )
    grid_search.fit(x_train, y_train)

    best_params = grid_search.best_params_

    return best_params  # , grid_search





def skopt_bayesian_search(classifier, x_train, y_train, params):
    cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=3, random_state=1)
    search = BayesSearchCV(estimator=classifier, search_spaces=params, n_jobs=-1, cv=cv)
    search.fit(x_train, y_train)
    return search.best_params_

### Loan_functions.py

In [3]:
from loan_functions import (
    create_formatted_df,
    make_subplot,
    lower_column_names,
    lower_column_values,
    column_description,
    plot_histogram,
    column_comparison,
    int_range,
    clear,
    calculate_value_counts,
)

We'll read in the csv datasets one by one to save on memory, clearing them each time. Let's start with the training set, followed by our other datasets.

### Data Exploration


Let's start by reading in the training set and looking at balance. I'll keep the EDA of this notebook focused on the training set to have a more focused analysis and discussion of modeling and model performance, but a more detailed exploration of the supporting datasets can be found in the [notebook] found in this repository.

In [4]:
application_train = create_formatted_df("application_train.csv")

print(
    f"Training set has {application_train.shape[0]} rows and {application_train.shape[1]} columns."
)

application_train.sample(5)

Training set has 307511 rows and 122 columns.


sk_id_curr,target,name_contract_type,code_gender,flag_own_car,flag_own_realty,cnt_children,amt_income_total,amt_credit,amt_annuity,amt_goods_price,name_type_suite,name_income_type,name_education_type,name_family_status,name_housing_type,region_population_relative,days_birth,days_employed,days_registration,days_id_publish,own_car_age,flag_mobil,flag_emp_phone,flag_work_phone,flag_cont_mobile,flag_phone,flag_email,occupation_type,cnt_fam_members,region_rating_client,region_rating_client_w_city,weekday_appr_process_start,hour_appr_process_start,reg_region_not_live_region,reg_region_not_work_region,live_region_not_work_region,…,nonlivingarea_medi,fondkapremont_mode,housetype_mode,totalarea_mode,wallsmaterial_mode,emergencystate_mode,obs_30_cnt_social_circle,def_30_cnt_social_circle,obs_60_cnt_social_circle,def_60_cnt_social_circle,days_last_phone_change,flag_document_2,flag_document_3,flag_document_4,flag_document_5,flag_document_6,flag_document_7,flag_document_8,flag_document_9,flag_document_10,flag_document_11,flag_document_12,flag_document_13,flag_document_14,flag_document_15,flag_document_16,flag_document_17,flag_document_18,flag_document_19,flag_document_20,flag_document_21,amt_req_credit_bureau_hour,amt_req_credit_bureau_day,amt_req_credit_bureau_week,amt_req_credit_bureau_mon,amt_req_credit_bureau_qrt,amt_req_credit_bureau_year
i64,i64,str,str,str,str,i64,f64,f64,f64,f64,str,str,str,str,str,f64,i64,i64,f64,i64,f64,i64,i64,i64,i64,i64,i64,str,f64,i64,i64,str,i64,i64,i64,i64,…,f64,str,str,f64,str,str,f64,f64,f64,f64,f64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,f64,f64,f64,f64,f64,f64
311717,0,"""cash loans""","""m""","""n""","""n""",0,67500.0,161730.0,8257.5,135000.0,"""unaccompanied""","""working""","""secondary / secondary special""","""civil marriage""","""house / apartment""",0.01452,-11548,-113,-6156.0,-2596,,1,1,0,1,1,0,"""laborers""",2.0,2,2,"""saturday""",15,0,0,0,…,,,,,,,0.0,0.0,0.0,0.0,-2452.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,5.0
121800,0,"""cash loans""","""f""","""n""","""y""",1,90000.0,148500.0,17752.5,148500.0,"""spouse, partner""","""commercial associate""","""secondary / secondary special""","""married""","""house / apartment""",0.028663,-14634,-874,-5078.0,-3135,,1,1,0,1,0,0,"""core staff""",3.0,2,2,"""monday""",14,0,0,0,…,0.0,,"""block of flats""",0.1569,"""stone, brick""","""no""",0.0,0.0,0.0,0.0,-704.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
136700,0,"""cash loans""","""m""","""y""","""y""",1,180000.0,360000.0,22023.0,360000.0,"""unaccompanied""","""state servant""","""incomplete higher""","""married""","""house / apartment""",0.04622,-10083,-1036,-2019.0,-2726,1.0,1,1,0,1,0,0,"""core staff""",3.0,1,1,"""friday""",11,0,0,0,…,0.0,"""reg oper account""","""block of flats""",0.0097,"""stone, brick""","""no""",1.0,0.0,1.0,0.0,-460.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,1.0,0.0,1.0
134033,0,"""cash loans""","""m""","""y""","""n""",1,157500.0,490495.5,27387.0,454500.0,"""unaccompanied""","""working""","""secondary / secondary special""","""separated""","""with parents""",0.006671,-13769,-268,-7186.0,-4677,19.0,1,1,1,1,0,0,"""low-skill laborers""",2.0,2,2,"""tuesday""",12,0,0,0,…,,,,,,,1.0,0.0,1.0,0.0,-813.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
267388,0,"""revolving loans""","""f""","""y""","""n""",0,90000.0,270000.0,13500.0,270000.0,"""unaccompanied""","""pensioner""","""secondary / secondary special""","""single / not married""","""house / apartment""",0.018634,-23627,365243,-15919.0,-4376,1.0,1,0,0,1,0,0,,1.0,2,2,"""saturday""",10,0,0,0,…,0.0005,"""org spec account""","""block of flats""",0.2339,"""panel""","""no""",1.0,1.0,1.0,1.0,-327.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,6.0


Let's now take a look at balance of the target variable since that is our modeling variable of interest. 

In [5]:
plot_histogram(application_train, "target", title="Target Variable Distribution")

In [6]:
description = pl.read_csv("HomeCredit_columns_description.csv", encoding="latin1")
description = lower_column_names(description)
description = lower_column_values(description)

# Delete training spaces for some entries in the row column
description = description.with_columns(
    pl.col("row").map_elements(lambda x: x.split(" ")[0], return_dtype=pl.String)
)

null_df = null_count_comparison(application_train, description, "amt_annuity")
null_df

table,feature,null_count
str,str,i64
"""bureau.csv""","""amt_annuity""",42
"""application_{train|test}.csv""","""amt_annuity""",12
"""previous_application.csv""","""amt_annuity""",93


### Null values 

In [7]:
null_df = (
    application_train.null_count()
    .transpose(include_header=True)
    .rename(mapping={"column": "feature", "column_0": "null_count"})
    .sort(by="null_count", descending=True)
    .with_columns(
        pl.col("null_count")
        .map_elements(lambda x: x / len(application_train), return_dtype=pl.Float32)
        .alias("percentage")
    )
)

px.histogram(
    null_df,
    x="percentage",
    text_auto=True,
    title="Null value percentages across dataset",
).update_layout(bargap=0.2)

The graph above shows that we have a pretty wide spread of null value prevalence within this dataset. Thankfully none of these null values are in the target variable, but we will have to make a choice in our analysis of how to treat null values within this dataset since some classifiers are more tolerant of null values than others. LightGBM and XGBoost can handle null values when fitting, but commonly used sklearn classifiers do not.

#### Look for categorical anomalies 

## Correlations 


In [8]:
from typing import Any
from numpy import dtype
from numpy._typing._array_like import NDArray
from pandas import Index
from pandas.core.frame import DataFrame
from pandas.io.formats.style import Styler


corr: DataFrame = (
    application_train.select(cs.by_dtype(pl.NUMERIC_DTYPES)).to_pandas().corr()
)
corr = pd.DataFrame(corr["target"])

px.histogram(
    corr, "target", title="Correlations with Target Variable"
).update_layout(bargap=0.2)

The graph above shows the distribution of correlations as a graphical distribution rather than in tabular form because we have so many variables that we're working with. None of our variables are correlated with target, save for target itself.

## Hypothesis testing

### Is there difference in proportions of clients that default when broken down by their region of residence? 
$H_0$: $p_1$ = $p_2$ = $p_3$<br>
$H_1$: $p_1$ $\neq$ $p_2$ $\neq$ $p_3$


In [9]:
region_default = pd.crosstab(
    application_train["region_rating_client"].to_pandas(),
    application_train["target"].to_pandas(),
    rownames=["region_rating_client"],
    colnames=["target"],
)
region_default["default_proportion"] = region_default.iloc[:, 1] / region_default.sum(
    axis=1
)
region_default["total"] = region_default.iloc[:, :-1].sum(axis=1)

# pooled sample proportion
p1_default_prop = region_default.default_proportion.iloc[0]
p2_default_prop = region_default.default_proportion.iloc[1]
p3_default_prop = region_default.default_proportion.iloc[2]

p1_population = region_default.total.iloc[0]
p2_population = region_default.total.iloc[1]
p3_population = region_default.total.iloc[2]

p = (
    p1_default_prop * p1_population
    + p2_default_prop * p2_population
    + p3_default_prop * p3_population
) / (p1_population + p2_population + p3_population)

# standard error
se = np.sqrt(
    (p * (1 - p)) * ((1 / p1_population) + (1 / p2_population) + (1 / p2_population))
)

# test statistic
z = (p1_default_prop - p2_default_prop - p3_default_prop) / se

if np.abs(z) < 1.64485:
    print(
        f"Fail to reject the null hypothesis. We can assume the default percentage to be the same across {'region_rating_client'}."
    )
else:
    print(
        f"z = {z:.3f}. Reject null hypothesis. The proportion of credit defaults across values of {'region_rating_client'} is not equal."
    )

z = -82.387. Reject null hypothesis. The proportion of credit defaults across values of region_rating_client is not equal.


Because both gender and region are not equal across their values when compared to our target variable, we they are more likely to have a significant relationship with default risk. We should include these variables in our predictor pool. This raises the question of what variables we should use to predict our models, given that we feasibly cannot include all of them. Let's move on to feature selection so we can restrict our feature set to one that is both significant and predictive of the target variable without being so onerous to run calculations on. 


### Are younger homeowners are more likely to default on credit payments? 

$H_0$: $\mu_d$ =  $\mu_n$<br>
$H_1$:  $\mu_d$ $\neq$ $\mu_n$

s.t. <br>
$\mu_d$: the average age of clients who default<br>
$\mu_n$: the average age of clients who do not default

In [10]:
import scipy.stats as stats

age_df: DataFrame = application_train[["target", "days_birth"]]
age_df = age_df.with_columns((pl.col("days_birth") // -365)).rename(
    {"days_birth": "age"}
)

age_default = age_df.filter(pl.col("target") == 1)["age"]
age_no_default = age_df.filter(pl.col("target") == 0)["age"]

t_stat, p_value = stats.ttest_ind(age_no_default, age_default)

print("Two-sample t-test results:\n")
print(f"t-statistic: {t_stat:.3f}")
print(f"p-value: {p_value:.3f}\n")

if p_value < 0.05:
    print(
        "Reject the null hypothesis: There is a significant difference in the average ages \nof clients who default vs. those who don't.\n"
    )
else:
    print(
        "Fail to reject the null hypothesis: There is no significant difference in the average ages of clients who \ndefault vs. those who don't.\n"
    )

Two-sample t-test results:

t-statistic: 43.517
p-value: 0.000

Reject the null hypothesis: There is a significant difference in the average ages 
of clients who default vs. those who don't.



## Preparing data for modeling


In [11]:
def calculate_model_statistics(y_true, y_predict, beta=3.0, title="statistics"):
    """Uses actual y and predicted y values to return a dataframe of accuracy, precision, recall, and f-beta values as well as false negative and false posititive rates for a given classifier

    Args:
        y_true (numpy array or data series): dependent variable values from the dataset
        y_predict (_type_): dependent variable values arising from model
        beta (float, optional): Beta value to determine weighting between precision and recall in the f-beta score.Defaults to beta value set in global scope of this notebook.
        title (str, optional): _description_. Defaults to "statistics".

    Returns:
        model_statistics: pandas dataframe of statistics
    """

    # Calculate confusion matrix
    tn, fp, fn, tp = confusion_matrix(y_true, y_predict).ravel()

    # calculate statistics from confusion matrix
    # accuracy: float = accuracy_score(y_true, y_predict)
    roc_auc: float = roc_auc_score(y_true, y_predict)
    mcc: float = matthews_corrcoef(y_true, y_predict)
    f_beta: float = fbeta_score(y_true, y_predict, beta=beta)

    precision: float = precision_score(y_true, y_predict, zero_division=0)
    recall: float = recall_score(y_true, y_predict)
    balanced_accuracy = balanced_accuracy_score(y_true,y_predict)
    # false_negative_rate: float = fn / (tn + fp + fn + tp)
    # false_positive_rate: float = fp / (tn + fp + fn + tp)

    return pd.DataFrame(
        data={
            title: [
                roc_auc,
                mcc,
                f_beta,
                precision,
                recall,
                balanced_accuracy
                # accuracy,
                # false_negative_rate,
                # false_positive_rate,
            ]
        },
        index=[
            "roc_auc",
            "matthews_correlation",
            "f_beta",
            "precision",
            "recall",
            "balanced_accuracy"
            # "accuracy",
            # "false_negative_rate",
            # "false_positive_rate",
        ],
    )

Before we start creating models, it's important to lay out what our north star assessment criteria are for model assessment, and secondarily how we will choose what a high-performing model is. 

Given that our business problem is designing predictive classifiers of credit clients that are likely to default on payments, and that our training data is highly imbalanced such that our positive class is the minority class at a ratio of 10:1, we will need to be very precise in our metric selection. 

Therefore each model that we create will be judged on its ROC-AUC score, F1 score, Matthews Correlation Coefficient, Balanced Accuracy, Precision, and Recall. This sounds like a long list of metrics, so our primary determinants will be the ROC-AUC and MCC since ROC-AUC can be less powerful for highly imbalanced datasets, and MCC gives a pretty comprehensive view of a model's performance across all 4 quadrants of the confusion matrix.

## Modeling

Now that we've defined our metrics set and evaluation method, our workflow for the next section is as follows: 
1. Create pipelines to impute missing values, scale data, and fit to our classifier of choice
2. Calculate model statistics for each classifier


Let's setup some initial arrays/dataframes, as well as functions we'll need to use in our pipeline.

In [12]:
application_train = create_formatted_df("application_train.csv")

x = (
    application_train
    # .drop(cs.matches("flag_document_"))
    .drop(["sk_id_curr", "target"]).to_pandas()
)
y = pl.DataFrame(application_train["target"]).to_pandas()

# x_columns = x.columns
# y_columns = y.columns


x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.3, random_state=0, stratify=y
)
y_train = np.array(y_train).ravel()

numerical_columns = [*x.select_dtypes(exclude=["object", "category"]).columns]

categorical_columns = [*x.select_dtypes(include=["object", "category"]).columns]

In [36]:
def instantiate_numerical_simple_imputer(
    trial: Trial, fill_value: int = -1
) -> SimpleImputer:
    strategy = trial.suggest_categorical(
        "numerical_strategy", ["mean", "median", "most_frequent", "constant"]
    )
    return SimpleImputer(strategy=strategy, fill_value=fill_value)


def instantiate_categorical_simple_imputer(
    trial: Trial, fill_value: str = "missing"
) -> SimpleImputer:
    strategy = trial.suggest_categorical(
        "categorical_strategy", ["most_frequent", "constant"]
    )
    return SimpleImputer(strategy=strategy, fill_value=fill_value)


def instantiate_woe_encoder(trial: Trial) -> WOEEncoder:
    params = {
        "sigma": trial.suggest_float("sigma", 0.001, 5),
        "regularization": trial.suggest_float("regularization", 0, 5),
        "randomized": trial.suggest_categorical("randomized", [True, False]),
    }
    return WOEEncoder(**params)


def instantiate_robust_scaler(trial: Trial) -> RobustScaler:
    params = {
        "with_centering": trial.suggest_categorical("with_centering", [True, False]),
        "with_scaling": trial.suggest_categorical("with_scaling", [True, False]),
    }
    return RobustScaler(**params)


def instantiate_extra_trees(trial: Trial, warm_start=False) -> ExtraTreesClassifier:
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 1000),
        "max_depth": trial.suggest_int("max_depth", 1, 20),
        "max_features": trial.suggest_float("max_features", 0, 1),
        "bootstrap": trial.suggest_categorical("bootstrap", [True, False]),
        "n_jobs": -1,
        "random_state": 42,
    }
    return ExtraTreesClassifier(**params, warm_start=warm_start)


def instantiate_logistic_regression(trial) -> LogisticRegression:
    solver = trial.suggest_categorical(
        "solver", ["lbfgs", "newton-cg", "newton-cholesky", "sag", "saga"]
    )
    if solver == "newton-cholesky":
        penalty = trial.suggest_categorical("penalty", ["l2", None])
        params = {
            "solver": solver,
            "penalty": penalty,
            "C": trial.suggest_float("C", 0.0, 10.0),
        }
    elif solver == "lbfgs":
        penalty = trial.suggest_categorical("penalty", ["l2", None])
        params = {
            "solver": solver,
            "penalty": penalty,
            "C": trial.suggest_float("C", 0.0, 10.0),
        }
    elif solver == "liblinear":
        penalty = trial.suggest_categorical("penalty", ["l1", "l2"])
        params = {
            "solver": solver,
            "penalty": penalty,
            "C": trial.suggest_float("C", 0.0, 10.0),
        }
    elif solver == "newton-cg":
        penalty = trial.suggest_categorical("penalty", ["l2", None])
        params = {
            "solver": solver,
            "penalty": penalty,
            "C": trial.suggest_float("C", 0.0, 10.0),
        }
    elif solver == "sag":
        penalty = trial.suggest_categorical("penalty", ["l2", None])
        params = {
            "solver": solver,
            "penalty": penalty,
            "C": trial.suggest_float("C", 0.0, 10.0),
        }
    elif solver == "saga":
        penalty = trial.suggest_categorical("penalty", ["l2", None])
        params = {
            "solver": solver,
            "penalty": penalty,
            "C": trial.suggest_float("C", 0.0, 10.0),
        }
    return LogisticRegression(**params)


#def instantiate_lgbm_classifier(trial):
#    params = {
#        "boosting_type": trial.suggest_categorical(
#            "boosting_type", ["gbdt", "dart", "rf"]
#        ),
#        "num_leaves": trial.suggest_int("num_leaves", 2, 64),
#        "max_depth": trial.suggest_int("max_depth", -1, 20),
#        "n_estimators": trial.suggest_int("n_estimators", 35, 150),
#        "class_weight": "balanced",    }
#    return LGBMClassifier(**params)



def instantiate_lgbm_classifier(trial):
    params = {
        "boosting_type": trial.suggest_categorical(
            "boosting_type", ["gbdt", "dart"]#, #"rf"]
        ),
        "num_leaves": trial.suggest_int("num_leaves", 2, 64),
        "max_depth": trial.suggest_int("max_depth", -1, 20),
        "n_estimators": trial.suggest_int("n_estimators", 35, 150),
        "class_weight": "balanced",
        # Add the following conditional logic for bagging parameters
        #"bagging_freq": trial.suggest_int("bagging_freq", 1, 10) if params["boosting_type"] == "rf" else 1,  # Ensure bagging_freq > 0 for 'rf'
        #"bagging_fraction": trial.suggest_float("bagging_fraction", 0.1, 0.99) if params["boosting_type"] == "rf" else 1.0,  # Ensure 0 < bagging_fraction < 1 for 'rf'
        #"feature_fraction": trial.suggest_float("feature_fraction", 0.1, 0.99) if params["boosting_type"] != "rf" else 1.0,  # Ensure 0 < feature_fraction < 1 for non-'rf'
    }
    return LGBMClassifier(**params)

def instantiate_xgboost(trial):
    params = {
         "objective": trial.suggest_categorical(
    "objective", ["binary:hinge", "binary:logistic"]),
    "booster": trial.suggest_categorical("booster", ["gbtree", "dart", "gblinear"]),
    "max_leaves": trial.suggest_int("max_leaves",1, 10, 10),
    "max_depth": trial.suggest_int("max_depth",3, 15, 4),
    "grow_policy": trial.suggest_categorical("grow_policy",["depthwise"]),
    "n_estimators": trial.suggest_int("n_estimators",50, 100),
    "learning_rate": trial.suggest_float("learning_rate", 0.01, 1),
}
    return XGBClassifier(**params)
    


def instantiate_random_forest(trial):
    params = {
        "criterion": trial.suggest_categorical(
            "criterion", ["gini", "entropy", "log_loss"]
        ),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 64),
        "max_depth": trial.suggest_int("max_depth", 2, 64),
        "n_estimators": trial.suggest_int("n_estimators", 35, 150),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 20),
        "class_weight": trial.suggest_categorical(
            "class_weight", ["balanced", "balanced_subsample"]
        ),
        "max_features": trial.suggest_categorical(
            "max_features", ["sqrt", "log2", None]
        ),
    }
    return RandomForestClassifier(**params)


#def instantiate_random_forest(trial):

    # "criterion": ["gini", "entropy", "log_loss"],
    # "max_depth": list(range(1, 11)),
    # "max_features": ["sqrt", "log2", None],

    #n_estimators = trial.suggest_int("n_estimators", 10, 100)
    #max_depth = trial.suggest_int("max_depth", 2, 32)
    #criterion = trial.suggest_categorical("criterion", ["gini", "entropy", "log_loss"])
    
   # clf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, criterion=criterion)
    #return clf


def model_selector(clf_string, trial: Trial):
    if clf_string == "logistic_regression":
        model = instantiate_logistic_regression(trial)
    elif clf_string == "random_forest":
        model = instantiate_random_forest(trial)
    elif clf_string == "extra_trees":
        model = instantiate_extra_trees(trial)
    elif clf_string == "lightgbm":
        model = instantiate_lgbm_classifier(trial)
    elif clf_string=='xgboost':
        model = instantiate_xgboost(trial)

    return model

def instantiate_numerical_pipeline(trial : Trial) -> Pipeline:
  pipeline = Pipeline([
    ('imputer', instantiate_numerical_simple_imputer(trial)),
    ('scaler', instantiate_robust_scaler(trial))
  ])
  return pipeline

def instantiate_categorical_pipeline(trial : Trial) -> Pipeline:
  pipeline = Pipeline([
    ('imputer', instantiate_categorical_simple_imputer(trial)),
    ('encoder', instantiate_woe_encoder(trial))
  ])
  return pipeline

def instantiate_processor(trial : Trial, numerical_columns : list[str], categorical_columns : list[str]) -> ColumnTransformer:

  numerical_pipeline = instantiate_numerical_pipeline(trial)
  categorical_pipeline = instantiate_categorical_pipeline(trial)

  processor = ColumnTransformer([
    ('numerical_pipeline', numerical_pipeline, numerical_columns),
    ('categorical_pipeline', categorical_pipeline, categorical_columns)
  ])

  return processor

def instantiate_model(classifier, trial : Trial, numerical_columns : list[str], categorical_columns : list[str]) -> Pipeline:

  processor = instantiate_processor(
    trial, numerical_columns, categorical_columns
  )
  
  clf = model_selector(classifier,trial)

  model = Pipeline([
    ('processor', processor),
    ('classifier', clf)
  ])

  return model

Now let's define the objective function

In [14]:
def objective(classifier, trial : Trial, X : DataFrame, y : np.ndarray | Series, numerical_columns : Optional[list[str]]=None, categorical_columns : Optional[list[str]]=None, random_state : int=42) -> float:
  if numerical_columns is None:
    numerical_columns = [
      *x.select_dtypes(exclude=['object', 'category']).columns
    ]

  if categorical_columns is None:
    categorical_columns = [
      *x.select_dtypes(include=['object', 'category']).columns
    ]

  model = instantiate_model(classifier,trial, numerical_columns, categorical_columns)

  kf = KFold(n_splits=5, shuffle=True, random_state=random_state)
  roc_auc_scorer = make_scorer(roc_auc_score, needs_proba=True)
  scores = cross_val_score(model, X, y, scoring=roc_auc_scorer, cv=kf)

  return np.min([np.mean(scores), np.median([scores])])

In [15]:
classifiers: list[str] = ["logistic_regression", "extra_trees", "random_forest"]

model_performance = pd.DataFrame()
model_specifications = dict()

for classifier in classifiers:
    study: Study = create_study(study_name="optimization", direction="maximize")

    study.optimize(
        lambda trial: objective(classifier, trial, x_train, np.array(y_train).ravel()),
        n_trials=1,
    )  # n_trials=100 is the original value

    model_specifications[classifier]= study.best_params


    best_trial: FrozenTrial = study.best_trial
    model: Pipeline = instantiate_model(
        classifier,
        trial=best_trial,
        numerical_columns=numerical_columns,
        categorical_columns=categorical_columns,
    )
    model.fit(x_train, y_train)
    predictions = model.predict(x_test)
    model_performance[classifier] = calculate_model_statistics(y_test, predictions)
    model_performance

[I 2024-12-17 22:12:19,503] A new study created in memory with name: optimization

The `needs_threshold` and `needs_proba` parameter are deprecated in version 1.4 and will be removed in 1.6. You can either let `response_method` be `None` or set it to `predict` to preserve the same behaviour.


Setting penalty=None will ignore the C and l1_ratio parameters


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Setting penalty=None will ignore the C and l1_ratio parameters


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/mod

These models are performing really poorly so far, with both the Matthews Correlation Coefficient and ROC-AUC pointing to the modeling being no better than random guessing then it comes to predicting on defaults. Let's see how well the lightgbm and xgboost classifiers perform. 

In [16]:
classifiers: list[str] = ["lightgbm",
                          "xgboost"]

for classifier in classifiers:
    study = create_study(study_name="optimization", direction="maximize")

    study.optimize(
        lambda trial: objective(classifier, trial, x_train, np.array(y_train).ravel()),
        n_trials=2,
    )  # n_trials=100 is the original value

    model_specifications[classifier]= study.best_params


    best_trial: FrozenTrial = study.best_trial
    model: Pipeline = instantiate_model(
        classifier,
        trial=best_trial,
        numerical_columns=numerical_columns,
        categorical_columns=categorical_columns,
    )
    model.fit(x_train, y_train)
    predictions = model.predict(x_test)
    model_performance[classifier] = calculate_model_statistics(y_test, predictions)
    model_performance

[I 2024-12-17 22:21:23,375] A new study created in memory with name: optimization

The `needs_threshold` and `needs_proba` parameter are deprecated in version 1.4 and will be removed in 1.6. You can either let `response_method` be `None` or set it to `predict` to preserve the same behaviour.



[LightGBM] [Info] Number of positive: 13937, number of negative: 158268
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.013112 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11418
[LightGBM] [Info] Number of data points in the train set: 172205, number of used features: 115
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
[LightGBM] [Info] Number of positive: 13897, number of negative: 158308
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.019073 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11463
[LightGBM] [Info] Number of data points in the train set: 172205, number of used features: 115
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.00

[I 2024-12-17 22:21:33,628] Trial 0 finished with value: 0.7474595659640969 and parameters: {'numerical_strategy': 'most_frequent', 'with_centering': True, 'with_scaling': False, 'categorical_strategy': 'most_frequent', 'sigma': 4.068667775232698, 'regularization': 1.7958582050557452, 'randomized': False, 'boosting_type': 'gbdt', 'num_leaves': 16, 'max_depth': 19, 'n_estimators': 39}. Best is trial 0 with value: 0.7474595659640969.

The `needs_threshold` and `needs_proba` parameter are deprecated in version 1.4 and will be removed in 1.6. You can either let `response_method` be `None` or set it to `predict` to preserve the same behaviour.



[LightGBM] [Info] Number of positive: 13937, number of negative: 158268
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.019142 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15337
[LightGBM] [Info] Number of data points in the train set: 172205, number of used features: 115
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
[LightGBM] [Info] Number of positive: 13897, number of negative: 158308
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.016541 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15383
[LightGBM] [Info] Number of data points in the train set: 172205, number of used features: 115
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
[Light

[I 2024-12-17 22:21:45,260] Trial 1 finished with value: 0.7435881215375002 and parameters: {'numerical_strategy': 'most_frequent', 'with_centering': True, 'with_scaling': False, 'categorical_strategy': 'most_frequent', 'sigma': 4.0972922880022065, 'regularization': 2.2163619286056373, 'randomized': True, 'boosting_type': 'dart', 'num_leaves': 50, 'max_depth': 18, 'n_estimators': 52}. Best is trial 0 with value: 0.7474595659640969.


[LightGBM] [Info] Number of positive: 17377, number of negative: 197880
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.013181 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11522
[LightGBM] [Info] Number of data points in the train set: 215257, number of used features: 115
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000


[I 2024-12-17 22:21:47,713] A new study created in memory with name: optimization

suggest_int() got {'step'} as positional arguments but they were expected to be given as keyword arguments.


The distribution is specified by [1, 10] and step=10, but the range is not divisible by `step`. It will be replaced by [1, 1].


suggest_int() got {'step'} as positional arguments but they were expected to be given as keyword arguments.


The `needs_threshold` and `needs_proba` parameter are deprecated in version 1.4 and will be removed in 1.6. You can either let `response_method` be `None` or set it to `predict` to preserve the same behaviour.


Parameters: { "grow_policy", "max_depth", "max_leaves" } are not used.



Parameters: { "grow_policy", "max_depth", "max_leaves" } are not used.



Parameters: { "grow_policy", "max_depth", "max_leaves" } are not used.



Parameters: { "grow_policy", "max_depth", "max_leaves" } are not used.



Parameters: { "grow_policy", "max_depth", "max_leaves" } are

In [17]:
model_performance

Unnamed: 0,logistic_regression,extra_trees,random_forest,lightgbm,xgboost
roc_auc,0.500772,0.5,0.614881,0.68387,0.5
matthews_correlation,0.023344,0.0,0.208138,0.212613,0.0
f_beta,0.001939,0.0,0.304293,0.511215,0.0
precision,0.433333,0.0,0.250323,0.162757,0.0
recall,0.001745,0.0,0.311762,0.670784,0.0
balanced_accuracy,0.500772,0.5,0.614881,0.68387,0.5


Let's implement the above code with successive halving


In [18]:



def instantiate_extra_trees(trial: Trial, warm_start=False) -> ExtraTreesClassifier:
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 1000),
        "max_depth": trial.suggest_int("max_depth", 1, 20),
        "max_features": trial.suggest_float("max_features", 0, 1),
        "bootstrap": trial.suggest_categorical("bootstrap", [True, False]),
        "n_jobs": -1,
        "random_state": 42,
    }
    return ExtraTreesClassifier(**params, warm_start=warm_start)

In [19]:
from typing import Optional
from pandas import DataFrame
from optuna import Trial
from sklearn.linear_model import SGDClassifier

def objective(classifier_string: str, trial : Trial, X : DataFrame, y : DataFrame, seed : int=42) -> Optional[float]:
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, shuffle=True, random_state=seed
    )
    #model = instantiate_model(classifier=classifier_string,trial=trial, numerical_columns=numerical_columns, categorical_columns=categorical_columns)
    model = model_selector(classifier_string, trial) #instantiate_extra_trees(trial, warm_start=True)
    n_estimators = model.get_params().get('n_estimators')
    min_estimators = 45
    
    for num_estimators in range(min_estimators, n_estimators + 1):
        model.set_params(n_estimators=num_estimators)
        model.fit(X_train, y_train)
        
        score = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
        trial.report(score, num_estimators)
    
        if trial.should_prune():
            pass#raise TrialPruned()

    kfold = KFold(shuffle=True, random_state=seed)
    roc_auc = make_scorer(roc_auc_score, needs_proba=True)
    scores = cross_val_score(model, X, y, cv=kfold, scoring=roc_auc)
    
    return np.min([np.mean(scores), np.median(scores)])

In [29]:
numerical_train = application_train.clone()
encoder_mapping_key = dict()
for col in numerical_train.columns:
    try:
        key: dict[str, int] = create_encoder_mapping(numerical_train, col)
        numerical_train = encode_feature(numerical_train, col, key)
        encoder_mapping_key[col] = key
    except:
        pass

x = numerical_train.drop("target")
y = numerical_train["target"]
x_train, x_test, y_train, y_test = train_test_split(x, y, shuffle=True)

study = create_study(
    direction="maximize",
    pruner=SuccessiveHalvingPruner(reduction_factor=2),
    sampler=RandomSampler(seed=42),
)
study.optimize(
    lambda trial: objective("lightgbm", trial, x, y), n_trials=2
)  # 30 trials is 12m 35.9s

[I 2024-12-17 22:31:14,834] A new study created in memory with name: no-name-14823518-2141-453e-a685-51890930e2d7


[LightGBM] [Info] Number of positive: 18634, number of negative: 211999
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.013435 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12208
[LightGBM] [Info] Number of data points in the train set: 230633, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
[LightGBM] [Info] Number of positive: 18634, number of negative: 211999
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012248 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12208
[LightGBM] [Info] Number of data points in the train set: 230633, number of used features: 116
[LightGBM] [


The `needs_threshold` and `needs_proba` parameter are deprecated in version 1.4 and will be removed in 1.6. You can either let `response_method` be `None` or set it to `predict` to preserve the same behaviour.



[LightGBM] [Info] Number of positive: 19876, number of negative: 226132
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014591 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12259
[LightGBM] [Info] Number of data points in the train set: 246008, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
[LightGBM] [Info] Number of positive: 19888, number of negative: 226121
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.026318 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12240
[LightGBM] [Info] Number of data points in the train set: 246009, number of used features: 117
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.0000

[I 2024-12-17 22:31:28,271] Trial 0 finished with value: 0.7101552933890023 and parameters: {'boosting_type': 'dart', 'num_leaves': 48, 'max_depth': 12, 'n_estimators': 53}. Best is trial 0 with value: 0.7101552933890023.


[LightGBM] [Info] Number of positive: 18634, number of negative: 211999
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011226 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12208
[LightGBM] [Info] Number of data points in the train set: 230633, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
[LightGBM] [Info] Number of positive: 18634, number of negative: 211999
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012478 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12208
[LightGBM] [Info] Number of data points in the train set: 230633, number of used features: 116
[LightGBM] [


The `needs_threshold` and `needs_proba` parameter are deprecated in version 1.4 and will be removed in 1.6. You can either let `response_method` be `None` or set it to `predict` to preserve the same behaviour.



[LightGBM] [Info] Number of positive: 19876, number of negative: 226132
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.016361 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12259
[LightGBM] [Info] Number of data points in the train set: 246008, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
[LightGBM] [Info] Number of positive: 19888, number of negative: 226121
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.017448 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12240
[LightGBM] [Info] Number of data points in the train set: 246009, number of used features: 117
[LightGBM] [

[I 2024-12-17 22:33:18,123] Trial 1 finished with value: 0.7145099764014973 and parameters: {'boosting_type': 'gbdt', 'num_leaves': 56, 'max_depth': 12, 'n_estimators': 117}. Best is trial 1 with value: 0.7145099764014973.


In [30]:
model_specifications['lgbm_with_nulls']= study.best_params


lbgm_no_nulls=LGBMClassifier(**study.best_params)
lbgm_no_nulls.fit(x_train, y_train)
predictions = lbgm_no_nulls.predict(x_test)
model_performance["lgbm_with_nulls"] = calculate_model_statistics(y_test, predictions)
model_performance


{'boosting_type': 'gbdt',
 'num_leaves': 56,
 'max_depth': 12,
 'n_estimators': 117}

In [38]:
study = create_study(
    direction="maximize",
    pruner=SuccessiveHalvingPruner(reduction_factor=2),
    sampler=RandomSampler(seed=42),
)
study.optimize(
    lambda trial: objective("xgboost", trial, x, y), n_trials=2
)

xgb_no_nulls=XGBClassifier(**study.best_params)
xgb_no_nulls.fit(x_train, y_train)
predictions = xgb_no_nulls.predict(x_test)
model_performance["xgboost_with_nulls"] = calculate_model_statistics(y_test, predictions)
model_performance

[I 2024-12-17 22:45:53,107] A new study created in memory with name: no-name-285246cd-a2c1-4856-b0e9-1be569cf21d8

suggest_int() got {'step'} as positional arguments but they were expected to be given as keyword arguments.


The distribution is specified by [1, 10] and step=10, but the range is not divisible by `step`. It will be replaced by [1, 1].


suggest_int() got {'step'} as positional arguments but they were expected to be given as keyword arguments.


The `needs_threshold` and `needs_proba` parameter are deprecated in version 1.4 and will be removed in 1.6. You can either let `response_method` be `None` or set it to `predict` to preserve the same behaviour.

[I 2024-12-17 22:46:25,140] Trial 0 finished with value: 0.5 and parameters: {'objective': 'binary:logistic', 'booster': 'gbtree', 'max_leaves': 1, 'max_depth': 3, 'grow_policy': 'depthwise', 'n_estimators': 52, 'learning_rate': 0.8675143843171859}. Best is trial 0 with value: 0.5.

suggest_int() got {'step'} as positional 

## SMOTE Oversampling 
One way to strengthen our model performance could be to oversample/undersample on our dataset to rebalance the proportion of positive and negative target classes, and then train our classifier on that. We'll use the method put forth in the original SMOTE paper and then retrain on the models shown above. 

In [56]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.decomposition import PCA

x = np.array(x)
y = np.array(y)

oversampling = SMOTE(sampling_strategy=0.1)
undersampling = RandomUnderSampler(sampling_strategy=0.5)

steps = [("oversample", oversampling), ("undersample", undersampling)]
pipeline = ImbPipeline(steps=steps)
smote_x, smote_y = pipeline.fit_resample(x, y)

pca = PCA(n_components=smote_x.shape[1])
smote_x = pca.fit_transform(smote_x)

x_train, x_test, y_train, y_test = train_test_split(
    smote_x, smote_y, stratify=smote_y, random_state=15
)

Now that we've created our smote dataset, we can fit the models. However our previous model specifications aren't as helpful since the dataset is new, and models may perform better on different hyperparameters. Let's create another study and run it again! 

In [61]:
study = create_study(
    direction="maximize",
    pruner=SuccessiveHalvingPruner(reduction_factor=2),
    sampler=RandomSampler(seed=42),
)
study.optimize(lambda trial: objective("lightgbm", trial, x, y), n_trials=2)


model_specifications["SMOTE_lgbm"] = study.best_params

smote_lgbm = LGBMClassifier(**study.best_params)
smote_lgbm.fit(x_train, y_train)
predictions = smote_lgbm.predict(x_test)
model_performance["SMOTE_lgbm"] = calculate_model_statistics(y_test, predictions)
model_performance

[I 2024-12-17 23:13:37,310] A new study created in memory with name: no-name-f274b13d-68ed-4537-8fcf-2a06fcac1e27

The `needs_threshold` and `needs_proba` parameter are deprecated in version 1.4 and will be removed in 1.6. You can either let `response_method` be `None` or set it to `predict` to preserve the same behaviour.

[I 2024-12-17 23:13:49,852] Trial 0 finished with value: 0.7101552933890023 and parameters: {'boosting_type': 'dart', 'num_leaves': 48, 'max_depth': 12, 'n_estimators': 53}. Best is trial 0 with value: 0.7101552933890023.

The `needs_threshold` and `needs_proba` parameter are deprecated in version 1.4 and will be removed in 1.6. You can either let `response_method` be `None` or set it to `predict` to preserve the same behaviour.

[I 2024-12-17 23:15:34,958] Trial 1 finished with value: 0.7145099764014973 and parameters: {'boosting_type': 'gbdt', 'num_leaves': 56, 'max_depth': 12, 'n_estimators': 117}. Best is trial 1 with value: 0.7145099764014973.


Unnamed: 0,logistic_regression,extra_trees,random_forest,lightgbm,xgboost,xgb_with_nulls,xgboost_with_nulls,SMOTE_lgbm
roc_auc,0.500772,0.5,0.614881,0.503748,0.5,0.5,0.5,0.628378
matthews_correlation,0.023344,0.0,0.208138,0.053637,0.0,0.0,0.0,0.309718
f_beta,0.001939,0.0,0.304293,0.009254,0.0,0.0,0.0,0.375642
precision,0.433333,0.0,0.250323,0.464286,0.0,0.0,0.0,0.63666
recall,0.001745,0.0,0.311762,0.008345,0.0,0.0,0.0,0.359276
balanced_accuracy,0.500772,0.5,0.614881,0.503748,0.5,0.5,0.5,0.628378


In [62]:
study = create_study(
    direction="maximize",
    pruner=SuccessiveHalvingPruner(reduction_factor=2),
    sampler=RandomSampler(seed=42),
)
study.optimize(lambda trial: objective("xgboost", trial, x, y), n_trials=2)

smote_xgb = XGBClassifier(**study.best_params)
smote_xgb.fit(x_train, y_train)
predictions = smote_xgb.predict(x_test)
model_performance["SMOTE_xgb"] = calculate_model_statistics(y_test, predictions)
model_performance

[I 2024-12-17 23:15:35,747] A new study created in memory with name: no-name-c01bd11f-aa94-4fff-a5d6-651b669d953d

suggest_int() got {'step'} as positional arguments but they were expected to be given as keyword arguments.


The distribution is specified by [1, 10] and step=10, but the range is not divisible by `step`. It will be replaced by [1, 1].


suggest_int() got {'step'} as positional arguments but they were expected to be given as keyword arguments.


The `needs_threshold` and `needs_proba` parameter are deprecated in version 1.4 and will be removed in 1.6. You can either let `response_method` be `None` or set it to `predict` to preserve the same behaviour.

[I 2024-12-17 23:15:54,091] Trial 0 finished with value: 0.5 and parameters: {'objective': 'binary:logistic', 'booster': 'gbtree', 'max_leaves': 1, 'max_depth': 3, 'grow_policy': 'depthwise', 'n_estimators': 52, 'learning_rate': 0.8675143843171859}. Best is trial 0 with value: 0.5.

suggest_int() got {'step'} as positional 

Unnamed: 0,logistic_regression,extra_trees,random_forest,lightgbm,xgboost,xgb_with_nulls,xgboost_with_nulls,SMOTE_lgbm,SMOTE_xgb
roc_auc,0.500772,0.5,0.614881,0.503748,0.5,0.5,0.5,0.628378,0.5
matthews_correlation,0.023344,0.0,0.208138,0.053637,0.0,0.0,0.0,0.309718,0.0
f_beta,0.001939,0.0,0.304293,0.009254,0.0,0.0,0.0,0.375642,0.0
precision,0.433333,0.0,0.250323,0.464286,0.0,0.0,0.0,0.63666,0.0
recall,0.001745,0.0,0.311762,0.008345,0.0,0.0,0.0,0.359276,0.0
balanced_accuracy,0.500772,0.5,0.614881,0.503748,0.5,0.5,0.5,0.628378,0.5


In [None]:
break

## Code for later

SMote, but with previous lgbm no nulls model specs


In [None]:
smote_lgbm = LGBMClassifier(**model_specifications["lgbm_with_nulls"], verbose=-1)
smote_lgbm.fit(x_train, y_train)

y_predict = smote_lgbm.predict(x_test)

smote_performance: tuple[DataFrame] = calculate_model_statistics(y_test, y_predict)
model_performance["SMOTE_lgbm"] = smote_performance


pd.concat(
    [model_performance["lightgbm"], smote_performance],
    axis=1,
)

In [None]:
def instantiate_sgd_classifier(trial, random_state=0):
    params = {
        "loss": trial.suggest_categorical(
            "loss",
            [
                "hinge",
                "log_loss",
                "modified_huber",
                "squared_hinge",
                "perceptron",
                "squared_error",
                "huber",
                "epsilon_insensitive",
                "squared_epsilon_insensitive",
            ],
        ), "penalty": trial.suggest_categorical('penalty',['l2','l1', 'elasticnet', None]),
        "alpha": trial.suggest_float("alpha",0.0, 1000),
        "l1_ratio": trial.suggest_float("l1_ratio",0.0, 1.0)}
    return SGDClassifier(**params)


In [69]:
# encode the features manually to start
encoder_mapping_key = dict()
for col in application_train.columns:
    try:
        key: dict[str, int] = create_encoder_mapping(application_train, col)
        numerical_train = encode_feature(application_train, col, key)
        encoder_mapping_key[col] = key
    except:
        pass

x = numerical_train.drop("target")
y = numerical_train["target"]



In [125]:
from optuna import Trial
from sklego.preprocessing import ColumnSelector


def choose_columns(trial : Trial, columns : list[str]) -> list[str]:
  choose = lambda column: trial.suggest_categorical(column, [True, False])
  choices = [*filter(choose, columns)]
  return choices


def instantiate_column_selector(trial : Trial, columns : list[str]) -> ColumnSelector:
  choose = lambda column: trial.suggest_categorical(column, [True, False])
  choices = [*filter(choose, columns)]
  selector = ColumnSelector(choices)
  return selector




In [None]:
from optuna.visualization import plot_param_importances

plot_param_importances(study)

In [126]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import RobustScaler, MinMaxScaler, StandardScaler

Classifier = (
    RandomForestClassifier
    | ExtraTreesClassifier
    | SVC
    | LogisticRegression
    | KNeighborsClassifier
)


def instantiate_learner(trial: Trial) -> Classifier:
    algorithm = trial.suggest_categorical(
        "algorithm", ["logistic", "forest", "extra_forest", "lgbm"]
    )
    if algorithm == "logistic":
        model = instantiate_logistic_regression(trial)
    elif algorithm == "forest":
        model = instantiate_random_forest(trial)
    elif algorithm == "extra_forest":
        model = instantiate_extra_trees(trial)
    elif algorithm == "lgbm":
        model = instantiate_lgbm_classifier(trial)
    # elif algorithm=='knn':
    #  model = instantiate_knn(trial)

    return model


def instantiate_scaler(trial):
    scaler_type = trial.suggest_categorical(
        "scaler_type", ["standard", "minmax", "robust"]
    )

    if scaler_type == "standard":
        params = {
            "with_mean": trial.suggest_categorical("with_mean", [True, False]),
            "with_std": trial.suggest_categorical("with_std", [True, False]),
        }
        scaler = StandardScaler(**params)

    elif scaler_type == "minmax":
        scaler = MinMaxScaler()

    elif scaler_type == "robust":
        params = {
            "with_centering": trial.suggest_categorical(
                "with_centering", [True, False]
            ),
            "with_scaling": trial.suggest_categorical("with_std", [True, False]),
        }
        scaler = RobustScaler(**params)

    return scaler

#def instantiate_encoder(trial):

In [None]:
def objective(trial : Trial, x : DataFrame, y : DataFrame, seed : int=42):
    x_train, x_test, y_train, y_test = train_test_split(
        x, y, shuffle=True, random_state=seed
    )
    
    model = instantiate_extra_trees(trial, warm_start=True)
    n_estimators = model.get_params().get('n_estimators')
    min_estimators = 100
    
    for num_estimators in range(min_estimators, n_estimators + 1):
        model.set_params(n_estimators=num_estimators)
        model.fit(x_train, y_train)
        
        score = roc_auc_score(y_test, model.predict_proba(x_test)[:, 1])
        trial.report(score, num_estimators)
    
        if trial.should_prune():
            raise TrialPruned()

    kfold = KFold(shuffle=True, random_state=seed)
    roc_auc = make_scorer(roc_auc_score, needs_proba=True)
    scores = cross_val_score(model, x, y, cv=kfold, scoring=roc_auc)
    
    return np.min([np.mean(scores), np.median(scores)])


In [136]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

def instantiate_numerical_pipeline(trial : Trial) -> Pipeline:
  pipeline = Pipeline([
    ('imputer', instantiate_numerical_simple_imputer(trial)),
    ('scaler', instantiate_scaler(trial))
  ])
  return pipeline

def instantiate_categorical_pipeline(trial : Trial) -> Pipeline:
  pipeline = Pipeline([
    ('imputer', instantiate_categorical_simple_imputer(trial)),
    ('encoder', instantiate_woe_encoder(trial))#instantiate_encoder(trial))
  ])
  return pipeline

def instantiate_processor(trial : Trial, numerical_columns : list[str], categorical_columns : list[str]) -> ColumnTransformer:

  numerical_pipeline = instantiate_numerical_pipeline(trial)
  categorical_pipeline = instantiate_categorical_pipeline(trial)

  selected_numerical_columns = choose_columns(trial,numerical_columns)
  selected_categorical_columns = choose_columns(trial,categorical_columns)

  processor = ColumnTransformer([
    ('numerical_pipeline', numerical_pipeline, selected_numerical_columns),
    ('categorical_pipeline', categorical_pipeline, selected_categorical_columns)
  ])

  return processor

def instantiate_model(trial : Trial, numerical_columns : list[str], categorical_columns : list[str]) -> Pipeline:

  processor = instantiate_processor(
    trial, numerical_columns, categorical_columns
  )

  learner = instantiate_learner(trial)

  model = Pipeline([
    ('processor', processor),
    ('model', learner)
  ])

  return model

from optuna import create_study

study = create_study(study_name='optimization', direction='maximize')

study.optimize(lambda trial: objective(trial, x_train, np.array(y_train).ravel()), n_trials=5) #n_trials=100 is the original value

[I 2024-12-08 15:11:00,130] A new study created in memory with name: optimization
[W 2024-12-08 15:11:00,514] Trial 0 failed with parameters: {'n_estimators': 807, 'max_depth': 7, 'max_features': 0.21296783552104626, 'bootstrap': False} because of the following error: ValueError("could not convert string to float: 'cash loans'").
Traceback (most recent call last):
  File "/home/lemuelrobinson/Documents/Credit_Capstone/.venv/lib/python3.11/site-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/tmp/ipykernel_187908/244823179.py", line 52, in <lambda>
    study.optimize(lambda trial: objective(trial, x_train, np.array(y_train).ravel()), n_trials=5) #n_trials=100 is the original value
                                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipykernel_187908/1609094541.py", line 12, in objective
    model.fit(x_train, y_train)
  File "/home/lemuelrobinson/Documen

ValueError: could not convert string to float: 'cash loans'

In [None]:
break

In [None]:
study = create_study(study_name='optimization', direction='maximize')

study.optimize(lambda trial: objective(classifier='random_forest',trial=trial, X=x_train, y=np.array(y_train).ravel()), n_trials=1) #n_trials=100 is the original value

best_trial: FrozenTrial = study.best_trial
model: Pipeline = instantiate_model('random_forest',trial=best_trial, numerical_columns=numerical_columns, categorical_columns=categorical_columns)
model.fit(x_train, y_train)
predictions = model.predict(x_test)
calculate_model_statistics(y_true=y_test, y_predict=predictions)

#study.best_params


In [None]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import make_scorer



from optuna import create_study
from optuna.pruners import SuccessiveHalvingPruner
from optuna.samplers import RandomSampler

study = create_study(
  direction="maximize",
  pruner=SuccessiveHalvingPruner(reduction_factor=2),
  sampler=RandomSampler(seed=42) 
)
study.optimize(lambda trial: objective(trial, x, y), n_trials=60)

In [None]:
def iterative_bayesian_search(classifier, x_train, y_train, parameter_grid):
    final_parameters = dict()
    # define iteration dictionary

    for key, value in parameter_grid.items():

        #final_parameters[key]=value
        iteration_grid = {key: value}
        iteration_grid = {**final_parameters, **iteration_grid}

        # do bayesian search
        model_parameters = skopt_bayesian_search(
            classifier, x_train, y_train, iteration_grid
        )

        # isolate iteration_grid parameter
        final_parameters[key] = model_parameters[key]

    return final_parameters

In [None]:
def restrict_x_columns(x_train,x_test, columns):
    x_train=x_train[:,columns]
    x_test=x_test[:,columns]
    return x_train,x_test

from sklearn.feature_selection import RFECV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import confusion_matrix


Let's first implement bayesian optimization on a random forest model to see how well the data performs there.

In [None]:

cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=3, random_state=1)

model = TPOTClassifier(
    generations=3,
    population_size=50,
    cv=cv,
    scoring="accuracy",
    verbosity=2,
    random_state=15,
    n_jobs=-1,
).fit(x_train, y_train)


best_pipeline = model.fitted_pipeline_
tpot_classifier = clone(best_pipeline)
tpot_classifier_params = tpot_classifier.steps[-1][1].get_params()
tpot_classifier_name = str(type(best_pipeline.steps[-1][1])).split(".")[-1][:-2]

# Fit the new classifier on the training data
tpot_classifier.fit(training_x, training_y)
predictions = tpot_classifier.predict(validation_x)
tpot_accuracy = tpot_classifier.score(validation_x, validation_y)
print(f"\n{tpot_classifier_name} accuracy is {tpot_accuracy:.3f}")

model_stats_df = calculate_model_statistics(
    y_true=validation_y, y_predict=predictions, title=tpot_classifier_name
)

In [None]:
params = {
    "boosting_type": ["gbdt"],
    "num_leaves": int_range(2, 15),
    "max_depth": int_range(1, 15),
    "n_estimators": np.linspace(50, 150, 10, dtype=int),#[50],
    "n_estimators": np.linspace(50, 300, 10, dtype=int),
    "reg_alpha": np.linspace(0, 1, 10),
    "reg_lambda": np.linspace(0, 1, 10),
    "subsample": np.linspace(0.1, 1, 20),
}

n_iter = 200
classifier = LGBMClassifier(verbose=-1)
metric = "precision"


random_search = RandomizedSearchCV(
    classifier, param_distributions=params, n_iter=n_iter, scoring=metric
)
random_search.fit(x_train, y_train)
#print(random_search.best_score_, random_search.best_params_)
lgbm_params=random_search.best_params_




predictions = random_search.predict(x_test)

lgbm_metric: float = random_search.score(x_test, y_test)
print(f"LGBM classifier metric is {lgbm_metric:.3f}")
calculate_model_statistics(y_true=y_test, y_predict=predictions)




lgbm_classifier: LGBMClassifier = LGBMClassifier(**lgbm_params).fit(
    x_train, y_train
)
#predictions = lgbm_classifier.predict(x_test)

#lgbm_accuracy: float = lgbm_classifier.score(x_test, y_test)
#print(f"LGBM classifier accuracy is {lgbm_accuracy:.3f}")
calculate_model_statistics(y_true=y_test, y_predict=predictions)

In [None]:
from xgboost import XGBClassifier
from hyperopt import hp, fmin, tpe, rand, STATUS_OK, Trials
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
import time


# Declare xgboost search space for Hyperopt
xgboost_space = {
    "max_depth": hp.choice("x_max_depth", [2, 3, 4, 5, 6]),
    "min_child_weight": hp.choice(
        "x_min_child_weight", np.round(np.arange(0.0, 0.2, 0.01), 5)
    ),
    "learning_rate": hp.choice(
        "x_learning_rate", np.round(np.arange(0.005, 0.3, 0.01), 5)
    ),
    "subsample": hp.choice("x_subsample", np.round(np.arange(0.1, 1.0, 0.05), 5)),
    "colsample_bylevel": hp.choice(
        "x_colsample_bylevel", np.round(np.arange(0.1, 1.0, 0.05), 5)
    ),
    "colsample_bytree": hp.choice(
        "x_colsample_bytree", np.round(np.arange(0.1, 1.0, 0.05), 5)
    ),
    "n_estimators": hp.choice("x_n_estimators", np.arange(25, 100, 5)),
}

best_score = 1.0


def objective(space):

    global best_score
    model = XGBClassifier(**space, n_jobs=-1)
    kfold = KFold(n_splits=3, random_state=0, shuffle=True)
    score = -cross_val_score(
        model, x_train, y_train, cv=kfold, scoring="neg_log_loss", verbose=False
    ).mean()

    if score < best_score:
        best_score = score

    return score


start = time.time()

xgb_best_params = fmin(
    objective, space=xgboost_space, algo=tpe.suggest, max_evals=200, trials=Trials()
)

print("Hyperopt search took %.2f seconds for 200 candidates" % ((time.time() - start)))
print("Best score: %.2f " % (-best_score))
print("Best space: ", xgb_best_params)
xgb = XGBClassifier(**xgb_best_params)
xgb.fit(x_train, y_train)

predictions = xgb.predict(x_test)

xgb_metric: float = random_search.score(x_test, y_test)
print(f"XGB classifier metric is {xgb_metric:.3f}")
calculate_model_statistics(y_true=y_test, y_predict=predictions)

In [None]:
from sklearn.ensemble import AdaBoostClassifier

params = {
    "n_estimators": np.linspace(35, 150, 50, dtype=int),
    "learning_rate": np.linspace(0.01, 0.3, 12),
}

adaboost_params = skopt_bayesian_search(
    AdaBoostClassifier(algorithm="SAMME"), x_train, y_train, params
)
adaboost_classifier: AdaBoostClassifier = AdaBoostClassifier(
    algorithm="SAMME", **adaboost_params
).fit(x_train, y_train)

predictions = adaboost_classifier.predict(x_test)
adaboost_accuracy: float = adaboost_classifier.score(x_test, y_test)
print(f"AdaBoost classifier accuracy is {adaboost_accuracy:.3f}")
model_stats_df['AdaboostClassifier'] = calculate_model_statistics(y_true=y_test, y_predict=predictions)

In [None]:
break

### Null value tolerant machine learning models
Several supervised learning classification models can handle null or missing values to varying degrees. Here are some of the main classification models that are relatively tolerant of null values:
Decision Trees and Random Forests
Decision trees and random forest models are generally quite tolerant of missing values:
During training, these models can work around missing values by using surrogate splits.
For prediction, there are strategies like sending samples with missing values down both branches and averaging the results.
Random forests in particular tend to be robust to missing data, as the ensemble nature helps mitigate issues with individual trees1.
Naive Bayes
Naive Bayes classifiers can handle missing values naturally:
For categorical features, missing values can be treated as a separate category.
For numerical features, missing values can be ignored when calculating means and variances.
This makes Naive Bayes models quite tolerant of null values without requiring imputation2.
K-Nearest Neighbors (KNN)
KNN can work with missing data by using:
Partial distance calculations that ignore missing features
Imputation of missing values based on nearest neighbors
While not inherently null-tolerant, KNN can be adapted to handle missing data reasonably well3.
Support Vector Machines (SVM)
SVMs don't directly handle missing values, but can be made more robust by:
Using kernels that can handle missing data
Imputing missing values before training
With appropriate preprocessing, SVMs can work effectively even with some missing data1.
Gradient Boosting Models
Gradient boosting models like XGBoost and LightGBM have built-in methods for handling missing values:
They can learn the best direction to take for missing values at each split.
This allows them to handle missing data both during training and prediction without explicit imputation1.
While these models can work with missing data, it's generally recommended to investigate the reason for missing values and consider imputation strategies where appropriate. The performance impact of missing data can vary depending on the specific dataset and problem.


In [None]:
px.violin(bureau, x="credit_day_overdue").show()
bureau.to_pandas().value_counts("credit_day_overdue")

In [None]:
# pos_cash_balance: DataFrame = create_formatted_df("POS_CASH_balance.csv")
corr = pos_cash_balance.select(cs.by_dtype(pl.NUMERIC_DTYPES)).to_pandas().corr()
features = pos_cash_balance.select(cs.by_dtype(pl.NUMERIC_DTYPES)).to_pandas().columns
clear(pos_cash_balance)

mask = np.tril(np.ones_like(corr, dtype=bool))
masked_corr = corr.where(mask)
masked_corr.columns = features
masked_corr.index = features

styled_corr = masked_corr.style.background_gradient(cmap="GnBu").format("{:.3f}")
styled_corr

In [None]:
# Print all columns in training set
train_desc = description.filter(pl.col("table") == "application_{train|test}.csv")[
    ["row", "description"]
].to_pandas()
for row in range(len(train_desc)):
    print(train_desc.iloc[row, 0], ": ", train_desc.iloc[row, 1])

In [None]:
# for all variables in training set
# Calculate correlation of target with variable x.
# if absolute value of corr is >.3 and <.7, add to some correlation bucket
# elif absolute value is >.7, add to strong correlation bucket
# else put variable in no correlation bucket


def group_correlations(df, feature_of_interest):
    no_corr = dict()
    weak_corr = dict()
    strong_corr = dict()

    for feature in df.select(cs.numeric()).columns:
        if feature == feature_of_interest:
            continue

        # pg.partial_corr(data=application_train.to_pandas(), x='target', y='amt_goods_price', covar='target')#['r'].values[0]

        corr_df = df[[feature_of_interest, feature]]
        # corr=pg.partial_corr(data=application_train.to_pandas(), x=feature_of_interest, y=feature, covar='target')#['r'].values[0]
        corr = corr_df.to_pandas().corr().iloc[0, 1]
        if np.abs(corr) >= 0.7:
            strong_corr[feature] = corr
        elif np.abs(corr) <= 0.3:
            no_corr[feature] = corr
        else:
            weak_corr[feature] = corr

    index = [feature_of_interest]
    strong_corr = pd.DataFrame(
        data=strong_corr, index=index
    )  # columns=strong_corr.keys(), )
    weak_corr = pd.DataFrame(data=weak_corr, index=index)
    no_corr = pd.DataFrame(data=no_corr, index=index)
    return strong_corr, weak_corr, no_corr


strong, weak, no = group_correlations(application_train, "target")

no

In [None]:
# partial correlation
pg.partial_corr(
    data=application_train.to_pandas(), x="target", y="amt_goods_price", covar="target"
)["r"].values[0]

#### NB: The important point is for BorutaPy, multicollinearity should be removed before running it.

In [None]:
from boruta import BorutaPy
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA


# Set x to only features that have no null values
x = application_train[x_features_no_nulls].drop("target")  # .to_pandas()
y = pl.DataFrame(application_train["target"])  # .to_pandas()

# Conduct PCA to remove multicolinearity from training set
pca = PCA(n_components=len(x.columns))  # , svd_solver='full')
pca.fit_transform(x)


x_features_no_nulls = list(
    null_df.filter(pl.col("null_count") == 0).select("features").to_series()
)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

boruta = BorutaPy(
    estimator=RandomForestRegressor(),
    # n_estimators=54,  # "auto",
    max_iter=100,
)
boruta.fit(np.array(x_train), np.array(y_train))

In [None]:
feature_df = pd.DataFrame(x_train.columns, columns=["features"])
feature_df["rank"] = boruta.ranking_
feature_df["included_features"] = boruta.support_
# feature_df.with_columns(boruta.ranking_).alias("rank")

In [None]:
# Full training set correlations

# pos_cash_balance: DataFrame = create_formatted_df("POS_CASH_balance.csv")
corr = application_train.select(cs.by_dtype(pl.NUMERIC_DTYPES)).to_pandas().corr()
features = application_train.select(cs.by_dtype(pl.NUMERIC_DTYPES)).to_pandas().columns
# clear(pos_cash_balance)

mask = np.tril(np.ones_like(corr, dtype=bool))
masked_corr = corr.where(mask)
masked_corr.columns = features
masked_corr.index = features

styled_corr = masked_corr.style.background_gradient(cmap="GnBu").format("{:.3f}")
styled_corr

In [None]:
# Setting numeric & categorical features for further analysis
response = "target"
# cat_feats = [c for c in x.select(cs.string())]
# bin_feats = [c for c in x.columns if '_bin' in c]
# cat_feats = cat_feats + bin_feats˛
num_feats = [c for c in x.select(cs.numeric())]

# x_train, x_test, y_train, y_test
x_train = x_train.to_pandas()
x_test = x_test.to_pandas()
y_train = y_train.to_pandas()
y_test = y_test.to_pandas()


# x[num_feats] = x[num_feats].astype('float')
# x[cat_feats] = x[cat_feats].astype('object')

# x.replace(-1, np.nan, inplace=True)
#!pip install autofeatselect
from autofeatselect import CorrelationCalculator, FeatureSelector, AutoFeatureSelect

# Create AutoFeatureSelect class
feat_selector = AutoFeatureSelect(
    modeling_type="classification",
    X_train=x_train,
    y_train=y_train,
    X_test=x_test,
    y_test=y_test,
    numeric_columns=num_feats,
    categorical_columns=[],  # cat_feats,
    seed=24,
)

# Detect Correlated Features
corr_features = feat_selector.calculate_correlated_features(
    static_features=None, num_threshold=0.9, cat_threshold=0.9
)
# Drop Correlated Features
feat_selector.drop_correlated_features()

# Determine Selection Methods to Apply
# Options: 'lgbm', 'xgb', 'rf','perimp', 'rfecv', 'boruta', 'lassocv'
# Note: Hyperparameters of all methods can be changed
selection_methods = ["lgbm", "xgb", "rf", "perimp", "rfecv", "boruta"]
final_importance_df = feat_selector.apply_feature_selection(
    selection_methods=selection_methods,
    lgbm_hyperparams=None,
    xgb_hyperparams=None,
    rf_hyperparams=None,
    lassocv_hyperparams=None,
    perimp_hyperparams=None,
    rfecv_hyperparams=None,
    boruta_hyperparams=None,
)

# Print Results
final_importance_df.head()

In [None]:
from autofeatselect import CorrelationCalculator, FeatureSelector, AutoFeatureSelect

# Static features will not be removed even if they are correlated with other features.
static_features = ["sk_id_curr"]

# Detect correlated features
corr_df_num, num_remove_list = CorrelationCalculator.numeric_correlations(
    application_train,
    features=application_train.columns,
    static_features=static_features,
    threshold=0.9,
)

#### Logistic Regression


In [None]:
x1=application_train.select(feature_set1)

# scale data
scaled_x = StandardScaler().fit_transform(x)

# x_features_no_nulls = list(null_df.filter(pl.col("null_count") == 0).select("features").to_series())
x_train, x_test, y_train, y_test = train_test_split(
    scaled_x, y, test_size=0.2, stratify=y
)
print(x_train.shape)
print(y_train.shape)
y_train=y_train.to_numpy().ravel()

In [None]:
# Specify the different options for hyperparameters
grid = {
    "C": np.logspace(-3, 3, 5),
    # "penalty": ["l1", "l2", None],
    "solver": ["saga", "newton-cholesky", "liblinear", "sag", "saga", "lbfgs"],
    "class_weight": [None, "balanced"],
    "tol": np.linspace(0, 0.5, 10),
}

regression = LogisticRegression(max_iter=1000)

regression_params = skopt_bayesian_search(
    regression, x_train, y_train, grid
)  # ,np=True)

# tuned_regression = LogisticRegression(**regression_params).fit(x_train, y_train)


print(regression_params)
