In [55]:
import datetime

import pandas as pd
import polars as pl
import pathlib
import numpy as np
import lightgbm as lgb
from dateutil.relativedelta import relativedelta 
import mlflow
import plotly.graph_objects as go
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.utils.class_weight import compute_sample_weight

In [56]:
pl.Config(set_fmt_float="full")
pd.options.display.float_format = '{:.3f}'.format
pd.options.plotting.backend = "plotly"

mlflow.set_tracking_uri("/home/paolo/git/home-credit/data/mlflow_runs")

In [57]:
SEED = 666
INPUT_PATH = pathlib.Path("../data/datalake/silver/preprocessed.parquet")

## Helper functions

In [58]:
def get_learning_curves(model: lgb.LGBMModel, metric: str = "binary_logloss") -> go.Figure:
    keys = list(model.evals_result_.keys())
    train_key, valid_key = "", ""
    for key in keys:
        if "train" in key:
            train_key = key
        elif "valid" in key:
            valid_key = key

    results = pd.DataFrame({"train": model.evals_result_[train_key][metric], "valid": model.evals_result_[valid_key][metric]})

    fig = go.Figure()
    for col in results.columns:
        fig.add_trace(go.Scatter(x=results.index+1, y=results[col], mode='lines', name=f"{col}"))

    fig.update_layout(title=f'Learning curves {metric}',
                      xaxis_title='Iteration',
                      yaxis_title=metric,
                      margin=dict(l=0,r=0,b=0,t=30),
                      )
    return fig

def split_train_validation(dataframe: pl.LazyFrame | pl.DataFrame, test_months: float = 0.2) -> tuple[pl.LazyFrame, pl.LazyFrame] | None:
    if isinstance(dataframe, pl.LazyFrame):
        min_date: datetime.date = dataframe.select("date_decision").min().collect().item(0,0)
        max_date: datetime.date = dataframe.select("date_decision").max().collect().item(0,0)
    elif isinstance(dataframe, pl.DataFrame):
        min_date: datetime.date = dataframe.select("date_decision").min().item(0,0)
        max_date: datetime.date = dataframe.select("date_decision").max().item(0,0)
    else:
        return    
    
    total_months: int = (max_date - min_date).days // 30 + 1

    test_months: int = round(total_months * test_months)

    # print(f"total months: {total_months} - test_months: {test_months}")
    start_test_date: datetime.date = max_date - relativedelta(months=test_months)
    return dataframe.filter(pl.col("date_decision") < start_test_date).drop("date_decision"), dataframe.filter(pl.col("date_decision") >= start_test_date).drop("date_decision")

def convert_object_to_categorical(data: pd.DataFrame) -> pd.DataFrame:
    for col in data.columns:
        if data[col].dtype == "object":
            data[col] = data[col].astype("category")

    return data

def get_columns_with_more_x_perc_nulls(dataframe: pl.LazyFrame, threshold: float = 0.5) -> list[str]:
    describe = dataframe.describe(percentiles=[]).filter(pl.col("statistic").is_in(["count", "null_count"]))
    return describe.transpose(include_header=True, column_names=["count", "null_count"]).slice(1).with_columns(
        (pl.col("null_count").cast(pl.Float32) / pl.col("count").cast(pl.Float32)).alias("perc_null")
    ).filter(pl.col("perc_null") > threshold).select("column").to_series().to_list()

def binarize_columns_with_more_x_perc_nulls(columns: list[str]) -> list[pl.Expr]:
    transforms = []
    for col in columns:
        transforms.append(
            pl.when(pl.col(col).is_null()).then(pl.lit(False)).otherwise(pl.lit(True)).alias(col)
        )
    return transforms

def optimize_datatypes(dataframe: pl.LazyFrame) -> pl.LazyFrame:
    transforms = []
    for col, col_dtype in zip(dataframe.columns, dataframe.dtypes):
        if isinstance(col_dtype, tuple(pl.FLOAT_DTYPES)):
            transforms.append(
                pl.col(col).cast(pl.Float32).shrink_dtype().alias(col)
            )
        elif isinstance(col_dtype, tuple(pl.INTEGER_DTYPES)):
            transforms.append(
                pl.col(col).cast(pl.Int32).shrink_dtype().alias(col)
            )
        elif isinstance(col_dtype, pl.String):
            transforms.append(
                pl.col(col).cast(pl.Categorical).alias(col)
            )
    
    return dataframe.with_columns(*transforms)

def fill_nulls(dataframe: pl.LazyFrame) -> pl.LazyFrame:
    fill_null_transforms = []

    for col, col_dtype in zip(dataframe.columns, dataframe.dtypes):
        if isinstance(col_dtype, tuple(pl.NUMERIC_DTYPES)):
            fill_null_transforms.append(pl.col(col).fill_null(pl.col(col).median()).shrink_dtype().alias(col))
        elif isinstance(col_dtype, (pl.Boolean, pl.Categorical)):
            fill_null_transforms.append(pl.col(col).fill_null(pl.col(col).mode().first()).alias(col))
    
    return dataframe.with_columns(*fill_null_transforms)

## Load and preprocess data

In [59]:
data: pl.LazyFrame = pl.scan_parquet(INPUT_PATH).with_columns(pl.col("date_decision").cast(pl.Date))
data = data.drop([col for col in data.columns if "num_group" in col])
data = optimize_datatypes(data)

In [60]:
data = data.with_columns(
    pl.col("date_decision").dt.year().alias("year"),
    pl.col("date_decision").dt.month().alias("month"),
    pl.col("date_decision").dt.ordinal_day().alias("day_of_year"),
    pl.when(pl.col("date_decision").dt.weekday().is_in([6,7])).then(pl.lit(1)).otherwise(pl.lit(0)).alias("decision_is_on_weekend"),
).drop(["WEEK_NUM", "MONTH"])

data.head().collect()

case_id,date_decision,target,actualdpdtolerance_344P,amtinstpaidbefduel24m_4187115A,annuity_780A,annuitynextmonth_57A,applicationcnt_361L,applications30d_658L,applicationscnt_1086L,applicationscnt_464L,applicationscnt_629L,applicationscnt_867L,avgdbddpdlast24m_3658932P,avgdbddpdlast3m_4187120P,avgdbdtollast24m_4525197P,avgdpdtolclosure24_3658938P,avginstallast24m_3658937A,avglnamtstart24m_4525187A,avgmaxdpdlast9m_3716943P,avgoutstandbalancel6m_4187114A,avgpmtlast12m_4525200A,bankacctype_710L,cardtype_51L,clientscnt12m_3712952L,clientscnt3m_3712950L,clientscnt6m_3712949L,clientscnt_100L,clientscnt_1022L,clientscnt_1071L,clientscnt_1130L,clientscnt_136L,clientscnt_157L,clientscnt_257L,clientscnt_304L,clientscnt_360L,clientscnt_493L,…,firstnonzeroinstldate_307D_mean,inittransactioncode_279L_mode,isbidproduct_390L_median,isdebitcard_527L_median,mainoccupationinc_437A_median,mainoccupationinc_437A_mean,maxdpdtolerance_577P_median,maxdpdtolerance_577P_mean,outstandingdebt_522A_median,outstandingdebt_522A_mean,pmtnum_8L_median,pmtnum_8L_mean,postype_4733339M_mode,profession_152M_mode,rejectreason_755M_mode,rejectreasonclient_4145042M_mode,revolvingaccount_394A_median,status_219L_mode,tenor_203L_median,tenor_203L_mean,addres_district_368M_mode,addres_role_871L_median,addres_zip_823M_mode,conts_role_79M_mode,empls_economicalst_849M_mode,empls_employedfrom_796D_median,empls_employedfrom_796D_mean,empls_employer_name_740M_mode,relatedpersons_role_762T_median,cacccardblochreas_147M_mode,conts_type_509L_mode,credacc_cards_status_52L_median,age,year,month,day_of_year,decision_is_on_weekend
i8,date,i8,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,bool,f32,f32,f32,cat,bool,f32,f32,f32,f32,f32,f32,f32,bool,f32,f32,f32,f32,f32,…,f32,cat,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,cat,cat,cat,cat,f32,cat,f32,f32,cat,f32,cat,cat,cat,f32,f32,cat,f32,cat,cat,f32,f32,i32,i8,i16,i32
0,2019-01-03,0,,,1917.5999755859373,0,0,0,0,0,0,0,,,,,,False,,,,,False,0,0,0,0,0,0,0,False,0,0,0,0,0,…,,,,0,,,,,,,,,,,,,0,,,,,,,,,,,,,,,,32.5315055847168,2019,1,3,0
1,2019-01-03,0,,,3134.0,0,0,0,0,0,0,0,,,,,,False,,,,,False,0,0,0,0,0,0,0,True,3,0,0,0,0,…,,,,0,,,,,,,,,,,,,0,,,,,,,,,,,,,,,,61.46575164794922,2019,1,3,0
2,2019-01-04,0,,,4937.0,0,0,0,0,0,0,0,,,,,,False,,,,,False,0,0,0,0,0,0,0,False,0,0,0,0,0,…,2071.0,,0.0,0,8200.0,8200.0,,,,,18.0,18.0,,,,,0,,18.0,18.0,,,,,,,,,,,,,44.12328720092773,2019,1,4,0
3,2019-01-03,0,,,4643.60009765625,0,0,1,0,2,0,1,,,,,,False,,,,,False,0,0,0,0,0,0,1,False,0,0,0,0,0,…,-35.0,,0.0,0,11000.0,11000.0,,,,,12.0,12.0,,,,,0,,12.0,12.0,,,,,,,,,,,,,25.44109535217285,2019,1,3,0
4,2019-01-04,1,,,3390.199951171875,0,0,1,0,0,0,1,,,,,,False,,,,,False,0,0,0,0,0,0,0,False,0,0,0,0,0,…,-35.0,,0.0,0,16000.0,16000.0,,,,,24.0,24.0,,,,,0,,24.0,24.0,,,,,,,,,,,,,25.024658203125,2019,1,4,0


In [61]:
# cols_to_drop = get_columns_with_more_x_perc_nulls(data, threshold=0.85)
# data = data.drop(cols_to_drop)
# 
# cols_to_binarize = get_columns_with_more_x_perc_nulls(data, threshold=0.5)
# data = data.with_columns(*binarize_columns_with_more_x_perc_nulls(cols_to_binarize))
# 
# data.describe()

## Split train test

In [62]:
cols_to_drop = []
for col in data.columns:
    if "clientscnt" in col or "district" in col or "age_days" in col:
        cols_to_drop.append(col)

data = data.drop(cols_to_drop)

data.head().collect()

case_id,date_decision,target,actualdpdtolerance_344P,amtinstpaidbefduel24m_4187115A,annuity_780A,annuitynextmonth_57A,applicationcnt_361L,applications30d_658L,applicationscnt_1086L,applicationscnt_464L,applicationscnt_629L,applicationscnt_867L,avgdbddpdlast24m_3658932P,avgdbddpdlast3m_4187120P,avgdbdtollast24m_4525197P,avgdpdtolclosure24_3658938P,avginstallast24m_3658937A,avglnamtstart24m_4525187A,avgmaxdpdlast9m_3716943P,avgoutstandbalancel6m_4187114A,avgpmtlast12m_4525200A,bankacctype_710L,cardtype_51L,cntincpaycont9m_3716944L,cntpmts24_3658933L,commnoinclast6m_3546845L,credamount_770A,credtype_322L,currdebt_22A,currdebtcredtyperange_828A,datefirstoffer_1144D,datelastinstal40dpd_247D,datelastunpaid_3546854D,daysoverduetolerancedd_3976961L,deferredmnthsnum_166L,disbursedcredamount_1113A,…,firstnonzeroinstldate_307D_median,firstnonzeroinstldate_307D_mean,inittransactioncode_279L_mode,isbidproduct_390L_median,isdebitcard_527L_median,mainoccupationinc_437A_median,mainoccupationinc_437A_mean,maxdpdtolerance_577P_median,maxdpdtolerance_577P_mean,outstandingdebt_522A_median,outstandingdebt_522A_mean,pmtnum_8L_median,pmtnum_8L_mean,postype_4733339M_mode,profession_152M_mode,rejectreason_755M_mode,rejectreasonclient_4145042M_mode,revolvingaccount_394A_median,status_219L_mode,tenor_203L_median,tenor_203L_mean,addres_role_871L_median,addres_zip_823M_mode,conts_role_79M_mode,empls_economicalst_849M_mode,empls_employedfrom_796D_median,empls_employedfrom_796D_mean,empls_employer_name_740M_mode,relatedpersons_role_762T_median,cacccardblochreas_147M_mode,conts_type_509L_mode,credacc_cards_status_52L_median,age,year,month,day_of_year,decision_is_on_weekend
i8,date,i8,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,bool,f32,f32,f32,cat,bool,f32,f32,f32,f32,cat,f32,f32,i8,i16,i8,f32,f32,f32,…,f32,f32,cat,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,cat,cat,cat,cat,f32,cat,f32,f32,f32,cat,cat,cat,f32,f32,cat,f32,cat,cat,f32,f32,i32,i8,i16,i32
0,2019-01-03,0,,,1917.5999755859373,0,0,0,0,0,0,0,,,,,,False,,,,,False,,,0.0,30000.0,"""CAL""",0,0,,17899,,,0,30000.0,…,,,,,0,,,,,,,,,,,,,0,,,,,,,,,,,,,,,32.5315055847168,2019,1,3,0
1,2019-01-03,0,,,3134.0,0,0,0,0,0,0,0,,,,,,False,,,,,False,,,0.0,19999.80078125,"""CAL""",0,0,,17899,,,0,19999.80078125,…,,,,,0,,,,,,,,,,,,,0,,,,,,,,,,,,,,,61.46575164794922,2019,1,3,0
2,2019-01-04,0,,,4937.0,0,0,0,0,0,0,0,,,,,,False,,,,,False,,,,78000.0,"""CAL""",0,0,,17900,,,0,78000.0,…,2071.0,2071.0,,0.0,0,8200.0,8200.0,,,,,18.0,18.0,,,,,0,,18.0,18.0,,,,,,,,,,,,44.12328720092773,2019,1,4,0
3,2019-01-03,0,,,4643.60009765625,0,0,1,0,2,0,1,,,,,,False,,,,,False,,,0.0,40000.0,"""CAL""",0,0,,17899,,,0,40000.0,…,-35.0,-35.0,,0.0,0,11000.0,11000.0,,,,,12.0,12.0,,,,,0,,12.0,12.0,,,,,,,,,,,,25.44109535217285,2019,1,3,0
4,2019-01-04,1,,,3390.199951171875,0,0,1,0,0,0,1,,,,,,False,,,,,False,,,0.0,44000.0,"""CAL""",0,0,,17900,,,0,44000.0,…,-35.0,-35.0,,0.0,0,16000.0,16000.0,,,,,24.0,24.0,,,,,0,,24.0,24.0,,,,,,,,,,,,25.024658203125,2019,1,4,0


In [63]:
# data = data.drop([col for col in data.columns if col.split("_")[-1] in ["first", "last", "mode"]])

In [64]:
## Drop categorical columns with just one category other than null
cols_to_drop = []
for col in data.select([col for col, dtype in zip(data.columns, data.dtypes) if isinstance(dtype, pl.Categorical)]).columns:
    if data.select(col).drop_nulls().unique().collect().shape[0] == 1:
        cols_to_drop.append(col)

data = data.drop(cols_to_drop)

In [65]:
[col for col in data.columns if "dtlastpmtallstes_3545839D" in col]

['dtlastpmtallstes_3545839D_median', 'dtlastpmtallstes_3545839D_mean']

## Fill nulls for each column

In [66]:
data = data.with_columns(
    pl.when(pl.col("posfpd30lastmonth_3976960P").is_null()).then(pl.lit(1)).otherwise(pl.lit(0)).alias("posfpd30lastmonth_3976960P_is_null"),
    pl.when(pl.col("empls_employedfrom_796D").is_null()).then(pl.lit(1)).otherwise(pl.lit(0)).alias("empls_employedfrom_796D_is_null"),
    pl.when(pl.col("posfpd10lastmonth_333P").is_null()).then(pl.lit(1)).otherwise(pl.lit(0)).alias("posfpd10lastmonth_333P_is_null"), # or -1.
    pl.when(pl.col("lastdelinqdate_224D").is_null()).then(pl.lit(1)).otherwise(pl.lit(0)).alias("lastdelinqdate_224D_is_null"),
    pl.when(pl.col("dtlastpmtallstes_4499206D").is_null()).then(pl.lit(1)).otherwise(pl.lit(0)).alias("dtlastpmtallstes_4499206D_is_null"),
    pl.when(pl.col("avgdbdtollast24m_4525197P").is_null()).then(pl.lit(1)).otherwise(pl.lit(0)).alias("avgdbdtollast24m_4525197P_is_null"),
    pl.when(pl.col("mindbdtollast24m_4525191P").is_null()).then(pl.lit(1)).otherwise(pl.lit(0)).alias("mindbdtollast24m_4525191P_is_null"),
    pl.when(pl.col("maxdbddpdlast1m_3658939P").is_null()).then(pl.lit(1)).otherwise(pl.lit(0)).alias("maxdbddpdlast1m_3658939P_is_null"),
    pl.when(pl.col("empl_employedfrom_271D_median").is_null()).then(pl.lit(1)).otherwise(pl.lit(0)).alias("empl_employedfrom_271D_is_null"),
    pl.when(pl.col("employedfrom_700D_median").is_null()).then(pl.lit(1)).otherwise(pl.lit(0)).alias("employedfrom_700D_is_null"),
    pl.when(pl.col("avgdbddpdlast3m_4187120P").is_null()).then(pl.lit(1)).otherwise(pl.lit(0)).alias("avgdbddpdlast3m_4187120P_is_null"),
    pl.when(pl.col("datelastunpaid_3546854D").is_null()).then(pl.lit(1)).otherwise(pl.lit(0)).alias("datelastunpaid_3546854D_is_null"),
    pl.when(pl.col("maxdbddpdtollast6m_4187119P").is_null()).then(pl.lit(1)).otherwise(pl.lit(0)).alias("maxdbddpdtollast6m_4187119P_is_null"),
    pl.when(pl.col("responsedate_4527233D").is_null()).then(pl.lit(1)).otherwise(pl.lit(0)).alias("responsedate_4527233D_is_null"),
    pl.when(pl.col("lastrejectdate_50D").is_null()).then(pl.lit(1)).otherwise(pl.lit(0)).alias("lastrejectdate_50D_is_null"),
    pl.when(pl.col("datefirstoffer_1144D").is_null()).then(pl.lit(1)).otherwise(pl.lit(0)).alias("datefirstoffer_1144D_is_null"),
    pl.when(pl.col("maxdpdinstldate_3546855D").is_null()).then(pl.lit(1)).otherwise(pl.lit(0)).alias("maxdpdinstldate_3546855D_is_null"),
    pl.when(pl.col("responsedate_1012D").is_null()).then(pl.lit(1)).otherwise(pl.lit(0)).alias("responsedate_1012D_is_null"),
    pl.when(pl.col("dtlastpmtallstes_3545839D_median").is_null()).then(pl.lit(1)).otherwise(pl.lit(0)).alias("dtlastpmtallstes_3545839D_is_null"),
    pl.when(pl.col("dateactivated_425D_median").is_null()).then(pl.lit(1)).otherwise(pl.lit(0)).alias("dateactivated_425D_is_null"),
    pl.when(pl.col("maxdbddpdtollast12m_3658940P").is_null()).then(pl.lit(1)).otherwise(pl.lit(0)).alias("maxdbddpdtollast12m_3658940P_is_null"),
    pl.when(pl.col("lastapplicationdate_877D").is_null()).then(pl.lit(1)).otherwise(pl.lit(0)).alias("lastapplicationdate_877D_is_null"),
    pl.when(pl.col("posfstqpd30lastmonth_3976962P").is_null()).then(pl.lit(1)).otherwise(pl.lit(0)).alias("posfstqpd30lastmonth_3976962P_is_null"),
    pl.when(pl.col("creationdate_885D_median").is_null()).then(pl.lit(1)).otherwise(pl.lit(0)).alias("creationdate_885D_is_null"),
    pl.when(pl.col("firstdatedue_489D").is_null()).then(pl.lit(1)).otherwise(pl.lit(0)).alias("firstdatedue_489D_is_null"),
    pl.when(pl.col("firstnonzeroinstldate_307D_median").is_null()).then(pl.lit(1)).otherwise(pl.lit(0)).alias("firstnonzeroinstldate_307D_is_null"),
    pl.when(pl.col("lastapprdate_640D").is_null()).then(pl.lit(1)).otherwise(pl.lit(0)).alias("lastapprdate_640D_is_null"),


    pl.col("eir_270L").fill_null(value=-1.),
    pl.col("interestrate_311L").fill_null(value=-1.),
    pl.col("totinstallast1m_4525188A").fill_null(value=-1.),
    pl.col("maxlnamtstart6m_4525199A").fill_null(value=-1.),
    pl.col("avgpmtlast12m_4525200A").fill_null(value=-1.),
    pl.col("amtinstpaidbefduel24m_4187115A").fill_null(value=-1.),
    # pl.col("responsedate_4527233D").fill_null(value=1.),
    
    pl.col("pmtscount_423L").fill_null(value=0.),
    pl.col("pmtssum_45A").fill_null(value=0.),
    pl.col("maxpmtlast3m_4525190A").fill_null(value=0.),
    pl.col("numinstmatpaidtearly2d_4499204L").fill_null(value=0.),
    pl.col("numinstpaid_4499208L").fill_null(value=0.),
    pl.col("numinstpaidearly5dobd_4499205L").fill_null(value=0.),
    pl.col("numinstpaidearly3dest_4493216L").fill_null(value=0.),
    pl.col("numinstpaidearly5dest_4493211L").fill_null(value=0.),
    pl.col("numinstpaidearlyest_4493214L").fill_null(value=0.),
    pl.col("numinstregularpaidest_4493210L").fill_null(value=0.),
    pl.col("numinsttopaygrest_4493213L").fill_null(value=0.),
    pl.col("numinstunpaidmaxest_4493212L").fill_null(value=0.),
    pl.col("sumoutstandtotalest_4493215A").fill_null(value=0.),
    pl.col("maxdpdinstlnum_3546846P").fill_null(value=0.),
    pl.col("lastrejectcredamount_222A").fill_null(value=0.),
    pl.col("avgmaxdpdlast9m_3716943P").fill_null(value=0.),
    pl.col("maxdpdtolerance_577P_median").fill_null(value=0.),
    pl.col("maxdpdtolerance_577P_mean").fill_null(value=0.),
    pl.col("maxdbddpdtollast12m_3658940P").fill_null(value=0.),
    pl.col("approvaldate_319D_median").fill_null(value=0.),
    pl.col("approvaldate_319D_mean").fill_null(value=0.),
    pl.col("outstandingdebt_522A_median").fill_null(value=0.),
    pl.col("outstandingdebt_522A_mean").fill_null(value=0.),
    pl.col("currdebt_94A_median").fill_null(value=0.),
    pl.col("currdebt_94A_mean").fill_null(value=0.),
    pl.col("numinstpaidlastcontr_4325080L").fill_null(value=0.),
    pl.col("avginstallast24m_3658937A").fill_null(value=0.),
    pl.col("maxoutstandbalancel12m_4187113A").fill_null(value=0.),
    pl.col("maxinstallast24m_3658928A").fill_null(value=0.),
    pl.col("avgdbddpdlast24m_3658932P").fill_null(value=0.),
    pl.col("mindbddpdlast24m_3658935P").fill_null(value=0.),
    pl.col("numinstlswithdpd5_4187116L").fill_null(value=0.),
    pl.col("maininc_215A").fill_null(value=0.),
    pl.col("avgdpdtolclosure24_3658938P").fill_null(value=0.),
    pl.col("pctinstlsallpaidlat10d_839L").fill_null(value=0.),
    pl.col("cntpmts24_3658933L").fill_null(value=0.),
    pl.col("pctinstlsallpaidlate6d_3546844L").fill_null(value=0.),
    pl.col("pctinstlsallpaidlate4d_3546849L").fill_null(value=0.),
    pl.col("pctinstlsallpaidearl3d_427L").fill_null(value=0.),
    pl.col("pctinstlsallpaidlate1d_3546856L").fill_null(value=0.),
    pl.col("numinstlswithdpd10_728L").fill_null(value=0.),
    pl.col("numinstlswithoutdpd_562L").fill_null(value=0.),
    pl.col("numinstregularpaid_973L").fill_null(value=0.),
    pl.col("cntincpaycont9m_3716944L").fill_null(value=0.),
    pl.col("numincomingpmts_3546848L").fill_null(value=0.),
    pl.col("lastactivateddate_801D").fill_null(value=0.),
    pl.col("daysoverduetolerancedd_3976961L").fill_null(value=0.),
    pl.col("numinsttopaygr_769L").fill_null(value=0.),
    pl.col("numinstunpaidmax_3546851L").fill_null(value=0.),
    pl.col("monthsannuity_845L").fill_null(value=0.),
    pl.col("numinstpaidearly5d_1087L").fill_null(value=0.),
    pl.col("numinstpaidearly_338L").fill_null(value=0.),
    pl.col("numinstpaidlate1d_3546852L").fill_null(value=0.),
    pl.col("numinstpaidearly3d_3546850L").fill_null(value=0.),
    pl.col("numinstlallpaidearly3d_817L").fill_null(value=0.),
    pl.col("numinstlsallpaid_934L").fill_null(value=0.),
    pl.col("lastapprcredamount_781A").fill_null(value=0.),
    pl.col("sumoutstandtotal_3546847A").fill_null(value=0.),
    pl.col("actualdpdtolerance_344P").fill_null(value=0.),
    pl.col("pmtnum_8L_median").fill_null(value=0.),
    pl.col("pmtnum_8L_mean").fill_null(value=0.),
    pl.col("tenor_203L_median").fill_null(value=0.),
    pl.col("tenor_203L_mean").fill_null(value=0.),
    pl.col("commnoinclast6m_3546845L").fill_null(value=0.),
    pl.col("maxdpdfrom6mto36m_3546853P").fill_null(value=0.),
    pl.col("maxdpdfrom6mto36m_3546853P").fill_null(value=0.),
    pl.col("mainoccupationinc_437A_median").fill_null(value=0.),
    pl.col("mainoccupationinc_437A_mean").fill_null(value=0.),
    pl.col("maxannuity_159A").fill_null(value=0.),
    pl.col("maxdebt4_972A").fill_null(value=0.),
    pl.col("maxdpdlast12m_727P").fill_null(value=0.),
    pl.col("maxdpdlast24m_143P").fill_null(value=0.),
    pl.col("maxdpdlast3m_392P").fill_null(value=0.),
    pl.col("maxdpdlast6m_474P").fill_null(value=0.),
    pl.col("maxdpdlast9m_1059P").fill_null(value=0.),
    pl.col("maxdpdtolerance_374P").fill_null(value=0.),
    pl.col("opencred_647L").fill_null(value=0.),
    pl.col("price_1097A").fill_null(value=0.),
    pl.col("days120_123L").fill_null(value=0.),
    pl.col("days180_256L").fill_null(value=0.),
    pl.col("days30_165L").fill_null(value=0.),
    pl.col("days360_512L").fill_null(value=0.),
    pl.col("days90_310L").fill_null(value=0.),
    pl.col("firstquarter_103L").fill_null(value=0.),
    pl.col("fourthquarter_440L").fill_null(value=0.),
    pl.col("numberofqueries_373L").fill_null(value=0.),
    pl.col("secondquarter_766L").fill_null(value=0.),
    pl.col("thirdquarter_1082L").fill_null(value=0.),
    pl.col("pmtnum_254L").fill_null(value=0.),
    pl.col("annuitynextmonth").fill_null(value=0.),
    pl.col("currdebt_22A").fill_null(value=0.),
    pl.col("currdebtcredtyperange_828A").fill_null(value=0.),    
    pl.col("numinstls_657L").fill_null(value=0.),
    pl.col("totalsettled_863A").fill_null(value=0.),
    pl.col("totaldebt_9A").fill_null(value=0.),


    pl.col("firstclxcampaign_1125D").fill_null(value=1.),


    pl.col("requesttype_4525192L").fill_null(value="NO_DEDUCTION"),


    pl.col("postype_4733339M_mode").fill_null(value="a55475b1"),
    pl.col("profession_152M_mode").fill_null(value="a55475b1"),
    pl.col("conts_role_79M_mode").fill_null(value="a55475b1"),
    pl.col("empls_economicalst_849M_mode").fill_null(value="a55475b1"),
    pl.col("cacccardblochreas_147M_mode").fill_null(value="a55475b1"),
    pl.col("description_5085714M").fill_null(value="a55475b1"),
    pl.col("education_1103M").fill_null(value="a55475b1"),
    pl.col("education_88M").fill_null(value="a55475b1"),
    pl.col("maritalst_385M").fill_null(value="a55475b1"),
    pl.col("maritalst_893M").fill_null(value="a55475b1"),


    pl.col("conts_type_509L_mode").fill_null(value="UNKNOWN"),
    pl.col("incometype_1044T_mode").fill_null(value="UNKNOWN"),
    pl.col("empl_industry_691L_mode").fill_null(value="UNKNOWN"),
    pl.col("credtype_587L_mode").fill_null(value="UNKNOWN"),
    pl.col("inittransactioncode_279L_mode").fill_null(value="UNKNOWN"),
    pl.col("status_219L_mode").fill_null(value="UNKNOWN"),
    pl.col("rejectreason_755M_mode").fill_null(value="UNKNOWN"),
    pl.col("cancelreason_3545846M_mode").fill_null(value="UNKNOWN"),
    pl.col("rejectreasonclient_4145042M_mode").fill_null(value="UNKNOWN"),
    pl.col("lastst_736L").fill_null(value="UNKNOWN"),
    pl.col("role_1084L_mode").fill_null(value="UNKNOWN"),
    pl.col("disbursementtype_67L").fill_null(value="UNKNOWN"),
    pl.col("twobodfilling_608L").fill_null(value="UNKNOWN"),
    pl.col("credtype_322L").fill_null(value="UNKNOWN"),
    pl.col("inittransactioncode_186L").fill_null(value="UNKNOWN"),
    
    # pl.col("responsedate_1012D").fill_null(value=1.),
    # KNN impute -----> avgoutstandbalancel6m_4187114A, maxoutstandbalancel12m_4187113A 
    # TOCHECK    -----> mastercontrelectronic_519L, Mastercontrexist_109L, isbidproduct_390L, relatedpersons_role_762T, credacc_cards_status_52L_median
    
).drop(["responsedate_4917613D", "addres_zip_823M_mode", "addres_role_871L"])

In [115]:
unknown_columns: list[str] = ['conts_type_509L_mode',
                              'incometype_1044T_mode',
                              'empl_industry_691L_mode',
                              'credtype_587L_mode',
                              'inittransactioncode_279L_mode',
                              'status_219L_mode',
                              'rejectreason_755M_mode',
                              'cancelreason_3545846M_mode',
                              'rejectreasonclient_4145042M_mode',
                              'lastst_736L',
                              'role_1084L_mode',
                              'disbursementtype_67L',
                              'twobodfilling_608L',
                              'credtype_322L',
                              'inittransactioncode_186L']

no_deduction_columns: list[str] = ["requesttype_4525192L"]

fill_one_columns: list[str] = ["firstclxcampaign_1125D"]

a55475b1_columns: list[str] = ['postype_4733339M_mode',
                               'profession_152M_mode',
                               'conts_role_79M_mode',
                               'empls_economicalst_849M_mode',
                               'cacccardblochreas_147M_mode',
                               'description_5085714M',
                               'education_1103M',
                               'education_88M',
                               'maritalst_385M',
                               'maritalst_893M']

fill_zero_columns: list[str] = ['pmtscount_423L',
                                'pmtssum_45A',
                                'maxpmtlast3m_4525190A',
                                'numinstmatpaidtearly2d_4499204L',
                                'numinstpaid_4499208L',
                                'numinstpaidearly5dobd_4499205L',
                                'numinstpaidearly3dest_4493216L',
                                'numinstpaidearly5dest_4493211L',
                                'numinstpaidearlyest_4493214L',
                                'numinstregularpaidest_4493210L',
                                'numinsttopaygrest_4493213L',
                                'numinstunpaidmaxest_4493212L',
                                'sumoutstandtotalest_4493215A',
                                'maxdpdinstlnum_3546846P',
                                'lastrejectcredamount_222A',
                                'avgmaxdpdlast9m_3716943P',
                                'maxdpdtolerance_577P_median',
                                'maxdpdtolerance_577P_mean',
                                'maxdbddpdtollast12m_3658940P',
                                'approvaldate_319D_median',
                                'approvaldate_319D_mean',
                                'outstandingdebt_522A_median',
                                'outstandingdebt_522A_mean',
                                'currdebt_94A_median',
                                'currdebt_94A_mean',
                                'numinstpaidlastcontr_4325080L',
                                'avginstallast24m_3658937A',
                                'maxoutstandbalancel12m_4187113A',
                                'maxinstallast24m_3658928A',
                                'avgdbddpdlast24m_3658932P',
                                'mindbddpdlast24m_3658935P',
                                'numinstlswithdpd5_4187116L',
                                'maininc_215A',
                                'avgdpdtolclosure24_3658938P',
                                'pctinstlsallpaidlat10d_839L',
                                'cntpmts24_3658933L',
                                'pctinstlsallpaidlate6d_3546844L',
                                'pctinstlsallpaidlate4d_3546849L',
                                'pctinstlsallpaidearl3d_427L',
                                'pctinstlsallpaidlate1d_3546856L',
                                'numinstlswithdpd10_728L',
                                'numinstlswithoutdpd_562L',
                                'numinstregularpaid_973L',
                                'cntincpaycont9m_3716944L',
                                'numincomingpmts_3546848L',
                                'lastactivateddate_801D',
                                'daysoverduetolerancedd_3976961L',
                                'numinsttopaygr_769L',
                                'numinstunpaidmax_3546851L',
                                'monthsannuity_845L',
                                'numinstpaidearly5d_1087L',
                                'numinstpaidearly_338L',
                                'numinstpaidlate1d_3546852L',
                                'numinstpaidearly3d_3546850L',
                                'numinstlallpaidearly3d_817L',
                                'numinstlsallpaid_934L',
                                'lastapprcredamount_781A',
                                'sumoutstandtotal_3546847A',
                                'actualdpdtolerance_344P',
                                'pmtnum_8L_median',
                                'pmtnum_8L_mean',
                                'tenor_203L_median',
                                'tenor_203L_mean',
                                'commnoinclast6m_3546845L',
                                'maxdpdfrom6mto36m_3546853P',
                                'maxdpdfrom6mto36m_3546853P',
                                'mainoccupationinc_437A_median',
                                'mainoccupationinc_437A_mean',
                                'maxannuity_159A',
                                'maxdebt4_972A',
                                'maxdpdlast12m_727P',
                                'maxdpdlast24m_143P',
                                'maxdpdlast3m_392P',
                                'maxdpdlast6m_474P',
                                'maxdpdlast9m_1059P',
                                'maxdpdtolerance_374P',
                                'opencred_647L',
                                'price_1097A',
                                'days120_123L',
                                'days180_256L',
                                'days30_165L',
                                'days360_512L',
                                'days90_310L',
                                'firstquarter_103L',
                                'fourthquarter_440L',
                                'numberofqueries_373L',
                                'secondquarter_766L',
                                'thirdquarter_1082L',
                                'pmtnum_254L',
                                'annuitynextmonth',
                                'currdebt_22A',
                                'currdebtcredtyperange_828A',
                                'numinstls_657L',
                                'totalsettled_863A',
                                'totaldebt_9A']

fill_minus_one_columns: list[str] = ['eir_270L',
                                     'interestrate_311L',
                                     'totinstallast1m_4525188A',
                                     'maxlnamtstart6m_4525199A',
                                     'avgpmtlast12m_4525200A',
                                     'amtinstpaidbefduel24m_4187115A']

indicator_columns: list[str] = ['posfpd30lastmonth_3976960P',
                      'empls_employedfrom_796D',
                      'posfpd10lastmonth_333P',
                      'lastdelinqdate_224D',
                      'dtlastpmtallstes_4499206D',
                      'avgdbdtollast24m_4525197P',
                      'mindbdtollast24m_4525191P',
                      'maxdbddpdlast1m_3658939P',
                      'empl_employedfrom_271D_median',
                      'employedfrom_700D_median',
                      'avgdbddpdlast3m_4187120P',
                      'datelastunpaid_3546854D',
                      'maxdbddpdtollast6m_4187119P',
                      'responsedate_4527233D',
                      'lastrejectdate_50D',
                      'datefirstoffer_1144D',
                      'maxdpdinstldate_3546855D',
                      'responsedate_1012D',
                      'dtlastpmtallstes_3545839D_median',
                      'dateactivated_425D_median',
                      'maxdbddpdtollast12m_3658940P',
                      'lastapplicationdate_877D',
                      'posfstqpd30lastmonth_3976962P',
                      'creationdate_885D_median',
                      'firstdatedue_489D',
                      'firstnonzeroinstldate_307D_median',
                      'lastapprdate_640D']

In [116]:
set(indicator_columns).difference(columns)

{'dateactivated_425D',
 'dtlastpmtallstes_3545839D',
 'empl_employedfrom_271D',
 'employedfrom_700D',
 'firstnonzeroinstldate_307D'}

In [105]:
data.select("education_1103M").describe(percentiles=0.5)

statistic,education_1103M
str,str
"""count""","""1500475"""
"""null_count""","""26183"""
"""mean""",
"""std""",
"""min""",
"""50%""",
"""max""",


In [113]:
data.select("numinstls_657L").collect().to_series().value_counts().sort("count", descending=True)

numinstls_657L,count
f32,u32
0,1039513
12,111662
24,78337
36,37140
16,34836
6,33501
18,28060
48,24241
30,19731
3,9917


In [70]:
data.select(pl.col("requesttype_4525192L").cat.get_categories()).collect()

  data.select(pl.col("requesttype_4525192L").cat.get_categories()).collect()


requesttype_4525192L
str
"""DEDUCTION_6"""
"""PENSION_6"""
"""SOCIAL_6"""
"""NO_DEDUCTION"""


In [71]:
1/0

ZeroDivisionError: division by zero

In [None]:
data: pl.DataFrame = data.drop(["case_id"]).unique().collect()
data: pl.DataFrame = pl.concat([data, pl.from_numpy(np.random.normal(size=data.shape[0]), schema=["noise"])], how="horizontal")

In [None]:
def split_data(data: pl.DataFrame):
    train, valid = split_train_validation(data, test_months=0.35)
    y_train, y_valid = train.select("target").to_series().to_pandas(), valid.select("target").to_series().to_pandas()
    x_train, x_valid = train.drop(["target", "date_decision"]), valid.drop(["target", "date_decision"])
    
    x_train = convert_object_to_categorical(x_train.to_pandas())
    x_valid = convert_object_to_categorical(x_valid.to_pandas())
    
    return x_train, y_train, x_valid, y_valid

## Model train

In [None]:
# import numpy as np
# from scipy import optimize
# from scipy import special
# 
# class FocalLoss:
# 
#     def __init__(self, gamma, alpha=None):
#         self.alpha = alpha
#         self.gamma = gamma
# 
#     def at(self, y):
#         if self.alpha is None:
#             return np.ones_like(y)
#         return np.where(y, self.alpha, 1 - self.alpha)
# 
#     def pt(self, y, p):
#         p = np.clip(p, 1e-15, 1 - 1e-15)
#         return np.where(y, p, 1 - p)
# 
#     def __call__(self, y_true, y_pred):
#         at = self.at(y_true)
#         pt = self.pt(y_true, y_pred)
#         return -at * (1 - pt) ** self.gamma * np.log(pt)
# 
#     def grad(self, y_true, y_pred):
#         y = 2 * y_true - 1  # {0, 1} -> {-1, 1}
#         at = self.at(y_true)
#         pt = self.pt(y_true, y_pred)
#         g = self.gamma
#         return at * y * (1 - pt) ** g * (g * pt * np.log(pt) + pt - 1)
# 
#     def hess(self, y_true, y_pred):
#         y = 2 * y_true - 1  # {0, 1} -> {-1, 1}
#         at = self.at(y_true)
#         pt = self.pt(y_true, y_pred)
#         g = self.gamma
# 
#         u = at * y * (1 - pt) ** g
#         du = -at * y * g * (1 - pt) ** (g - 1)
#         v = g * pt * np.log(pt) + pt - 1
#         dv = g * np.log(pt) + g + 1
# 
#         return (du * v + u * dv) * y * (pt * (1 - pt))
# 
#     def init_score(self, y_true):
#         res = optimize.minimize_scalar(
#             lambda p: self(y_true, p).sum(),
#             bounds=(0, 1),
#             method='bounded'
#         )
#         p = res.x
#         log_odds = np.log(p / (1 - p))
#         return log_odds
# 
#     def lgb_obj(self, preds, train_data):
#         y = train_data.get_label()
#         p = special.expit(preds)
#         return self.grad(y, p), self.hess(y, p)
# 
#     def lgb_eval(self, preds, train_data):
#         y = train_data.get_label()
#         p = special.expit(preds)
#         is_higher_better = False
#         return 'focal_loss', self(y, p).mean(), is_higher_better
# 
# loss = FocalLoss(alpha=None, gamma=0)
# loss.init_score(y_train)

In [None]:
import logging

loggers = [logging.getLogger(name) for name in logging.root.manager.loggerDict]
for logger in loggers:
    logger.disabled = True
    logger.propagate = False


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import PrecisionRecallDisplay, RocCurveDisplay, roc_auc_score

def store_metrics(model: LogisticRegression | lgb.LGBMModel, x_valid: pd.DataFrame, y_valid: pd.Series, valid_sample_weight: np.ndarray) -> None:
    y_pred = model.predict(x_valid)
    y_pred_proba = model.predict_proba(x_valid)[:, 1]
    
    fig = ConfusionMatrixDisplay.from_predictions(y_true=y_valid, y_pred=y_pred, normalize="all", sample_weight=valid_sample_weight)
    mlflow.log_figure(fig.figure_, artifact_file="confusion_matrix.png")
    
    fig = PrecisionRecallDisplay.from_predictions(y_true=y_valid, y_pred=y_pred_proba, sample_weight=valid_sample_weight)
    mlflow.log_figure(fig.figure_, artifact_file="precision_recall_curve.png")

    fig = RocCurveDisplay.from_predictions(y_true=y_valid, y_pred=y_pred_proba, sample_weight=valid_sample_weight)
    mlflow.log_figure(fig.figure_, artifact_file="roc_curve.png")

    conf_matrix: np.ndarray = confusion_matrix(y_pred=y_pred, y_true=y_valid, labels=model.classes_, sample_weight=valid_sample_weight)
    tn, fp, fn, tp = conf_matrix.ravel()

    epsilon = 1e-15

    # Calculate metrics
    precision = tp / (tp + fp + epsilon)
    recall = tp / (tp + fn + epsilon)
    specificity = tn / (tn + fp + epsilon)
    accuracy = (tp + tn) / (tp + tn + fp + fn + epsilon)
    balanced_accuracy = (recall + specificity) / 2
    f1_score = 2 * precision * recall / (precision + recall + epsilon)
    false_positive_rate = fp / (fp + tn + epsilon)
    false_negative_rate = fn / (fn + tp + epsilon)
    false_omission_rate = fn / (tn + fn + epsilon)
    false_discovery_rate = fp / (tp + fp + epsilon)
    positive_likelihood_ratio = recall / (false_positive_rate + epsilon)
    negative_likelihood_ratio = false_negative_rate / (specificity + epsilon)
    negative_predictive_value = tn / (tn + fn + epsilon)
    markedness = precision + negative_predictive_value - 1
    mcc = (tp * tn - fp * fn) / (((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)) ** 0.5 + epsilon)
    fowlkes_mallows_index = tp / (((tp + fp) * (tp + fn)) ** 0.5 + epsilon)
    # roc_auc = roc_auc_score(y_true=y_valid, y_score=y_pred_proba, sample_weight=valid_sample_weight)

    mlflow.log_metrics(metrics={
        "accuracy": round(accuracy, 3),
        "balanced_accuracy": round(balanced_accuracy, 3),
        "precision": round(precision, 3),
        "recall": round(recall, 3),
        "specificity": round(specificity, 3),
        "f1": round(f1_score, 3),
        "false_positive_rate": round(false_positive_rate, 3),
        "false_negative_rate": round(false_negative_rate, 3),
        "false_omission_rate": round(false_omission_rate, 3),
        "false_discovery_rate": round(false_discovery_rate, 3),
        "positive_likelihood_ratio": round(positive_likelihood_ratio, 3),
        "negative_likelihood_ratio": round(negative_likelihood_ratio, 3),
        "negative_predictive_value": round(negative_predictive_value, 3),
        "markedness": round(markedness, 3),
        "mcc": round(mcc, 3),
        "fowlkes_mallows_index": round(fowlkes_mallows_index, 3),
        # "auc": round(roc_auc, 3),
    })

def lightgbm_model_train(x_train: pd.DataFrame, y_train: pd.Series, x_valid: pd.DataFrame, y_valid: pd.Series) -> tuple[lgb.LGBMModel, pd.Series]:
    eval_results = {}

    # val_counts = y_train.value_counts(normalize = True)
    # scale_pos_weight = val_counts.loc[0] / val_counts.loc[1]
    # 
    # loss = FocalLoss(alpha=None, gamma=0)
    # 
    # train_init_score = np.full_like(y_train, loss.init_score(y_train), dtype=float)
    # valid_init_score = np.full_like(y_valid, loss.init_score(y_valid), dtype=float)

    # train_init_score = np.full_like(y_train, 0., dtype=float)
    # valid_init_score = np.full_like(y_valid, 0., dtype=float)
    
    with mlflow.start_run(nested=True):        
        mlflow.log_param(key="num_columns", value=len(x_train.columns))
        
        model = lgb.LGBMClassifier(
            random_state=SEED,
            n_jobs=-1,
            boosting_type='gbdt',
            num_leaves=128,
            max_depth=8,
            learning_rate=0.1,
            n_estimators=1_000,
            subsample_for_bin=200_000,
            objective="binary",
            class_weight=None,
            min_split_gain=0.0,
            min_child_weight=0.001,
            min_child_samples=10_000, # min_data_in_leaf
            subsample=0.5,
            subsample_freq=1,
            colsample_bytree=0.5,
            # reg_alpha=10.0,
            # reg_lambda=10.0,
            importance_type='gain',
            device="cpu",
            deterministic=True,
            verbose=-1,
            # extra_trees=True,
            # extra_seed=SEED
        )
    
        model.fit(
            X=x_train,
            y=y_train,
            sample_weight=train_weights,
            # init_score=train_init_score,
            eval_set=[(x_train, y_train), (x_valid, y_valid)],
            eval_metric=["binary_logloss", "average_precision", "auc"],
            eval_sample_weight=[train_weights, valid_weights],
            # eval_init_score=[train_init_score, valid_init_score],
            callbacks=[
                # lgb.log_evaluation(),
                lgb.record_evaluation(eval_results),
                lgb.early_stopping(stopping_rounds=100, first_metric_only=True),
            ],
        )

        store_metrics(model, valid_sample_weight=valid_weights, x_valid=x_valid, y_valid=y_valid)

        fig = get_learning_curves(model, metric="binary_logloss")
        mlflow.log_figure(fig, artifact_file="learning_curve_logloss.png")
        
        fig = get_learning_curves(model, metric="average_precision")
        mlflow.log_figure(fig, artifact_file="learning_curve_average_precision.png")

        fig = get_learning_curves(model, metric="auc")
        mlflow.log_figure(fig, artifact_file="learning_curve_auc.png")

        print("\ncomputing shap importances")
        shap_importances = np.average(model.predict(x_valid, pred_contrib=True)[:, :-1], axis=0, weights=valid_weights)
        shap_importances = pd.Series(shap_importances, index=x_valid.columns, name="shap_values").sort_values(ascending=True)
        print("finished computing shap importances")
        
        mlflow.log_dict(shap_importances.to_dict(), artifact_file="shap_importances.json")
        mlflow.log_figure(shap_importances.plot(kind="barh"), artifact_file="shap_importances.png")
        
    return model, shap_importances

In [None]:
data = data.drop([col for col in data.columns if col.endswith("mean")])
data = data.drop(["mobilephncnt_593L"])
data = fill_nulls(data)

In [None]:
mlflow.set_experiment("simple_model_full_data")
SPLIT_BY_SHAP = False

mlflow.lightgbm.autolog(
    log_input_examples=False,
    log_model_signatures=True,
    log_models=True,
    log_datasets=False,
    disable=False,
    exclusive=False,
    disable_for_unsupported_versions=False,
    silent=False,
    registered_model_name=None,
    extra_tags=None
)

with mlflow.start_run():
    i = 0
    cols_to_drop = []
    new_cols_to_drop = []
    while True:
        print(f"\nTraining iteration {i}")
        print(f"num columns to drop: {len(new_cols_to_drop)} - new columns: {len(data.columns)-len(new_cols_to_drop)}")
        
        data_iteration = data.drop(cols_to_drop)
        x_train, y_train, x_valid, y_valid = split_data(data_iteration)
        train_weights = compute_sample_weight(y=y_train, class_weight="balanced")
        valid_weights = compute_sample_weight(y=y_valid, class_weight="balanced")
        
        lgb_model, shap_importances = lightgbm_model_train(x_train=x_train, y_train=y_train, x_valid=x_valid, y_valid=y_valid)
        
        if SPLIT_BY_SHAP:
            new_cols_to_drop = list(shap_importances.index[(shap_importances <= 0)])
        else:
            feature_importances: pd.DataFrame = pd.DataFrame({"column": lgb_model.feature_name_, "importance": lgb_model.feature_importances_})
            noise_importance: int = feature_importances.query("column == 'noise'").importance.item()
            new_cols_to_drop = feature_importances.query("importance <= 'noise_importance' & column != 'noise'").column.tolist()
        
        try:
            new_cols_to_drop.remove("noise")
        except Exception:
            pass

        if not new_cols_to_drop:
            print("No more features to drop")
            break
        else:
            cols_to_drop.extend(new_cols_to_drop)
            i += 1