In [1]:
import numpy as np
import pandas as pd
import polars as pl
import os
import gc
dataPath = '/kaggle/input/home-credit-credit-risk-model-stability/'
# for dirname, _, filenames in os.walk(dataPath + 'csv_files/'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

In [2]:
# train_basetable = pd.read_csv(dataPath + "csv_files/train/train_base.csv")
# train_basetable.info()

In [3]:
train_basetable = pl.read_csv(dataPath + "csv_files/train/train_base.csv")
train_static = pl.concat(
    [
        pl.read_csv(dataPath + "csv_files/train/train_static_0_0.csv"),
        pl.read_csv(dataPath + "csv_files/train/train_static_0_1.csv"),
    ],
    how="vertical_relaxed", # coerces column into common supertype (e.g. int32->int64)
)
train_static_cb = pl.read_csv(dataPath + "csv_files/train/train_static_cb_0.csv")
train_person_1 = pl.read_csv(dataPath + "csv_files/train/train_person_1.csv")
train_credit_bureau_b_2 = pl.read_csv(dataPath + "csv_files/train/train_credit_bureau_b_2.csv")

In [4]:
test_basetable = pl.read_csv(dataPath + "csv_files/test/test_base.csv")
test_static = pl.concat(
    [
        pl.read_csv(dataPath + "csv_files/test/test_static_0_0.csv"),
        pl.read_csv(dataPath + "csv_files/test/test_static_0_1.csv"),
        pl.read_csv(dataPath + "csv_files/test/test_static_0_2.csv"),
    ],
    how="vertical_relaxed",
)
test_static_cb = pl.read_csv(dataPath + "csv_files/test/test_static_cb_0.csv")
test_person_1 = pl.read_csv(dataPath + "csv_files/test/test_person_1.csv")
test_credit_bureau_b_2 = pl.read_csv(dataPath + "csv_files/test/test_credit_bureau_b_2.csv")

In [5]:
# We need to use aggregation functions in tables with depth > 1, so tables that contain num_group1 column or 
# also num_group2 column.
train_person_1_feats_1 = train_person_1.group_by("case_id").agg(
    pl.col("mainoccupationinc_384A").max().alias("mainoccupationinc_384A_max"),
    (pl.col("incometype_1044T") == "SELFEMPLOYED").max().alias("mainoccupationinc_384A_any_selfemployed")
)

# Here num_group1=0 has special meaning, it is the person who applied for the loan.
train_person_1_feats_2 = train_person_1.select(["case_id", "num_group1", "housetype_905L"]).filter(
    pl.col("num_group1") == 0
).drop("num_group1").rename({"housetype_905L": "person_housetype"})

# Here we have num_goup1 and num_group2, so we need to aggregate again.
train_credit_bureau_b_2_feats = train_credit_bureau_b_2.group_by("case_id").agg(
    pl.col("pmts_pmtsoverdue_635A").max().alias("pmts_pmtsoverdue_635A_max"),
    (pl.col("pmts_dpdvalue_108P") > 31).max().alias("pmts_dpdvalue_108P_over31")
)

# We will process in this examples only A-type and M-type columns, so we need to select them.
selected_static_cols = []
for col in train_static.columns:
    if col[:4] in ("clie", "days"):
        print(col)
        continue
    else:
        selected_static_cols.append(col)

selected_static_cb_cols = []
for col in train_static_cb.columns:
    if col[:4] in ("clie", "days"):
        print(col)
        continue
    else:
        selected_static_cb_cols.append(col)

# Join all tables together.
data = train_basetable.join(
    train_static.select(selected_static_cols), how="left", on="case_id"
).join(
    train_static_cb.select(selected_static_cb_cols), how="left", on="case_id"
).join(
    train_person_1_feats_1, how="left", on="case_id"
).join(
    train_person_1_feats_2, how="left", on="case_id"
).join(
    train_credit_bureau_b_2_feats, how="left", on="case_id"
)

del train_basetable
del train_static
del train_static_cb
del train_person_1_feats_1
del train_person_1_feats_2
del train_credit_bureau_b_2_feats
gc.collect()

clientscnt12m_3712952L
clientscnt3m_3712950L
clientscnt6m_3712949L
clientscnt_100L
clientscnt_1022L
clientscnt_1071L
clientscnt_1130L
clientscnt_136L
clientscnt_157L
clientscnt_257L
clientscnt_304L
clientscnt_360L
clientscnt_493L
clientscnt_533L
clientscnt_887L
clientscnt_946L
daysoverduetolerancedd_3976961L
days120_123L
days180_256L
days30_165L
days360_512L
days90_310L


0

In [6]:
test_person_1_feats_1 = test_person_1.group_by("case_id").agg(
    pl.col("mainoccupationinc_384A").max().alias("mainoccupationinc_384A_max"),
    (pl.col("incometype_1044T") == "SELFEMPLOYED").max().alias("mainoccupationinc_384A_any_selfemployed")
)

test_person_1_feats_2 = test_person_1.select(["case_id", "num_group1", "housetype_905L"]).filter(
    pl.col("num_group1") == 0
).drop("num_group1").rename({"housetype_905L": "person_housetype"})

test_credit_bureau_b_2_feats = test_credit_bureau_b_2.group_by("case_id").agg(
    pl.col("pmts_pmtsoverdue_635A").max().alias("pmts_pmtsoverdue_635A_max"),
    (pl.col("pmts_dpdvalue_108P") > 31).max().alias("pmts_dpdvalue_108P_over31")
)

data_submission = test_basetable.join(
    test_static.select(selected_static_cols), how="left", on="case_id"
).join(
    test_static_cb.select(selected_static_cb_cols), how="left", on="case_id"
).join(
    test_person_1_feats_1, how="left", on="case_id"
).join(
    test_person_1_feats_2, how="left", on="case_id"
).join(
    test_credit_bureau_b_2_feats, how="left", on="case_id"
)

del test_basetable
del test_static
del test_static_cb
del test_person_1_feats_1
del test_person_1_feats_2
del test_credit_bureau_b_2_feats
gc.collect()

0

In [7]:
# frac_keep = 0.01 # working with a small percentage of data for easier handling

# data = data.sample(fraction=frac_keep)

In [8]:
# Less relevant columns
irrelevant_cols = ['pmtnum_254L', 'birthdate_574D', 'max_relationshiptoclient_642T', 'max_overdueamountmax2date_1142D', 'mean_numberofoverdueinstlmaxdat_641D', 'mean_pmts_overdue_1140A', 'mean_residualamount_856A', 'last_processingdate_168D', 'last_pmtamount_36A', 'posfstqpd30lastmonth_3976962P', 'mean_pmts_dpd_1073P', 'max_numberofoverdueinstlmaxdat_148D', 'max_nominalrate_281L', 'last_annuity_853A', 'max_pmtnum_8L', 'max_pmtamount_36A', 'mean_overdueamountmax2_398A', 'dateofbirth_337D', 'mean_overdueamountmax2_14A', 'mean_credlmt_935A', 'mean_pmts_dpd_303P', 'last_employedfrom_700D', 'max_periodicityofpmts_837L', 'lastrejectcredamount_222A', 'mean_totalamount_996A', 'max_processingdate_168D', 'mean_instlamount_768A', 'lastrejectdate_50D', 'mean_credlmt_230A', 'last_refreshdate_3813885D', 'responsedate_1012D', 'max_overdueamountmaxdatemonth_365T', 'mean_pmts_overdue_1152A', 'max_annuity_853A', 'mean_outstandingamount_362A', 'max_periodicityofpmts_1102L', 'max_employedfrom_700D', 'posfpd10lastmonth_333P', 'last_pmtnum_8L', 'mean_overdueamountmax2date_1002D', 'max_incometype_1044T', 'mean_overdueamountmax_35A', 'twobodfilling_608L', 'posfpd30lastmonth_3976960P', 'max_dpdmaxdatemonth_89T', 'max_mainoccupationinc_384A', 'mean_dpdmax_757P', 'max_numberofoverdueinstlmax_1151L', 'max_firstnonzeroinstldate_307D', 'days180_256L', 'max_empl_industry_691L', 'max_num_group2_13', 'last_personindex_1023L', 'credamount_770A', 'max_totalamount_6A', 'days30_165L', 'last_firstnonzeroinstldate_307D', 'mean_totalamount_6A', 'responsedate_4527233D', 'mean_dateofcredend_289D', 'max_dpdmaxdateyear_596T', 'mean_monthlyinstlamount_674A', 'max_byoccupationinc_3656910L', 'mean_annuity_853A', 'max_num_group1_15', 'max_overdueamountmaxdateyear_2T', 'last_credamount_590A', 'mean_totaloutstanddebtvalue_39A', 'max_numberofinstls_320L', 'disbursedcredamount_1113A', 'max_nominalrate_498L', 'max_pmts_dpd_303P', 'mean_monthlyinstlamount_332A', 'max_numberofoutstandinstls_59L', 'mean_downpmt_134A', 'mean_lastupdate_388D', 'max_numberofoverdueinstlmax_1039L', 'mean_mainoccupationinc_437A', 'max_num_group1_5', 'mean_dateofrealrepmt_138D', 'max_numberofinstls_229L', 'max_dateofcredstart_181D', 'maxannuity_159A', 'mean_dateofcredend_353D', 'max_monthlyinstlamount_674A', 'mean_firstnonzeroinstldate_307D', 'last_creationdate_885D', 'max_empl_employedtotal_800L', 'max_debtoutstand_525A', 'mean_credamount_590A', 'max_numberofcontrsvalue_358L', 'last_pmts_year_507T', 'max_rejectreason_755M', 'max_num_group1', 'mean_creationdate_885D', 'last_mainoccupationinc_437A', 'pmtscount_423L', 'max_lastupdate_388D', 'last_pmts_year_1139T', 'max_collater_valueofguarantee_1124L', 'max_birth_259D', 'max_dpdmaxdatemonth_442T', 'max_childnum_21L', 'max_mainoccupationinc_437A', 'max_downpmt_134A', 'mean_totaldebtoverduevalue_718A', 'description_5085714M', 'lastapplicationdate_877D', 'mean_credacc_credlmt_575A', 'max_credamount_590A', 'max_num_group2_15', 'last_currdebt_94A', 'lastrejectreason_759M', 'education_1103M', 'inittransactioncode_186L', 'last_credacc_credlmt_575A', 'last_downpmt_134A', 'days360_512L', 'last_num_group1', 'max_lastupdate_1112D', 'max_dateofrealrepmt_138D', 'max_overdueamountmaxdatemonth_284T', 'max_education_1138M', 'mean_overdueamount_659A', 'commnoinclast6m_3546845L', 'sumoutstandtotal_3546847A', 'mean_refreshdate_3813885D', 'price_1097A', 'last_rejectreason_755M', 'mastercontrelectronic_519L', 'month_decision', 'opencred_647L', 'max_debtoverdue_47A', 'mean_lastupdate_1112D', 'max_credacc_credlmt_575A', 'lastapprdate_640D', 'max_outstandingdebt_522A', 'credtype_322L', 'maritalst_385M', 'mean_totaloutstanddebtvalue_668A', 'last_education_1138M', 'max_residualamount_488A', 'max_num_group2_14', 'max_familystate_726L', 'last_num_group2_13', 'last_num_group2_14', 'mean_outstandingdebt_522A', 'last_postype_4733339M', 'max_credtype_587L', 'max_numberofoverdueinstls_725L', 'mobilephncnt_593L', 'last_outstandingdebt_522A', 'fourthquarter_440L', 'max_dpdmaxdateyear_896T', 'max_pmts_year_507T', 'pctinstlsallpaidlate1d_3546856L', 'lastcancelreason_561M', 'max_dateofcredend_353D', 'max_description_351M', 'max_approvaldate_319D', 'max_empls_employer_name_740M', 'max_overdueamountmaxdateyear_994T', 'numinsttopaygr_769L', 'lastapprcredamount_781A', 'last_pmts_month_158T', 'maininc_215A', 'max_refreshdate_3813885D', 'actualdpdtolerance_344P', 'weekday_decision', 'amtinstpaidbefduel24m_4187115A', 'last_cancelreason_3545846M', 'last_credtype_587L', 'last_familystate_726L', 'max_num_group1_13', 'mean_outstandingamount_354A', 'lastrejectcommoditycat_161M', 'last_inittransactioncode_279L', 'last_conts_type_509L', 'mean_approvaldate_319D', 'max_dtlastpmtallstes_3545839D', 'maxdpdfrom6mto36m_3546853P', 'max_remitter_829L', 'thirdquarter_1082L', 'clientscnt_533L', 'applicationscnt_867L', 'mean_currdebt_94A', 'mean_dtlastpmt_581D', 'max_collater_valueofguarantee_876L', 'max_inittransactioncode_279L', 'last_num_group1_6', 'dtlastpmtallstes_4499206D', 'mean_dtlastpmtallstes_3545839D', 'avgoutstandbalancel6m_4187114A', 'max_num_group1_9', 'last_status_219L', 'maxoutstandbalancel12m_4187113A', 'max_numberofcontrsvalue_258L', 'max_status_219L', 'max_currdebt_94A', 'disbursementtype_67L', 'numinstlsallpaid_934L', 'clientscnt_887L', 'maxdpdlast3m_392P', 'max_pmts_year_1139T', 'maxdebt4_972A', 'max_dateactivated_425D', 'applicationscnt_464L', 'clientscnt_1071L', 'max_language1_981M', 'avgpmtlast12m_4525200A', 'cntpmts24_3658933L', 'last_pmts_month_706T', 'max_purposeofcred_874M', 'secondquarter_766L', 'mean_dateactivated_425D', 'max_postype_4733339M', 'requesttype_4525192L', 'numinstpaidlastcontr_4325080L', 'sumoutstandtotalest_4493215A', 'last_approvaldate_319D', 'max_rejectreasonclient_4145042M', 'cntincpaycont9m_3716944L', 'maxdpdlast24m_143P', 'firstquarter_103L', 'last_maxdpdtolerance_577P', 'mean_maxdpdtolerance_577P', 'mean_actualdpd_943P', 'numinstpaidearly3d_3546850L', 'max_role_1084L', 'avginstallast24m_3658937A', 'last_mainoccupationinc_384A', 'mastercontrexist_109L', 'last_rejectreasonclient_4145042M', 'homephncnt_628L', 'sellerplacecnt_915L', 'firstdatedue_489D', 'numinstpaidearly5d_1087L', 'max_subjectroles_name_541M', 'last_birth_259D', 'last_empls_employer_name_740M', 'max_subjectrole_93M', 'last_dateactivated_425D', 'avgdbddpdlast3m_4187120P', 'last_financialinstitution_591M', 'max_contaddr_smempladdr_334L', 'last_dtlastpmtallstes_3545839D', 'mindbddpdlast24m_3658935P', 'last_persontype_1072L', 'max_subjectroles_name_838M', 'clientscnt_100L', 'max_classificationofcontr_400M', 'maxlnamtstart6m_4525199A', 'numinstpaidearly_338L', 'numinstpaidlate1d_3546852L', 'numincomingpmts_3546848L', 'numinstlswithoutdpd_562L', 'max_financialinstitution_591M', 'numrejects9m_859L', 'maxdpdinstldate_3546855D', 'max_conts_role_79M', 'last_contaddr_matchlist_1032L', 'max_cancelreason_3545846M', 'numcontrs3months_479L', 'pctinstlsallpaidearl3d_427L', 'lastrejectreasonclient_4145040M', 'pctinstlsallpaidlate4d_3546849L', 'max_numberofoutstandinstls_520L', 'mindbdtollast24m_4525191P', 'maxinstallast24m_3658928A', 'mean_overdueamount_31A', 'max_pmts_month_158T', 'maxdpdtolerance_374P', 'maxdbddpdtollast6m_4187119P', 'max_financialinstitution_382M', 'max_dtlastpmt_581D', 'clientscnt_946L', 'last_role_1084L', 'max_collaterals_typeofguarante_359M', 'education_88M', 'daysoverduetolerancedd_3976961L', 'max_collaterals_typeofguarante_669M', 'applications30d_658L', 'currdebtcredtyperange_828A', 'numinstls_657L', 'totalsettled_863A', 'max_purposeofcred_426M', 'max_subjectrole_182M', 'last_empladdr_zipcode_114M', 'last_language1_981M', 'max_contaddr_matchlist_1032L', 'last_subjectrole_93M', 'max_education_927M', 'max_empladdr_district_926M', 'applicationcnt_361L', 'last_empls_economicalst_849M', 'last_collater_typofvalofguarant_298M', 'last_collater_typofvalofguarant_407M', 'last_collaterals_typeofguarante_359M', 'last_collaterals_typeofguarante_669M', 'last_subjectroles_name_541M', 'firstclxcampaign_1125D', 'lastactivateddate_801D', 'lastdelinqdate_224D', 'max_pmts_month_706T', 'last_incometype_1044T', 'avgmaxdpdlast9m_3716943P', 'datefirstoffer_1144D', 'datelastunpaid_3546854D', 'last_safeguarantyflag_411L', 'last_sex_738L', 'last_type_25L', 'max_collater_typofvalofguarant_298M', 'max_collater_typofvalofguarant_407M', 'max_numberofoverdueinstls_834L', 'avgdpdtolclosure24_3658938P', 'maritalst_893M', 'lastapprcommoditycat_1041M', 'lastrejectcommodtypec_5251769M', 'lastst_736L', 'paytype1st_925L', 'paytype_783L', 'numinstregularpaidest_4493210L', 'numinstpaidearly5dest_4493211L', 'numinstregularpaid_973L', 'pctinstlsallpaidlat10d_839L', 'numnotactivated_1143L', 'numpmtchanneldd_318L', 'max_safeguarantyflag_411L', 'max_type_25L', 'numinstlswithdpd10_728L', 'numinstpaid_4499208L', 'last_contaddr_smempladdr_334L', 'max_isbidproduct_390L', 'last_isbidproduct_390L', 'max_classificationofcontr_13M', 'deferredmnthsnum_166L', 'downpmt_116A', 'isbidproduct_1095L', 'numactivecreds_622L', 'numactivecredschannel_414L', 'numactiverelcontr_750L', 'max_contractst_545M', 'maxdpdinstlnum_3546846P', 'last_subjectroles_name_838M', 'max_cacccardblochreas_147M', 'last_cacccardblochreas_147M', 'max_conts_type_509L', 'max_empls_economicalst_849M', 'max_contractst_964M', 'clientscnt_157L', 'clientscnt_257L', 'clientscnt_304L', 'maxdbddpdlast1m_3658939P', 'maxdbddpdtollast12m_3658940P', 'clientscnt_360L', 'clientscnt_493L', 'numinstlswithdpd5_4187116L', 'annuitynextmonth_57A', 'currdebt_22A', 'totaldebt_9A', 'last_actualdpd_943P', 'last_conts_role_79M', 'pctinstlsallpaidlate6d_3546844L', 'last_classificationofcontr_13M', 'last_classificationofcontr_400M', 'last_contractst_545M', 'last_contractst_964M', 'last_description_351M', 'last_financialinstitution_382M', 'applicationscnt_1086L', 'clientscnt_1022L', 'clientscnt_1130L', 'max_empladdr_zipcode_114M', 'last_education_927M', 'last_empladdr_district_926M', 'last_purposeofcred_426M', 'last_purposeofcred_874M', 'last_subjectrole_182M']
data = data.drop(columns=[col for col in irrelevant_cols if col in data.columns])
data_submission = data_submission.drop(columns=[col for col in irrelevant_cols if col in data_submission.columns])

  data = data.drop(columns=[col for col in irrelevant_cols if col in data.columns])
  data_submission = data_submission.drop(columns=[col for col in irrelevant_cols if col in data_submission.columns])


In [9]:
pd.options.mode.chained_assignment = None  # None|'warn'|'raise'

seed = 42
from sklearn.model_selection import train_test_split
case_ids = data['case_id'].unique().shuffle(seed=seed)
case_ids_train, case_ids_test = train_test_split(case_ids, train_size=0.85, random_state=seed)

cols_pred = []
for col in data.columns:
    if col[-1].isupper() and col[:-1].islower():
        cols_pred.append(col)
        
def from_polars_to_pandas(case_ids):
    return (
        data.filter(pl.col("case_id").is_in(case_ids))[["case_id", "WEEK_NUM", "target"]].to_pandas(),
        data.filter(pl.col("case_id").is_in(case_ids))[cols_pred].to_pandas(),
        data.filter(pl.col("case_id").is_in(case_ids))["target"].to_pandas()
    )

from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()

# Make it into a type that XGBClassifier can handle
def convert_strings(df, train=True, columns=None):
    if train:
        threshold = 0.7
        original_columns = set(df.columns)
        # Drop columns with more than x% missing values
        df = df.copy().dropna(thresh=(1 - threshold) * len(df), axis=1)
        remaining_columns = df.columns
    else:
        df = df[columns]
        remaining_columns = columns
    for col in df.columns:
        # Base date
        base_date = pd.Timestamp('2020-10-19 00:00:00')
        if df[col].dtype.name in ['object', 'string']:
            if 'date' in col:
                df[col] = pd.to_datetime(df[col])
                df[col] = (df[col] - base_date).dt.days  
                df[col] = RobustScaler().fit_transform(df.copy()[[col]])
            elif col == 'previouscontdistrict_112M':
                df = df.drop(col, axis=1)
                continue
            df[col] = df[col].astype("string").astype('category')
            current_categories = df[col].cat.categories
            new_categories = current_categories.to_list() + ["Unknown"]
            new_dtype = pd.CategoricalDtype(categories=new_categories, ordered=True)
            df[col] = df[col].astype(new_dtype).cat.codes
            if df[col].nunique() > 30:  # More than 30 categories
                df[col] = df[col].astype('category').cat.codes
                df[col] = RobustScaler().fit_transform(df[[col]])
        elif df[col].dtype == 'float':
            df[col] = RobustScaler().fit_transform(df[[col]])
    return df, remaining_columns

base_train, X, y = from_polars_to_pandas(case_ids_train)
base_test, X_test, y_test = from_polars_to_pandas(case_ids_test)

del data
del case_ids_train
del case_ids_test
gc.collect()

X, train_columns = convert_strings(X, train=True)
X_test, _ = convert_strings(X_test, train=False, columns=train_columns)

In [10]:
print(f"Train: {X.shape}")
print(f"Test: {X_test.shape}")

Train: (1297660, 22)
Test: (228999, 22)


In [11]:
import xgboost as xgb
import optuna
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score 
from sklearn.ensemble import AdaBoostClassifier
import lightgbm as lgb
import json

def custom_imputer(df):
    for column in df.columns:
        if df[column].dtype == 'float':
            df[column] = df[column].fillna(df[column].mean())
        elif pd.api.types.is_integer_dtype(df[column].dtype) or df[column].dtype == 'bool':
            if pd.api.types.is_integer_dtype(df[column].dtype):
                df[column] = df[column].fillna(df[column].median())
            elif df[column].dtype == 'bool':
                df[column] = df[column].fillna(df[column].mode()[0])
    return df

# def xgb_objective(trial):
#     params = {
#         'tree_method':'hist',
#         'device':'cuda',
#         'enable_categorical':'True',
#         'eval_metric':'auc',
#         'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.5),
#         'n_estimators': trial.suggest_int('n_estimators', 50, 100), # 100, 1000)
#         'max_depth': trial.suggest_int('max_depth', 3, 5), # 3, 9)
#         'min_child_weight': trial.suggest_int('min_child_weight', 1, 9),
#         'subsample': trial.suggest_float('subsample', 0.5, 1.0),
#         'colsample_bytree': trial.suggest_float('colsample_bytree', 0.05, 0.8),
#         'reg_alpha' : trial.suggest_float('reg_alpha', 0.5, 1.0),
#         'reg_lambda' : trial.suggest_float('reg_lambda', 0.5, 1.0),
#         'random_state': 42
#     }
#     xgb_classifier = xgb.XGBClassifier(**params)
#     skfold = KFold(n_splits=3, shuffle=True)
#     cv_results = []
#     for fold, (train_idx, val_idx) in enumerate(skfold.split(X, y)):
#         X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
#         X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
        
#         X_train = xgb.DMatrix(X_train, label=y_train, enable_categorical=True)
#         X_val = xgb.DMatrix(X_val, enable_categorical=True)

#         best_xgb = xgb.train(params, X_train, num_boost_round=902)
#         y_val_pred = best_xgb.predict(X_val)
#         val_loss = roc_auc_score(y_val, y_val_pred)
#         cv_results.append(val_loss)
#     return np.mean(cv_results)

# def gbm_objective(trial):
#     params = {
#         'objective': 'binary',
#         'metric': 'auc',
#         'device_type': 'gpu',
#         'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
#         'num_leaves': trial.suggest_int('num_leaves', 20, 40),
#         'max_depth': trial.suggest_int('max_depth', 3, 8),
#         'min_child_samples': trial.suggest_int('min_child_samples', 20, 100),
#         'subsample': trial.suggest_float('subsample', 0.5, 1.0),
#         'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 0.9),
#         'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 1.0),
#         'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 1.0)
#     }
#     cv_results = []
#     skfold = KFold(n_splits=3, shuffle=True)
#     for train_idx, val_idx in skfold.split(X, y):
#         X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
#         y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

#         dtrain = lgb.Dataset(X_train, label=y_train)
#         dval = lgb.Dataset(X_val, label=y_val, reference=dtrain)

#         model = lgb.train(params, dtrain, num_boost_round=100, valid_sets=[dval],)
#         preds = model.predict(X_val)
#         auc = roc_auc_score(y_val, preds)
#         cv_results.append(auc)
#     return np.mean(cv_results)

# n_trials = 300

# tuning_study = optuna.create_study(direction='maximize')
# tuning_study.optimize(xgb_objective, n_trials=n_trials)
# print("XGB Best trial:")
# trial = tuning_study.best_trial
# print(f"  Value: {trial.value}")
# print("  Params: ")
# for key, value in trial.params.items():
#     print(f"    {key}: {value}")
# best_params_xgb = tuning_study.best_params

# tuning_study = optuna.create_study(direction='maximize')
# tuning_study.optimize(gbm_objective, n_trials=n_trials)
# print("GBM Best trial:")
# trial = tuning_study.best_trial
# print(f"  Value: {trial.value}")
# print("  Params: ")
# for key, value in trial.params.items():
#     print(f"    {key}: {value}")
# best_params_gbm = tuning_study.best_params

# params_json_xgb = json.dumps(best_params_xgb)
# params_json_gbm = json.dumps(best_params_gbm)
# with open('best_params_xgb.json', 'w') as f:
#     f.write(params_json_xgb)
# with open('best_params_gbm.json', 'w') as f:
#     f.write(params_json_gbm)





In [12]:
pd.options.mode.chained_assignment = None  # None|'warn'|'raise'

# XGB
# best_params_xgb_v1 = {'learning_rate': 0.04685416258390824, 'n_estimators': 78, 'max_depth': 5, 'min_child_weight': 4, 'subsample': 0.8054062256552962, 'colsample_bytree': 0.23701687375018693, 'reg_alpha': 0.8310191036671074, 'reg_lambda': 0.8000025963932935}
best_params_xgb = {'learning_rate': 0.03724075891357543, 'n_estimators': 92, 'max_depth': 5, 'min_child_weight': 1, 'subsample': 0.9993454153985859, 'colsample_bytree': 0.667933819896793, 'reg_alpha': 0.874083448363055, 'reg_lambda': 0.8998681581406556}
# best_xgb = xgb.XGBClassifier(**best_params)
# best_xgb = best_xgb.fit(X, y)
# y_pred = best_xgb.predict(X)
# y_test_pred = best_xgb.predict(X_test)

# WITH GPU
best_params_xgb['tree_method'] = 'hist'
best_params_xgb['device'] = 'cuda'

X_dmat = xgb.DMatrix(X, label=y, enable_categorical=True)
X_test_dmat = xgb.DMatrix(X_test, enable_categorical=True)

best_xgb = xgb.train(best_params_xgb, X_dmat, num_boost_round=902)

y_pred_xgb = best_xgb.predict(X_dmat)
y_test_pred_xgb = best_xgb.predict(X_test_dmat)
#

# # ADABOOST
# # Impute values for adaboost
# X_ada = custom_imputer(X)
# X_test_ada = custom_imputer(X_test)

# adaboost = AdaBoostClassifier(n_estimators=50, random_state=42)
# adaboost.fit(X_ada, y)
# y_pred_ada = adaboost.predict(X_ada)
# y_test_pred_ada = adaboost.predict(X_test_ada)

# GBM

X_dmat = lgb.Dataset(X, label=y)

# lgb_params = {'objective': 'binary','boosting_type': 'gbdt','metric': 'auc','learning_rate': 0.1,'num_leaves': 31,'max_depth': -1,'min_child_samples': 20,'subsample': 1.0,'colsample_bytree': 1.0,'reg_alpha': 0.0,'reg_lambda': 0.0,}
# best_params_gbm_v1 = {'learning_rate': 0.12629903580473947, 'num_leaves': 33, 'max_depth': 8, 'min_child_samples': 40, 'subsample': 0.8414187308054227, 'colsample_bytree': 0.46976873543554787, 'reg_alpha': 0.9060523209262823, 'reg_lambda': 0.97840319882865}
best_params_gbm = {'learning_rate': 0.14646560927560817, 'num_leaves': 40, 'max_depth': 8, 'min_child_samples': 76, 'subsample': 0.9176445028303771, 'colsample_bytree': 0.5049679815127405, 'reg_alpha': 0.9993513880549101, 'reg_lambda': 0.97192418018783}
lgb_model = lgb.train(best_params_gbm, X_dmat, num_boost_round=100)
y_pred_gbm = lgb_model.predict(X)
y_test_pred_gbm = lgb_model.predict(X_test)

base_train["score"] = (y_pred_xgb + y_pred_gbm) / 2
base_test["score"] = (y_test_pred_xgb + y_test_pred_gbm) / 2

Parameters: { "n_estimators" } are not used.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.274352 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3604
[LightGBM] [Info] Number of data points in the train set: 1297660, number of used features: 22
[LightGBM] [Info] Start training from score 0.031501


In [13]:
def gini_stability(base, w_fallingrate=88.0, w_resstd=-0.5):
    
    print(base['target'].value_counts())
    base = base.loc[:, ["WEEK_NUM", "target", "score"]]\
        .sort_values("WEEK_NUM")\
        .groupby("WEEK_NUM")[["target", "score"]]
    # Prepare to store Gini coefficients for each group
    gini_in_time = []
    
    # Compute Gini coefficients for each group, checking class counts
    for _, group in base:
        if len(group['target'].unique()) > 1:
            gini_score = 2 * roc_auc_score(group["target"], group["score"]) - 1
            gini_in_time.append(gini_score)
        else:
            # Handle the case where only one class is present
            gini_in_time.append(0)  # Or another default value as appropriate
    
    x = np.arange(len(gini_in_time))
    y = gini_in_time
    a, b = np.polyfit(x, y, 1)
    y_hat = a*x + b
    residuals = y - y_hat
    res_std = np.std(residuals)
    avg_gini = np.mean(gini_in_time)
    return avg_gini + w_fallingrate * min(0, a) + w_resstd * res_std

stability_score_train = gini_stability(base_train)
stability_score_test = gini_stability(base_test)

print(f'The stability score on the train set is: {stability_score_train}') 
print(f'The stability score on the test set is: {stability_score_test}')

target
0    1256783
1      40877
Name: count, dtype: int64
target
0    221882
1      7117
Name: count, dtype: int64
The stability score on the train set is: 0.5079862136461668
The stability score on the test set is: 0.4671435843083652


In [14]:
X_submission = data_submission[cols_pred].to_pandas()
X_submission, _ = convert_strings(X_submission, train=False, columns=train_columns)
categorical_cols = X.select_dtypes(include=['category']).columns

# categorical_cols_submission = X_submission.select_dtypes(include=['category']).columns
# not_in_list1 = [item for item in categorical_cols_submission if item not in categorical_cols]
# X_submission = X_submission.drop(columns=not_in_list1)
   
# for col in categorical_cols:    
#     X_submission[col] = X_submission[col].astype('category')
#     train_categories = set(X[col].cat.categories)
#     submission_categories = set(X_submission[col].cat.categories)  # Ensure this is defined before use
#     all_categories = train_categories.union(submission_categories).union({'Unknown'})
    
#     # Update the data types with the new categories list, including 'Unknown'
#     new_dtype = pd.CategoricalDtype(categories=all_categories, ordered=True)
#     X[col] = X[col].astype(new_dtype)
#     X_submission[col] = X_submission[col].astype(new_dtype)
    
#     # Replace categories in X_submission not found in X_train with "Unknown"
#     X_submission.loc[X_submission[col].isin(submission_categories - train_categories), col] = "Unknown"
#     # Ensure X_submission uses the same categories as X_train
#     X_submission[col] = X_submission[col].cat.set_categories(X[col].cat.categories)

# XGB
X_submission_dmat = xgb.DMatrix(X_submission, enable_categorical=True)
y_submission_pred_xgb = best_xgb.predict(X_submission_dmat)

# # ADA
# X_submission_ada = custom_imputer(X_submission)
# y_submission_pred_ada = adaboost.predict(X_submission_ada)

# GBM
X_submission_dmat = lgb.Dataset(X_submission)
y_submission_pred_gbm = lgb_model.predict(X_submission)

# Ensembling
y_submission_pred = (y_submission_pred_xgb + y_submission_pred_gbm) / 2

submission = pd.DataFrame({
    "case_id": data_submission["case_id"].to_numpy(),
    "score": y_submission_pred
}).set_index('case_id')
submission.to_csv("./submission.csv")