In [49]:
import polars as pl
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

dataPath = "/kaggle/input/home-credit-credit-risk-model-stability/"

In [50]:
def set_table_dtypes(df: pl.DataFrame) -> pl.DataFrame:
    # implement here all desired dtypes for tables
    # the following is just an example
    for col in df.columns:
        # last letter of column name will help you determine the type
        if col[-1] in ("P", "A"):
            df = df.with_columns(pl.col(col).cast(pl.Float64).alias(col))

    return df


def convert_strings(df: pd.DataFrame) -> pd.DataFrame:
    for col in df.columns:
        if df[col].dtype.name in ["object", "string"]:
            df[col] = df[col].astype("string").astype("category")
            current_categories = df[col].cat.categories
            new_categories = current_categories.to_list() + ["Unknown"]
            new_dtype = pd.CategoricalDtype(categories=new_categories, ordered=True)
            df[col] = df[col].astype(new_dtype)
    return df

In [51]:
train_basetable = pl.read_csv("../Data/train_base.csv")
train_static = pl.concat(
    [
        pl.read_csv("../Data/train_static_0_0.csv").pipe(set_table_dtypes),
        pl.read_csv("../Data/train_static_0_1.csv").pipe(set_table_dtypes),
    ],
    how="vertical_relaxed",
)
train_static_cb = pl.read_csv("../Data/train_static_cb_0.csv").pipe(set_table_dtypes)
train_person_1 = pl.read_csv("../Data/train_person_1.csv").pipe(set_table_dtypes)
train_credit_bureau_b_2 = pl.read_csv("../Data/train_credit_bureau_b_2.csv").pipe(
    set_table_dtypes
)

In [9]:
test_basetable = pl.read_csv("../Data/test_base.csv")
test_static = pl.concat(
    [
        pl.read_csv("../Data/test_static_0_0.csv").pipe(set_table_dtypes),
        pl.read_csv("../Data/test_static_0_1.csv").pipe(set_table_dtypes),
        pl.read_csv("../Data/test_static_0_2.csv").pipe(set_table_dtypes),
    ],
    how="vertical_relaxed",
)
test_static_cb = pl.read_csv("../Data/test_static_cb_0.csv").pipe(set_table_dtypes)
test_person_1 = pl.read_csv("../Data/test_person_1.csv").pipe(set_table_dtypes)
test_credit_bureau_b_2 = pl.read_csv("../Data/test_credit_bureau_b_2.csv").pipe(
    set_table_dtypes
)

In [52]:
# We need to use aggregation functions in tables with depth > 1, so tables that contain num_group1 column or
# also num_group2 column.
train_person_1_feats_1 = train_person_1.group_by("case_id").agg(
    pl.col("mainoccupationinc_384A").max().alias("mainoccupationinc_384A_max"),
    (pl.col("incometype_1044T") == "SELFEMPLOYED")
    .max()
    .alias("mainoccupationinc_384A_any_selfemployed"),
)


# Here num_group1=0 has special meaning, it is the person who applied for the loan.
train_person_1_feats_2 = (
    train_person_1.select(["case_id", "num_group1", "housetype_905L"])
    .filter(pl.col("num_group1") == 0)
    .drop("num_group1")
    .rename({"housetype_905L": "person_housetype"})
)

# Here we have num_goup1 and num_group2, so we need to aggregate again.
train_credit_bureau_b_2_feats = train_credit_bureau_b_2.group_by("case_id").agg(
    pl.col("pmts_pmtsoverdue_635A").max().alias("pmts_pmtsoverdue_635A_max"),
    (pl.col("pmts_dpdvalue_108P") > 31).max().alias("pmts_dpdvalue_108P_over31"),
)

# We will process in this examples only A-type and M-type columns, so we need to select them.
selected_static_cols = []
for col in train_static.columns:
    if col[-1] in ("A", "M"):
        selected_static_cols.append(col)
print(selected_static_cols)

selected_static_cb_cols = []
for col in train_static_cb.columns:
    if col[-1] in ("A", "M"):
        selected_static_cb_cols.append(col)
print(selected_static_cb_cols)

# Join all tables together.
data = (
    train_basetable.join(
        train_static.select(["case_id"] + selected_static_cols),
        how="left",
        on="case_id",
    )
    .join(
        train_static_cb.select(["case_id"] + selected_static_cb_cols),
        how="left",
        on="case_id",
    )
    .join(train_person_1_feats_1, how="left", on="case_id")
    .join(train_person_1_feats_2, how="left", on="case_id")
    .join(train_credit_bureau_b_2_feats, how="left", on="case_id")
)

['amtinstpaidbefduel24m_4187115A', 'annuity_780A', 'annuitynextmonth_57A', 'avginstallast24m_3658937A', 'avglnamtstart24m_4525187A', 'avgoutstandbalancel6m_4187114A', 'avgpmtlast12m_4525200A', 'credamount_770A', 'currdebt_22A', 'currdebtcredtyperange_828A', 'disbursedcredamount_1113A', 'downpmt_116A', 'inittransactionamount_650A', 'lastapprcommoditycat_1041M', 'lastapprcommoditytypec_5251766M', 'lastapprcredamount_781A', 'lastcancelreason_561M', 'lastotherinc_902A', 'lastotherlnsexpense_631A', 'lastrejectcommoditycat_161M', 'lastrejectcommodtypec_5251769M', 'lastrejectcredamount_222A', 'lastrejectreason_759M', 'lastrejectreasonclient_4145040M', 'maininc_215A', 'maxannuity_159A', 'maxannuity_4075009A', 'maxdebt4_972A', 'maxinstallast24m_3658928A', 'maxlnamtstart6m_4525199A', 'maxoutstandbalancel12m_4187113A', 'maxpmtlast3m_4525190A', 'previouscontdistrict_112M', 'price_1097A', 'sumoutstandtotal_3546847A', 'sumoutstandtotalest_4493215A', 'totaldebt_9A', 'totalsettled_863A', 'totinstallas

In [53]:
case_ids = data["case_id"].unique().shuffle(seed=1)
case_ids_train, case_ids_test = train_test_split(
    case_ids, train_size=0.6, random_state=1
)
case_ids_valid, case_ids_test = train_test_split(
    case_ids_test, train_size=0.5, random_state=1
)

cols_pred = []
for col in data.columns:
    if col[-1].isupper() and col[:-1].islower():
        cols_pred.append(col)

print(cols_pred)


def from_polars_to_pandas(case_ids: pl.DataFrame) -> pl.DataFrame:
    return (
        data.filter(pl.col("case_id").is_in(case_ids))[
            ["case_id", "WEEK_NUM", "target"]
        ].to_pandas(),
        data.filter(pl.col("case_id").is_in(case_ids))[cols_pred].to_pandas(),
        data.filter(pl.col("case_id").is_in(case_ids))["target"].to_pandas(),
    )


base_train, X_train, y_train = from_polars_to_pandas(case_ids_train)
base_valid, X_valid, y_valid = from_polars_to_pandas(case_ids_valid)
base_test, X_test, y_test = from_polars_to_pandas(case_ids_test)

for df in [X_train, X_valid, X_test]:
    df = convert_strings(df)

['amtinstpaidbefduel24m_4187115A', 'annuity_780A', 'annuitynextmonth_57A', 'avginstallast24m_3658937A', 'avglnamtstart24m_4525187A', 'avgoutstandbalancel6m_4187114A', 'avgpmtlast12m_4525200A', 'credamount_770A', 'currdebt_22A', 'currdebtcredtyperange_828A', 'disbursedcredamount_1113A', 'downpmt_116A', 'inittransactionamount_650A', 'lastapprcommoditycat_1041M', 'lastapprcommoditytypec_5251766M', 'lastapprcredamount_781A', 'lastcancelreason_561M', 'lastotherinc_902A', 'lastotherlnsexpense_631A', 'lastrejectcommoditycat_161M', 'lastrejectcommodtypec_5251769M', 'lastrejectcredamount_222A', 'lastrejectreason_759M', 'lastrejectreasonclient_4145040M', 'maininc_215A', 'maxannuity_159A', 'maxannuity_4075009A', 'maxdebt4_972A', 'maxinstallast24m_3658928A', 'maxlnamtstart6m_4525199A', 'maxoutstandbalancel12m_4187113A', 'maxpmtlast3m_4525190A', 'previouscontdistrict_112M', 'price_1097A', 'sumoutstandtotal_3546847A', 'sumoutstandtotalest_4493215A', 'totaldebt_9A', 'totalsettled_863A', 'totinstallas

In [54]:
lgb_train = lgb.Dataset(X_train, label=y_train)
lgb_valid = lgb.Dataset(X_valid, label=y_valid, reference=lgb_train)

params = {
    "boosting_type": "gbdt",
    "objective": "binary",
    "metric": "auc",
    "max_depth": 3,
    "num_leaves": 31,
    "learning_rate": 0.05,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "n_estimators": 1000,
    "verbose": -1,
}

gbm = lgb.train(
    params,
    lgb_train,
    valid_sets=[lgb_train, lgb_valid],
    callbacks=[lgb.log_evaluation(50), lgb.early_stopping(10)],
)



Training until validation scores don't improve for 10 rounds
[50]	training's auc: 0.706346	valid_1's auc: 0.705963
[100]	training's auc: 0.726077	valid_1's auc: 0.724362
[150]	training's auc: 0.734315	valid_1's auc: 0.731423
[200]	training's auc: 0.740133	valid_1's auc: 0.735874
[250]	training's auc: 0.744217	valid_1's auc: 0.739009
[300]	training's auc: 0.747163	valid_1's auc: 0.740965
[350]	training's auc: 0.750072	valid_1's auc: 0.742924
[400]	training's auc: 0.752661	valid_1's auc: 0.744582
[450]	training's auc: 0.754851	valid_1's auc: 0.745977
[500]	training's auc: 0.756586	valid_1's auc: 0.747033
[550]	training's auc: 0.758271	valid_1's auc: 0.747877
[600]	training's auc: 0.760103	valid_1's auc: 0.749039
[650]	training's auc: 0.761793	valid_1's auc: 0.750087
[700]	training's auc: 0.763201	valid_1's auc: 0.750863
Early stopping, best iteration is:
[739]	training's auc: 0.764123	valid_1's auc: 0.751216


In [55]:
for base, X in [(base_train, X_train), (base_valid, X_valid), (base_test, X_test)]:
    y_pred = gbm.predict(X, num_iteration=gbm.best_iteration)
    base["score"] = y_pred

lgb_roc_train = roc_auc_score(base_train["target"], base_train["score"])
lgb_roc_valid = roc_auc_score(base_valid["target"], base_valid["score"])
lgb_roc_test = roc_auc_score(base_test["target"], base_test["score"])

print(f"The AUC score on the train set is: {lgb_roc_train}")
print(f"The AUC score on the valid set is: {lgb_roc_valid}")
print(f"The AUC score on the test set is: {lgb_roc_test}")

print("------")

print(
    f'The AUC score on the train set is: {roc_auc_score(base_train["target"], base_train["score"])}'
)
print(
    f'The AUC score on the valid set is: {roc_auc_score(base_valid["target"], base_valid["score"])}'
)
print(
    f'The AUC score on the test set is: {roc_auc_score(base_test["target"], base_test["score"])}'
)

The AUC score on the train set is: 0.764122917660593
The AUC score on the valid set is: 0.7512157223309048
The AUC score on the test set is: 0.7483072129459662
------
The AUC score on the train set is: 0.764122917660593
The AUC score on the valid set is: 0.7512157223309048
The AUC score on the test set is: 0.7483072129459662


In [56]:
def gini_stability(base, w_fallingrate=88.0, w_resstd=-0.5):
    gini_in_time = (
        base.loc[:, ["WEEK_NUM", "target", "score"]]
        .sort_values("WEEK_NUM")
        .groupby("WEEK_NUM")[["target", "score"]]
        .apply(lambda x: 2 * roc_auc_score(x["target"], x["score"]) - 1)
        .tolist()
    )

    x = np.arange(len(gini_in_time))
    y = gini_in_time
    a, b = np.polyfit(x, y, 1)
    y_hat = a * x + b
    residuals = y - y_hat
    res_std = np.std(residuals)
    avg_gini = np.mean(gini_in_time)
    return avg_gini + w_fallingrate * min(0, a) + w_resstd * res_std


lgb_stability_score_train = gini_stability(base_train)
lgb_stability_score_valid = gini_stability(base_valid)
lgb_stability_score_test = gini_stability(base_test)

print(f"The stability score on the train set is: {lgb_stability_score_train}")
print(f"The stability score on the valid set is: {lgb_stability_score_valid}")
print(f"The stability score on the test set is: {lgb_stability_score_test}")

The stability score on the train set is: 0.4976648127691175
The stability score on the valid set is: 0.4726726686264489
The stability score on the test set is: 0.4583643686935092


XGBoost

In [78]:
# do the same as above for XGBoost

import xgboost as xgb

dtrain = xgb.DMatrix(X_train, label=y_train, enable_categorical=True)
dvalid = xgb.DMatrix(X_valid, label=y_valid, enable_categorical=True)

params = {
    "objective": "binary:logistic",
    "eval_metric": "auc",
    "max_depth": 10,
    "eta": 0.05,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "n_estimators": 1000,
    "verbosity": 0,
}

watchlist = [(dtrain, "train"), (dvalid, "valid")]

xgb_model = xgb.train(
    params,
    dtrain,
    num_boost_round=1000,
    evals=watchlist,
    early_stopping_rounds=15,
    verbose_eval=50,
)

[0]	train-auc:0.69535	valid-auc:0.55697
[50]	train-auc:0.81832	valid-auc:0.68261
[100]	train-auc:0.85976	valid-auc:0.70523
[104]	train-auc:0.86140	valid-auc:0.70336


In [79]:
dtest = xgb.DMatrix(X_test, enable_categorical=True)

for base, X in [(base_train, X_train), (base_valid, X_valid), (base_test, X_test)]:
    d = xgb.DMatrix(X, enable_categorical=True)
    y_pred = xgb_model.predict(d)
    base["score"] = y_pred

xgb_roc_train = roc_auc_score(base_train["target"], base_train["score"])
xgb_roc_valid = roc_auc_score(base_valid["target"], base_valid["score"])
xgb_roc_test = roc_auc_score(base_test["target"], base_test["score"])

print("------")

print(
    f'The AUC score on the train set is: {roc_auc_score(base_train["target"], base_train["score"])}'
)

print(
    f'The AUC score on the valid set is: {roc_auc_score(base_valid["target"], base_valid["score"])}'
)

print(
    f'The AUC score on the test set is: {roc_auc_score(base_test["target"], base_test["score"])}'
)

------
The AUC score on the train set is: 0.8614036122643383
The AUC score on the valid set is: 0.7033600451870209
The AUC score on the test set is: 0.7085760103695176


In [80]:
xgb_stability_score_train = gini_stability(base_train)

xgb_stability_score_valid = gini_stability(base_valid)

xgb_stability_score_test = gini_stability(base_test)

print(f"The stability score on the train set is: {xgb_stability_score_train}")

print(f"The stability score on the valid set is: {xgb_stability_score_valid}")

print(f"The stability score on the test set is: {xgb_stability_score_test}")

The stability score on the train set is: 0.7093855596344508
The stability score on the valid set is: 0.3802748356115785
The stability score on the test set is: 0.3728889288284678


In [60]:
# do the same as above for CatBoost

import catboost as cb

# fill nan with mode for categorical columns

# Fill NaN with mode for each column in your dataframe
for column in X_train.columns:
    X_train[column].fillna(X_train[column].mode()[0], inplace=True)

for column in X_valid.columns:
    X_valid[column].fillna(X_valid[column].mode()[0], inplace=True)

cat_features = X_train.select_dtypes(include=["category"]).columns.tolist()

train_pool = cb.Pool(X_train, y_train, cat_features=cat_features)
valid_pool = cb.Pool(X_valid, y_valid, cat_features=cat_features)

params = {
    "loss_function": "Logloss",
    "eval_metric": "AUC",
    "depth": 3,
    "learning_rate": 0.05,
    "l2_leaf_reg": 3,
    "iterations": 1000,
    "verbose": 50,
}

cb_model = cb.CatBoost(params)
cb_model.fit(train_pool, eval_set=valid_pool, early_stopping_rounds=10, verbose=50)

0:	test: 0.5427875	best: 0.5427875 (0)	total: 615ms	remaining: 10m 14s
Stopped by overfitting detector  (10 iterations wait)

bestTest = 0.6805735299
bestIteration = 30

Shrink model to first 31 iterations.


<catboost.core.CatBoost at 0x2bedb42d0>

In [61]:
for col in X_test.columns:
    X_test[col].fillna(X_test[col].mode()[0], inplace=True)


for base, X in [(base_train, X_train), (base_valid, X_valid), (base_test, X_test)]:

    y_pred = cb_model.predict(X)

    base["score"] = y_pred

cb_roc_train = roc_auc_score(base_train["target"], base_train["score"])
cb_roc_valid = roc_auc_score(base_valid["target"], base_valid["score"])
cb_roc_test = roc_auc_score(base_test["target"], base_test["score"])

print("------")

print(
    f'The AUC score on the train set is: {roc_auc_score(base_train["target"], base_train["score"])}'
)

print(
    f'The AUC score on the valid set is: {roc_auc_score(base_valid["target"], base_valid["score"])}'
)

print(
    f'The AUC score on the test set is: {roc_auc_score(base_test["target"], base_test["score"])}'
)

cb_stability_score_train = gini_stability(base_train)

cb_stability_score_valid = gini_stability(base_valid)

cb_stability_score_test = gini_stability(base_test)

print(f"The stability score on the train set is: {cb_stability_score_train}")

print(f"The stability score on the valid set is: {cb_stability_score_valid}")

print(f"The stability score on the test set is: {cb_stability_score_test}")

------
The AUC score on the train set is: 0.6783064729924821
The AUC score on the valid set is: 0.6805735299222451
The AUC score on the test set is: 0.6750788941275945
The stability score on the train set is: 0.3171432810595581
The stability score on the valid set is: 0.3206886314866385
The stability score on the test set is: 0.30648998373635555


In [33]:
# #random forest
# from sklearn.ensemble import RandomForestClassifier
# rf = RandomForestClassifier(n_estimators=100, random_state=1)

# X_train_encoded = pd.get_dummies(X_train)
# rf.fit(X_train_encoded, y_train)

In [62]:
# remove categorical variables from X_train, X_valid and X_test

X_train_rf = X_train.select_dtypes(exclude=["category"])

X_valid_rf = X_valid.select_dtypes(exclude=["category"])

X_test_rf = X_test.select_dtypes(exclude=["category"])

In [63]:
from sklearn.ensemble import RandomForestClassifier

rf2 = RandomForestClassifier(n_estimators=100, random_state=1)

rf2.fit(X_train_rf, y_train)

In [36]:
# X_valid_encoded = pd.get_dummies(X_valid)

# X_test_encoded = pd.get_dummies(X_test)

In [64]:
for base, X in [
    (base_train, X_train_rf),
    (base_valid, X_valid_rf),
    (base_test, X_test_rf),
]:

    y_pred = rf2.predict_proba(X)[:, 1]

    base["score"] = y_pred

rf_roc_train = roc_auc_score(base_train["target"], base_train["score"])
rf_roc_valid = roc_auc_score(base_valid["target"], base_valid["score"])
rf_roc_test = roc_auc_score(base_test["target"], base_test["score"])


print(
    f'The AUC score on the train set is: {roc_auc_score(base_train["target"], base_train["score"])}'
)

print(
    f'The AUC score on the valid set is: {roc_auc_score(base_valid["target"], base_valid["score"])}'
)

print(
    f'The AUC score on the test set is: {roc_auc_score(base_test["target"], base_test["score"])}'
)

The AUC score on the train set is: 0.9996762922710531
The AUC score on the valid set is: 0.650666122453193
The AUC score on the test set is: 0.6628267506859472


In [65]:
rf_stability_score_train = gini_stability(base_train)

rf_stability_score_valid = gini_stability(base_valid)

rf_stability_score_test = gini_stability(base_test)

print(f"The stability score on the train set is: {rf_stability_score_train}")

print(f"The stability score on the valid set is: {rf_stability_score_valid}")

print(f"The stability score on the test set is: {rf_stability_score_test}")

The stability score on the train set is: 0.999011755828809
The stability score on the valid set is: 0.2674629806830987
The stability score on the test set is: 0.2885238959093119


In [81]:
# now create a table with the results

results = pd.DataFrame(
    {
        "Model": ["LightGBM", "XGBoost", "CatBoost", "Random Forest"],
        "AUC Train": [lgb_roc_train, xgb_roc_train, cb_roc_train, rf_roc_train],
        "AUC Valid": [lgb_roc_valid, xgb_roc_valid, cb_roc_valid, rf_roc_valid],
        "AUC Test": [lgb_roc_test, xgb_roc_test, cb_roc_test, rf_roc_test],
        "Stability Train": [
            lgb_stability_score_train,
            xgb_stability_score_train,
            cb_stability_score_train,
            rf_stability_score_train,
        ],
        "Stability Valid": [
            lgb_stability_score_valid,
            xgb_stability_score_valid,
            cb_stability_score_valid,
            rf_stability_score_valid,
        ],
        "Stability Test": [
            lgb_stability_score_test,
            xgb_stability_score_test,
            cb_stability_score_test,
            rf_stability_score_test,
        ],
    }
)

In [82]:
results

Unnamed: 0,Model,AUC Train,AUC Valid,AUC Test,Stability Train,Stability Valid,Stability Test
0,LightGBM,0.764123,0.751216,0.748307,0.497665,0.472673,0.458364
1,XGBoost,0.861404,0.70336,0.708576,0.709386,0.380275,0.372889
2,CatBoost,0.678306,0.680574,0.675079,0.317143,0.320689,0.30649
3,Random Forest,0.999676,0.650666,0.662827,0.999012,0.267463,0.288524
