# Base one note book

Portion of Data processing inherited from original Starters' Notebook

In this notebook you will see how to:
* Load the data.(from Origianl starters' notebook)
* Join tables with Polars.  (from original starters' notebook)
* Featture Engineering: one hot encoding, missing value handlement
* Train a XGboost
* Train a Random Forest
* Ensemble Iforest + Random Forest
* Ensemble Iforest + XGboost
## Load the data

In [1]:
import polars as pl
import numpy as np
import pandas as pd

# import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

dataPath = "/kaggle/input/home-credit-credit-risk-model-stability/"

In [2]:
def set_table_dtypes(df: pl.DataFrame) -> pl.DataFrame:
    # implement here all desired dtypes for tables
    # the following is just an example
    for col in df.columns:
        # last letter of column name will help you determine the type
        if col[-1] in ("P", "A"):
            df = df.with_columns(pl.col(col).cast(pl.Float64).alias(col))

    return df


def convert_strings(df: pd.DataFrame) -> pd.DataFrame:
    for col in df.columns:
        if df[col].dtype.name in ["object", "string"]:
            df[col] = df[col].astype("string").astype("category")
            current_categories = df[col].cat.categories
            new_categories = current_categories.to_list() + ["Unknown"]
            new_dtype = pd.CategoricalDtype(categories=new_categories, ordered=True)
            df[col] = df[col].astype(new_dtype)
    return df

In [3]:
train_basetable = pl.read_csv("train_base.csv")
train_static = pl.concat(
    [
        pl.read_csv("train_static_0_0.csv").pipe(set_table_dtypes),
        pl.read_csv("train_static_0_1.csv").pipe(set_table_dtypes),
    ],
    how="vertical_relaxed",
)
train_static_cb = pl.read_csv("train_static_cb_0.csv").pipe(set_table_dtypes)
train_person_1 = pl.read_csv("train_person_1.csv").pipe(set_table_dtypes)
train_credit_bureau_b_2 = pl.read_csv("train_credit_bureau_b_2.csv").pipe(
    set_table_dtypes
)

In [4]:
test_basetable = pl.read_csv("test_base.csv")
test_static = pl.concat(
    [
        pl.read_csv("test_static_0_0.csv").pipe(set_table_dtypes),
        pl.read_csv("test_static_0_1.csv").pipe(set_table_dtypes),
        pl.read_csv("test_static_0_2.csv").pipe(set_table_dtypes),
    ],
    how="vertical_relaxed",
)
test_static_cb = pl.read_csv("test_static_cb_0.csv").pipe(set_table_dtypes)
test_person_1 = pl.read_csv("test_person_1.csv").pipe(set_table_dtypes)
test_credit_bureau_b_2 = pl.read_csv("test_credit_bureau_b_2.csv").pipe(
    set_table_dtypes
)

## Feature engineering

In this part, we can see a simple example of joining tables via `case_id`. Here the loading and joining is done with polars library. Polars library is blazingly fast and has much smaller memory footprint than pandas. 

In [5]:
# We need to use aggregation functions in tables with depth > 1, so tables that contain num_group1 column or
# also num_group2 column.
train_person_1_feats_1 = train_person_1.group_by("case_id").agg(
    pl.col("mainoccupationinc_384A").max().alias("mainoccupationinc_384A_max"),
    (pl.col("incometype_1044T") == "SELFEMPLOYED")
    .max()
    .alias("mainoccupationinc_384A_any_selfemployed"),
)

# Here num_group1=0 has special meaning, it is the person who applied for the loan.
train_person_1_feats_2 = (
    train_person_1.select(["case_id", "num_group1", "housetype_905L"])
    .filter(pl.col("num_group1") == 0)
    .drop("num_group1")
    .rename({"housetype_905L": "person_housetype"})
)

# Here we have num_goup1 and num_group2, so we need to aggregate again.
train_credit_bureau_b_2_feats = train_credit_bureau_b_2.group_by("case_id").agg(
    pl.col("pmts_pmtsoverdue_635A").max().alias("pmts_pmtsoverdue_635A_max"),
    (pl.col("pmts_dpdvalue_108P") > 31).max().alias("pmts_dpdvalue_108P_over31"),
)

# We will process in this examples only A-type and M-type columns, so we need to select them.
selected_static_cols = []
for col in train_static.columns:
    if col[-1] in ("A", "M"):
        selected_static_cols.append(col)
print(selected_static_cols)

selected_static_cb_cols = []
for col in train_static_cb.columns:
    if col[-1] in ("A", "M"):
        selected_static_cb_cols.append(col)
print(selected_static_cb_cols)

# Join all tables together.
data = (
    train_basetable.join(
        train_static.select(["case_id"] + selected_static_cols),
        how="left",
        on="case_id",
    )
    .join(
        train_static_cb.select(["case_id"] + selected_static_cb_cols),
        how="left",
        on="case_id",
    )
    .join(train_person_1_feats_1, how="left", on="case_id")
    .join(train_person_1_feats_2, how="left", on="case_id")
    .join(train_credit_bureau_b_2_feats, how="left", on="case_id")
)

['amtinstpaidbefduel24m_4187115A', 'annuity_780A', 'annuitynextmonth_57A', 'avginstallast24m_3658937A', 'avglnamtstart24m_4525187A', 'avgoutstandbalancel6m_4187114A', 'avgpmtlast12m_4525200A', 'credamount_770A', 'currdebt_22A', 'currdebtcredtyperange_828A', 'disbursedcredamount_1113A', 'downpmt_116A', 'inittransactionamount_650A', 'lastapprcommoditycat_1041M', 'lastapprcommoditytypec_5251766M', 'lastapprcredamount_781A', 'lastcancelreason_561M', 'lastotherinc_902A', 'lastotherlnsexpense_631A', 'lastrejectcommoditycat_161M', 'lastrejectcommodtypec_5251769M', 'lastrejectcredamount_222A', 'lastrejectreason_759M', 'lastrejectreasonclient_4145040M', 'maininc_215A', 'maxannuity_159A', 'maxannuity_4075009A', 'maxdebt4_972A', 'maxinstallast24m_3658928A', 'maxlnamtstart6m_4525199A', 'maxoutstandbalancel12m_4187113A', 'maxpmtlast3m_4525190A', 'previouscontdistrict_112M', 'price_1097A', 'sumoutstandtotal_3546847A', 'sumoutstandtotalest_4493215A', 'totaldebt_9A', 'totalsettled_863A', 'totinstallas

In [6]:
test_person_1_feats_1 = test_person_1.group_by("case_id").agg(
    pl.col("mainoccupationinc_384A").max().alias("mainoccupationinc_384A_max"),
    (pl.col("incometype_1044T") == "SELFEMPLOYED")
    .max()
    .alias("mainoccupationinc_384A_any_selfemployed"),
)

test_person_1_feats_2 = (
    test_person_1.select(["case_id", "num_group1", "housetype_905L"])
    .filter(pl.col("num_group1") == 0)
    .drop("num_group1")
    .rename({"housetype_905L": "person_housetype"})
)

test_credit_bureau_b_2_feats = test_credit_bureau_b_2.group_by("case_id").agg(
    pl.col("pmts_pmtsoverdue_635A").max().alias("pmts_pmtsoverdue_635A_max"),
    (pl.col("pmts_dpdvalue_108P") > 31).max().alias("pmts_dpdvalue_108P_over31"),
)

data_submission = (
    test_basetable.join(
        test_static.select(["case_id"] + selected_static_cols), how="left", on="case_id"
    )
    .join(
        test_static_cb.select(["case_id"] + selected_static_cb_cols),
        how="left",
        on="case_id",
    )
    .join(test_person_1_feats_1, how="left", on="case_id")
    .join(test_person_1_feats_2, how="left", on="case_id")
    .join(test_credit_bureau_b_2_feats, how="left", on="case_id")
)

In [7]:
case_ids = data["case_id"].unique().shuffle(seed=1)
case_ids_train, case_ids_test = train_test_split(
    case_ids, train_size=0.6, random_state=1
)
case_ids_valid, case_ids_test = train_test_split(
    case_ids_test, train_size=0.5, random_state=1
)

cols_pred = []
for col in data.columns:
    if col[-1].isupper() and col[:-1].islower():
        cols_pred.append(col)

print(cols_pred)


def from_polars_to_pandas(case_ids: pl.DataFrame) -> pl.DataFrame:
    return (
        data.filter(pl.col("case_id").is_in(case_ids))[
            ["case_id", "WEEK_NUM", "target"]
        ].to_pandas(),
        data.filter(pl.col("case_id").is_in(case_ids))[cols_pred].to_pandas(),
        data.filter(pl.col("case_id").is_in(case_ids))["target"].to_pandas(),
    )


base_train, X_train, y_train = from_polars_to_pandas(case_ids_train)
base_valid, X_valid, y_valid = from_polars_to_pandas(case_ids_valid)
base_test, X_test, y_test = from_polars_to_pandas(case_ids_test)

for df in [X_train, X_valid, X_test]:
    df = convert_strings(df)

['amtinstpaidbefduel24m_4187115A', 'annuity_780A', 'annuitynextmonth_57A', 'avginstallast24m_3658937A', 'avglnamtstart24m_4525187A', 'avgoutstandbalancel6m_4187114A', 'avgpmtlast12m_4525200A', 'credamount_770A', 'currdebt_22A', 'currdebtcredtyperange_828A', 'disbursedcredamount_1113A', 'downpmt_116A', 'inittransactionamount_650A', 'lastapprcommoditycat_1041M', 'lastapprcommoditytypec_5251766M', 'lastapprcredamount_781A', 'lastcancelreason_561M', 'lastotherinc_902A', 'lastotherlnsexpense_631A', 'lastrejectcommoditycat_161M', 'lastrejectcommodtypec_5251769M', 'lastrejectcredamount_222A', 'lastrejectreason_759M', 'lastrejectreasonclient_4145040M', 'maininc_215A', 'maxannuity_159A', 'maxannuity_4075009A', 'maxdebt4_972A', 'maxinstallast24m_3658928A', 'maxlnamtstart6m_4525199A', 'maxoutstandbalancel12m_4187113A', 'maxpmtlast3m_4525190A', 'previouscontdistrict_112M', 'price_1097A', 'sumoutstandtotal_3546847A', 'sumoutstandtotalest_4493215A', 'totaldebt_9A', 'totalsettled_863A', 'totinstallas

In [8]:
base_train, X_train, y_train = from_polars_to_pandas(case_ids_train)
base_valid, X_valid, y_valid = from_polars_to_pandas(case_ids_valid)
base_test, X_test, y_test = from_polars_to_pandas(case_ids_test)

for df in [X_train, X_valid, X_test]:
    df = convert_strings(df)

One Hot Encoding

In [9]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder()
category_colums = X_test.select_dtypes(include=["category"]).columns

# encode test data
encoded_data = encoder.fit_transform(X_test[category_colums])
encoded_df = pd.DataFrame(
    encoded_data.toarray(), columns=encoder.get_feature_names_out(category_colums)
)
X_test.drop(columns=category_colums, inplace=True)
X_test = pd.concat([X_test, encoded_df], axis=1)

# one hot encoding for X_train
encoded_data = encoder.fit_transform(X_train[category_colums])
encoded_df = pd.DataFrame(
    encoded_data.toarray(), columns=encoder.get_feature_names_out(category_colums)
)
X_train.drop(columns=category_colums, inplace=True)
X_train = pd.concat([X_train, encoded_df], axis=1)

# one hot encoding for X_valid
encoded_data = encoder.fit_transform(X_valid[category_colums])
encoded_df = pd.DataFrame(
    encoded_data.toarray(), columns=encoder.get_feature_names_out(category_colums)
)
X_valid.drop(columns=category_colums, inplace=True)
X_valid = pd.concat([X_valid, encoded_df], axis=1)

print(X_test.shape)
print(X_train.shape)
print(X_valid.shape)

(305332, 726)
(915995, 826)
(305332, 728)


In [10]:
# match columns with X_Test
test_columns = X_test.columns
missing_columns = [
    col
    for col in test_columns
    if col not in X_train.columns or col not in X_valid.columns
]
X_test = X_test.drop(columns=missing_columns)
test_columns = X_test.columns
X_train = X_train[test_columns]
X_valid = X_valid[test_columns]
print(X_test.shape)
print(X_train.shape)
print(X_valid.shape)

(305332, 657)
(915995, 657)
(305332, 657)


Handle Missing Data

In [11]:
# missing data
for df in [X_train, X_valid, X_test]:
    missing_percentage = (df.isnull().sum() / len(df)) * 100
    columns_to_drop = missing_percentage[missing_percentage > 95].index.tolist()
    df.drop(columns=columns_to_drop, inplace=True)
print(f"Train: {X_train.shape}")
print(f"Valid: {X_valid.shape}")
print(f"Test: {X_test.shape}")

Train: (915995, 653)
Valid: (305332, 653)
Test: (305332, 653)


## Training XGBOOST

In [12]:
# Split the dataset into two equal parts
# base_train1, base_train2, X_train1, X_train2, y_train1, y_train2 = train_test_split(
#     base_train, X_train, y_train, test_size=0.999, random_state=42
# )

# base_train3, base_train4, X_train3, X_train4, y_train3, y_train4 = train_test_split(
#     base_train2, X_train2, y_train2, test_size=0.2, random_state=42
# )

In [21]:
# searching for range with subset of training data
from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
import xgboost as xgb

pipeline_xgb = Pipeline(
    [
        ("imputer", SimpleImputer(strategy="median")),
        ("classifier", xgb.XGBClassifier(objective="binary:logistic", random_state=42)),
    ]
)

param_grid_xgb = {
    "classifier__learning_rate": [0.01],
    "classifier__max_depth": [50],
    "classifier__n_estimators": [500],
    "classifier__subsample": [0.15],
    "classifier__colsample_bytree": [0.65],
    "classifier__reg_alpha": [0.5],
    "classifier__reg_lambda": [5],
}

grid_search_xgb = GridSearchCV(
    pipeline_xgb,
    param_grid_xgb,
    cv=5,
    scoring="roc_auc",  # n_jobs=-1
)

grid_search_xgb.fit(X_train, base_train["target"])

best_params_xgb = grid_search_xgb.best_params_
print("Best hyperparameters:", best_params_xgb)

best_model_xgb = grid_search_xgb.best_estimator_
# print result


acc_xgb_train = best_model_xgb.score(X_train, base_train["target"])
acc_xgb_val = best_model_xgb.score(X_valid, base_valid["target"])
acc_xgb_test = best_model_xgb.score(X_test, base_test["target"])
print("train_acc:", acc_xgb_train)
print("vali_acc:", acc_xgb_val)
print("test_acc:", acc_xgb_test)

Best hyperparameters: {'classifier__colsample_bytree': 0.65, 'classifier__learning_rate': 0.01, 'classifier__max_depth': 50, 'classifier__n_estimators': 500, 'classifier__reg_alpha': 0.5, 'classifier__reg_lambda': 5, 'classifier__subsample': 0.15}
train_acc: 0.9684845441296077
vali_acc: 0.9690762841759134
test_acc: 0.9682968047895406


In [22]:
# y_train_pred_proba_knn = best_model_knn.predict_proba(X_train3)[:, 1]
# auc_knn_train = roc_auc_score(base_train3["target"], y_train_pred_proba_knn)

y_xgb_train = best_model_xgb.predict_proba(
    X_train,
)[:, 1]
auc_xgb_train = roc_auc_score(base_train["target"], y_xgb_train)
y_xgb_val = best_model_xgb.predict_proba(
    X_valid,
)[:, 1]
auc_xgb_val = roc_auc_score(base_valid["target"], y_xgb_val)
y_xgb_test = best_model_xgb.predict_proba(
    X_test,
)[:, 1]
auc_xgb_test = roc_auc_score(base_test["target"], y_xgb_test)

print("train_auc:", auc_xgb_train)
print("vali_auc:", auc_xgb_val)
print("test_auc:", auc_xgb_test)

train_auc: 0.8491031407520588
vali_auc: 0.7466128037356864
test_auc: 0.7445891835727575


In [None]:
for base, X in [(base_train, X_train), (base_valid, X_valid), (base_test, X_test)]:
    y_pred = best_model_xgb.predict_proba(X)[:, 1]
    base["score"] = y_pred

In [None]:
print(
    f'The AUC score on the train set is: {roc_auc_score(base_train["target"], base_train["score"])}'
)
print(
    f'The AUC score on the valid set is: {roc_auc_score(base_valid["target"], base_valid["score"])}'
)
print(
    f'The AUC score on the test set is: {roc_auc_score(base_test["target"], base_test["score"])}'
)

The AUC score on the train set is: 0.84910314077158
The AUC score on the valid set is: 0.7466128053463996
The AUC score on the test set is: 0.7445891833980491


In [19]:
def gini_stability(base, w_fallingrate=88.0, w_resstd=-0.5):
    gini_in_time = (
        base.loc[:, ["WEEK_NUM", "target", "score"]]
        .sort_values("WEEK_NUM")
        .groupby("WEEK_NUM")[["target", "score"]]
        .apply(lambda x: 2 * roc_auc_score(x["target"], x["score"]) - 1)
        .tolist()
    )

    x = np.arange(len(gini_in_time))
    y = gini_in_time
    a, b = np.polyfit(x, y, 1)
    y_hat = a * x + b
    residuals = y - y_hat
    res_std = np.std(residuals)
    avg_gini = np.mean(gini_in_time)
    return avg_gini + w_fallingrate * min(0, a) + w_resstd * res_std


stability_score_train = gini_stability(base_train)
stability_score_valid = gini_stability(base_valid)
stability_score_test = gini_stability(base_test)

print("train stability", stability_score_train)
print("valid stability", stability_score_valid)
print("test stability", stability_score_test)

## Training Random Forest

In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

pipeline_rf = Pipeline(
    [
        ("imputer", SimpleImputer(strategy="median")),
        ("classifier", RandomForestClassifier(random_state=42)),
    ]
)

param_grid_rf = {
    "classifier__n_estimators": [500],  # , 500
    "classifier__max_features": ["sqrt"],  # "auto"
    "classifier__max_depth": [20],  # 10,30, None
    "classifier__min_samples_split": [2],  # 10
    "classifier__min_samples_leaf": [4],  # 4
}

grid_search_rf = GridSearchCV(
    pipeline_rf,
    param_grid_rf,
    cv=5,
    scoring="roc_auc",
)  # n_jobs=-1

grid_search_rf.fit(X_train, base_train["target"])

best_params_rf = grid_search_rf.best_params_
print("Best hyperparameters:", best_params_rf)

best_model_rf = grid_search_rf.best_estimator_

acc_rf_train = best_model_rf.score(X_train, base_train["target"])
acc_rf_valid = best_model_rf.score(X_valid, base_valid["target"])
acc_rf_test = best_model_rf.score(X_test, base_test["target"])

print("rf train_auc:", acc_rf_train)
print("rf valid_auc:", acc_rf_valid)
print("rf test_auc:", acc_rf_test)

Best hyperparameters: {'classifier__max_depth': 20, 'classifier__max_features': 'sqrt', 'classifier__min_samples_leaf': 4, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 500}
rf train_auc: 0.9684801772935442
rf valid_auc: 0.9690762841759134
rf test_auc: 0.9682968047895406


In [15]:
for base, X in [(base_train, X_train), (base_valid, X_valid), (base_test, X_test)]:
    y_pred = best_model_rf.predict_proba(X)[:, 1]
    base["score"] = y_pred

In [16]:
y_rf_train = best_model_rf.predict_proba(X_train)[:, 1]
y_rf_valid = best_model_rf.predict_proba(X_valid)[:, 1]
y_rf_test = best_model_rf.predict_proba(X_test)[:, 1]

In [17]:
print(
    f'The RF AUC score on the train set is: {roc_auc_score(base_train["target"], base_train["score"])}'
)
print(
    f'The RF AUC score on the valid set is: {roc_auc_score(base_valid["target"], base_valid["score"])}'
)
print(
    f'The RF AUC score on the test set is: {roc_auc_score(base_test["target"], base_test["score"])}'
)

The RF AUC score on the train set is: 0.844398017427238
The RF AUC score on the valid set is: 0.7217115239925151
The RF AUC score on the test set is: 0.7196116736124211


In [20]:
stability_rf_train = gini_stability(base_train)
stability_rf_valid = gini_stability(base_valid)
stability_rf_test = gini_stability(base_test)

print("rf train stability", stability_rf_train)
print("rf valid stability", stability_rf_valid)
print("rf test stability", stability_rf_test)

rf train stability 0.6789859434229357
rf valid stability 0.4142947760001885
rf test stability 0.3986476891377608


## KNN

In [None]:
# from sklearn.neighbors import KNeighborsClassifier

# # Define KNN pipeline
# pipeline_knn = Pipeline(
#     [
#         ("imputer", SimpleImputer(strategy="median")),
#         ("classifier", KNeighborsClassifier()),
#     ]
# )

# param_grid_knn = {
#     "classifier__n_neighbors": [
#         10,
#     ],
#     "classifier__weights": ["distance"],
# }

# grid_search_knn = GridSearchCV(
#     pipeline_knn,
#     param_grid_knn,
#     cv=5,
#     scoring="roc_auc",  # n_jobs=-1
# )

# grid_search_knn.fit(X_train, base_train["target"])

# best_params_knn = grid_search_knn.best_params_
# print("Best hyperparameters:", best_params_knn)

# best_model_knn = grid_search_knn.best_estimator_

# y_train_pred_proba_knn = best_model_knn.predict_proba(X_train)[:, 1]
# auc_knn_train = roc_auc_score(base_train["target"], y_train_pred_proba_knn)

# y_val_pred_proba_knn = best_model_knn.predict_proba(X_valid)[:, 1]
# auc_knn_val = roc_auc_score(base_valid["target"], y_val_pred_proba_knn)


# y_test_pred_proba_knn = best_model_knn.predict_proba(X_test)[:, 1]
# auc_knn_test = roc_auc_score(base_test["target"], y_test_pred_proba_knn)

# # Print results
# print("train_auc:", auc_knn_train)
# print("vali_auc:", auc_knn_val)
# print("test_auc:", auc_knn_test)

KeyboardInterrupt: 

In [None]:
# for base, X in [
#     (base_test, X_test)
# ]:  # (base_train3, X_train3), (base_train4, X_train4),
#     y_pred = best_model_knn.predict_proba(X)[:, 1]
#     base["score"] = y_pred

In [None]:
# # stability_rf_train = gini_stability(base_train3)
# # stability_rf_valid = gini_stability(base_train4)
# stability_knn_test = gini_stability(base_test)

# # print("rf train stability", stability_rf_train)
# # print("rf valid stability", stability_rf_valid)
# print("knn test stability", stability_knn_test)

## Model Stacking

IFOREST-TRIAL

In [23]:
# missing data processing

imp_median = SimpleImputer(missing_values=np.nan, strategy="median")

imp_median.fit(X_train)
X_imputed = imp_median.transform(X_train)

imp_median.fit(X_valid)
X_imputed_val = imp_median.transform(X_valid)

imp_median.fit(X_test)
X_imputed_test = imp_median.transform(X_test)

In [24]:
X_imputed.shape

(915995, 653)

In [25]:
# Iforest
from sklearn.ensemble import IsolationForest

isolation_forest = IsolationForest(contamination=0.03245766, random_state=42)
isolation_forest.fit(X_imputed)
train_anomaly_scores = isolation_forest.decision_function(X_imputed)
val_anomaly_scores = isolation_forest.decision_function(X_imputed_val)
test_anomaly_scores = isolation_forest.decision_function(X_imputed_test)

In [26]:
# impute iforest
X_imputed = np.column_stack((X_imputed, train_anomaly_scores))
X_imputed_val = np.column_stack((X_imputed_val, val_anomaly_scores))
X_imputed_test = np.column_stack((X_imputed_test, test_anomaly_scores))

In [None]:
# # impute knn ----pending!!!!
# X_imputed = np.column_stack((X_imputed, y_train_pred_proba_knn))
# X_imputed_val = np.column_stack((X_imputed_val, y_val_pred_proba_knn))
# X_imputed_test = np.column_stack((X_imputed_test, y_test_pred_proba_knn))

In [27]:
# impute rf
X_imputed = np.column_stack((X_imputed, y_rf_train))
X_imputed_val = np.column_stack((X_imputed_val, y_rf_valid))
X_imputed_test = np.column_stack((X_imputed_test, y_rf_test))

In [28]:
X_imputed.shape

(915995, 655)

In [29]:
X_train_i = pd.DataFrame(X_imputed)
X_valid_i = pd.DataFrame(X_imputed_val)
X_test_i = pd.DataFrame(X_imputed_test)

In [31]:
# result for rf+ifrost -->xgboost
pipeline_xgb_i = Pipeline(
    [
        ("imputer", SimpleImputer(strategy="median")),
        ("classifier", xgb.XGBClassifier(objective="binary:logistic", random_state=42)),
    ]
)

param_grid_xgb_i = {
    "classifier__learning_rate": [0.01],
    "classifier__max_depth": [10],
    "classifier__n_estimators": [70],
    "classifier__subsample": [0.15],
    "classifier__colsample_bytree": [0.65],
    "classifier__reg_alpha": [0.3],
    "classifier__reg_lambda": [5],
}

grid_search_xgb_i = GridSearchCV(
    pipeline_xgb_i,
    param_grid_xgb_i,
    cv=5,
    scoring="roc_auc",  # n_jobs=-1
)

grid_search_xgb_i.fit(X_train_i, base_train["target"])

best_params_xgb_i = grid_search_xgb_i.best_params_
print("Best hyperparameters:", best_params_xgb_i)

best_model_xgb_i = grid_search_xgb_i.best_estimator_
# print result

Best hyperparameters: {'classifier__colsample_bytree': 0.65, 'classifier__learning_rate': 0.01, 'classifier__max_depth': 10, 'classifier__n_estimators': 70, 'classifier__reg_alpha': 0.3, 'classifier__reg_lambda': 5, 'classifier__subsample': 0.15}


In [32]:
for base, X in [
    (base_train, X_train_i),
    (base_valid, X_valid_i),
    (base_test, X_test_i),
]:
    y_pred = best_model_xgb_i.predict_proba(X)[:, 1]
    base["score"] = y_pred
print(
    f'The meta_xgb AUC score on the train set is: {roc_auc_score(base_train["target"], base_train["score"])}'
)
print(
    f'The meta_xgb AUC score on the valid set is: {roc_auc_score(base_valid["target"], base_valid["score"])}'
)
print(
    f'The meta_xgb AUC score on the test set is: {roc_auc_score(base_test["target"], base_test["score"])}'
)

The meta_xgb AUC score on the train set is: 0.8879768422100222
The meta_xgb AUC score on the valid set is: 0.7190190222657052
The meta_xgb AUC score on the test set is: 0.718337101991866


In [33]:
stability_rf_train = gini_stability(base_train)
tability_rf_valid = gini_stability(base_valid)
stability_meta_test = gini_stability(base_test)

print("rf train stability", stability_rf_train)
print("rf valid stability", stability_rf_valid)
print("meta_xgb test stability", stability_meta_test)

rf train stability 0.764702539216158
rf valid stability 0.4142947760001885
meta_xgb test stability 0.3993853635796833


Submission

In [None]:
# # Beyes SearchCV
# from skopt import BayesSearchCV

# pipeline_xgb = Pipeline(
#     [
#         ("imputer", SimpleImputer(strategy="median")),
#         ("classifier", xgb.XGBClassifier(objective="binary:logistic", random_state=42)),
#     ]
# )

# param_grid_xgb = {
#     "classifier__learning_rate": (0.01, 0.1, "log-uniform"),
#     "classifier__max_depth": (50, 100),
#     "classifier__n_estimators": (500, 1000),
#     "classifier__subsample": (0.1, 0.5, "uniform"),
#     "classifier__colsample_bytree": (0.5, 1.0, "uniform"),
# }

# bayesian_search_xgb = BayesSearchCV(
#     pipeline_xgb, param_grid_xgb, cv=5, scoring="accuracy", n_jobs=-1
# )

# bayesian_search_xgb.fit(X_train, y_train)

# best_params_xgb = bayesian_search_xgb.best_params_
# print("Best hyperparameters:", best_params_xgb)

# best_model_xgb = bayesian_search_xgb.best_estimator_

# accuracy_xgb = best_model_xgb.score(X_valid, y_valid)

# print("Accuracy:", accuracy_xgb)

Best hyperparameters: OrderedDict([('classifier__colsample_bytree', 0.5), ('classifier__learning_rate', 0.010710388739295373), ('classifier__max_depth', 50), ('classifier__n_estimators', 508), ('classifier__subsample', 0.1)])
Accuracy: 0.9690795592993856


## Original notebook evaluation

Evaluation with AUC and then comparison with the stability metric is shown below.

In [None]:
# for base, X in [(base_train, X_train), (base_valid, X_valid), (base_test, X_test)]:
#     y_pred = best_model_xgb.predict(X)
#     base["score"] = y_pred

# print(
#     f'The AUC score on the train set is: {roc_auc_score(base_train["target"], base_train["score"])}'
# )
# print(
#     f'The AUC score on the valid set is: {roc_auc_score(base_valid["target"], base_valid["score"])}'
# )
# print(
#     f'The AUC score on the test set is: {roc_auc_score(base_test["target"], base_test["score"])}'
# )

The AUC score on the train set is: 0.5000502625906061
The AUC score on the valid set is: 0.4999983101828382
The AUC score on the test set is: 0.5


In [None]:
# def gini_stability(base, w_fallingrate=88.0, w_resstd=-0.5):
#     gini_in_time = (
#         base.loc[:, ["WEEK_NUM", "target", "score"]]
#         .sort_values("WEEK_NUM")
#         .groupby("WEEK_NUM")[["target", "score"]]
#         .apply(lambda x: 2 * roc_auc_score(x["target"], x["score"]) - 1)
#         .tolist()
#     )

#     x = np.arange(len(gini_in_time))
#     y = gini_in_time
#     a, b = np.polyfit(x, y, 1)
#     y_hat = a * x + b
#     residuals = y - y_hat
#     res_std = np.std(residuals)
#     avg_gini = np.mean(gini_in_time)
#     return avg_gini + w_fallingrate * min(0, a) + w_resstd * res_std


# stability_score_train = gini_stability(base_train)
# stability_score_valid = gini_stability(base_valid)
# stability_score_test = gini_stability(base_test)

# print(f"The stability score on the train set is: {stability_score_train}")
# print(f"The stability score on the valid set is: {stability_score_valid}")
# print(f"The stability score on the test set is: {stability_score_test}")

The stability score on the train set is: 0.002716678063434127
The stability score on the valid set is: -0.0004015572928544619
The stability score on the test set is: -0.0003913620321528701


## Submission

Scoring the submission dataset is below, we need to take care of new categories. Then we save the score as a last step. 

In [None]:
# X_submission = data_submission[cols_pred].to_pandas()
# X_submission = convert_strings(X_submission)
# categorical_cols = X_train.select_dtypes(include=["category"]).columns

# for col in categorical_cols:
#     train_categories = set(X_train[col].cat.categories)
#     submission_categories = set(X_submission[col].cat.categories)
#     new_categories = submission_categories - train_categories
#     X_submission.loc[X_submission[col].isin(new_categories), col] = "Unknown"
#     new_dtype = pd.CategoricalDtype(categories=train_categories, ordered=True)
#     X_train[col] = X_train[col].astype(new_dtype)
#     X_submission[col] = X_submission[col].astype(new_dtype)

# y_submission_pred = gbm.predict(X_submission, num_iteration=gbm.best_iteration)

In [None]:
# submission = pd.DataFrame(
#     {"case_id": data_submission["case_id"].to_numpy(), "score": y_submission_pred}
# ).set_index("case_id")
# submission.to_csv("./submission.csv")

In [None]:
# data_submission

case_id,date_decision,MONTH,WEEK_NUM,amtinstpaidbefduel24m_4187115A,annuity_780A,annuitynextmonth_57A,avginstallast24m_3658937A,avglnamtstart24m_4525187A,avgoutstandbalancel6m_4187114A,avgpmtlast12m_4525200A,credamount_770A,currdebt_22A,currdebtcredtyperange_828A,disbursedcredamount_1113A,downpmt_116A,inittransactionamount_650A,lastapprcommoditycat_1041M,lastapprcommoditytypec_5251766M,lastapprcredamount_781A,lastcancelreason_561M,lastotherinc_902A,lastotherlnsexpense_631A,lastrejectcommoditycat_161M,lastrejectcommodtypec_5251769M,lastrejectcredamount_222A,lastrejectreason_759M,lastrejectreasonclient_4145040M,maininc_215A,maxannuity_159A,maxannuity_4075009A,maxdebt4_972A,maxinstallast24m_3658928A,maxlnamtstart6m_4525199A,maxoutstandbalancel12m_4187113A,maxpmtlast3m_4525190A,previouscontdistrict_112M,price_1097A,sumoutstandtotal_3546847A,sumoutstandtotalest_4493215A,totaldebt_9A,totalsettled_863A,totinstallast1m_4525188A,description_5085714M,education_1103M,education_88M,maritalst_385M,maritalst_893M,pmtaverage_3A,pmtaverage_4527227A,pmtaverage_4955615A,pmtssum_45A,mainoccupationinc_384A_max,mainoccupationinc_384A_any_selfemployed,person_housetype,pmts_pmtsoverdue_635A_max,pmts_dpdvalue_108P_over31
i64,str,i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,str,f64,str,f64,f64,str,str,f64,str,str,f64,f64,f64,f64,f64,f64,f64,f64,str,f64,f64,f64,f64,f64,f64,str,str,str,str,str,f64,f64,f64,f64,f64,bool,str,f64,bool
57543,"""2021-05-14""",202201,100,191767.36,3674.6,1218.2001,16049.4,17054.4,14554.4,24482.0,20000.0,12154.4,0.0,20000.0,0.0,,"""a55475b1""","""a55475b1""",14000.0,"""a55475b1""",,,"""P109_133_183""","""P49_111_165""",24000.0,"""a55475b1""","""a55475b1""",34000.0,280983.56,,231440.03,131700.8,16672.6,157731.78,16641.4,"""a55475b1""",0.0,12154.4,12154.4,12154.4,456031.1,17859.6,"""2fc785b2""","""6b2ae0fa""","""a55475b1""","""38c061ee""","""a55475b1""",,,,,34000.0,False,,,
57549,"""2022-01-17""",202201,100,129704.4,5742.6,3546.6,32426.201,118964.805,13681.714,32426.201,75000.0,10638.2,10638.2,75000.0,0.0,,"""a55475b1""","""a55475b1""",94000.0,"""a55475b1""",,,"""a55475b1""","""a55475b1""",160000.0,"""a55475b1""","""P30_86_84""",44000.0,337659.8,,34066.0,122511.4,31820.6,21278.0,122511.4,"""a55475b1""",,10638.2,10638.2,10638.2,373720.84,126058.0,"""2fc785b2""","""39a0853f""","""a55475b1""","""a7fcb6e5""","""a55475b1""",,,26815.6,,49800.0,False,,,
57551,"""2020-11-27""",202201,100,71036.4,2844.6,0.0,8357.2,,0.0,9551.0,27095.201,0.0,0.0,27095.201,0.0,,"""a55475b1""","""a55475b1""",200000.0,"""P85_114_140""",,,"""a55475b1""","""a55475b1""",,"""a55475b1""","""a55475b1""",70000.0,83400.0,,54000.0,41783.402,54000.0,62619.0,,"""P11_36_178""",27095.201,0.0,0.0,0.0,75219.0,,"""2fc785b2""","""6b2ae0fa""","""a55475b1""","""3439d993""","""a55475b1""",,,,,59600.0,False,,,
57552,"""2020-11-27""",202201,100,183992.0,6298.8003,12155.4,7440.4,,199322.4,9148.4,100000.0,191269.61,191269.61,100000.0,0.0,,"""a55475b1""","""a55475b1""",0.0,"""P94_109_143""",,,"""a55475b1""","""a55475b1""",150000.0,"""a55475b1""","""P94_109_143""",,110500.0,,188126.14,12155.4,104473.6,288642.6,12155.4,"""P21_87_50""",,191269.61,191269.61,191269.61,284213.0,18889.0,"""2fc785b2""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",,,23402.8,,112000.0,False,"""OWNED""",,
57569,"""2021-12-20""",202201,100,0.0,4682.6,0.0,,,,10796.4,60000.0,0.0,0.0,60000.0,0.0,,"""a55475b1""","""a55475b1""",20000.0,"""P94_109_143""",,,"""a55475b1""","""a55475b1""",40000.0,"""a55475b1""","""P94_109_143""",6000.0,37704.0,,64555.668,,,0.0,,"""a55475b1""",,0.0,0.0,0.0,95348.42,,"""2fc785b2""","""717ddd49""","""a55475b1""","""3439d993""","""a55475b1""",,,17333.6,,58000.0,False,"""OWNED""",,
57630,"""2021-03-16""",202201,100,0.0,8905.0,0.0,,,,,96174.0,0.0,0.0,96174.0,0.0,,"""P148_110_5""","""P161_88_182""",8876.0,"""P198_89_166""",,,"""a55475b1""","""a55475b1""",,"""a55475b1""","""a55475b1""",12000.0,1382.8,,0.0,,,,,"""a55475b1""",96174.0,0.0,0.0,0.0,9677.601,,"""2fc785b2""","""6b2ae0fa""","""a55475b1""","""3439d993""","""a55475b1""",,,,,60000.0,False,"""OWNED""",,
57631,"""2022-06-04""",202201,100,,2540.6,0.0,,,,,24920.0,0.0,0.0,24920.0,0.0,,"""a55475b1""","""a55475b1""",,"""P94_109_143""",,,"""P100_96_175""","""P165_57_169""",46279.8,"""P45_84_106""","""P94_109_143""",,0.0,,0.0,,,,,"""a55475b1""",24920.0,,,0.0,0.0,,"""2fc785b2""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",,,,,,,,,
57632,"""2022-02-05""",202201,100,63647.402,4732.0,0.0,3536.0,,10581.714,3536.0,25998.0,0.0,0.0,25998.0,0.0,,"""P53_45_92""","""P200_75_140""",50116.0,"""a55475b1""",,,"""a55475b1""","""a55475b1""",,"""a55475b1""","""a55475b1""",56000.0,7000.0,,63647.402,3536.0,63647.402,42412.0,3536.0,"""P159_143_123""",25998.0,0.0,0.0,0.0,63652.0,7071.4,"""2fc785b2""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",,,15841.2,,,,,,
57633,"""2022-01-25""",202201,100,,8273.0,0.0,,,,,200000.0,0.0,0.0,200000.0,0.0,,"""a55475b1""","""a55475b1""",,"""P85_114_140""",,,"""P159_130_59""","""P75_90_70""",64996.0,"""P45_84_106""","""P94_109_143""",,0.0,,0.0,,,,,"""a55475b1""",0.0,,,0.0,0.0,,"""2fc785b2""","""a55475b1""","""a55475b1""","""3439d993""","""a55475b1""",,,,,,,,,
57634,"""2021-01-27""",202201,100,39948.8,1165.8,0.0,3994.8,,1675.4,3358.4001,12108.2,0.0,0.0,12108.2,0.0,,"""P159_130_59""","""P174_113_42""",16494.201,"""a55475b1""",,,"""a55475b1""","""a55475b1""",,"""a55475b1""","""a55475b1""",50000.0,5000.0,,19798.0,4949.6,20887.201,20150.8,,"""a55475b1""",13998.0,0.0,0.0,0.0,39950.8,,"""2fc785b2""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",,,,,,,,,


In [None]:
auc_xgb = 0.7445891833980491
auc_rf = 0.7196116736124211
auc_meta = 0.718337101991866
auc_lightgbt = 

stability_xgb =  0.4531885133887121
stability_rf = 0.3986476891377608
stability_meta = 0.27299659685452415

