In [63]:
import os
import warnings
from pathlib import Path

import catboost
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier, Pool
from dotenv import load_dotenv
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier

import src.features.basic as ftr_basic
import src.models.training as training
import src.utils.io as io_utils
import src.visualization.plotting as visual

In [29]:
load_dotenv()
warnings.filterwarnings("ignore")
%matplotlib inline
%load_ext autoreload
%autoreload 2


ROOT = Path(os.getenv("ROOT"))
CONFIG_DIR = ROOT / Path("src/config")
EXP_PATH = ROOT / "experiments/stacking"
DATA_CFG = io_utils.load_yaml(CONFIG_DIR / Path("data.yaml"))
MODELS_CFG = io_utils.load_yaml(CONFIG_DIR / Path("models.yml"))
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [28]:
DATA_ORIGIN_PATH = DATA_CFG["new_features_data_full"]
DATA_SYN_PATH = DATA_CFG["new_features_train_data"]

CB_MODEL_PATH = MODELS_CFG["catboost_searh"]
LGBM_MODEL_PATH = MODELS_CFG["lgbm_searh"]
XGB_MODEL_PATH = MODELS_CFG["xgboost_searh"]

DATA_TEST_PATH = DATA_CFG["data_raw"]["test_path"]

In [14]:
data_origin = io_utils.load_df_parquet(
    ROOT / DATA_ORIGIN_PATH["data_new_features_path"]
)
target_origin = io_utils.load_df_parquet(ROOT / DATA_ORIGIN_PATH["target_path"])

df_train = io_utils.load_df_parquet(ROOT / DATA_SYN_PATH["train_new_features_path"])
df_val = io_utils.load_df_parquet(ROOT / DATA_SYN_PATH["val_new_features_path"])
y_train = io_utils.load_df_parquet(ROOT / DATA_SYN_PATH["train_target_path"])
y_val = io_utils.load_df_parquet(ROOT / DATA_SYN_PATH["val_target_path"])

data_syntetic = pd.concat([df_train, df_val])
target_syntetic = pd.concat([y_train, y_val])

print(
    "Syntetic data size: ",
    data_syntetic.shape,
    "\nOrigin data size: ",
    data_origin.shape,
)
data_syntetic.head()

Syntetic data size:  (749999, 23) 
Origin data size:  (45209, 23)


Unnamed: 0,age,job,marital,education,contact,poutcome,was_contact,credit_score,job_marital,job_education,...,previous_cat,log_duration,log_balance,multiply_logs,is_overdraft,sin_month,cos_month,sin_day,cos_day,jb_mean
0,30,blue-collar,married,primary,unknown,unknown,0,1,blue-collar_married,blue-collar_primary,...,0,5.697093,7.300473,41.591476,0,0.866025,-0.5,0.299363,-0.954139,972.5004
1,33,technician,divorced,secondary,cellular,unknown,0,0,technician_divorced,technician_secondary,...,0,6.137727,3.871201,23.760375,0,-0.5,-0.866025,0.937752,0.347305,1073.106634
2,28,blue-collar,single,primary,telephone,unknown,0,1,blue-collar_single,blue-collar_primary,...,0,6.276643,6.318968,39.66191,0,0.866025,-0.5,0.651372,-0.758758,972.5004
3,29,unemployed,single,primary,cellular,unknown,0,0,unemployed_single,unemployed_primary,...,0,5.46806,7.635787,41.752942,0,0.5,0.866025,0.394356,0.918958,1429.215033
4,55,technician,married,secondary,cellular,success,1,1,technician_married,technician_secondary,...,<5,6.419995,6.763885,43.424107,0,0.5,-0.866025,0.988468,0.151428,1073.106634


In [32]:
test = io_utils.load_df_parquet(ROOT / "data/processed/test_new_features.parquet")
test_ids = io_utils.load_df_parquet(ROOT / "data/processed/test_ids.parquet")
test_ids

Unnamed: 0,id
0,750000
1,750001
2,750002
3,750003
4,750004
...,...
249995,999995
249996,999996
249997,999997
249998,999998


In [34]:
ftr_names = ftr_basic.get_features_names(data_syntetic)
cat_features, num_features = ftr_names["categorical"], ftr_names["numeric"]

data_syntetic = ftr_basic.cat_features_to_category(data_syntetic)
data_origin = ftr_basic.cat_features_to_category(data_origin)

test = ftr_basic.cat_features_to_category(test)

data_origin, data_syntetic = ftr_basic.align_categorical_levels(
    data_origin, data_syntetic, cat_features
)

data_origin, test = ftr_basic.align_categorical_levels(data_origin, test, cat_features)

In [25]:
cb_meta = io_utils.load_yaml(ROOT / CB_MODEL_PATH["meta_path"])
lgbm_meta = io_utils.load_yaml(ROOT / LGBM_MODEL_PATH["meta_path"])
xgb_meta = io_utils.load_yaml(ROOT / XGB_MODEL_PATH["meta_path"])

cb_params = cb_meta["best_search_params"]
cb_iter = cb_meta["best_iteration"]
lgbm_params = lgbm_meta["best_search_params"]
lgbm_iter = lgbm_meta["best_iteration"]
xgb_params = xgb_meta["best_search_params"]
xgb_iter = xgb_meta["best_iteration"]

In [26]:
cb_params.pop("od_wait")
cb_params.pop("od_type")
cb_params["iterations"] = cb_iter
cb_params

{'rsm': 0.9,
 'random_strength': 0.5,
 'allow_writing_files': False,
 'eval_metric': 'AUC',
 'verbose': 0,
 'iterations': 5979,
 'bagging_temperature': 0.5,
 'auto_class_weights': 'Balanced',
 'loss_function': 'Logloss',
 'l2_leaf_reg': 3,
 'depth': 6,
 'min_data_in_leaf': 20,
 'learning_rate': 0.05,
 'random_seed': 42}

In [27]:
cb_train_pool = Pool(data_origin, target_origin, cat_features=cat_features)
cb_origin = CatBoostClassifier(**cb_params)
cb_origin.fit(cb_train_pool)

<catboost.core.CatBoostClassifier at 0x16795acc0>

In [30]:
CB_PATH = EXP_PATH / "cb_origin_model.cbm"

cb_origin.save_model(CB_PATH)

In [38]:
cb_origin_pred = cb_origin.predict_proba(data_syntetic)[:, 1]

cb_origin_pred.size

749999

In [40]:
df_pred_syn = pd.DataFrame(cb_origin_pred, columns=["cb_origin"])

df_pred_syn.head()

Unnamed: 0,cb_origin
0,0.00082
1,0.786712
2,0.779987
3,0.340208
4,0.980317


In [42]:
pop_param = [
    "metric",
    "colsample_bytree",
    "subsample",
    "subsample_freq",
    "min_child_samples",
]
for pp in pop_param:
    lgbm_params.pop(pp)
lgbm_params["n_estimators"] = lgbm_iter

lgbm_params

{'boosting_type': 'gbdt',
 'class_weight': 'balanced',
 'importance_type': 'split',
 'learning_rate': 0.02,
 'max_depth': -1,
 'min_child_weight': 0.001,
 'min_split_gain': 0.0,
 'n_estimators': 2764,
 'n_jobs': -1,
 'num_leaves': 106,
 'objective': 'binary',
 'random_state': 42,
 'reg_alpha': 0.5214297905300546,
 'reg_lambda': 1.236243687952708,
 'subsample_for_bin': 200000,
 'min_data_in_leaf': 137,
 'feature_fraction': 0.7282532043662948,
 'bagging_fraction': 0.9926867054910268,
 'bagging_freq': 2,
 'max_cat_to_onehot': 6,
 'cat_l2': 19.205212018125557,
 'cat_smooth': 228.1355236625612}

In [44]:
lgbm_origin = LGBMClassifier(**lgbm_params)
lgbm_origin.fit(data_origin, target_origin, categorical_feature=cat_features)

[LightGBM] [Info] Number of positive: 5289, number of negative: 39920
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005089 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1078
[LightGBM] [Info] Number of data points in the train set: 45209, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000


In [45]:
lgbm_origin_pred = lgbm_origin.predict_proba(data_syntetic)[:, 1]
df_pred_syn["lgbm_origin"] = lgbm_origin_pred

df_pred_syn.head()



Unnamed: 0,cb_origin,lgbm_origin
0,0.00082,0.000456
1,0.786712,0.521691
2,0.779987,0.639053
3,0.340208,0.145469
4,0.980317,0.923032


In [47]:
xgb_params["n_estimators"] = xgb_iter

xgb_params

{'objective': 'binary:logistic',
 'eval_metric': 'auc',
 'tree_method': 'hist',
 'enable_categorical': True,
 'random_state': 42,
 'scale_pos_weight': 7.2883770960422005,
 'learning_rate': 0.024897474486594344,
 'max_depth': 8,
 'min_child_weight': 7.792508169512386,
 'subsample': 0.8994432077827796,
 'colsample_bytree': 0.95631699635141,
 'reg_alpha': 0.4790048592166046,
 'reg_lambda': 0.11896889876122446,
 'gamma': 0.008867213299758848,
 'max_cat_to_onehot': 15,
 'n_estimators': 1037}

In [48]:
xgb_origin = XGBClassifier(**xgb_params)
xgb_origin.fit(data_origin, target_origin)

In [49]:
xgb_origin_pred = xgb_origin.predict_proba(data_syntetic)[:, 1]
df_pred_syn["xgb_origin"] = xgb_origin_pred

df_pred_syn.head()

Unnamed: 0,cb_origin,lgbm_origin,xgb_origin
0,0.00082,0.000456,0.002163
1,0.786712,0.521691,0.575666
2,0.779987,0.639053,0.707706
3,0.340208,0.145469,0.208254
4,0.980317,0.923032,0.873314


In [57]:
cb_test_pred = cb_origin.predict_proba(test)[:, 1]
lgbm_test_pred = lgbm_origin.predict_proba(test)[:, 1]
xgb_test_pred = xgb_origin.predict_proba(test)[:, 1]

df_pred_test = pd.DataFrame(cb_test_pred, columns=["cb_origin"])
df_pred_test["lgbm_origin"] = lgbm_test_pred
df_pred_test["xgb_origin"] = xgb_test_pred

df_pred_test.head()



Unnamed: 0,cb_origin,lgbm_origin,xgb_origin
0,0.004368,0.000704,0.010442
1,0.262193,0.102707,0.518493
2,0.00169,0.000247,0.00155
3,0.000153,0.000464,0.000866
4,0.049807,0.00237,0.030053


In [None]:
params = {"cb": cb_params, "lgbm": lgbm_params, "xgb": xgb_params}
for name in params.keys():
    pred, auc_mean = training.oof_pred(
        data_syntetic,
        target_syntetic,
        name,
        params=params[name],
        cv=5,
        random_state=RANDOM_STATE,
    )
    print(f"mean AUC on syntetic {name}: {auc_mean:.4f}")
    df_pred_syn[f"{name}_syn"] = pred

df_pred_syn.head()

OOF cb:   0%|          | 0/5 [00:00<?, ?it/s]

In [None]:
for name in params.keys():
    pred = training.test_pred_syn_train(
        data_syntetic,
        target_syntetic,
        test,
        model_name=name,
        params=params[name],
    )
    df_pred_test[f"{name}_syn"] = pred

df_pred_test.head()

In [None]:
PATH_PREDS = DATA_CFG["preds_for_meta"]
io_utils.save_df_parquet(df_pred_syn, ROOT / PATH_PREDS["preds_on_syn"])
io_utils.save_df_parquet(df_pred_test, ROOT / PATH_PREDS["preds_on_test"])
io_utils.save_df_parquet(target_syntetic, ROOT / PATH_PREDS["target_on_syn"])