In [1]:
import os
import warnings
from pathlib import Path

import catboost
import lightgbm as lgbm
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier, Pool
from dotenv import load_dotenv
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier

import src.features.basic as ftr_basic
import src.utils.io as io_utils
import src.visualization.plotting as visual

In [29]:
load_dotenv()
warnings.filterwarnings("ignore")
%matplotlib inline
%load_ext autoreload
%autoreload 2


ROOT = Path(os.getenv("ROOT"))
CONFIG_DIR = ROOT / Path("src/config")
EXP_PATH = ROOT / "experiments/stacking"
DATA_CFG = io_utils.load_yaml(CONFIG_DIR / Path("data.yaml"))
MODELS_CFG = io_utils.load_yaml(CONFIG_DIR / Path("models.yml"))
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [28]:
DATA_ORIGIN_PATH = DATA_CFG["new_features_data_full"]
DATA_SYN_PATH = DATA_CFG["new_features_train_data"]

CB_MODEL_PATH = MODELS_CFG["catboost_searh"]
LGBM_MODEL_PATH = MODELS_CFG["lgbm_searh"]
XGB_MODEL_PATH = MODELS_CFG["xgboost_searh"]

DATA_TEST_PATH = DATA_CFG["data_raw"]["test_path"]

In [14]:
data_origin = io_utils.load_df_parquet(
    ROOT / DATA_ORIGIN_PATH["data_new_features_path"]
)
target_origin = io_utils.load_df_parquet(ROOT / DATA_ORIGIN_PATH["target_path"])

df_train = io_utils.load_df_parquet(ROOT / DATA_SYN_PATH["train_new_features_path"])
df_val = io_utils.load_df_parquet(ROOT / DATA_SYN_PATH["val_new_features_path"])
y_train = io_utils.load_df_parquet(ROOT / DATA_SYN_PATH["train_target_path"])
y_val = io_utils.load_df_parquet(ROOT / DATA_SYN_PATH["val_target_path"])

data_syntetic = pd.concat([df_train, df_val])
target_syntetic = pd.concat([y_train, y_val])

print(
    "Syntetic data size: ",
    data_syntetic.shape,
    "\nOrigin data size: ",
    data_origin.shape,
)
data_syntetic.head()

Syntetic data size:  (749999, 23) 
Origin data size:  (45209, 23)


Unnamed: 0,age,job,marital,education,contact,poutcome,was_contact,credit_score,job_marital,job_education,...,previous_cat,log_duration,log_balance,multiply_logs,is_overdraft,sin_month,cos_month,sin_day,cos_day,jb_mean
0,30,blue-collar,married,primary,unknown,unknown,0,1,blue-collar_married,blue-collar_primary,...,0,5.697093,7.300473,41.591476,0,0.866025,-0.5,0.299363,-0.954139,972.5004
1,33,technician,divorced,secondary,cellular,unknown,0,0,technician_divorced,technician_secondary,...,0,6.137727,3.871201,23.760375,0,-0.5,-0.866025,0.937752,0.347305,1073.106634
2,28,blue-collar,single,primary,telephone,unknown,0,1,blue-collar_single,blue-collar_primary,...,0,6.276643,6.318968,39.66191,0,0.866025,-0.5,0.651372,-0.758758,972.5004
3,29,unemployed,single,primary,cellular,unknown,0,0,unemployed_single,unemployed_primary,...,0,5.46806,7.635787,41.752942,0,0.5,0.866025,0.394356,0.918958,1429.215033
4,55,technician,married,secondary,cellular,success,1,1,technician_married,technician_secondary,...,<5,6.419995,6.763885,43.424107,0,0.5,-0.866025,0.988468,0.151428,1073.106634


In [32]:
test = io_utils.load_df_parquet(ROOT / "data/processed/test_new_features.parquet")
test_ids = io_utils.load_df_parquet(ROOT / "data/processed/test_ids.parquet")
test_ids

Unnamed: 0,id
0,750000
1,750001
2,750002
3,750003
4,750004
...,...
249995,999995
249996,999996
249997,999997
249998,999998


In [17]:
ftr_names = ftr_basic.get_features_names(data_syntetic)
cat_features, num_features = ftr_names["categorical"], ftr_names["numeric"]

data_syntetic = ftr_basic.cat_features_to_category(data_syntetic)
data_origin = ftr_basic.cat_features_to_category(data_origin)

data_origin, data_syntetic = ftr_basic.align_categorical_levels(
    data_origin, data_syntetic, cat_features
)

In [25]:
cb_meta = io_utils.load_yaml(ROOT / CB_MODEL_PATH["meta_path"])
lgbm_meta = io_utils.load_yaml(ROOT / LGBM_MODEL_PATH["meta_path"])
xgb_meta = io_utils.load_yaml(ROOT / XGB_MODEL_PATH["meta_path"])

cb_params = cb_meta["best_search_params"]
cb_iter = cb_meta["best_iteration"]
lgbm_params = lgbm_meta["best_search_params"]
lgbm_iter = lgbm_meta["best_iteration"]
xgb_params = xgb_meta["best_search_params"]
xgb_iter = xgb_meta["best_iteration"]

In [26]:
cb_params.pop("od_wait")
cb_params.pop("od_type")
cb_params["iterations"] = cb_iter
cb_params

{'rsm': 0.9,
 'random_strength': 0.5,
 'allow_writing_files': False,
 'eval_metric': 'AUC',
 'verbose': 0,
 'iterations': 5979,
 'bagging_temperature': 0.5,
 'auto_class_weights': 'Balanced',
 'loss_function': 'Logloss',
 'l2_leaf_reg': 3,
 'depth': 6,
 'min_data_in_leaf': 20,
 'learning_rate': 0.05,
 'random_seed': 42}

In [27]:
cb_train_pool = Pool(data_origin, target_origin, cat_features=cat_features)
cb_origin = CatBoostClassifier(**cb_params)
cb_origin.fit(cb_train_pool)

<catboost.core.CatBoostClassifier at 0x16795acc0>

In [30]:
CB_PATH = EXP_PATH / "cb_origin_model.cbm"

cb_origin.save_model(CB_PATH)

In [None]:
cb_origin_pred = cb_origin.predict_proba(data_syntetic)[:1]