In [4]:
import optuna
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier, Pool

import warnings, io, sys, contextlib
from tqdm.auto import tqdm
import optuna
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier, Pool, CatBoostError

# ---------- global silence for Python warnings ----------
warnings.filterwarnings("ignore")

# ---------- context manager to silence stdout/stderr from libs ----------
@contextlib.contextmanager
def suppress_output():
    saved_stdout, saved_stderr = sys.stdout, sys.stderr
    try:
        sys.stdout, sys.stderr = io.StringIO(), io.StringIO()
        yield
    finally:
        sys.stdout, sys.stderr = saved_stdout, saved_stderr

# ---------- Optuna: silence its own logging; we’ll use its built-in progress bar ----------
optuna.logging.set_verbosity(optuna.logging.WARNING)

In [6]:
# original_df = pd.read_csv("data/bank-full.csv", sep = ";")
data_train = pd.read_csv("/teamspace/studios/this_studio/Binary-Classification-with-a-Bank-Dataset/data/train.csv")
data_train = data_train.drop(["id"], axis =1)
# Convert 'y' column in original_df from 'yes'/'no' to 1/0
# original_df['y'] = original_df['y'].map({'yes': 1, 'no': 0})

# Concatenate original_df and df (ignore index to avoid duplicate indices)
# data_train = pd.concat([original_df, df], ignore_index=True)

data_test = pd.read_csv("/teamspace/studios/this_studio/Binary-Classification-with-a-Bank-Dataset/data/test.csv")

In [7]:

# ================================
# Data from Step 2
# ================================
train_f = data_train.copy()
test_f = data_test.copy()

TARGET_COL = "y"
id_col = "id"
train_f[TARGET_COL] = train_f[TARGET_COL].astype(int)

# 2) Cast ALL object columns to 'category' and align categories across train/test
all_obj_cols = [c for c in train_f.columns if train_f[c].dtype == "O"]
for c in all_obj_cols:
    train_f[c] = train_f[c].astype("category")
    if c in test_f.columns:
        test_f[c] = test_f[c].astype("category")

# Also align categories for any pre-existing 'category' dtype columns (e.g., campaign_bins)
all_cat_cols = [c for c in train_f.columns if str(train_f[c].dtype).startswith("category")]
for c in all_cat_cols:
    if c in test_f.columns:
        cats = sorted(list(set(train_f[c].cat.categories.tolist()) |
                           set(test_f[c].cat.categories.tolist())))
        train_f[c] = train_f[c].cat.set_categories(cats)
        test_f[c]  = test_f[c].cat.set_categories(cats)

# 3) Build feature matrices
TARGET_COL = "y"
features = [c for c in train_f.columns if c not in {TARGET_COL, id_col}]
X = train_f[features].copy()
y = train_f[TARGET_COL].copy()
X_test = test_f[features].copy()

# For LightGBM / CatBoost: list of categorical feature names + indices
cat_cols = [c for c in X.columns if str(X[c].dtype).startswith("category")]
cat_idx  = [X.columns.get_loc(c) for c in cat_cols]

# For XGBoost: ONLY numeric/boolean columns
xgb_features = [c for c in X.columns if not str(X[c].dtype).startswith("category")]
# (LightGBM error stemmed from object dtypes; now fixed by casting to category)

# ================================
# CV setup
# ================================
N_SPLITS = 5
RANDOM_STATE = 42
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)

# ================================
# Model tuning functions (silent; no prints; no logs)
# ================================
def tune_lgb(trial):
    params = {
        "objective": "binary",
        "metric": "auc",
        "boosting_type": "gbdt",
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2),
        "num_leaves": trial.suggest_int("num_leaves", 31, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.6, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.6, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 10),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 20, 200),
        "lambda_l1": trial.suggest_float("lambda_l1", 0.0, 5.0),
        "lambda_l2": trial.suggest_float("lambda_l2", 0.0, 5.0),
        "verbose": -1,
        "is_unbalance": True,
        "seed": RANDOM_STATE,
    }
    oof = np.zeros(len(X))
    for tr_idx, va_idx in skf.split(X, y):
        dtr = lgb.Dataset(X.iloc[tr_idx], y.iloc[tr_idx], categorical_feature=cat_cols, free_raw_data=False)
        dva = lgb.Dataset(X.iloc[va_idx], y.iloc[va_idx], categorical_feature=cat_cols, reference=dtr, free_raw_data=False)
        with suppress_output():
            model = lgb.train(
                params,
                dtr,
                valid_sets=[dva],
                callbacks=[
                    lgb.early_stopping(100, verbose=False),
                    lgb.log_evaluation(0)  # no per-iter logs
                ],
            )
            oof[va_idx] = model.predict(X.iloc[va_idx], num_iteration=model.best_iteration)
    return roc_auc_score(y, oof)

def tune_xgb(trial):
    # Try GPU; if unavailable, silently fall back to CPU
    base = {
        "objective": "binary:logistic",
        "eval_metric": "auc",
        "tree_method": "hist",
        "verbosity": 0,  # silence XGBoost logs
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "lambda": trial.suggest_float("lambda", 0.0, 5.0),
        "alpha": trial.suggest_float("alpha", 0.0, 5.0),
        "scale_pos_weight": 1.0,
    }
    oof = np.zeros(len(X))
    for tr_idx, va_idx in skf.split(X, y):
        dtr = xgb.DMatrix(X.iloc[tr_idx][xgb_features], label=y.iloc[tr_idx])
        dva = xgb.DMatrix(X.iloc[va_idx][xgb_features], label=y.iloc[va_idx])
        params = dict(base)
        params["device"] = "cuda"
        with suppress_output():
            try:
                model = xgb.train(params, dtr, num_boost_round=5000, evals=[(dva, "valid")],
                                  early_stopping_rounds=100, verbose_eval=False)
            except Exception:
                params["device"] = "cpu"
                model = xgb.train(params, dtr, num_boost_round=5000, evals=[(dva, "valid")],
                                  early_stopping_rounds=100, verbose_eval=False)
            oof[va_idx] = model.predict(dva, iteration_range=(0, model.best_iteration))
    return roc_auc_score(y, oof)

def tune_cat(trial):
    params = {
        "iterations": 5000,
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2),
        "depth": trial.suggest_int("depth", 4, 10),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1.0, 10.0),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "bootstrap_type": "Bernoulli",
        "eval_metric": "AUC",
        "task_type": "GPU",   # CatBoost will auto-fallback in our try/except
        "random_seed": RANDOM_STATE,
        "verbose": False,
        "logging_level": "Silent",
        "allow_writing_files": False,
        "use_best_model": True,
    }
    oof = np.zeros(len(X))
    for tr_idx, va_idx in skf.split(X, y):
        train_pool = Pool(X.iloc[tr_idx], y.iloc[tr_idx], cat_features=cat_cols)
        valid_pool = Pool(X.iloc[va_idx], y.iloc[va_idx], cat_features=cat_cols)
        with suppress_output():
            try:
                model = CatBoostClassifier(**params)
                model.fit(train_pool, eval_set=valid_pool, early_stopping_rounds=100, verbose=False)
            except CatBoostError:
                params_cpu = dict(params)
                params_cpu["task_type"] = "CPU"
                model = CatBoostClassifier(**params_cpu)
                model.fit(train_pool, eval_set=valid_pool, early_stopping_rounds=100, verbose=False)
            oof[va_idx] = model.predict_proba(valid_pool)[:, 1]
    return roc_auc_score(y, oof)

# ================================
# Run tuning with progress bars only
# ================================
study_lgb = optuna.create_study(direction="maximize")
study_lgb.optimize(tune_lgb, n_trials=20, show_progress_bar=True)

study_xgb = optuna.create_study(direction="maximize")
study_xgb.optimize(tune_xgb, n_trials=20, show_progress_bar=True)

study_cat = optuna.create_study(direction="maximize")
study_cat.optimize(tune_cat, n_trials=20, show_progress_bar=True)

# ================================
# Train final models with best params and blend (single tqdm bar)
# ================================
def train_oof_preds():
    preds_test_all, oof_all = [], []
    model_specs = [
        ("lgb", study_lgb.best_params, "lightgbm"),
        ("xgb", study_xgb.best_params, "xgboost"),
        ("cat", study_cat.best_params, "catboost"),
    ]
    total_steps = len(model_specs) * N_SPLITS
    with tqdm(total=total_steps, desc="Training blended models", leave=True) as pbar:
        for name, params, which in model_specs:
            oof = np.zeros(len(X))
            preds_test = np.zeros(len(X_test))
            for tr_idx, va_idx in skf.split(X, y):
                if which == "lightgbm":
                    dtr = lgb.Dataset(X.iloc[tr_idx], y.iloc[tr_idx], categorical_feature=cat_cols, free_raw_data=False)
                    dva = lgb.Dataset(X.iloc[va_idx], y.iloc[va_idx], categorical_feature=cat_cols, reference=dtr, free_raw_data=False)
                    with suppress_output():
                        model = lgb.train(
                            {**params, "objective": "binary", "metric": "auc", "verbose": -1},
                            dtr,
                            valid_sets=[dva],
                            callbacks=[lgb.early_stopping(100, verbose=False), lgb.log_evaluation(0)],
                        )
                        oof[va_idx] = model.predict(X.iloc[va_idx], num_iteration=model.best_iteration)
                        preds_test += model.predict(X_test, num_iteration=model.best_iteration) / N_SPLITS

                elif which == "xgboost":
                    dtr = xgb.DMatrix(X.iloc[tr_idx][xgb_features], label=y.iloc[tr_idx])
                    dva = xgb.DMatrix(X.iloc[va_idx][xgb_features], label=y.iloc[va_idx])
                    params_full = {**params, "objective": "binary:logistic", "eval_metric": "auc", "verbosity": 0}
                    if "device" not in params_full:
                        params_full["device"] = "cuda"
                    with suppress_output():
                        try:
                            model = xgb.train(params_full, dtr, num_boost_round=5000,
                                              evals=[(dva, "valid")], early_stopping_rounds=100, verbose_eval=False)
                        except Exception:
                            params_full["device"] = "cpu"
                            model = xgb.train(params_full, dtr, num_boost_round=5000,
                                              evals=[(dva, "valid")], early_stopping_rounds=100, verbose_eval=False)
                        oof[va_idx] = model.predict(dva, iteration_range=(0, model.best_iteration))
                        preds_test += model.predict(xgb.DMatrix(X_test[xgb_features]), iteration_range=(0, model.best_iteration)) / N_SPLITS

                elif which == "catboost":
                    train_pool = Pool(X.iloc[tr_idx], y.iloc[tr_idx], cat_features=cat_cols)
                    valid_pool = Pool(X.iloc[va_idx], y.iloc[va_idx], cat_features=cat_cols)
                    with suppress_output():
                        try:
                            model = CatBoostClassifier(**params, iterations=5000, eval_metric="AUC",
                                                       verbose=False, logging_level="Silent", allow_writing_files=False,
                                                       use_best_model=True)
                            model.fit(train_pool, eval_set=valid_pool, early_stopping_rounds=100)
                        except CatBoostError:
                            model = CatBoostClassifier(**{**params, "task_type": "CPU"}, iterations=5000,
                                                       eval_metric="AUC", verbose=False, logging_level="Silent",
                                                       allow_writing_files=False, use_best_model=True)
                            model.fit(train_pool, eval_set=valid_pool, early_stopping_rounds=100)
                        oof[va_idx] = model.predict_proba(valid_pool)[:, 1]
                        preds_test += model.predict_proba(Pool(X_test, cat_features=cat_cols))[:, 1] / N_SPLITS
                pbar.update(1)
            preds_test_all.append(preds_test)
            oof_all.append(oof)
    return np.array(oof_all), np.array(preds_test_all)

oof_models, preds_models = train_oof_preds()

# ================================
# Blend weight tuning (only Optuna progress bar)
# ================================
def tune_blend(trial):
    w_lgb = trial.suggest_float("w_lgb", 0, 1)
    w_xgb = trial.suggest_float("w_xgb", 0, 1)
    w_cat = trial.suggest_float("w_cat", 0, 1)
    weights = np.array([w_lgb, w_xgb, w_cat], dtype=float)
    if weights.sum() == 0:
        weights = np.array([1.0, 1.0, 1.0])
    weights /= weights.sum()
    blended = np.average(oof_models, axis=0, weights=weights)
    return roc_auc_score(y, blended)

study_blend = optuna.create_study(direction="maximize")
study_blend.optimize(tune_blend, n_trials=50, show_progress_bar=True)

best_w = np.array([study_blend.best_params["w_lgb"],
                   study_blend.best_params["w_xgb"],
                   study_blend.best_params["w_cat"]], dtype=float)
if best_w.sum() == 0:
    best_w = np.array([1.0, 1.0, 1.0])
best_w /= best_w.sum()

final_preds = np.average(preds_models, axis=0, weights=best_w)

Best trial: 10. Best value: 0.968223: 100%|██████████| 20/20 [18:16<00:00, 54.83s/it]
Best trial: 15. Best value: 0.949196: 100%|██████████| 20/20 [26:37<00:00, 79.89s/it] 
  0%|          | 0/20 [00:00<?, ?it/s]


[W 2025-08-19 01:44:55,347] Trial 0 failed with parameters: {'learning_rate': 0.03653747311121259, 'depth': 9, 'l2_leaf_reg': 8.22421335806522, 'subsample': 0.9114694136271042} because of the following error: CatBoostError("Only one of parameters ['verbose', 'logging_level', 'verbose_eval', 'silent'] should be set").
Traceback (most recent call last):
  File "/tmp/ipykernel_7344/3861333143.py", line 142, in tune_cat
    model.fit(train_pool, eval_set=valid_pool, early_stopping_rounds=100, verbose=False)
    ~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/zeus/miniconda3/envs/cloudspace/lib/python3.13/site-packages/catboost/core.py", line 5241, in fit
    _process_synonyms(params)
    ~~~~~~~~~~~~~~~~~^^^^^^^^
  File "/home/zeus/miniconda3/envs/cloudspace/lib/python3.13/site-packages/catboost/core.py", line 1645, in _process_synonyms
    metric_period, verbose, logging_level = _process_verbose(
                                          

CatBoostError: Only one of parameters ['verbose', 'logging_level', 'verbose_eval', 'silent'] should be set

In [None]:

# ================================
# Submission
# ================================
sub = pd.DataFrame({id_col: test_f[id_col], TARGET_COL: final_preds})
sub.to_csv("submission_step3_blend.csv", index=False)
print("Saved blended submission with weights:", best_w)
