In [1]:
import optuna
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier, Pool


In [2]:
# original_df = pd.read_csv("data/bank-full.csv", sep = ";")
data_train = pd.read_csv("data/train.csv")
data_train = data_train.drop(["id"], axis =1)
# Convert 'y' column in original_df from 'yes'/'no' to 1/0
# original_df['y'] = original_df['y'].map({'yes': 1, 'no': 0})

# Concatenate original_df and df (ignore index to avoid duplicate indices)
# data_train = pd.concat([original_df, df], ignore_index=True)

data_test = pd.read_csv("data/test.csv")

In [4]:

# ================================
# Data from Step 2
# ================================
train_f = data_train.copy()
test_f = data_test.copy()

TARGET_COL = "y"
id_col = "id"
train_f[TARGET_COL] = train_f[TARGET_COL].astype(int)

# 2) Cast ALL object columns to 'category' and align categories across train/test
all_obj_cols = [c for c in train_f.columns if train_f[c].dtype == "O"]
for c in all_obj_cols:
    train_f[c] = train_f[c].astype("category")
    if c in test_f.columns:
        test_f[c] = test_f[c].astype("category")

# Also align categories for any pre-existing 'category' dtype columns (e.g., campaign_bins)
all_cat_cols = [c for c in train_f.columns if str(train_f[c].dtype).startswith("category")]
for c in all_cat_cols:
    if c in test_f.columns:
        cats = sorted(list(set(train_f[c].cat.categories.tolist()) |
                           set(test_f[c].cat.categories.tolist())))
        train_f[c] = train_f[c].cat.set_categories(cats)
        test_f[c]  = test_f[c].cat.set_categories(cats)

# 3) Build feature matrices
TARGET_COL = "y"
features = [c for c in train_f.columns if c not in {TARGET_COL, id_col}]
X = train_f[features].copy()
y = train_f[TARGET_COL].copy()
X_test = test_f[features].copy()

# For LightGBM / CatBoost: list of categorical feature names + indices
cat_cols = [c for c in X.columns if str(X[c].dtype).startswith("category")]
cat_idx  = [X.columns.get_loc(c) for c in cat_cols]

# For XGBoost: ONLY numeric/boolean columns
xgb_features = [c for c in X.columns if not str(X[c].dtype).startswith("category")]
# (LightGBM error stemmed from object dtypes; now fixed by casting to category)

# ================================
# CV setup
# ================================
N_SPLITS = 5
RANDOM_STATE = 42
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)

# ================================
# Model tuning functions
# ================================
def tune_lgb(trial):
    params = {
        "objective": "binary",
        "metric": "auc",
        "boosting_type": "gbdt",
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2),
        "num_leaves": trial.suggest_int("num_leaves", 31, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.6, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.6, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 10),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 20, 200),
        "lambda_l1": trial.suggest_float("lambda_l1", 0.0, 5.0),
        "lambda_l2": trial.suggest_float("lambda_l2", 0.0, 5.0),
        "verbose": -1,
        "is_unbalance": True,
        "seed": RANDOM_STATE
    }
    oof = np.zeros(len(X))
    for train_idx, valid_idx in skf.split(X, y):
        dtrain = lgb.Dataset(X.iloc[train_idx], y.iloc[train_idx], categorical_feature=cat_cols)
        dvalid = lgb.Dataset(X.iloc[valid_idx], y.iloc[valid_idx], categorical_feature=cat_cols)
        model = lgb.train(params, dtrain, valid_sets=[dvalid],
                          callbacks=[lgb.early_stopping(100, verbose=False)])
        oof[valid_idx] = model.predict(X.iloc[valid_idx], num_iteration=model.best_iteration)
    return roc_auc_score(y, oof)

def tune_xgb(trial):
    params = {
        "objective": "binary:logistic",
        "eval_metric": "auc",
        "tree_method": "hist",
        "device": "cuda",  # GPU if available
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "lambda": trial.suggest_float("lambda", 0.0, 5.0),
        "alpha": trial.suggest_float("alpha", 0.0, 5.0),
        "scale_pos_weight": 1.0
    }
    oof = np.zeros(len(X))
    for train_idx, valid_idx in skf.split(X, y):
        dtrain = xgb.DMatrix(X.iloc[train_idx][xgb_features], label=y.iloc[train_idx])
        dvalid = xgb.DMatrix(X.iloc[valid_idx][xgb_features], label=y.iloc[valid_idx])
        model = xgb.train(params, dtrain, num_boost_round=5000,
                          evals=[(dvalid, "valid")], early_stopping_rounds=100,
                          verbose_eval=False)
        oof[valid_idx] = model.predict(dvalid, iteration_range=(0, model.best_iteration))
    return roc_auc_score(y, oof)

def tune_cat(trial):
    params = {
        "iterations": 5000,
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2),
        "depth": trial.suggest_int("depth", 4, 10),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1.0, 10.0),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "bootstrap_type": "Bernoulli",
        "eval_metric": "AUC",
        "task_type": "GPU",  # or "CPU"
        "random_seed": RANDOM_STATE,
        "verbose": False
    }
    oof = np.zeros(len(X))
    for train_idx, valid_idx in skf.split(X, y):
        train_pool = Pool(X.iloc[train_idx], y.iloc[train_idx], cat_features=cat_cols)
        valid_pool = Pool(X.iloc[valid_idx], y.iloc[valid_idx], cat_features=cat_cols)
        model = CatBoostClassifier(**params)
        model.fit(train_pool, eval_set=valid_pool, early_stopping_rounds=100, verbose=False)
        oof[valid_idx] = model.predict_proba(valid_pool)[:, 1]
    return roc_auc_score(y, oof)

# ================================
# Run tuning (small trials for demo)
# ================================
print("Tuning LightGBM...")
study_lgb = optuna.create_study(direction="maximize")
study_lgb.optimize(tune_lgb, n_trials=20)

print("Tuning XGBoost...")
study_xgb = optuna.create_study(direction="maximize")
study_xgb.optimize(tune_xgb, n_trials=20)

print("Tuning CatBoost...")
study_cat = optuna.create_study(direction="maximize")
study_cat.optimize(tune_cat, n_trials=20)

# ================================
# Train final models with best params and blend
# ================================
def train_oof_preds():
    preds_test_all = []
    oof_all = []
    for name, params, train_fn in [
        ("lgb", study_lgb.best_params, "lightgbm"),
        ("xgb", study_xgb.best_params, "xgboost"),
        ("cat", study_cat.best_params, "catboost")
    ]:
        oof = np.zeros(len(X))
        preds_test = np.zeros(len(X_test))
        for train_idx, valid_idx in skf.split(X, y):
            if train_fn == "lightgbm":
                dtrain = lgb.Dataset(X.iloc[train_idx], y.iloc[train_idx], categorical_feature=cat_cols)
                dvalid = lgb.Dataset(X.iloc[valid_idx], y.iloc[valid_idx], categorical_feature=cat_cols)
                model = lgb.train({**params, "objective": "binary", "metric": "auc", "verbose": -1},
                                  dtrain, valid_sets=[dvalid],
                                  callbacks=[lgb.early_stopping(100, verbose=False)])
                oof[valid_idx] = model.predict(X.iloc[valid_idx])
                preds_test += model.predict(X_test) / N_SPLITS

            elif train_fn == "xgboost":
                dtrain = xgb.DMatrix(X.iloc[train_idx][xgb_features], label=y.iloc[train_idx])
                dvalid = xgb.DMatrix(X.iloc[valid_idx][xgb_features], label=y.iloc[valid_idx])
                model = xgb.train({**params, "objective": "binary:logistic", "eval_metric": "auc"},
                                  dtrain, num_boost_round=5000,
                                  evals=[(dvalid, "valid")], early_stopping_rounds=100,
                                  verbose_eval=False)
                oof[valid_idx] = model.predict(dvalid, iteration_range=(0, model.best_iteration))
                preds_test += model.predict(xgb.DMatrix(X_test[xgb_features])) / N_SPLITS

            elif train_fn == "catboost":
                train_pool = Pool(X.iloc[train_idx], y.iloc[train_idx], cat_features=cat_cols)
                valid_pool = Pool(X.iloc[valid_idx], y.iloc[valid_idx], cat_features=cat_cols)
                model = CatBoostClassifier(**params, iterations=5000, eval_metric="AUC", verbose=False)
                model.fit(train_pool, eval_set=valid_pool, early_stopping_rounds=100)
                oof[valid_idx] = model.predict_proba(valid_pool)[:, 1]
                preds_test += model.predict_proba(Pool(X_test, cat_features=cat_cols))[:, 1] / N_SPLITS

        preds_test_all.append(preds_test)
        oof_all.append(oof)
    return np.array(oof_all), np.array(preds_test_all)

oof_models, preds_models = train_oof_preds()

# ================================
# Blend weight tuning
# ================================
def tune_blend(trial):
    w_lgb = trial.suggest_float("w_lgb", 0, 1)
    w_xgb = trial.suggest_float("w_xgb", 0, 1)
    w_cat = trial.suggest_float("w_cat", 0, 1)
    weights = np.array([w_lgb, w_xgb, w_cat])
    weights /= weights.sum()
    blended = np.average(oof_models, axis=0, weights=weights)
    return roc_auc_score(y, blended)

study_blend = optuna.create_study(direction="maximize")
study_blend.optimize(tune_blend, n_trials=50)

# Final blended prediction
best_w = np.array([study_blend.best_params["w_lgb"],
                   study_blend.best_params["w_xgb"],
                   study_blend.best_params["w_cat"]])
best_w /= best_w.sum()

final_preds = np.average(preds_models, axis=0, weights=best_w)


[I 2025-08-15 18:34:41,269] A new study created in memory with name: no-name-125e9e65-c79f-409b-b5fd-dcf9fa2b4449


Tuning LightGBM...


[I 2025-08-15 18:35:43,009] Trial 0 finished with value: 0.9669207500679496 and parameters: {'learning_rate': 0.09912739347765478, 'num_leaves': 83, 'feature_fraction': 0.7111258957046671, 'bagging_fraction': 0.9100507208319891, 'bagging_freq': 9, 'min_data_in_leaf': 60, 'lambda_l1': 0.7753613270478349, 'lambda_l2': 4.887088407557567}. Best is trial 0 with value: 0.9669207500679496.
[I 2025-08-15 18:36:39,250] Trial 1 finished with value: 0.9660154898591515 and parameters: {'learning_rate': 0.052676881221803445, 'num_leaves': 220, 'feature_fraction': 0.8400690302051211, 'bagging_fraction': 0.6414923243486647, 'bagging_freq': 1, 'min_data_in_leaf': 176, 'lambda_l1': 0.06928376912448297, 'lambda_l2': 2.646643961113116}. Best is trial 0 with value: 0.9669207500679496.
[I 2025-08-15 18:37:27,109] Trial 2 finished with value: 0.9662978143181139 and parameters: {'learning_rate': 0.13152399972430873, 'num_leaves': 43, 'feature_fraction': 0.7079593369456566, 'bagging_fraction': 0.7541972392329

Tuning XGBoost...


  self.starting_round = model.num_boosted_rounds()
  self.starting_round = model.num_boosted_rounds()
  self.starting_round = model.num_boosted_rounds()
  self.starting_round = model.num_boosted_rounds()
  self.starting_round = model.num_boosted_rounds()
  self.starting_round = model.num_boosted_rounds()
  self.starting_round = model.num_boosted_rounds()
  self.starting_round = model.num_boosted_rounds()
  self.starting_round = model.num_boosted_rounds()
  self.starting_round = model.num_boosted_rounds()
[I 2025-08-15 18:55:50,226] Trial 0 finished with value: 0.9483000061070357 and parameters: {'learning_rate': 0.19200683258044454, 'max_depth': 10, 'min_child_weight': 8, 'subsample': 0.8001167705888174, 'colsample_bytree': 0.6465925992112143, 'lambda': 3.487939475083908, 'alpha': 0.7223907192122914}. Best is trial 0 with value: 0.9483000061070357.
  self.starting_round = model.num_boosted_rounds()
  self.starting_round = model.num_boosted_rounds()
  self.starting_round = model.num_boo

KeyboardInterrupt: 

In [None]:

# ================================
# Submission
# ================================
sub = pd.DataFrame({id_col: test_f[id_col], TARGET_COL: final_preds})
sub.to_csv("submission_step3_blend.csv", index=False)
print("Saved blended submission with weights:", best_w)
