In [None]:
import pandas as pd
import numpy as np
import optuna
import xgboost as xgb
import catboost as cb

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import KNNImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score


In [None]:
# Load data
train = pd.read_csv("/kaggle/input/iisc-umc-301-kaggle-competition-1/train.csv")
test  = pd.read_csv("/kaggle/input/iisc-umc-301-kaggle-competition-1/test.csv")
submission = pd.read_csv("/kaggle/input/iisc-umc-301-kaggle-competition-1/sample_submission.csv")

train.rename(columns={'audio_valence':'valence','audio_mode':'mode'}, inplace=True)
test.rename(columns={'audio_valence':'valence','audio_mode':'mode'}, inplace=True)

train.drop('id', axis=1, inplace=True)
test_ids = test.pop('id')

# predictors/target  (raw, no scaling or imputation)
X = train.drop('song_popularity', axis=1)
y = train['song_popularity']
X_test = test.copy()

In [None]:
# Cross-validation helper
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scale_pos_weight = (y == 0).sum() / (y == 1).sum()


In [None]:

# Optuna: XGBoost
def objective_xgb(trial):
    params = {
        "objective": "binary:logistic",
        "eval_metric": "auc",
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.05),
        "max_depth": trial.suggest_int("max_depth", 3, 8),
        "subsample": trial.suggest_float("subsample", 0.6, 0.9),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 0.9),
        "n_estimators": 800,
        "scale_pos_weight": scale_pos_weight,
        "tree_method": "gpu_hist",
        "random_state": 42,
        "n_jobs": -1
    }
    oof = np.zeros(len(X))
    for tr, val in skf.split(X, y):
        model = xgb.XGBClassifier(**params)
        model.fit(X.iloc[tr], y.iloc[tr],
                  eval_set=[(X.iloc[val], y.iloc[val])],
                  early_stopping_rounds=50,
                  verbose=False)
        oof[val] = model.predict_proba(X.iloc[val])[:, 1]
    return roc_auc_score(y, oof)

study_xgb = optuna.create_study(direction="maximize")
study_xgb.optimize(objective_xgb, n_trials=30)

xgb_params = {**study_xgb.best_params,
              "objective": "binary:logistic",
              "eval_metric": "auc",
              "n_estimators": 800,
              "scale_pos_weight": scale_pos_weight,
              "tree_method": "gpu_hist",
              "random_state": 42,
              "n_jobs": -1}

# Optuna: CatBoost
def objective_cat(trial):
    params = {
        "loss_function": "Logloss",
        "eval_metric": "AUC",
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.05),
        "depth": trial.suggest_int("depth", 4, 8),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1, 6),
        "iterations": 800,
        "task_type": "GPU",
        "random_seed": 42,
        "verbose": False
    }
    oof = np.zeros(len(X))
    for tr, val in skf.split(X, y):
        model = cb.CatBoostClassifier(**params)
        model.fit(X.iloc[tr], y.iloc[tr],
                  eval_set=(X.iloc[val], y.iloc[val]),
                  early_stopping_rounds=50,
                  verbose=False)
        oof[val] = model.predict_proba(X.iloc[val])[:, 1]
    return roc_auc_score(y, oof)

study_cat = optuna.create_study(direction="maximize")
study_cat.optimize(objective_cat, n_trials=30)

cat_params = {**study_cat.best_params,
              "loss_function": "Logloss",
              "eval_metric": "AUC",
              "iterations": 800,
              "task_type": "GPU",
              "random_seed": 42,
              "verbose": False}

print("Best XGB params:", xgb_params)
print("Best CAT params:", cat_params)


In [None]:

# Bagging: 40 XGB + 40 CatBoost
N_XGB = 40
N_CAT = 40
test_preds = np.zeros(len(X_test))
oof_preds = np.zeros(len(X))

# Clean param dicts to avoid duplicate keys
xgb_params_clean = {k: v for k, v in xgb_params.items()
                    if k not in ["random_state"]}
cat_params_clean = {k: v for k, v in cat_params.items()
                    if k not in ["random_seed", "verbose"]}

# Train 40 XGB
for seed in range(N_XGB):
    model = xgb.XGBClassifier(**xgb_params_clean, random_state=seed)
    model.fit(X, y, verbose=False)
    oof_preds += model.predict_proba(X)[:, 1] / (N_XGB + N_CAT)
    test_preds += model.predict_proba(X_test)[:, 1] / (N_XGB + N_CAT)

# Train 40 CatBoost
for seed in range(N_CAT):
    model = cb.CatBoostClassifier(**cat_params_clean,
                                  random_seed=seed,
                                  verbose=False)
    model.fit(X, y)
    oof_preds += model.predict_proba(X)[:, 1] / (N_XGB + N_CAT)
    test_preds += model.predict_proba(X_test)[:, 1] / (N_XGB + N_CAT)

print("OOF AUC (bagged 40 models):", roc_auc_score(y, oof_preds))

submission["song_popularity"] = test_preds
submission.to_csv("submission.csv", index=False)
print("✅ submission.csv written")