In [9]:
import pandas as pd
from sklearn.preprocessing import RobustScaler  # or StandardScaler, MinMaxScaler

In [10]:
df = pd.read_csv("data/bank-full.csv", sep = ";")
# df = df.drop(["id"], axis =1)
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [14]:
# Separate target
y = df["y"]
X = df.drop(columns=["y"])

# One-hot encode
cat_cols = X.select_dtypes(include=["object", "category"]).columns
X_enc = pd.get_dummies(X, columns=cat_cols, drop_first=True)

# Identify continuous numeric columns from the pre-encoded data
# (more reliable if you pick them before encoding; otherwise use a heuristic)
cont_num_cols = [
    c for c in X.columns
    if pd.api.types.is_numeric_dtype(X[c]) and X[c].nunique() > 2
]

# Scale only those continuous columns
scaler = RobustScaler()  # alt: StandardScaler() or MinMaxScaler(feature_range=(-1, 1))
X_scaled = X_enc.copy()
X_scaled[cont_num_cols] = scaler.fit_transform(X_scaled[cont_num_cols])
X_scaled.head()

y_fixed = (
    pd.Series(y)
      .map({"yes": 1, "no": 0, "1": 1, "0": 0})
      .astype("float32")
      .fillna(pd.Series(y).astype("float32") if pd.api.types.is_numeric_dtype(y) else np.nan)
)


In [15]:
import numpy as np
import xgboost as xgb
from tqdm.notebook import tqdm

# X = X_scaled  # Assuming X_scaled is your full processed DataFrame
# y = target column

# Convert to DMatrix
dtrain = xgb.DMatrix(X_scaled, label=y_fixed)

# XGBoost parameters
use_gpu = False
params = {
    "objective": "binary:logistic",
    "eval_metric": ["logloss", "auc"],
    "eta": 0.05,
    "max_depth": 6,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "tree_method": "gpu_hist" if use_gpu else "hist",
    "nthread": -1,
}

num_boost_round = 1000  # you can increase since no early stopping

# --- Clean tqdm callback ---
class TQDMCallback(xgb.callback.TrainingCallback):
    def __init__(self, total):
        self.total = total
        self.pbar = None

    def before_training(self, model):
        self.pbar = tqdm(total=self.total, desc="XGBoost training", position=0, leave=True)
        return model

    def after_iteration(self, model, epoch, evals_log):
        self.pbar.update(1)
        try:
            last_log = {m: vals[-1] for m, vals in evals_log["train"].items()}
            self.pbar.set_postfix({
                "logloss": f"{last_log.get('logloss', np.nan):.4f}",
                "auc":     f"{last_log.get('auc', np.nan):.4f}",
            }, refresh=True)
        except Exception:
            pass
        return False

    def after_training(self, model):
        if self.pbar is not None:
            self.pbar.close()
        return model

# Train the model on full dataset
bst = xgb.train(
    params=params,
    dtrain=dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtrain, "train")],
    callbacks=[TQDMCallback(num_boost_round)]
)

# Save model if needed
bst.save_model("xgb_model_full.json")


XGBoost training:   0%|          | 0/1000 [00:00<?, ?it/s]

[0]	train-logloss:0.34599	train-auc:0.86893
[1]	train-logloss:0.33371	train-auc:0.88369
[2]	train-logloss:0.32349	train-auc:0.88917
[3]	train-logloss:0.31468	train-auc:0.89054
[4]	train-logloss:0.30688	train-auc:0.90107
[5]	train-logloss:0.30321	train-auc:0.90958
[6]	train-logloss:0.29653	train-auc:0.91023
[7]	train-logloss:0.29337	train-auc:0.91341
[8]	train-logloss:0.28740	train-auc:0.91532
[9]	train-logloss:0.28215	train-auc:0.91526
[10]	train-logloss:0.27720	train-auc:0.91791
[11]	train-logloss:0.27269	train-auc:0.91818
[12]	train-logloss:0.26859	train-auc:0.91858
[13]	train-logloss:0.26468	train-auc:0.91899
[14]	train-logloss:0.26123	train-auc:0.91933
[15]	train-logloss:0.25958	train-auc:0.92051
[16]	train-logloss:0.25634	train-auc:0.92051
[17]	train-logloss:0.25319	train-auc:0.92152
[18]	train-logloss:0.25042	train-auc:0.92226
[19]	train-logloss:0.24772	train-auc:0.92233
[20]	train-logloss:0.24654	train-auc:0.92286
[21]	train-logloss:0.24404	train-auc:0.92307
[22]	train-logloss:0

In [16]:
import xgboost as xgb

# --- 1) Load test data ---
test_df = pd.read_csv("data/test.csv")
if "id" in test_df.columns:
    ids = test_df["id"].copy()
    test_df = test_df.drop(columns=["id"])
else:
    ids = pd.Series(range(len(test_df)), name="id")

# --- 2) One-hot encode categorical columns with the SAME setup as train ---
# IMPORTANT: use the same `cat_cols` and drop_first=True choice you used on train
test_enc = pd.get_dummies(test_df, columns=cat_cols, drop_first=True)

# Align to training matrix columns (adds any missing OHE columns with zeros and orders columns identically)
test_enc = test_enc.reindex(columns=X_scaled.columns, fill_value=0)

# --- 3) Scale ONLY the continuous numeric columns with the FITTED RobustScaler ---
# These columns exist in test_enc unchanged (since only categorical were OHE'ed)
test_enc[cont_num_cols] = scaler.transform(test_enc[cont_num_cols])

# --- 4) Predict with the best iteration ---
dtest = xgb.DMatrix(test_enc)
best_iter = getattr(bst, "best_iteration", None)
print(f"Best Iteration: {best_iter}")

if best_iter is None:
    best_iter = bst.num_boosted_rounds() - 1

y_prob = bst.predict(dtest, iteration_range=(0, best_iter + 1))
y_pred = (y_prob >= 0.5).astype(int)

# --- 5) Package results (and save if you like) ---
out = pd.DataFrame({
    "id": ids,
    "y": y_pred
})
print(out.head())

# Optional: save submission/predictions
out.to_csv("xgb_predictions.csv", index=False)
print("Saved predictions to xgb_predictions.csv")


Best Iteration: None
       id  y
0  750000  0
1  750001  0
2  750002  0
3  750003  0
4  750004  0
Saved predictions to xgb_predictions.csv
