In [None]:
import pandas as pd
from sklearn.preprocessing import RobustScaler  # or StandardScaler, MinMaxScaler

In [None]:
df = pd.read_csv("data/train.csv")
df = df.drop(["id"], axis =1)
df.head()

In [None]:


# Separate target
y = df["y"]
X = df.drop(columns=["y"])

# One-hot encode
cat_cols = X.select_dtypes(include=["object", "category"]).columns
X_enc = pd.get_dummies(X, columns=cat_cols, drop_first=True)

# Identify continuous numeric columns from the pre-encoded data
# (more reliable if you pick them before encoding; otherwise use a heuristic)
cont_num_cols = [
    c for c in X.columns
    if pd.api.types.is_numeric_dtype(X[c]) and X[c].nunique() > 2
]

# Scale only those continuous columns
scaler = RobustScaler()  # alt: StandardScaler() or MinMaxScaler(feature_range=(-1, 1))
X_scaled = X_enc.copy()
X_scaled[cont_num_cols] = scaler.fit_transform(X_scaled[cont_num_cols])
X_scaled.head()


In [None]:
import numpy as np
from tqdm.auto import tqdm
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score

# --- Assumptions ---
# You already have:
#   y            -> target (0/1)
#   X_scaled     -> your scaled + one-hot encoded features
# If your variable is named differently (e.g., X_encoded or X_processed), swap it in.

X = X_scaled  # or X_processed / X_encoded
y = y         # keep as is

# Train/valid split
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# XGBoost likes DMatrix for speed and memory efficiency
dtrain = xgb.DMatrix(X_train, label=y_train)
dvalid = xgb.DMatrix(X_valid, label=y_valid)

# Choose CPU or GPU tree method
use_gpu = False  # set True if you have a CUDA-capable GPU + xgboost built with CUDA

params = {
    "objective": "binary:logistic",
    "eval_metric": ["logloss", "auc"],
    "eta": 0.05,                 # learning_rate
    "max_depth": 6,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "tree_method": "gpu_hist" if use_gpu else "hist",
    "nthread": -1,
}

num_boost_round = 1000
early_stopping_rounds = 50
evals = [(dtrain, "train"), (dvalid, "valid")]

# --- tqdm callback for pretty progress ---
class TQDMCallback(xgb.callback.TrainingCallback):
    def __init__(self, total):
        self.total = total
        self.pbar = None

    def after_training(self, model):
        if self.pbar is not None:
            self.pbar.close()
        return model

    def before_training(self, model):
        self.pbar = tqdm(total=self.total, desc="XGBoost training", leave=True)
        return model

    def after_iteration(self, model, epoch, evals_log):
        # Update bar and show latest valid metrics in postfix
        self.pbar.update(1)

        # Pull last metric values for display
        try:
            # evals_log looks like:
            # {'train': {'logloss': [...], 'auc': [...]}, 'valid': {'logloss': [...], 'auc': [...]} }
            last_train = {m: vals[-1] for m, vals in evals_log["train"].items()}
            last_valid = {m: vals[-1] for m, vals in evals_log["valid"].items()}
            self.pbar.set_postfix({
                "train_logloss": f"{last_train.get('logloss', np.nan):.4f}",
                "valid_logloss": f"{last_valid.get('logloss', np.nan):.4f}",
                "valid_auc":     f"{last_valid.get('auc', np.nan):.4f}",
            })
        except Exception:
            pass

        # Return False to continue training
        return False

# Early stopping callback (saves best score/iteration)
es_cb = xgb.callback.EarlyStopping(
    rounds=early_stopping_rounds,
    save_best=True,          # keep best iteration
    maximize=True,           # because 'auc' is a maximize metric
    data_name="valid",
    metric_name="auc"
)

# Train
bst = xgb.train(
    params=params,
    dtrain=dtrain,
    num_boost_round=num_boost_round,
    evals=evals,
    callbacks=[TQDMCallback(num_boost_round), es_cb],
)



In [11]:
# Inference at best iteration (XGBoost >= 1.6/2.0 style)
best_iter = getattr(bst, "best_iteration", None)
if best_iter is None:
    # Fallback if early stopping didn't set it
    best_iter = bst.num_boosted_rounds() - 1  # zero-based

# Use iteration_range instead of ntree_limit
y_prob = bst.predict(dvalid, iteration_range=(0, best_iter + 1))
y_pred = (y_prob >= 0.5).astype(int)

from sklearn.metrics import roc_auc_score, accuracy_score
auc = roc_auc_score(y_valid, y_prob)
acc = accuracy_score(y_valid, y_pred)
print(f"\nBest iteration: {best_iter}  (used trees: {best_iter + 1})")
print(f"Valid AUC: {auc:.4f} | Valid Acc: {acc:.4f}")



Best iteration: 996  (used trees: 997)
Valid AUC: 0.9673 | Valid Acc: 0.9347


In [12]:
import xgboost as xgb

# --- 1) Load test data ---
test_df = pd.read_csv("data/test.csv")
if "id" in test_df.columns:
    ids = test_df["id"].copy()
    test_df = test_df.drop(columns=["id"])
else:
    ids = pd.Series(range(len(test_df)), name="id")

# --- 2) One-hot encode categorical columns with the SAME setup as train ---
# IMPORTANT: use the same `cat_cols` and drop_first=True choice you used on train
test_enc = pd.get_dummies(test_df, columns=cat_cols, drop_first=True)

# Align to training matrix columns (adds any missing OHE columns with zeros and orders columns identically)
test_enc = test_enc.reindex(columns=X_scaled.columns, fill_value=0)

# --- 3) Scale ONLY the continuous numeric columns with the FITTED RobustScaler ---
# These columns exist in test_enc unchanged (since only categorical were OHE'ed)
test_enc[cont_num_cols] = scaler.transform(test_enc[cont_num_cols])

# --- 4) Predict with the best iteration ---
dtest = xgb.DMatrix(test_enc)
best_iter = getattr(bst, "best_iteration", None)
if best_iter is None:
    best_iter = bst.num_boosted_rounds() - 1

y_prob = bst.predict(dtest, iteration_range=(0, best_iter + 1))
y_pred = (y_prob >= 0.5).astype(int)

# --- 5) Package results (and save if you like) ---
out = pd.DataFrame({
    "id": ids,
    "y_prob": y_prob,
    "y_pred": y_pred
})
print(out.head())

# Optional: save submission/predictions
out.to_csv("xgb_predictions.csv", index=False)
print("Saved predictions to xgb_predictions.csv")


       id    y_prob  y_pred
0  750000  0.001079       0
1  750001  0.121239       0
2  750002  0.000277       0
3  750003  0.000141       0
4  750004  0.019914       0
Saved predictions to xgb_predictions.csv
