In [1]:
import pandas as pd
from sklearn.preprocessing import RobustScaler  # or StandardScaler, MinMaxScaler

In [2]:
df = pd.read_csv("data/train.csv")
df = df.drop(["id"], axis =1)
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,42,technician,married,secondary,no,7,no,no,cellular,25,aug,117,3,-1,0,unknown,0
1,38,blue-collar,married,secondary,no,514,no,no,unknown,18,jun,185,1,-1,0,unknown,0
2,36,blue-collar,married,secondary,no,602,yes,no,unknown,14,may,111,2,-1,0,unknown,0
3,27,student,single,secondary,no,34,yes,no,unknown,28,may,10,2,-1,0,unknown,0
4,26,technician,married,secondary,no,889,yes,no,cellular,3,feb,902,1,-1,0,unknown,1


In [3]:


# Separate target
y = df["y"]
X = df.drop(columns=["y"])

# One-hot encode
cat_cols = X.select_dtypes(include=["object", "category"]).columns
X_enc = pd.get_dummies(X, columns=cat_cols, drop_first=True)

# Identify continuous numeric columns from the pre-encoded data
# (more reliable if you pick them before encoding; otherwise use a heuristic)
cont_num_cols = [
    c for c in X.columns
    if pd.api.types.is_numeric_dtype(X[c]) and X[c].nunique() > 2
]

# Scale only those continuous columns
scaler = RobustScaler()  # alt: StandardScaler() or MinMaxScaler(feature_range=(-1, 1))
X_scaled = X_enc.copy()
X_scaled[cont_num_cols] = scaler.fit_transform(X_scaled[cont_num_cols])
X_scaled.head()


Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,job_blue-collar,job_entrepreneur,job_housemaid,...,month_jul,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_other,poutcome_success,poutcome_unknown
0,0.2,-0.451079,0.666667,-0.059259,0.5,0.0,0.0,False,False,False,...,False,False,False,False,False,False,False,False,False,True
1,-0.066667,-0.086331,0.083333,0.192593,-0.5,0.0,0.0,True,False,False,...,False,True,False,False,False,False,False,False,False,True
2,-0.2,-0.023022,-0.25,-0.081481,0.0,0.0,0.0,True,False,False,...,False,False,False,True,False,False,False,False,False,True
3,-0.8,-0.431655,0.916667,-0.455556,0.0,0.0,0.0,False,False,False,...,False,False,False,True,False,False,False,False,False,True
4,-0.866667,0.183453,-1.166667,2.848148,-0.5,0.0,0.0,False,False,False,...,False,False,False,False,False,False,False,False,False,True


In [6]:
import numpy as np
import xgboost as xgb
from tqdm.notebook import tqdm

# X = X_scaled  # Assuming X_scaled is your full processed DataFrame
# y = target column

# Convert to DMatrix
dtrain = xgb.DMatrix(X_scaled, label=y)

# XGBoost parameters
use_gpu = False
params = {
    "objective": "binary:logistic",
    "eval_metric": ["logloss", "auc"],
    "eta": 0.05,
    "max_depth": 6,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "tree_method": "gpu_hist" if use_gpu else "hist",
    "nthread": -1,
}

num_boost_round = 1000  # you can increase since no early stopping

# --- Clean tqdm callback ---
class TQDMCallback(xgb.callback.TrainingCallback):
    def __init__(self, total):
        self.total = total
        self.pbar = None

    def before_training(self, model):
        self.pbar = tqdm(total=self.total, desc="XGBoost training", position=0, leave=True)
        return model

    def after_iteration(self, model, epoch, evals_log):
        self.pbar.update(1)
        try:
            last_log = {m: vals[-1] for m, vals in evals_log["train"].items()}
            self.pbar.set_postfix({
                "logloss": f"{last_log.get('logloss', np.nan):.4f}",
                "auc":     f"{last_log.get('auc', np.nan):.4f}",
            }, refresh=True)
        except Exception:
            pass
        return False

    def after_training(self, model):
        if self.pbar is not None:
            self.pbar.close()
        return model

# Train the model on full dataset
bst = xgb.train(
    params=params,
    dtrain=dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtrain, "train")],
    callbacks=[TQDMCallback(num_boost_round)]
)

# Save model if needed
bst.save_model("xgb_model_full.json")


XGBoost training:   0%|          | 0/1000 [00:00<?, ?it/s]

[0]	train-logloss:0.34744	train-auc:0.93116
[1]	train-logloss:0.33047	train-auc:0.93906
[2]	train-logloss:0.31646	train-auc:0.94006
[3]	train-logloss:0.30458	train-auc:0.94041
[4]	train-logloss:0.29420	train-auc:0.94259
[5]	train-logloss:0.29072	train-auc:0.94617
[6]	train-logloss:0.28189	train-auc:0.94710
[7]	train-logloss:0.27894	train-auc:0.94854
[8]	train-logloss:0.27137	train-auc:0.94917
[9]	train-logloss:0.26444	train-auc:0.94927
[10]	train-logloss:0.25816	train-auc:0.94959
[11]	train-logloss:0.25240	train-auc:0.94958
[12]	train-logloss:0.24714	train-auc:0.94986
[13]	train-logloss:0.24230	train-auc:0.94993
[14]	train-logloss:0.23781	train-auc:0.95020
[15]	train-logloss:0.23580	train-auc:0.95097
[16]	train-logloss:0.23181	train-auc:0.95105
[17]	train-logloss:0.22799	train-auc:0.95148
[18]	train-logloss:0.22441	train-auc:0.95185
[19]	train-logloss:0.22113	train-auc:0.95203
[20]	train-logloss:0.22002	train-auc:0.95236
[21]	train-logloss:0.21704	train-auc:0.95249
[22]	train-logloss:0

In [9]:
import xgboost as xgb

# --- 1) Load test data ---
test_df = pd.read_csv("data/test.csv")
if "id" in test_df.columns:
    ids = test_df["id"].copy()
    test_df = test_df.drop(columns=["id"])
else:
    ids = pd.Series(range(len(test_df)), name="id")

# --- 2) One-hot encode categorical columns with the SAME setup as train ---
# IMPORTANT: use the same `cat_cols` and drop_first=True choice you used on train
test_enc = pd.get_dummies(test_df, columns=cat_cols, drop_first=True)

# Align to training matrix columns (adds any missing OHE columns with zeros and orders columns identically)
test_enc = test_enc.reindex(columns=X_scaled.columns, fill_value=0)

# --- 3) Scale ONLY the continuous numeric columns with the FITTED RobustScaler ---
# These columns exist in test_enc unchanged (since only categorical were OHE'ed)
test_enc[cont_num_cols] = scaler.transform(test_enc[cont_num_cols])

# --- 4) Predict with the best iteration ---
dtest = xgb.DMatrix(test_enc)
best_iter = getattr(bst, "best_iteration", None)
print(f"Best Iteration: {best_iter}")

if best_iter is None:
    best_iter = bst.num_boosted_rounds() - 1

y_prob = bst.predict(dtest, iteration_range=(0, best_iter + 1))
y_pred = (y_prob >= 0.5).astype(int)

# --- 5) Package results (and save if you like) ---
out = pd.DataFrame({
    "id": ids,
    "y": y_pred
})
print(out.head())

# Optional: save submission/predictions
out.to_csv("xgb_predictions.csv", index=False)
print("Saved predictions to xgb_predictions.csv")


Best Iteration: None
       id  y
0  750000  0
1  750001  0
2  750002  0
3  750003  0
4  750004  0
Saved predictions to xgb_predictions.csv
