In [23]:

import pandas as pd
import numpy as np
from xgboost import XGBClassifier
import joblib
from pathlib import Path
import warnings

warnings.filterwarnings("ignore")

SEED = 42
np.random.seed(SEED)

# Paths
DATA_DIR = Path("../data/processed")
MODEL_DIR = Path("../experiments/models")

print("XGB proba persistence notebook initialized.")
print("Data dir:", DATA_DIR)
print("Model dir:", MODEL_DIR)

XGB proba persistence notebook initialized.
Data dir: ..\data\processed
Model dir: ..\experiments\models


In [24]:

train = pd.read_csv(DATA_DIR / "train.csv")
test  = pd.read_csv(DATA_DIR / "test.csv")

print("Train shape:", train.shape)
print("Test shape:", test.shape)

assert "Class" in train.columns, "Target column missing"

Train shape: (227845, 61)
Test shape: (56962, 61)


In [25]:
train.columns.tolist()

['Time',
 'V1',
 'V2',
 'V3',
 'V4',
 'V5',
 'V6',
 'V7',
 'V8',
 'V9',
 'V10',
 'V11',
 'V12',
 'V13',
 'V14',
 'V15',
 'V16',
 'V17',
 'V18',
 'V19',
 'V20',
 'V21',
 'V22',
 'V23',
 'V24',
 'V25',
 'V26',
 'V27',
 'V28',
 'Amount',
 'Class',
 'timestamp',
 'hour',
 'dayofweek',
 'amount_log',
 'amount_scaled',
 'merchant_id',
 'device_type',
 'geo_bucket',
 'account_id',
 'account_age_days',
 'merchant_freq',
 'account_txn_count',
 'device_freq',
 'last_5_mean_amount',
 'last_5_count',
 'merchant_id_fe',
 'device_type_fe',
 'geo_bucket_fe',
 'account_id_fe',
 'amount_times_age',
 'is_new_merchant',
 'merchant_id_missing',
 'device_type_missing',
 'geo_bucket_missing',
 'account_age_days_missing',
 'pca_x',
 'pca_y',
 'anomaly_score',
 'is_anomaly',
 'xgb_proba']

In [26]:

DROP_COLS = [
    "Class",
    "timestamp",
    "pca_x",
    "pca_y",
    'anomaly_score',
    'is_anomaly',
    'cluster_id',
    "mlp_proba",
    "ae_recon_error",
    'ae_latent_1',
    'ae_latent_1',
    'ae_latent_2',
    'ae_latent_3',
    'ae_latent_4',
    'ae_latent_5',
    'ae_latent_6',
    'ae_latent_7',
    'ae_latent_8',
    'xgb_proba'
]

ID_COLS = [
    "merchant_id",
    "account_id",
    "device_type",
    "geo_bucket"
]

drop_cols = [c for c in DROP_COLS + ID_COLS if c in train.columns]

X_train = train.drop(columns=drop_cols, errors="ignore")
y_train = train["Class"].astype("int32")

X_test = test.drop(columns=drop_cols, errors="ignore")

# Keep numeric only
X_train = X_train.select_dtypes(include=["number"])
X_test  = X_test.select_dtypes(include=["number"])

print("XGB train features:", X_train.shape)
print("XGB test features:", X_test.shape)

XGB train features: (227845, 50)
XGB test features: (56962, 50)


In [27]:

best_params = {
    "max_depth": 7,
    "learning_rate": 0.1,
    "subsample": 0.7,
    "colsample_bytree": 0.7,
    "n_estimators": 300,
    "scale_pos_weight": 1.0,
    "objective": "binary:logistic",
    "eval_metric": "logloss",
    "random_state": SEED,
    "n_jobs": -1
}

xgb_final = XGBClassifier(**best_params)

print("Training final XGBoost on full data...")
xgb_final.fit(X_train, y_train)

print("Training complete.")

Training final XGBoost on full data...
Training complete.


In [28]:
xgb_path = MODEL_DIR / "xgb.joblib"
joblib.dump(xgb_final, xgb_path)

print("XGBoost model saved to:", xgb_path)

XGBoost model saved to: ..\experiments\models\xgb.joblib


In [29]:
train["xgb_proba"] = xgb_final.predict_proba(X_train)[:, 1]
test["xgb_proba"]  = xgb_final.predict_proba(X_test)[:, 1]

print("xgb_proba added.")
print(train["xgb_proba"].describe())

xgb_proba added.
count    2.278450e+05
mean     1.728146e-03
std      4.082348e-02
min      9.587777e-10
25%      4.997229e-07
50%      1.433954e-06
75%      4.781527e-06
max      9.999958e-01
Name: xgb_proba, dtype: float64


In [30]:

train.to_csv(DATA_DIR / "train.csv", index=False)
test.to_csv(DATA_DIR / "test.csv", index=False)

print("Processed datasets updated with xgb_proba.")

Processed datasets updated with xgb_proba.


In [31]:
"xgb_proba" in train.columns  # must be True


True

In [32]:
import json
from pathlib import Path

FEATURE_PATH = Path("../experiments/models/xgb_features.json")

xgb_features = X_train.columns.tolist()

with open(FEATURE_PATH, "w") as f:
    json.dump(xgb_features, f, indent=2)

print(f"Saved {len(xgb_features)} XGB features to {FEATURE_PATH}")


Saved 50 XGB features to ..\experiments\models\xgb_features.json


In [33]:
import json
from pathlib import Path

XGB_FEATURES_PATH = Path("../experiments/models/xgb_features.json")

# this MUST come from the exact dataframe used to train XGB
features = X_train.columns.tolist()

with open(XGB_FEATURES_PATH, "w") as f:
    json.dump(
        {"features": features},
        f,
        indent=2
    )

print(f"Saved {len(features)} XGB features correctly")


Saved 50 XGB features correctly
