In [4]:
import xgboost
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.metrics import (
    roc_auc_score, average_precision_score,
    confusion_matrix, classification_report,
    roc_curve, precision_recall_curve
)
import matplotlib.pyplot as plt

import shap

In [5]:
dataset = pd.read_parquet('./preprocessed_datasets/dataset_alpha_0.4_full_binarytarget_win-24_thresh-19_both.parquet')
dataset['timestamp'] = dataset.index

In [7]:
TIME_COL = "timestamp"   
TARGET_COL = "target"    
df = dataset.copy()


df[TIME_COL] = pd.to_datetime(df[TIME_COL])
df = df.sort_values(TIME_COL).reset_index(drop=True)


FEATURES = [c for c in df.columns if c not in [TIME_COL, TARGET_COL]]

X = df[FEATURES]
y = df[TARGET_COL].astype(int)

n = len(df)
test_size = int(0.20 * n)
val_size  = int(0.20 * (n - test_size))

train_end = n - test_size
val_end = train_end
train_end2 = train_end - val_size

X_train, y_train = X.iloc[:train_end2], y.iloc[:train_end2]
X_val,   y_val   = X.iloc[train_end2:val_end], y.iloc[train_end2:val_end]
X_test,  y_test  = X.iloc[val_end:], y.iloc[val_end:]


# n_pos = (y_train == 1).sum()
# n_neg = (y_train == 0).sum()
# w_pos = (n_neg / max(n_pos, 1))

# w_train = np.where(y_train.values == 1, w_pos, 1.0)
# w_val   = np.where(y_val.values   == 1, w_pos, 1.0)
# w_test  = np.where(y_test.values  == 1, w_pos, 1.0)  # only used if you want weighted metrics

# print(f"Train positives={n_pos}, negatives={n_neg}, w_pos={w_pos:.3f}")



In [None]:
bal = (len(y)-sum(y))/sum(y)


model = XGBClassifier(
    n_estimators=500,
    learning_rate=0.03,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=1.0,
    min_child_weight=1.0,
    objective="binary:logistic",
    eval_metric="aucpr",
    scale_pos_weight=bal,      
    random_state=1233,
)

model.fit(
    X_train, y_train,    
    eval_set=[(X_val, y_val)],
    early_stopping_rounds=100
)

print("Best iteration:", model.best_iteration)

In [None]:


proba_test = model.predict_proba(X_test)[:, 1]

auc = roc_auc_score(y_test, proba_test)
auprc = average_precision_score(y_test, proba_test)

print("\n=== Test Metrics ===")
print(f"ROC AUC : {auc:.4f}")
print(f"AUPRC   : {auprc:.4f}")

# Pick a threshold (0.5 default). Often for imbalance you may tune it.
threshold = 0.50
yhat_test = (proba_test >= threshold).astype(int)

cm = confusion_matrix(y_test, yhat_test)
print("\nConfusion matrix (threshold = %.2f):\n%s" % (threshold, cm))
print("\nClassification report:\n", classification_report(y_test, yhat_test, digits=4))

# Plot ROC + PR curves
fpr, tpr, _ = roc_curve(y_test, proba_test)
prec, rec, _ = precision_recall_curve(y_test, proba_test)

fig, ax = plt.subplots(1, 2, figsize=(12, 4))

ax[0].plot(fpr, tpr, label=f"AUC={auc:.3f}")
ax[0].plot([0, 1], [0, 1], "--", color="gray")
ax[0].set_title("ROC Curve")
ax[0].set_xlabel("False Positive Rate")
ax[0].set_ylabel("True Positive Rate")
ax[0].legend()

ax[1].plot(rec, prec, label=f"AUPRC={auprc:.3f}")
ax[1].set_title("Precision-Recall Curve")
ax[1].set_xlabel("Recall")
ax[1].set_ylabel("Precision")
ax[1].legend()

plt.tight_layout()
plt.show()

In [None]:
X_explain = X_test.copy()
if len(X_explain) > 5000:
    X_explain = X_explain.sample(5000, random_state=42)

explainer = shap.TreeExplainer(model)
shap_values = explainer(X_explain)  # shap.Explanation

# Global importance (beeswarm)
shap.plots.beeswarm(shap_values, max_display=25)

# Bar plot of mean(|SHAP|)
shap.plots.bar(shap_values, max_display=25)