In [None]:
import numpy as np
import pandas as pd
import glob

import xgboost as xgb

from sklearn.model_selection import GroupShuffleSplit
from sklearn.metrics import log_loss, brier_score_loss

import matplotlib.pyplot as plt

In [None]:
parquet_files = glob.glob("data/silver_features/*.parquet")
df = pd.concat([pd.read_parquet(f) for f in parquet_files], ignore_index=True)

print("Total rows:", len(df))
print("Total markets:", df["condition_id"].nunique())

In [None]:
df["y_final"] = df["y_final"].astype(int)
df["had_trade"] = df["had_trade"].astype(int)

df["ret_1"] = df["ret_1"].fillna(0.0)
df["vol_7"] = df["vol_7"].fillna(0.0)

df["log_volume"] = np.log1p(df["volume"].astype(float))
df["log_open_interest"] = p.log1p(df["open_interest"].astype(float))
df["log_time_to_close"] = np.log1p(df["time_to_close_sec"].clip(lower=0).astype(float))

In [None]:
DAYS_BEFORE_CLOSE = 7
SECONDS_BEFORE_CLOSE = DAYS_BEFORE_CLOSE * 24 * 60 * 60

df_horizon = df[df["time_to_close_sec"] >= SECONDS_BEFORE_CLOSE].copy()

df_snapshot = (df_horizon.loc[df_horizon.groupby("condition_id")["time_to_close_sec"].idxmin()].copy())

print("Markets after snapshot filtering:", df_znapshot["condition_id"].nunique())
print("Time-to-close (days):")
print((df_snapshot["time_to_close_sex"] / (24*60*60)).describe())

In [None]:
feature_cols = [
    "p_mkt",
    "spread",
    "log_volume",
    "log_open_interest",
    "log_time_to_close",
    "ret_1",
    "vol_7",
    "had_trade"
]

X = df_snapshot[feature_cols].astype(float)
y = df_snapshot["y_final"].values
groups = df_snapshot["condition_id"].values

In [None]:
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, test_idx = next(gss.split(X, y, groups=groups))

X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y[train_idx], y[test_idx]

print("Train markets:", len(np.unique(groups[train_idx])))
print("Test markets:", len(np.unique(groups[test_idx])))

In [None]:
p_base = np.clip(X_test["p_mkt"].values, 1e-6, 1-1e-6)

print("Baseline logloss:", log_loss(y_test, p_base))
print("Baseline brier:", brier_score_loss(y_test, p_base))

In [None]:
monotone_constraints = (1, 0, 0, 0, -1, 0, 0, 0)

xgb_model = xgb.XGBClassifier(
    n_estimators=500,
    max_depth=8,
    learning_rate=0.03,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="binary:logistic",
    eval_metric="logloss",
    monotone_constraints=monotone_constraints,
    random_state=42
)

xgb_model.fit(X_train, y_train)

In [None]:
p_model = np.clip(xgb_model.predict_proba(X_test)[:, 1], 1e-6, 1-1e-6)

print("XGBoost logloss:", log_loss(y_test, p_model))
print("XGBoost brier:", brier_score_loss(y_test, p_model))

print("\nImprovements vs baseline:")
print("Δ logloss:", log_loss(y_test, p_base) - log_loss(y_test, p_model))
print("Δ brier:", brier_score_loss(y_test, p_base) - brier_score_loss(y_test, p_model))

In [None]:
delta = p_model - p_base

plt.figure(figsize=(6,4))
plt.hist(delta, bins=40)
plt.axvline(0, color="red", linestyle="--")
plt.title("XGBoost adjustment: p_model - p_mkt")
plt.xlabel("delta")
plt.ylabel("count")
plt.show()

In [None]:
importances = pd.Series(
    xgb_model.feature_importances_,
    index=feature_cols
).sort_values(ascending=False)

print(importances)

importances.plot(kind="barh". figsize(6,4))
plt.title("XGBoost feature importance (gain)")
plt.show()

In [None]:
from sklearn.calibration import calibration_curve

prob_true, prob_pred = calibration_curve(y_test, p_model, n_bins=10)

plt.figure(figsize=(5,5))
plt.plot(prob_pred, prob_true, marker="o")
plt.plot([0,1], [0,1], linestyle="--")
plt.xlabel("Predicted probability")
plt.ylabel("Empirical frequency")
plt.title("XGBoost calibration")
plt.grid(alpha=0.3)
plt.show()