In [None]:
# 1) Imports & feature view
import os, pandas as pd, numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import GroupKFold
from sklearn.metrics import roc_auc_score, brier_score_loss
from xgboost import XGBClassifier
from sqlalchemy import create_engine, text
from dotenv import load_dotenv
load_dotenv(); engine = create_engine(os.getenv("POSTGRES_URL"))

SQL = """
select ra.season, ra.round, r.race_id, r.driver_id,
       r.grid, df.form_points_5, tp.team_quali_pos5, q.position as quali_pos,
       (r.grid - q.position) as grid_vs_quali, coalesce(w.is_wet,false) as is_wet,
       (r.position between 1 and 10) as is_top10
from raw.results r
join raw.races ra using (race_id)
left join staging.driver_form df using (race_id, driver_id)
left join staging.team_pace tp using (race_id)
left join raw.qualifying q using (race_id, driver_id)
left join raw.weather w using (race_id)
where r.position is not null
"""
df = pd.read_sql(text(SQL), engine)
features = ["grid","form_points_5","team_quali_pos5","quali_pos","grid_vs_quali","is_wet"]
X = df[features].fillna(0)
y = df["is_top10"].astype(int)

In [None]:
# 2) GroupKFold by season (time‑aware)
cv = GroupKFold(n_splits=5)
preds, trues = [], []
for tr, va in cv.split(X, y, groups=df["season"]):
    model = XGBClassifier(n_estimators=400, max_depth=5, learning_rate=0.05, subsample=0.8, colsample_bytree=0.9, reg_lambda=2.0)
    model.fit(X.iloc[tr], y.iloc[tr])
    p = model.predict_proba(X.iloc[va])[:,1]
    preds.extend(p); trues.extend(y.iloc[va].tolist())
auc = roc_auc_score(trues, preds); brier = brier_score_loss(trues, preds)
print({"cv_auc": round(auc,4), "cv_brier": round(brier,4)})

In [None]:
# 3) Reliability (calibration) curve
bins = np.linspace(0,1,11)
inds = np.digitize(preds, bins)-1
df_cal = pd.DataFrame({"bin": inds, "pred": preds, "y": trues}).groupby("bin").agg(pred_mean=("pred","mean"), actual_rate=("y","mean"))
ax = df_cal.plot(y=["pred_mean","actual_rate"], marker='o'); ax.set_title("Calibration: predicted vs actual");

In [None]:
# 4) Feature importance (gain)
model = XGBClassifier(n_estimators=400, max_depth=5, learning_rate=0.05)
model.fit(X, y)
imp = pd.Series(model.get_booster().get_score(importance_type='gain'))
imp.sort_values(ascending=False).plot(kind='bar', figsize=(8,3));

In [None]:
# 5) Add a new feature experiment: driver consistency (finish pos std over last 5)
SQL_new = """
with res as (
  select driver_id, race_id, season, round, position,
         row_number() over (partition by driver_id order by season, round) as rn
  from raw.results r join raw.races ra using (race_id)
)
select r1.driver_id, r1.race_id, stddev_pop(r2.position) as driver_consistency5
from res r1 join res r2 on r1.driver_id=r2.driver_id and r2.rn between r1.rn-5 and r1.rn-1
group by 1,2
"""
cons = pd.read_sql(text(SQL_new), engine)
base = pd.read_sql(text(SQL), engine)
X2 = (base.merge(cons, on=["race_id","driver_id"], how="left")
          .assign(driver_consistency5=lambda d: d.driver_consistency5.fillna(d.driver_consistency5.median()))
          [features + ["driver_consistency5"]])

In [None]:
# 6) Re‑evaluate with the new feature
cv = GroupKFold(n_splits=5)
preds, trues = [], []
for tr, va in cv.split(X2, y, groups=base["season"]):
    model = XGBClassifier(n_estimators=400, max_depth=5, learning_rate=0.05)
    model.fit(X2.iloc[tr], y.iloc[tr])
    p = model.predict_proba(X2.iloc[va])[:,1]
    preds.extend(p); trues.extend(y.iloc[va].tolist())

In [None]:
# 7) Log the experiment
from datetime import datetime
log = {
    "ts": datetime.utcnow().isoformat(),
    "added_features": ["driver_consistency5"],
    "cv_auc": float(roc_auc_score(trues, preds)),
    "notes": "Stddev of recent finishes captures volatility; keep if > +0.003 AUC"
}
log

In [None]:
# 8) Save figures for BI/README
out = "../bi/screenshots"; os.makedirs(out, exist_ok=True)
ax.figure.savefig(f"{out}/calibration_curve.png", bbox_inches='tight')
print("AUC with new feature:", roc_auc_score(trues, preds))