# NHL Holdout Verification

This notebook reproduces the 69% high-confidence NHL edge directly from the canonical feature matrix and closing-line dataset.

*Last refresh:* 2025‑11‑20



In [None]:
import json
from datetime import datetime
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

PROJECT_ROOT = Path("..").resolve().parent
DATA_DIR = PROJECT_ROOT / "data" / "modeling_datasets"
FEATURE_MATRIX = DATA_DIR / "nhl_feature_matrix.parquet"
SUMMARY_JSON = DATA_DIR / "nhl_feature_matrix_summary.json"
METRICS_OUTPUT = PROJECT_ROOT / "docs" / "investor" / "verification" / "nhl_holdout_metrics.json"
print(f"Using feature matrix: {FEATURE_MATRIX}")
print(f"Writing metrics to: {METRICS_OUTPUT}")



In [None]:
df = pd.read_parquet(FEATURE_MATRIX)
df['date'] = pd.to_datetime(df['date'])
df = df.sort_values('date').reset_index(drop=True)
print(df.shape)
df.head()



In [None]:
feature_cols = [c for c in df.columns if c.startswith('perf_') or c.startswith('nom_') or c.startswith('narr_') or c.startswith('ctx_') or c.startswith('odds_')]
len(feature_cols)



In [None]:
cutoff_date = pd.Timestamp('2024-09-01')
train_mask = df['date'] < cutoff_date
test_mask = df['date'] >= cutoff_date

X_train = df.loc[train_mask, feature_cols].values
X_test = df.loc[test_mask, feature_cols].values
y_train = df.loc[train_mask, 'moneyline_result'].values
y_test = df.loc[test_mask, 'moneyline_result'].values

print(f"Train: {X_train.shape[0]} games, Test: {X_test.shape[0]} games")



In [None]:
scaler = StandardScaler()
dX_train = scaler.fit_transform(X_train)
dX_test = scaler.transform(X_test)

model = LogisticRegression(max_iter=500, solver='lbfgs')
model.fit(dX_train, y_train)
probs = model.predict_proba(dX_test)[:, 1]



In [None]:
def american_to_profit(odds: float, stake: float = 1.0) -> float:
    if odds is None or np.isnan(odds):
        return 0.0
    if odds > 0:
        return stake * odds / 100.0
    return stake * 100.0 / abs(odds)

holdout = df.loc[test_mask, ['date', 'home_team', 'away_team', 'closing_moneyline_home', 'moneyline_result']].copy()
holdout['prob'] = probs
holdout.head()



In [None]:
def evaluate_threshold(threshold: float):
    bets = holdout[holdout['prob'] >= threshold]
    if bets.empty:
        return {
            'threshold': threshold,
            'bets': 0,
            'win_rate': np.nan,
            'roi': np.nan,
        }
    wins = (bets['moneyline_result'] == 1).sum()
    win_rate = wins / len(bets)
    profits = []
    for _, row in bets.iterrows():
        profit = american_to_profit(row['closing_moneyline_home'])
        profits.append(profit if row['moneyline_result'] == 1 else -1.0)
    roi = np.sum(profits) / len(bets)
    return {
        'threshold': threshold,
        'bets': int(len(bets)),
        'win_rate': win_rate,
        'roi': roi,
    }

thresholds = (0.55, 0.60, 0.65, 0.70)
results = [evaluate_threshold(t) for t in thresholds]
results_df = pd.DataFrame(results)
results_df



In [None]:
feature_summary = {}
if SUMMARY_JSON.exists():
    feature_summary = json.loads(SUMMARY_JSON.read_text())

metrics = {
    "generated_at": datetime.utcnow().isoformat() + "Z",
    "cutoff_date": cutoff_date.strftime("%Y-%m-%d"),
    "train_games": int(train_mask.sum()),
    "test_games": int(test_mask.sum()),
    "feature_count": len(feature_cols),
    "thresholds": results,
    "source_feature_matrix": str(FEATURE_MATRIX),
    "feature_summary": {
        "total_features": feature_summary.get("total_features"),
        "breakdown": feature_summary.get("feature_breakdown"),
    },
}

METRICS_OUTPUT.parent.mkdir(parents=True, exist_ok=True)
with open(METRICS_OUTPUT, "w") as f:
    json.dump(metrics, f, indent=2)

metrics



The ≥0.65 tier should align with the published 69.4% win rate / +0.32 ROI. If the values diverge materially, re-run `scripts/run_daily_pipeline.py` to refresh the feature matrix and models.

Latest metrics are exported to `docs/investor/verification/nhl_holdout_metrics.json` for inclusion in investor materials.
