In [16]:
from pathlib import Path
import hashlib, shutil, subprocess, os
from datetime import datetime

REPO   = Path(r"C:\Users\Nic\Desktop\NHLPredictor")
DESK   = Path(r"C:\Users\Nic\Desktop")
NBNAME = "PredictNHL.ipynb"
EXTS   = {".csv", ".json", ".xlsx"}
BRANCH = "main"

# --- Force a CSV to "update" even if content is the same ---
FORCE_REFRESH_CSV = True
CSV_WHITELIST = []  # e.g. ["predictions.csv", "enhanced_game_data.csv"]; empty = all CSVs in repo

def sha(p):
    import hashlib
    h=hashlib.sha256()
    with p.open("rb") as f:
        for b in iter(lambda:f.read(1<<20), b""): h.update(b)
    return h.hexdigest()

def bump_csv_bytes(path: Path):
    """Toggle trailing newline to force a harmless byte change."""
    b = path.read_bytes()
    if b.endswith(b"\n"):
        path.write_bytes(b.rstrip(b"\n"))      # remove last newline
    else:
        path.write_bytes(b + b"\n")            # add last newline

def run(cmd):
    r = subprocess.run(cmd, cwd=str(REPO), capture_output=True, text=True)
    if r.stdout: print(r.stdout.strip())
    if r.stderr and r.returncode != 0: print(r.stderr.strip())
    return r.returncode

assert REPO.exists()
os.chdir(REPO)

# 1) Build targets: notebook + tracked csv/json/xlsx
targets = {NBNAME}
targets |= {p.name for p in REPO.iterdir() if p.is_file() and p.suffix.lower() in EXTS}

# 2) Copy Desktop -> repo when content differs
changed = []
for name in sorted(targets):
    src, dst = DESK/name, REPO/name
    if src.exists():
        if (not dst.exists()) or sha(src) != sha(dst):
            shutil.copy2(src, dst); changed.append(name)

# 3) Optionally force-refresh CSVs (toggle EOF newline)
if FORCE_REFRESH_CSV:
    csvs = [p for p in REPO.iterdir() if p.is_file() and p.suffix.lower()==".csv"]
    if CSV_WHITELIST:
        csvs = [p for p in csvs if p.name in CSV_WHITELIST]
    for p in csvs:
        # only bump if not already modified by step 2 (to avoid double-noise)
        if p.name not in changed:
            bump_csv_bytes(p); changed.append(p.name)

# 4) Commit & push (only if something actually changed)
if changed:
    run(["git","add"] + changed)
    msg = f"Auto update (nb+data) - {datetime.now():%Y-%m-%d %H:%M:%S}"
    run(["git","commit","-m", msg])
    run(["git","pull","--rebase","origin", BRANCH])
    run(["git","push","-u","origin", BRANCH])
    print("Pushed:", ", ".join(changed))
else:
    print("Nothing to update. (No content changes and force-refresh off or nothing matched.)")



[main f2c313e] Auto update (nb+data) - 2025-10-17 22:34:59
 4 files changed, 1659 insertions(+), 1589 deletions(-)
Current branch main is up to date.
branch 'main' set up to track 'origin/main'.
Pushed: PredictNHL.ipynb, enhanced_game_data.csv, predictions.csv, selected_features.json


In [1]:
!pip install tensorflow
!pip install xgboost
!pip install streamlit
!pip install imbalanced-learn
!pip install shap
!pip install statsmodels
!pip install lightgbm

Collecting numpy<2.1.0,>=1.26.0 (from tensorflow-intel==2.18.0->tensorflow)
  Downloading numpy-2.0.2-cp312-cp312-win_amd64.whl.metadata (59 kB)
Downloading numpy-2.0.2-cp312-cp312-win_amd64.whl (15.6 MB)
   ---------------------------------------- 0.0/15.6 MB ? eta -:--:--
   --- ------------------------------------ 1.3/15.6 MB 8.4 MB/s eta 0:00:02
   -------- ------------------------------- 3.1/15.6 MB 8.4 MB/s eta 0:00:02
   ------------ --------------------------- 5.0/15.6 MB 8.6 MB/s eta 0:00:02
   ---------------- ----------------------- 6.6/15.6 MB 8.4 MB/s eta 0:00:02
   ---------------------- ----------------- 8.9/15.6 MB 9.1 MB/s eta 0:00:01
   ---------------------------- ----------- 11.0/15.6 MB 9.2 MB/s eta 0:00:01
   --------------------------------- ------ 13.1/15.6 MB 9.3 MB/s eta 0:00:01
   ---------------------------------------  15.5/15.6 MB 9.6 MB/s eta 0:00:01
   ---------------------------------------- 15.6/15.6 MB 9.4 MB/s  0:00:01
Installing collected packages: 


[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip






[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
import os
import time
import json
import random
import warnings
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score, TimeSeriesSplit, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.metrics import f1_score, roc_auc_score, brier_score_loss, accuracy_score, log_loss, classification_report, make_scorer
from sklearn.calibration import CalibratedClassifierCV
from sklearn.inspection import permutation_importance
from sklearn.feature_selection import mutual_info_classif, RFECV, SelectFromModel, SelectKBest
from sklearn.base import BaseEstimator, TransformerMixin, clone
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from imblearn.over_sampling import SMOTE
import joblib
from datetime import datetime
import re
from scipy.stats import loguniform, uniform
from io import StringIO
from datetime import datetime, timedelta
import unicodedata
import os, joblib
from selenium import webdriver
from selenium import webdriver
from bs4 import BeautifulSoup, Comment
import pandas as pd, time
import numpy as np
import pandas as pd
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.compose import ColumnTransformer
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:

# EXTRACT BOTH SEASONS
SEASONS = [2025, 2026]
FEATURES_STATS = {
    'points_pct': 'PTS%', 'srs': 'SRS',
    'goals_for_per_game': 'GF/G', 'goals_against_per_game': 'GA/G',
    'power_play_pct': 'PP%', 'pen_kill_pct': 'PK%',
    'save_pct': 'SV%', 'shots': 'S', 'shots_against': 'SA'
}
FEATURES_5V5 = {
    'corsi_pct_5on5': 'CF%',
    'fenwick_pct_5on5': 'FF%',
    'exp_on_goals_for': 'xGF',
    'exp_on_goals_against': 'xGA',
    'hdsc_for_pct': 'HDCF%',
    'pdo': 'PDO'
}

_clean = lambda n: n.replace('*', '').strip() if n else ''
_to_num = lambda x: pd.to_numeric(str(x).replace('%', '').replace(',', '').strip(), errors='coerce')

def scrape_season(season):
    url = f"https://www.hockey-reference.com/leagues/NHL_{season}.html"

    with webdriver.Chrome() as driver:
        driver.get(url)
        time.sleep(2.5)
        soup = BeautifulSoup(driver.page_source, 'html.parser')

    table_stats = soup.find('table', id='stats')
    records = []
    for i, row in enumerate(table_stats.find_all('tr')[1:]):
        cols = row.find_all('td')
        if not cols: continue
        team = _clean(cols[0].text)
        if team == "League Average": continue
        rec = {'Team': team, 'Rank': i + 1}
        for k, lbl in FEATURES_STATS.items():
            el = row.find("td", {"data-stat": k})
            rec[lbl] = _to_num(el.text) if el else None
        records.append(rec)
    df_stats = pd.DataFrame(records)
    df_stats["Season"] = season

    #Stats #5v5
    table_adv = None
    for c in soup.find_all(string=lambda t: isinstance(t, Comment)):
        if 'stats_adv' in c:
            table_adv = BeautifulSoup(c, 'html.parser').find('table', id='stats_adv')
            break

    adv_records = []
    for row in table_adv.find_all('tr')[1:]:
        cols = row.find_all('td')
        if not cols: continue
        team = _clean(cols[0].text)
        if team == "League Average": continue
        rec = {'Team': team}
        for k, lbl in FEATURES_5V5.items():
            el = row.find("td", {"data-stat": k}) or row.find("td", {"data-stat": k.lower()}) or row.find("td", {"data-stat": k.upper()})
            rec[lbl] = _to_num(el.text) if el else None
        adv_records.append(rec)

    df_5v5 = pd.DataFrame(adv_records)
    df_5v5["xGF%"] = (df_5v5["xGF"] / (df_5v5["xGF"] + df_5v5["xGA"]) * 100).round(1)
    df_5v5["Season"] = season

    # ---------- MERGE ----------
    df = (
        pd.merge(df_stats, df_5v5, on=["Team", "Season"], how="left")
          .sort_values("Rank")
          .reset_index(drop=True)
    )

    return df



df_2025 = scrape_season(2025)
df_2026 = scrape_season(2026)

# Combine both datasets
df_combined = pd.concat([df_2025, df_2026], ignore_index=True).sort_values(["Team", "Season"]).reset_index(drop=True)


In [3]:
import unicodedata
import re
import requests
from io import StringIO

UA = {"User-Agent": "Mozilla/5.0"}
TIMEZONE = "America/Toronto"

def to_csv_url(url: str) -> str:
    return re.sub(r"/results/(nhl-\d{4})$", r"/download/\1-UTC.csv", url)

def parse_datetime_column(s: pd.Series) -> pd.Series:
    dt = pd.to_datetime(s, errors="coerce", utc=True, dayfirst=True)
    fallback = dt.isna()
    if fallback.any():
        dt.loc[fallback] = pd.to_datetime(s[fallback], errors="coerce", utc=True, dayfirst=False)
    return dt


# ==========================
# TEAM NORMALIZATION SECTION
# ==========================

TEAM_NAME_MAP = {
    "Montreal Canadiens": "Montreal Canadiens",
    "Montr√©al Canadiens": "Montreal Canadiens",

    "Utah Hockey Club": "Utah Mammoth",
    "Utah HC": "Utah Mammoth",
    "Utah Hockey C": "Utah Mammoth",
    "Utah Mammoth": "Utah Mammoth",
}

def normalize_team_name(s: str) -> str:
    if pd.isna(s):
        return s
    s = str(s).strip()

    # Remove accents
    s = "".join(c for c in unicodedata.normalize("NFKD", s)
                if not unicodedata.combining(c))

    # Normalize multiple spaces
    s = re.sub(r"\s{2,}", " ", s)

    return TEAM_NAME_MAP.get(s, s)


def load_game_results(url: str) -> pd.DataFrame:
    csv_url = to_csv_url(url)
    try:
        df = pd.read_csv(csv_url)
    except:
        r = requests.get(csv_url, headers=UA, timeout=20)
        r.raise_for_status()
        df = pd.read_csv(StringIO(r.text))

    df.columns = [col.strip() for col in df.columns]
    lc = {col.lower(): col for col in df.columns}

    out = pd.DataFrame({
        "Date_str": df[lc.get("date")],
        "Home Team": df[lc.get("home team", "home")],
        "Away Team": df[lc.get("away team", "away")],
    })

    # Scores
    if "Home Score" in df.columns and "Away Score" in df.columns:
        out["Home Score"] = pd.to_numeric(df["Home Score"], errors="coerce")
        out["Away Score"] = pd.to_numeric(df["Away Score"], errors="coerce")
    elif "Score" in df.columns:
        scores = df["Score"].astype(str).str.extract(r"(\d+)\s*[-‚Äì‚àí]\s*(\d+)")
        out["Home Score"] = pd.to_numeric(scores[0], errors="coerce")
        out["Away Score"] = pd.to_numeric(scores[1], errors="coerce")
    else:
        out["Home Score"] = pd.NA
        out["Away Score"] = pd.NA

    # Fill from "result"
    if "result" in lc:
        missing = out["Home Score"].isna() | out["Away Score"].isna()
        result_scores = df[lc["result"]].astype(str).str.extract(r"(\d+)\s*[-‚Äì‚àí]\s*(\d+)")
        out.loc[missing, "Home Score"] = pd.to_numeric(result_scores[0], errors="coerce")
        out.loc[missing, "Away Score"] = pd.to_numeric(result_scores[1], errors="coerce")

    out["Date_UTC"] = parse_datetime_column(out["Date_str"])
    out["Date_Local"] = out["Date_UTC"].dt.tz_convert(TIMEZONE)
    out["LocalDate"] = out["Date_Local"].dt.date

    # Clean rows
    out = (
        out.dropna(subset=["Date_UTC"])
           .sort_values("Date_UTC")
           .drop_duplicates(subset=["Date_UTC", "Home Team", "Away Team"], keep="last")
           .reset_index(drop=True)
    )

    # Normalize TEAM NAMES exactly ONCE
    for col in ["Home Team", "Away Team"]:
        out[col] = out[col].apply(normalize_team_name)

    return out


In [4]:
# ================================================
# ==== BUILD df_all AFTER loading game results ====
# ================================================

df_2024 = load_game_results("https://fixturedownload.com/results/nhl-2024")
df_2024["Season"], df_2024["weight"] = "2024-2025", 1.0

df_2025 = load_game_results("https://fixturedownload.com/results/nhl-2025")
df_2025["Season"], df_2025["weight"] = "2025-2026", 2.0

df_all = pd.concat([df_2024, df_2025], ignore_index=True)

# Played flag
df_all["Played"] = df_all["Home Score"].notna() & df_all["Away Score"].notna()

# Build master list
df_games_master = (
    df_all[["LocalDate", "Home Team", "Away Team", "Home Score", "Away Score", "Season", "Played"]]
        .rename(columns={"LocalDate": "Date"})
        .sort_values("Date")
        .reset_index(drop=True)
)

print("‚úî df_all built:", df_all.shape)
print("‚úî df_games_master built:", df_games_master.shape)


‚úî df_all built: (2624, 11)
‚úî df_games_master built: (2624, 7)


In [5]:
# =====================================
# ==== BUILD df_final (Played only) ===
# =====================================

df_final = (
    df_all[df_all["Played"]]
    .copy()
    .sort_values("LocalDate")
    .reset_index(drop=True)
)

# Standardize "Date" used by the model
df_final["Date"] = df_final["LocalDate"].astype("datetime64[ns]")

print("‚úî df_final built:", df_final.shape)
print(df_final.head())


‚úî df_final built: (1722, 12)
           Date_str          Home Team           Away Team Home Score  \
0  04/10/2024 17:00     Buffalo Sabres   New Jersey Devils          1   
1  05/10/2024 14:00  New Jersey Devils      Buffalo Sabres          3   
2  08/10/2024 20:30     Seattle Kraken     St. Louis Blues          2   
3  08/10/2024 23:00   Florida Panthers       Boston Bruins          6   
4  09/10/2024 02:00       Utah Mammoth  Chicago Blackhawks          5   

  Away Score                  Date_UTC                Date_Local   LocalDate  \
0          4 2024-10-04 17:00:00+00:00 2024-10-04 13:00:00-04:00  2024-10-04   
1          1 2024-10-05 14:00:00+00:00 2024-10-05 10:00:00-04:00  2024-10-05   
2          3 2024-10-08 20:30:00+00:00 2024-10-08 16:30:00-04:00  2024-10-08   
3          4 2024-10-08 23:00:00+00:00 2024-10-08 19:00:00-04:00  2024-10-08   
4          2 2024-10-09 02:00:00+00:00 2024-10-08 22:00:00-04:00  2024-10-08   

      Season  weight  Played       Date  
0  2024

In [6]:
# --- DATA MANIPULATION ---

df_final["Date"] = df_final["Date_Local"].dt.tz_localize(None)

# Add Home Win flag if missing
if "Home Win" not in df_final.columns:
    df_final["Home Win"] = (df_final["Home Score"] > df_final["Away Score"]).astype(bool)

# Sort by chronological order
df_final = df_final.sort_values(by="Date").reset_index(drop=True)


# --- 1Ô∏è‚É£ LAST 10 WINS ---
def calculate_last_10_stats(df, team_column):
    last_10_wins = []
    team_games = {}  # Key: (team, season)
    for _, row in df.iterrows():
        season = row["Season"]
        team = row[team_column]
        key = (team, season)
        recent_games = team_games.get(key, [])[-10:]
        last_10_wins.append(sum(recent_games))
        home_key = (row["Home Team"], season)
        away_key = (row["Away Team"], season)
        team_games.setdefault(home_key, []).append(row["Home Win"])
        team_games.setdefault(away_key, []).append(not row["Home Win"])
    return last_10_wins

df_final["Home Last 10 Wins"] = calculate_last_10_stats(df_final, "Home Team")
df_final["Away Last 10 Wins"] = calculate_last_10_stats(df_final, "Away Team")


# --- 2Ô∏è‚É£ PLAYED YESTERDAY ---
def calculate_played_yesterday(df):
    last_game_date = {}
    home_played_yesterday, away_played_yesterday = [], []
    for _, row in df.iterrows():
        season = row["Season"]
        date = row["Date"]
        home = row["Home Team"]
        away = row["Away Team"]
        yesterday = date - pd.Timedelta(days=1)
        home_key = (home, season)
        away_key = (away, season)
        home_played_yesterday.append(last_game_date.get(home_key) == yesterday)
        away_played_yesterday.append(last_game_date.get(away_key) == yesterday)
        last_game_date[home_key] = date
        last_game_date[away_key] = date
    return home_played_yesterday, away_played_yesterday

df_final["Home Played Yesterday"], df_final["Away Played Yesterday"] = calculate_played_yesterday(df_final)


# --- 3Ô∏è‚É£ WIN RATE ---
def calculate_win_rate(df, team_column, is_home_column):
    win_rate = []
    team_stats = {}  # Key: (team, season)
    for _, row in df.iterrows():
        season = row["Season"]
        team = row[team_column]
        key = (team, season)
        stats = team_stats.setdefault(key, {"wins": 0, "games": 0})
        win_rate.append(stats["wins"] / stats["games"] if stats["games"] > 0 else 0)
        stats["wins"] += row["Home Win"] if is_home_column else not row["Home Win"]
        stats["games"] += 1
    return win_rate

df_final["Home Win Rate"] = calculate_win_rate(df_final, "Home Team", is_home_column=True)
df_final["Away Win Rate"] = calculate_win_rate(df_final, "Away Team", is_home_column=False)


# --- 4Ô∏è‚É£ OVERALL STREAK ---
def calculate_overall_win_streak(df):
    streak = {}
    home_streaks, away_streaks = [], []
    for _, row in df.iterrows():
        season = row["Season"]
        home_key = (row["Home Team"], season)
        away_key = (row["Away Team"], season)
        home_streaks.append(streak.get(home_key, 0))
        away_streaks.append(streak.get(away_key, 0))
        if row["Home Win"]:
            streak[home_key] = streak.get(home_key, 0) + 1
            streak[away_key] = 0
        else:
            streak[home_key] = 0
            streak[away_key] = streak.get(away_key, 0) + 1
    return home_streaks, away_streaks

home_streaks, away_streaks = calculate_overall_win_streak(df_final)
df_final["Home Team Overall Win Streak Before Game"] = home_streaks
df_final["Away Team Overall Win Streak Before Game"] = away_streaks


# --- 5Ô∏è‚É£ REST DAYS ---
OFFSEASON_GAP_DAYS = 45
def calculate_days_since_last_game(df, gap=OFFSEASON_GAP_DAYS):
    last_game_date = {}
    home_rest, away_rest = [], []
    for _, row in df.iterrows():
        season = row["Season"]
        date = row["Date"]
        home_key = (row["Home Team"], season)
        away_key = (row["Away Team"], season)
        prev_home = last_game_date.get(home_key)
        prev_away = last_game_date.get(away_key)
        home_rest.append((date - prev_home).days if prev_home and (date - prev_home).days <= gap else None)
        away_rest.append((date - prev_away).days if prev_away and (date - prev_away).days <= gap else None)
        last_game_date[home_key] = date
        last_game_date[away_key] = date
    return home_rest, away_rest

df_final["Home Rest Days Since Last Game"], df_final["Away Rest Days Since Last Game"] = calculate_days_since_last_game(df_final)


In [7]:

df_final["Home Advantage"] = df_final["Home Win Rate"] - df_final["Away Win Rate"]
df_final["Win Streak Impact"] = df_final["Home Team Overall Win Streak Before Game"] - df_final["Away Team Overall Win Streak Before Game"]
df_final["Last 10 Wins"] = df_final["Home Last 10 Wins"] - df_final["Away Last 10 Wins"]
df_final = df_final.dropna().reset_index(drop=True)
df_final["Date"] = pd.to_datetime(df_final["Date"])
games_2025 = df_final[df_final["Date"] < "2025-10-01"].copy()
games_2026 = df_final[df_final["Date"] >= "2025-10-01"].copy()
stats_2025 = df_combined[df_combined["Season"] == 2025].copy()
stats_2025["Team"] = stats_2025["Team"].astype(str).str.strip()
games_2025 = games_2025.merge(
    stats_2025.add_prefix("Home_"), left_on="Home Team", right_on="Home_Team", how="left"
)
games_2025 = games_2025.merge(
    stats_2025.add_prefix("Away_"), left_on="Away Team", right_on="Away_Team", how="left"
)



In [8]:
print(df_final)

              Date_str             Home Team              Away Team  \
0     05/10/2024 14:00     New Jersey Devils         Buffalo Sabres   
1     10/10/2024 23:00         Boston Bruins     Montreal Canadiens   
2     10/10/2024 23:00     New Jersey Devils    Toronto Maple Leafs   
3     12/10/2024 00:00         Winnipeg Jets     Chicago Blackhawks   
4     12/10/2024 02:00  Vegas Golden Knights        St. Louis Blues   
...                ...                   ...                    ...   
1677  02/12/2025 00:00   Philadelphia Flyers    Pittsburgh Penguins   
1678  02/12/2025 00:00     New Jersey Devils  Columbus Blue Jackets   
1679  02/12/2025 00:30        Buffalo Sabres          Winnipeg Jets   
1680  02/12/2025 01:00       St. Louis Blues          Anaheim Ducks   
1681  02/12/2025 03:00       San Jose Sharks           Utah Mammoth   

     Home Score Away Score                  Date_UTC  \
0             3          1 2024-10-05 14:00:00+00:00   
1             6          4 2024-10-

In [9]:

SNAPSHOT_PATH = "team_snapshots.csv"
SNAPSHOT_SEASON = 2026

today = pd.Timestamp.now().normalize().date()

snapshot = df_combined[df_combined["Season"] == SNAPSHOT_SEASON].copy()
snapshot["Snapshot Date"] = pd.Timestamp(today)
snapshot["Team"] = snapshot["Team"].astype(str).str.strip()

if os.path.exists(SNAPSHOT_PATH):
    df_existing = pd.read_csv(SNAPSHOT_PATH, parse_dates=["Snapshot Date"])
    df_existing = df_existing[df_existing["Snapshot Date"] != pd.Timestamp(today)]
    df_all_snapshots = pd.concat([df_existing, snapshot], ignore_index=True)
else:
    df_all_snapshots = snapshot

df_all_snapshots.to_csv(SNAPSHOT_PATH, index=False)
print(f"‚úÖ Snapshot saved for {today}. Total entries: {len(df_all_snapshots)}")


‚úÖ Snapshot saved for 2025-12-03. Total entries: 416


In [10]:

snapshots = pd.read_csv("team_snapshots.csv", parse_dates=["Snapshot Date"])
snapshots["Team"] = snapshots["Team"].astype(str).str.strip()

final_2025_stats = df_combined[df_combined["Season"] == 2025].copy()
final_2025_stats["Team"] = final_2025_stats["Team"].astype(str).str.strip()
final_2025_stats["Snapshot Date"] = pd.Timestamp("2025-07-01")  # after season end
final_2025_stats["Season"] = "2024-2025"

def merge_snapshot_stats(df_games, df_snapshots, team_column, prefix):
    result = []

    for idx, row in df_games.iterrows():
        game_date = row["Date"]
        team = row[team_column]
        season = row["Season"]

        if season == "2025-2026":
            # ‚úÖ Dynamic: use most recent snapshot for ongoing season
            team_snapshots = df_snapshots[
                (df_snapshots["Team"] == team)
                & (df_snapshots["Snapshot Date"] <= game_date)
            ]
            if not team_snapshots.empty:
                latest_snapshot = team_snapshots.sort_values("Snapshot Date").iloc[-1]
                renamed = latest_snapshot.rename(lambda x: f"{prefix}_{x}" if x not in ["Team"] else x)
                result.append(renamed)
                continue

        elif season == "2024-2025":
            # ‚úÖ Static: use final 2025 stats
            team_stats = final_2025_stats[final_2025_stats["Team"] == team]
            if not team_stats.empty:
                latest_snapshot = team_stats.iloc[-1]
                renamed = latest_snapshot.rename(lambda x: f"{prefix}_{x}" if x not in ["Team"] else x)
                result.append(renamed)
                continue

        # ‚ùå No data found
        result.append(pd.Series(dtype="float64"))

    snapshot_df = pd.DataFrame(result, index=df_games.index)
    return pd.concat([df_games, snapshot_df], axis=1)

# --- Apply to both Home and Away teams ---
df_final = merge_snapshot_stats(df_final, snapshots, team_column="Home Team", prefix="Home")
df_final = merge_snapshot_stats(df_final, snapshots, team_column="Away Team", prefix="Away")

# --- Save and confirm ---
df_final.to_csv("games_with_snapshots.csv", index=False)
print("‚úÖ Merged: 2024-2025 uses final stats, 2025-2026 uses dynamic snapshots.")

‚úÖ Merged: 2024-2025 uses final stats, 2025-2026 uses dynamic snapshots.


In [11]:
print(df_final)
df_final.to_csv("df_final.csv", index=False)


              Date_str             Home Team              Away Team  \
0     05/10/2024 14:00     New Jersey Devils         Buffalo Sabres   
1     10/10/2024 23:00         Boston Bruins     Montreal Canadiens   
2     10/10/2024 23:00     New Jersey Devils    Toronto Maple Leafs   
3     12/10/2024 00:00         Winnipeg Jets     Chicago Blackhawks   
4     12/10/2024 02:00  Vegas Golden Knights        St. Louis Blues   
...                ...                   ...                    ...   
1677  02/12/2025 00:00   Philadelphia Flyers    Pittsburgh Penguins   
1678  02/12/2025 00:00     New Jersey Devils  Columbus Blue Jackets   
1679  02/12/2025 00:30        Buffalo Sabres          Winnipeg Jets   
1680  02/12/2025 01:00       St. Louis Blues          Anaheim Ducks   
1681  02/12/2025 03:00       San Jose Sharks           Utah Mammoth   

     Home Score Away Score                  Date_UTC  \
0             3          1 2024-10-05 14:00:00+00:00   
1             6          4 2024-10-

In [12]:
# 1Ô∏è‚É£ RANK DIFFERENCE
if "Home_Rank" in df_final.columns and "Away_Rank" in df_final.columns:
    df_final["Rank Difference"] = df_final["Home_Rank"] - df_final["Away_Rank"]
else:
    print("‚ö†Ô∏è Columns Home_Rank / Away_Rank not found. Skipping Rank Difference.")


# 2Ô∏è‚É£ RELATIVE SRS (strength differential)
if "Home_SRS" in df_final.columns and "Away_SRS" in df_final.columns:
    df_final["SRS_Diff"] = df_final["Home_SRS"] - df_final["Away_SRS"]
else:
    print("‚ö†Ô∏è Columns Home_SRS / Away_SRS not found. Skipping SRS_Diff.")


# 3Ô∏è‚É£ OPPONENT STRENGTH (rolling average of opponents' ranks)
def calculate_avg_opponent_rank(df):
    team_opponent_ranks = {team: [] for team in pd.concat([df["Home Team"], df["Away Team"]]).unique()}
    home_strengths, away_strengths = [], []

    for _, row in df.iterrows():
        home = row["Home Team"]
        away = row["Away Team"]
        home_opponents = team_opponent_ranks[home]
        away_opponents = team_opponent_ranks[away]

        # average of last 10 opponents‚Äô ranks before this game
        home_strengths.append(np.mean(home_opponents[-10:]) if home_opponents else np.nan)
        away_strengths.append(np.mean(away_opponents[-10:]) if away_opponents else np.nan)

        # update AFTER the current game (to avoid data leakage)
        if not pd.isna(row.get("Away_Rank", np.nan)):
            team_opponent_ranks[home].append(row["Away_Rank"])
        if not pd.isna(row.get("Home_Rank", np.nan)):
            team_opponent_ranks[away].append(row["Home_Rank"])

    return home_strengths, away_strengths


df_final["Home Opponent Strength"], df_final["Away Opponent Strength"] = calculate_avg_opponent_rank(df_final)


# 4Ô∏è‚É£ CLEAN-UP (optional)
df_final = df_final.dropna(subset=["Home_Rank", "Away_Rank"])
df_final = df_final.dropna().reset_index(drop=True)

# 5Ô∏è‚É£ EXPORT
df_final.to_csv("df_final_with_rank_features.csv", index=False)
print("‚úÖ Final dataset exported: df_final_with_rank_features.csv")



‚úÖ Final dataset exported: df_final_with_rank_features.csv


In [13]:

scorer = make_scorer(f1_score, pos_label=None, average='weighted')

y_true = df_final["Home Win"].astype(int).values

y_pred = np.ones_like(y_true)                   
y_proba = np.ones_like(y_true, dtype=float)     

f1 = f1_score(y_true, y_pred, average="weighted")


try:
    roc = roc_auc_score(y_true, y_proba)
except ValueError:
    roc = float("nan")  # undefined for constant scores


brier = brier_score_loss(y_true, y_proba)

print(f"Baseline (Always Home) ‚Äî F1: {f1:.4f}, ROC AUC: {roc}, Brier: {brier:.4f}")
y_pred_away = np.zeros_like(y_true)
y_proba_away = np.zeros_like(y_true, dtype=float)

f1_away = f1_score(y_true, y_pred_away, average="weighted")
try:
    roc_away = roc_auc_score(y_true, y_proba_away)
except ValueError:
    roc_away = float("nan")
brier_away = brier_score_loss(y_true, y_proba_away)

print(f"Baseline (Always Away) ‚Äî F1: {f1_away:.4f}, ROC AUC: {roc_away}, Brier: {brier_away:.4f}")

Baseline (Always Home) ‚Äî F1: 0.4057, ROC AUC: 0.5, Brier: 0.4369
Baseline (Always Away) ‚Äî F1: 0.2657, ROC AUC: 0.5, Brier: 0.5631


In [14]:
# --- Feature selection (all existing + clean) ---
base_feature_cols = [
    # Game context / form
    "Home Played Yesterday",
    "Away Played Yesterday",
    "Home Rest Days Since Last Game",
    "Away Rest Days Since Last Game",

    # Comparative metrics
    "Home Advantage",
    "Win Streak Impact",
    "Last 10 Wins",
    "Home Opponent Strength",
    "Away Opponent Strength",
    "SRS_Diff",

    # Snapshot strength metrics (team stats)
    "Home_PP%",
    "Away_PP%",
    "Home_PK%",
    "Away_PK%",
    "Home_SV%",
    "Away_SV%",
    "Home_xGF%",
    "Away_xGF%",
]

df_model = df_final.dropna(subset=base_feature_cols + ["Home Win"]).copy()
df_model = df_model.sort_values("Date").reset_index(drop=True)


X = df_model[base_feature_cols]
y = df_model["Home Win"].astype(int)

print(f"‚úÖ Model dataset ready: {X.shape[0]} games, {X.shape[1]} features.")
print(f"Home win rate: {y.mean():.2%}")


‚úÖ Model dataset ready: 1323 games, 18 features.
Home win rate: 56.31%


In [15]:
# ======================================
# ==== CONFIG & GLOBAL PARAMETERS ======
# ======================================

SEED = 42
CV_SPLITS = 6
GAP = 3
TEST_RATIO = 0.2

# --- Iteration controls ---
N_ITER_LOG_L2 = 40
N_ITER_LOG_EN = 40
N_ITER_RF = 20
N_ITER_LGBM = 20
N_ITER_MLP = 200

MLP_MAX_ITER = 2000
MLP_EARLY_STOP = True
MLP_VAL_FRAC = 0.12
MLP_PATIENCE = 20

# ======================================
# ==== FEATURES ========================
# ======================================

base_feature_cols = [
    # Game context / rest
    "Home Played Yesterday",
    "Away Played Yesterday",
    "Home Rest Days Since Last Game",
    "Away Rest Days Since Last Game",

    # Absolute form (avant le match)
    "Home Last 10 Wins",
    "Away Last 10 Wins",
    "Home Win Rate",
    "Away Win Rate",
    "Home Team Overall Win Streak Before Game",
    "Away Team Overall Win Streak Before Game",
    "Home Opponent Strength",
    "Away Opponent Strength",


    # Saison / force structurelle
    "Home_SRS",
    "Away_SRS",
    "Home_PP%",
    "Away_PP%",
    "Home_PK%",
    "Away_PK%",
    "Home_SV%",
    "Away_SV%",
    "Home_xGF%",
    "Away_xGF%",
]
TARGET_COL = "Home Win"


# ======================================
# ==== PARAM GRIDS =====================
# ======================================

PARAMS_LOG_L2 = {
    "clf__C": loguniform(1e-3, 1e1),
    "clf__class_weight": [None, "balanced"]
}

PARAMS_LOG_EN = {
    "clf__C": loguniform(1e-3, 1e1),
    "clf__l1_ratio": uniform(0, 1),
    "clf__class_weight": [None, "balanced"]
}

PARAMS_RF = {
    "clf__n_estimators": [300, 500, 800],
    "clf__max_depth": [None, 5, 10, 15],
    "clf__min_samples_split": [2, 5, 10],
    "clf__min_samples_leaf": [1, 2, 5],
}

PARAMS_LGBM = {
    "clf__num_leaves": [15, 31, 63],
    "clf__learning_rate": [0.01, 0.05, 0.1],
    "clf__n_estimators": [200, 400, 600],
    "clf__subsample": [0.8, 0.9, 1.0],
    "clf__colsample_bytree": [0.8, 0.9, 1.0],
}

PARAMS_MLP = {
    "clf__solver": ["adam", "sgd"],
    "clf__max_iter": [3000],
    "clf__activation": ["relu", "tanh"],
    "clf__alpha": loguniform(1e-6, 1e-2),
    "clf__learning_rate_init": loguniform(5e-4, 1e-2),
    "clf__batch_size": [32, 64, 128],
    "clf__learning_rate": ["adaptive", "invscaling"],
    "clf__momentum": uniform(0.6, 0.39),
    "clf__nesterovs_momentum": [True, False],
}

SCORING = {
    "brier": "neg_brier_score",
    "roc_auc": "roc_auc",
    "f1w": "f1_weighted"
}

# ======================================
# ==== DATA PREPARATION ================
# ======================================

random.seed(SEED)
np.random.seed(SEED)

df = df_final.sort_values("Date").reset_index(drop=True)
missing = [c for c in base_feature_cols if c not in df.columns]
if missing:
    print(f"‚ö†Ô∏è Missing columns in df_final (skipped): {missing}")
feature_cols = [c for c in base_feature_cols if c in df.columns]

# Base : on garde seulement les lignes compl√®tes sur ces features + la cible
df_base = df.dropna(subset=feature_cols + [TARGET_COL]).copy()

# Version compl√®te avec les matchs futurs (NE PAS DROP)
df_final_full = df_final.sort_values("Date").reset_index(drop=True)

# Version training (matchs jou√©s seulement)
df_final_clean = df_final.dropna(subset=feature_cols + [TARGET_COL]).copy()
df_final_clean = df_final_clean.sort_values("Date").reset_index(drop=True)
# Historique utilis√© pour la construction des features en prod
df_hist_for_pred = df_final_clean.sort_values("Date").reset_index(drop=True)

# Ajout des colonnes d'identit√© d'√©quipe (cat√©gorielles)
df_base["Home_Team"] = df_base["Home Team"].astype("category")
df_base["Away_Team"] = df_base["Away Team"].astype("category")

# Renommage des features num√©riques (espaces -> _)
rename_map = {c: c.replace(" ", "_") for c in feature_cols}
df_base = df_base.rename(columns=rename_map)

# Liste finale des features num√©riques renomm√©es
feat = [rename_map[c] for c in feature_cols]

# Features cat√©gorielles (on garde ces noms l√†)
cat_features = ["Home_Team", "Away_Team"]

# Matrice X = num√©riques + cat√©gorielles, y = cible
X = df_base[feat + cat_features]
y = df_base[TARGET_COL].astype(int)

# Only keep numeric columns (ignore team categorical)
num_df = df_base[feat].copy()


cut = int((1 - TEST_RATIO) * len(df_base))
X_tr, X_te = X.iloc[:cut], X.iloc[cut:]
y_tr, y_te = y.iloc[:cut], y.iloc[cut:]

tscv = TimeSeriesSplit(n_splits=CV_SPLITS, gap=GAP)

# ======================================
# ==== PREPROCESSING ===================
# ======================================

num_features = feat            # features num√©riques (d√©j√† renomm√©es)
cat_features = ["Home_Team", "Away_Team"]  # features cat√©gorielles

preprocess = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_features),
    ]
)

# ======================================
# ==== HELPERS ========================
# ======================================

def evaluate(model, Xt, yt):
    p = model.predict_proba(Xt)[:, 1]
    yhat = (p >= 0.5).astype(int)
    return {
        "F1": f1_score(yt, yhat, average="weighted"),
        "ROC_AUC": roc_auc_score(yt, p),
        "Brier": brier_score_loss(yt, p)
    }

def calibrate_if_better(model, Xtr, ytr, Xte, yte):
    base = evaluate(model, Xte, yte)
    cal = CalibratedClassifierCV(model, method="isotonic", cv=5).fit(Xtr, ytr)
    calm = evaluate(cal, Xte, yte)
    return (cal, calm) if calm["Brier"] <= base["Brier"] else (model, base)
# ======================================
# ==== PIPELINES =======================
# ======================================

pipe_log_l2 = Pipeline([
    ("preprocess", preprocess),
    ("clf", LogisticRegression(max_iter=2000, solver="lbfgs", random_state=SEED))
])

pipe_log_en = Pipeline([
    ("preprocess", preprocess),
    ("clf", LogisticRegression(max_iter=2000, solver="saga", penalty="elasticnet", random_state=SEED))
])

pipe_rf = Pipeline([
    ("preprocess", preprocess),
    ("clf", RandomForestClassifier(random_state=SEED, n_jobs=-1))
])

pipe_lgbm = Pipeline([
    ("preprocess", preprocess),
    ("clf", LGBMClassifier(random_state=SEED))
])

pipe_mlp = Pipeline([
    ("preprocess", preprocess),
    ("clf", MLPClassifier(
        max_iter=MLP_MAX_ITER,
        random_state=SEED,
        early_stopping=MLP_EARLY_STOP,
        n_iter_no_change=MLP_PATIENCE,
        validation_fraction=MLP_VAL_FRAC,
        learning_rate="adaptive"
    ))
])


# ======================================
# ==== RANDOM SEARCHES ================
# ======================================

print("Running random searches...")

rs_log_l2 = RandomizedSearchCV(pipe_log_l2, PARAMS_LOG_L2, n_iter=N_ITER_LOG_L2,
    scoring=SCORING, refit="brier", cv=tscv, n_jobs=-1, verbose=1, random_state=SEED).fit(X_tr, y_tr)

rs_log_en = RandomizedSearchCV(pipe_log_en, PARAMS_LOG_EN, n_iter=N_ITER_LOG_EN,
    scoring=SCORING, refit="brier", cv=tscv, n_jobs=-1, verbose=1, random_state=SEED).fit(X_tr, y_tr)

rs_rf = RandomizedSearchCV(pipe_rf, PARAMS_RF, n_iter=N_ITER_RF,
    scoring=SCORING, refit="brier", cv=tscv, n_jobs=-1, verbose=1, random_state=SEED).fit(X_tr, y_tr)

rs_lgbm = RandomizedSearchCV(pipe_lgbm, PARAMS_LGBM, n_iter=N_ITER_LGBM,
    scoring=SCORING, refit="brier", cv=tscv, n_jobs=-1, verbose=1, random_state=SEED).fit(X_tr, y_tr)

rs_mlp = RandomizedSearchCV(pipe_mlp, PARAMS_MLP, n_iter=N_ITER_MLP,
    scoring=SCORING, refit="brier", cv=tscv, n_jobs=-1, verbose=1, random_state=SEED).fit(X_tr, y_tr)

best_log_l2, best_log_en = rs_log_l2.best_estimator_, rs_log_en.best_estimator_
best_rf, best_lgbm, best_mlp = rs_rf.best_estimator_, rs_lgbm.best_estimator_, rs_mlp.best_estimator_

# ======================================
# ==== EVALUATION ======================
# ======================================

m_l2 = evaluate(best_log_l2, X_te, y_te)
m_en = evaluate(best_log_en, X_te, y_te)
m_rf = evaluate(best_rf, X_te, y_te)
m_lgbm = evaluate(best_lgbm, X_te, y_te)
m_mlp = evaluate(best_mlp, X_te, y_te)

# Pick best logistic variant
log_final, log_metrics = (best_log_l2, m_l2) if m_l2["Brier"] <= m_en["Brier"] else (best_log_en, m_en)

# Calibrate MLP
mlp_final, mlp_metrics = calibrate_if_better(best_mlp, X_tr, y_tr, X_te, y_te)

# === Ensemble (equal weights)
p_log = log_final.predict_proba(X_te)[:, 1]
p_rf = best_rf.predict_proba(X_te)[:, 1]
p_lgbm = best_lgbm.predict_proba(X_te)[:, 1]
p_mlp = mlp_final.predict_proba(X_te)[:, 1]

p_avg = (p_log + p_rf + p_lgbm + p_mlp) / 4
ens_metrics = {
    "F1": f1_score(y_te, (p_avg >= 0.5).astype(int), average="weighted"),
    "ROC_AUC": roc_auc_score(y_te, p_avg),
    "Brier": brier_score_loss(y_te, p_avg)
}

# === Collect results
results = pd.DataFrame([
    {"Model": "Logistic L2", **m_l2},
    {"Model": "ElasticNet", **m_en},
    {"Model": "RandomForest", **m_rf},
    {"Model": "LightGBM", **m_lgbm},
    {"Model": "MLP (Base)", **m_mlp},
    {"Model": "Logistic (Final)", **log_metrics},
    {"Model": "MLP (Final)", **mlp_metrics},
    {"Model": "Ensemble", **ens_metrics},
])

results = results.sort_values("Brier").reset_index(drop=True)
print("\n=== Model Comparison ===")
print(results.round(4).to_string(index=False))

# ======================================
# ==== SAVE MODELS =====================
# ======================================

joblib.dump(log_final, "model_Logistic_TUNED.joblib")
joblib.dump(best_rf, "model_RF_TUNED.joblib")
joblib.dump(best_lgbm, "model_LGBM_TUNED.joblib")
joblib.dump(mlp_final, "model_MLP_TUNED.joblib")
results.to_csv("model_results_summary.csv", index=False)

Running random searches...
Fitting 6 folds for each of 40 candidates, totalling 240 fits
Fitting 6 folds for each of 40 candidates, totalling 240 fits
Fitting 6 folds for each of 20 candidates, totalling 120 fits
Fitting 6 folds for each of 20 candidates, totalling 120 fits
[LightGBM] [Info] Number of positive: 595, number of negative: 463
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000105 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1020
[LightGBM] [Info] Number of data points in the train set: 1058, number of used features: 84
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.562382 -> initscore=0.250834
[LightGBM] [Info] Start training from score 0.250834
Fitting 6 folds for each of 200 candidates, totalling 1200 fits





=== Model Comparison ===
           Model     F1  ROC_AUC  Brier
    RandomForest 0.5136   0.5641 0.2439
        Ensemble 0.5357   0.5661 0.2447
Logistic (Final) 0.5483   0.5643 0.2459
      ElasticNet 0.5483   0.5643 0.2459
     Logistic L2 0.5482   0.5641 0.2461
     MLP (Final) 0.5433   0.5654 0.2480
      MLP (Base) 0.5526   0.5607 0.2511
        LightGBM 0.5152   0.5522 0.2519




In [16]:
print(X.dtypes.tail(5))
print(X[["Home_Team", "Away_Team"]].head())


Away_SV%      float64
Home_xGF%     float64
Away_xGF%     float64
Home_Team    category
Away_Team    category
dtype: object
         Home_Team          Away_Team
0    Winnipeg Jets     Minnesota Wild
1  Edmonton Oilers     Calgary Flames
2     Dallas Stars     Seattle Kraken
3    Boston Bruins   Florida Panthers
4  Ottawa Senators  Los Angeles Kings


In [17]:
# ======================================
# ==== QUALITY DATASET WITH TEAMS ======
# ======================================

# On remet une copie compl√®te pour analyser la multicolin√©arit√© r√©elle
df_quality = df_base.copy()

# Encodage one-hot complet des √©quipes (pour analyser leur impact)
df_quality = pd.get_dummies(df_quality, columns=["Home_Team", "Away_Team"], drop_first=True)

# On retire la target
if TARGET_COL in df_quality.columns:
    df_quality = df_quality.drop(columns=[TARGET_COL])

# Ne garder que valeurs num√©riques
df_quality = df_quality.apply(pd.to_numeric, errors="coerce")

print("\n=== Quality dataset (numerical including team dummies) ===")
print(df_quality.head())



=== Quality dataset (numerical including team dummies) ===
   Date_str  Home Team  Away Team  Home Score  Away Score  \
0       NaN        NaN        NaN           2           1   
1       NaN        NaN        NaN           1           4   
2       NaN        NaN        NaN           2           0   
3       NaN        NaN        NaN           3           4   
4       NaN        NaN        NaN           8           7   

              Date_UTC           Date_Local  LocalDate  Season  weight  ...  \
0  1728856800000000000  1728856800000000000        NaN     NaN     1.0  ...   
1  1728864000000000000  1728864000000000000        NaN     NaN     1.0  ...   
2  1728864000000000000  1728864000000000000        NaN     NaN     1.0  ...   
3  1728925200000000000  1728925200000000000        NaN     NaN     1.0  ...   
4  1728925200000000000  1728925200000000000        NaN     NaN     1.0  ...   

   Away_Team_San Jose Sharks  Away_Team_Seattle Kraken  \
0                      False            

In [18]:

DATE_OFFSET = +0
TARGET_DATE = (datetime.now() + timedelta(days=DATE_OFFSET)).date()
print(f"üìÖ Target slate date set to: {TARGET_DATE}")

CSV_PATH = "odds.csv"

# Ignore games that don't have final scores (cancelled / not played)
ONLY_PLAYED_GAMES = True

def _strip_accents(s: str) -> str:
    return "".join(c for c in unicodedata.normalize("NFKD", str(s)) if not unicodedata.combining(c))

def _american_to_decimal(a):
    try: a = int(a)
    except Exception: return None
    if a == 0 or abs(a) < 100 or a == -100: return None
    return 1.0 + (a/100.0) if a > 0 else 1.0 + (100.0/abs(a))

def _parse_odds(s: str):
    if s is None: return (None, None, None)
    s = s.strip().lower().replace(" ", "")
    if s in {"s","skip"}:  return (None, None, "skip")
    if s in {"q","quit"}:  return (None, None, "quit")
    if "." in s or "," in s:   # accept decimal as a convenience
        try:
            dec = float(s.replace(",", "."))
            return (None, dec if dec > 1.01 else None, None)
        except Exception:
            return (None, None, None)
    m = re.fullmatch(r"([+-]?)(\d{2,4})", s)
    if not m: return (None, None, None)
    sign, num = m.groups(); num = int(num)
    american = -num if sign == "-" else +num
    if american == 0 or abs(american) < 100 or american == -100: return (None, None, None)
    return (american, _american_to_decimal(american), None)
    
def _mk_template(df_games, target_date):
    g = df_games.copy()
    g["Date"] = pd.to_datetime(g["Date"], errors="coerce")

    # === NEW FIX ===
    # Pour les dates pass√©es ‚Üí garder seulement les matchs jou√©s
    # Pour les dates futures ‚Üí ne rien filtrer (sinon on supprime tous les matchs futurs)
    today = datetime.now().date()

    if target_date < today:
        if {"Home Score", "Away Score"}.issubset(g.columns):
            g = g[g["Home Score"].notna() & g["Away Score"].notna()]
    # === END FIX ===

    # Filtrer par date (apr√®s le filtre intelligent)
    g = g[g["Date"].dt.date == target_date]

    out = (
        g.drop_duplicates(subset=["Date","Home Team","Away Team"])
         .loc[:, ["Date","Away Team","Home Team"]]
         .assign(**{
             "Away American Odds": None, "Away Odds": None,
             "Home American Odds": None, "Home Odds": None
         })
         .loc[:, [
             "Date", "Away Team", "Away American Odds", "Away Odds",
             "Home Team", "Home American Odds", "Home Odds"
         ]]
         .reset_index(drop=True)
    )

    out["Away Team"] = out["Away Team"].map(_strip_accents)
    out["Home Team"] = out["Home Team"].map(_strip_accents)
    return out



def _slate_counts_nearby(df_games, center_date, span=3):
    g = df_games.copy()
    g["Date"] = pd.to_datetime(g["Date"], errors="coerce")
    counts = []
    for delta in range(-1, span+1):
        d = center_date + timedelta(days=delta)
        n = int((g["Date"].dt.date == d).sum())
        counts.append((d, n))
    return counts

def enter_american_odds(df_games, date_str=None):
    target = (pd.to_datetime(date_str).date() if date_str else datetime.now().date())
    tmpl = _mk_template(df_games, target)

    print(f"\nTarget date: {target} | games found: {len(tmpl)}")
    nearby = _slate_counts_nearby(df_games, target, span=3)
    print("Nearby slates (games per day):")
    for d, n in nearby:
        mark = "  (today)" if d == target else ""
        print(f"  {d}: {n}{mark}")

    if tmpl.empty:
        print("No games for this date in df_all. Pick another date (e.g., enter_american_odds(df_all, '2025-10-12')).")
        return tmpl  # empty

    print("\nEnter AMERICAN odds (e.g., -120, +135).")
    print("Press Enter to leave a side blank, 's' to skip a game, 'q' to quit.\n")

    for i in range(len(tmpl)):
        away = tmpl.at[i, "Away Team"]
        home = tmpl.at[i, "Home Team"]
        header = f"[{i+1}/{len(tmpl)}]  {away}  @  {home}"
        print("="*len(header))
        print(header)
        print("="*len(header))

        # Away (AWAY TEAM odds)
        while True:
            a_in = input(f"  (AWAY)  {away}  American odds: ").strip()
            if a_in == "":  a_american, a_decimal, cmd = (None, None, None); break
            a_american, a_decimal, cmd = _parse_odds(a_in)
            if cmd in {"quit","skip"} or a_decimal is not None or a_american is None: break
            print("    -> invalid (try -120, +135, or 1.95)")
        if cmd == "quit": break
        if cmd == "skip": print("  skipped game\n"); continue

        # Home (HOME TEAM odds)
        while True:
            h_in = input(f"  (HOME)  {home}  American odds: ").strip()
            if h_in == "":  h_american, h_decimal, cmd2 = (None, None, None); break
            h_american, h_decimal, cmd2 = _parse_odds(h_in)
            if cmd2 in {"quit","skip"} or h_decimal is not None or h_american is None: break
            print("    -> invalid (try -120, +135, or 1.95)")
        if cmd2 == "quit": break
        if cmd2 == "skip": print("  skipped game\n"); continue

        tmpl.at[i, "Away American Odds"] = a_american
        tmpl.at[i, "Away Odds"]          = a_decimal
        tmpl.at[i, "Home American Odds"] = h_american
        tmpl.at[i, "Home Odds"]          = h_decimal
        print()

    complete = tmpl.dropna(subset=["Away Odds","Home Odds"]).reset_index(drop=True)
    complete.to_csv(CSV_PATH, index=False)
    print(f"\n‚úÖ Saved {len(complete)} matchup(s) with odds to {CSV_PATH}")
    return complete

# ===== RUN =====
# Today by default (change to a specific date string if you want):
df_odds = enter_american_odds(df_games_master, TARGET_DATE.strftime("%Y-%m-%d"))
print("\nSaved rows preview:")
print(df_odds.head(10))


üìÖ Target slate date set to: 2025-12-03

Target date: 2025-12-03 | games found: 5
Nearby slates (games per day):
  2025-12-02: 10
  2025-12-03: 5  (today)
  2025-12-04: 10
  2025-12-05: 5
  2025-12-06: 12

Enter AMERICAN odds (e.g., -120, +135).
Press Enter to leave a side blank, 's' to skip a game, 'q' to quit.

[1/5]  Utah Mammoth  @  Anaheim Ducks


  (AWAY)  Utah Mammoth  American odds:  -108
  (HOME)  Anaheim Ducks  American odds:  -110



[2/5]  Buffalo Sabres  @  Philadelphia Flyers


  (AWAY)  Buffalo Sabres  American odds:  -103
  (HOME)  Philadelphia Flyers  American odds:  -114



[3/5]  Washington Capitals  @  San Jose Sharks


  (AWAY)  Washington Capitals  American odds:  -143
  (HOME)  San Jose Sharks  American odds:  121



[4/5]  Dallas Stars  @  New Jersey Devils


  (AWAY)  Dallas Stars  American odds:  -101
  (HOME)  New Jersey Devils  American odds:  -116



[5/5]  Winnipeg Jets  @  Montreal Canadiens


  (AWAY)  Winnipeg Jets  American odds:  -114
  (HOME)  Montreal Canadiens  American odds:  -103




‚úÖ Saved 5 matchup(s) with odds to odds.csv

Saved rows preview:
        Date            Away Team Away American Odds Away Odds  \
0 2025-12-03         Utah Mammoth               -108  1.925926   
1 2025-12-03       Buffalo Sabres               -103  1.970874   
2 2025-12-03  Washington Capitals               -143  1.699301   
3 2025-12-03         Dallas Stars               -101  1.990099   
4 2025-12-03        Winnipeg Jets               -114  1.877193   

             Home Team Home American Odds Home Odds  
0        Anaheim Ducks               -110  1.909091  
1  Philadelphia Flyers               -114  1.877193  
2      San Jose Sharks                121      2.21  
3    New Jersey Devils               -116  1.862069  
4   Montreal Canadiens               -103  1.970874  


In [19]:
# ===================== CONFIG (edit if needed) =====================
MODEL_LOG_PATH = "model_Logistic_TUNED.joblib"
MODEL_MLP_PATH = "model_MLP_TUNED.joblib"
ENSEMBLE_WEIGHTS = (0.5, 0.5)     # (logistic, mlp)
BANKROLL = 260
PREDICTIONS_CSV = "predictions.csv"
DATE_OFFSET = 0                   # 0=today, +1=tomorrow, etc.
DATE_COL_IN_HISTORY = "Date"      # must exist in df_final
# ===================================================================

# ---- imports you MUST have ----
import os, joblib
import numpy as np
import pandas as pd
from datetime import datetime, timedelta

# ---- target date ----
TARGET_DATE = (datetime.now() + timedelta(days=DATE_OFFSET)).date()

# ---- safety: required dataframes/objects ----

required = {
    "df_odds": "DataFrame of today's games with columns: Home Team, Away Team, Home Odds, Away Odds",
    "df_hist_for_pred": "Historical features per game (used to build X_one)",
    "rename_map": "dict mapping training feature names (keys in feats) to model input names"
}

for name in required:
    if name not in globals():
        raise RuntimeError(f"Missing required object `{name}`: {required[name]}")


# ===== MODELS & ENSEMBLE SETUP =====
assert os.path.exists(MODEL_LOG_PATH), f"Missing model file: {MODEL_LOG_PATH}"
assert os.path.exists(MODEL_MLP_PATH), f"Missing model file: {MODEL_MLP_PATH}"
log_model = joblib.load(MODEL_LOG_PATH)
mlp_model = joblib.load(MODEL_MLP_PATH)

assert hasattr(log_model, "predict_proba"), "log_model must support predict_proba()"
assert hasattr(mlp_model, "predict_proba"), "mlp_model must support predict_proba()"

# ensemble weights (normalized)
try:
    w_log, w_mlp = ENSEMBLE_WEIGHTS
except Exception:
    w_log, w_mlp = 0.5, 0.5
ws = (w_log or 0) + (w_mlp or 0)
if ws <= 0:
    w_log, w_mlp = 0.5, 0.5
else:
    w_log, w_mlp = w_log / ws, w_mlp / ws

# ---- helpers ----
def _kelly(p_win: float, dec_odds: float) -> float:
    b = dec_odds - 1.0
    if b <= 0:
        return 0.0
    q = 1.0 - p_win
    return max(0.0, (b * p_win - q) / b)

def _assert_sorted(df_hist: pd.DataFrame, date_col: str):
    if not df_hist[date_col].is_monotonic_increasing:
        df_hist.sort_values(date_col, inplace=True, kind="mergesort")  # stable

def _latest_value_asof(df_hist: pd.DataFrame, team: str, col_if_home: str, col_if_away: str,
                       as_of_dt: pd.Timestamp, date_col: str):
    # history strictly before slate day boundary (prevent leakage)
    m = (((df_hist["Home Team"] == team) | (df_hist["Away Team"] == team))
         & (df_hist[date_col] < as_of_dt))
    if not m.any():
        raise ValueError(f"No history for {team} up to {as_of_dt}")
    last = df_hist.loc[m].iloc[-1]
    return last[col_if_home] if last["Home Team"] == team else last[col_if_away]

def build_features_for_match(home_team, away_team, df_hist, as_of_dt, date_col: str):
    """
    Construit une ligne de features pour un match Home vs Away √† la date as_of_dt,
    en utilisant les m√™mes features que le training (base_feature_cols + Home_Team/Away_Team).
    """

    _assert_sorted(df_hist, date_col)

    # Helper : derni√®re valeur dispo avant as_of_dt pour une √©quipe donn√©e
    L = lambda tm, ch, ca: _latest_value_asof(df_hist, tm, ch, ca, as_of_dt, date_col)

    # Home side
    home_played_yesterday = int(bool(L(home_team, "Home Played Yesterday", "Away Played Yesterday")))
    away_played_yesterday = int(bool(L(away_team, "Away Played Yesterday", "Home Played Yesterday")))

    feats = {
        # Game context / rest
        "Home Played Yesterday": home_played_yesterday,
        "Away Played Yesterday": away_played_yesterday,
        "Home Rest Days Since Last Game": L(home_team, "Home Rest Days Since Last Game", "Away Rest Days Since Last Game"),
        "Away Rest Days Since Last Game": L(away_team, "Away Rest Days Since Last Game", "Home Rest Days Since Last Game"),

        # Absolute form
        "Home Last 10 Wins": L(home_team, "Home Last 10 Wins", "Away Last 10 Wins"),
        "Away Last 10 Wins": L(away_team, "Home Last 10 Wins", "Away Last 10 Wins"),  # note: sym√©trique mais c√¥t√© √©quipe
        "Home Win Rate": L(home_team, "Home Win Rate", "Away Win Rate"),
        "Away Win Rate": L(away_team, "Home Win Rate", "Away Win Rate"),
        "Home Team Overall Win Streak Before Game": L(
            home_team,
            "Home Team Overall Win Streak Before Game",
            "Away Team Overall Win Streak Before Game"
        ),
        "Away Team Overall Win Streak Before Game": L(
            away_team,
            "Home Team Overall Win Streak Before Game",
            "Away Team Overall Win Streak Before Game"
        ),
        "Home Opponent Strength": L(home_team, "Home Opponent Strength", "Away Opponent Strength"),
        "Away Opponent Strength": L(away_team, "Home Opponent Strength", "Away Opponent Strength"),

        # Saison / force structurelle (snapshots)
        "Home_SRS": L(home_team, "Home_SRS", "Away_SRS"),
        "Away_SRS": L(away_team, "Home_SRS", "Away_SRS"),
        "Home_PP%": L(home_team, "Home_PP%", "Away_PP%"),
        "Away_PP%": L(away_team, "Home_PP%", "Away_PP%"),
        "Home_PK%": L(home_team, "Home_PK%", "Away_PK%"),
        "Away_PK%": L(away_team, "Home_PK%", "Away_PK%"),
        "Home_SV%": L(home_team, "Home_SV%", "Away_SV%"),
        "Away_SV%": L(away_team, "Home_SV%", "Away_SV%"),
        "Home_xGF%": L(home_team, "Home_xGF%", "Away_xGF%"),
        "Away_xGF%": L(away_team, "Home_xGF%", "Away_xGF%"),
    }

    # Map vers les noms utilis√©s au training (espaces -> "_")
    row = {rename_map.get(k, k): v for k, v in feats.items()}

    # Ajout des features cat√©gorielles (les mod√®les s'attendent √† les voir)
    row["Home_Team"] = home_team
    row["Away_Team"] = away_team

    X = pd.DataFrame([row])
    return X


# ==== PREDICT & SIZE BETS ====
as_of_dt = pd.Timestamp(TARGET_DATE)  # naive local midnight of target day

# clean team strings
for c in ["Home Team", "Away Team"]:
    if c in df_odds.columns:
        df_odds[c] = df_odds[c].astype(str).str.strip()
    if c in df_hist_for_pred.columns:
        df_hist_for_pred[c] = df_hist_for_pred[c].astype(str).str.strip()

pred_rows, pre_bets, total_pre = [], [], 0.0

for _, r in df_odds.iterrows():
    h, a = str(r["Home Team"]), str(r["Away Team"])

    # odds cleaning
    try:
        oh, oa = float(r["Home Odds"]), float(r["Away Odds"])
    except Exception:
        # skip rows with missing/invalid odds
        continue

    # build feature row
    X_one = build_features_for_match(h, a, df_hist_for_pred, as_of_dt=as_of_dt, date_col=DATE_COL_IN_HISTORY)



    # model probabilities
    p_h = (
        w_log * log_model.predict_proba(X_one)[:, 1].item()
        + w_mlp * mlp_model.predict_proba(X_one)[:, 1].item()
    )

    # Kelly stakes (unscaled)
    bh_pre = _kelly(p_h, oh) * BANKROLL
    ba_pre = _kelly(1.0 - p_h, oa) * BANKROLL
    pre_bets.append((bh_pre, ba_pre))
    total_pre += bh_pre + ba_pre

    pred_rows.append({
        "SlateDate": TARGET_DATE.isoformat(),
        "Away Team": a, "Away Odds": oa,
        "Home Team": h, "Home Odds": oh,
        "p_home_ens": round(p_h, 4)
    })

# scale proportionally if total stake > bankroll
for i, row in enumerate(pred_rows):
    bh, ba = pre_bets[i]
    if BANKROLL > 0 and total_pre > BANKROLL:
        s = BANKROLL / total_pre
        bh, ba = bh * s, ba * s
        row["Scaled"] = True
    else:
        row["Scaled"] = False
    row["Home Bet"] = int(round(bh)) if bh > 0 else 0
    row["Away Bet"] = int(round(ba)) if ba > 0 else 0

pred_df = pd.DataFrame(pred_rows)
cols = ["SlateDate","Away Team","Away Bet","Away Odds","Home Team","Home Bet","Home Odds","p_home_ens","Scaled"]
print(pred_df[cols] if set(cols).issubset(pred_df.columns) else pred_df)

# append/save
mode = "a" if os.path.exists(PREDICTIONS_CSV) else "w"
header = not os.path.exists(PREDICTIONS_CSV)
pred_df.to_csv(PREDICTIONS_CSV, index=False, mode=mode, header=header)
print(f"\nSaved {len(pred_df)} rows to {PREDICTIONS_CSV} (mode='{mode}', header={header})")



    SlateDate            Away Team  Away Bet  Away Odds            Home Team  \
0  2025-12-03         Utah Mammoth         0   1.925926        Anaheim Ducks   
1  2025-12-03       Buffalo Sabres         0   1.970874  Philadelphia Flyers   
2  2025-12-03  Washington Capitals         6   1.699301      San Jose Sharks   
3  2025-12-03         Dallas Stars         0   1.990099    New Jersey Devils   
4  2025-12-03        Winnipeg Jets         0   1.877193   Montreal Canadiens   

   Home Bet  Home Odds  p_home_ens  Scaled  
0        26   1.909091      0.5706   False  
1        74   1.877193      0.6663   False  
2         0   2.210000      0.4024   False  
3         0   1.862069      0.4976   False  
4         3   1.970874      0.5134   False  

Saved 5 rows to predictions.csv (mode='a', header=False)
