In [16]:
from pathlib import Path
import hashlib, shutil, subprocess, os
from datetime import datetime

REPO   = Path(r"C:\Users\Nic\Desktop\NHLPredictor")
DESK   = Path(r"C:\Users\Nic\Desktop")
NBNAME = "PredictNHL.ipynb"
EXTS   = {".csv", ".json", ".xlsx"}
BRANCH = "main"

# --- Force a CSV to "update" even if content is the same ---
FORCE_REFRESH_CSV = True
CSV_WHITELIST = []  # e.g. ["predictions.csv", "enhanced_game_data.csv"]; empty = all CSVs in repo

def sha(p):
    import hashlib
    h=hashlib.sha256()
    with p.open("rb") as f:
        for b in iter(lambda:f.read(1<<20), b""): h.update(b)
    return h.hexdigest()

def bump_csv_bytes(path: Path):
    """Toggle trailing newline to force a harmless byte change."""
    b = path.read_bytes()
    if b.endswith(b"\n"):
        path.write_bytes(b.rstrip(b"\n"))      # remove last newline
    else:
        path.write_bytes(b + b"\n")            # add last newline

def run(cmd):
    r = subprocess.run(cmd, cwd=str(REPO), capture_output=True, text=True)
    if r.stdout: print(r.stdout.strip())
    if r.stderr and r.returncode != 0: print(r.stderr.strip())
    return r.returncode

assert REPO.exists()
os.chdir(REPO)

# 1) Build targets: notebook + tracked csv/json/xlsx
targets = {NBNAME}
targets |= {p.name for p in REPO.iterdir() if p.is_file() and p.suffix.lower() in EXTS}

# 2) Copy Desktop -> repo when content differs
changed = []
for name in sorted(targets):
    src, dst = DESK/name, REPO/name
    if src.exists():
        if (not dst.exists()) or sha(src) != sha(dst):
            shutil.copy2(src, dst); changed.append(name)

# 3) Optionally force-refresh CSVs (toggle EOF newline)
if FORCE_REFRESH_CSV:
    csvs = [p for p in REPO.iterdir() if p.is_file() and p.suffix.lower()==".csv"]
    if CSV_WHITELIST:
        csvs = [p for p in csvs if p.name in CSV_WHITELIST]
    for p in csvs:
        # only bump if not already modified by step 2 (to avoid double-noise)
        if p.name not in changed:
            bump_csv_bytes(p); changed.append(p.name)

# 4) Commit & push (only if something actually changed)
if changed:
    run(["git","add"] + changed)
    msg = f"Auto update (nb+data) - {datetime.now():%Y-%m-%d %H:%M:%S}"
    run(["git","commit","-m", msg])
    run(["git","pull","--rebase","origin", BRANCH])
    run(["git","push","-u","origin", BRANCH])
    print("Pushed:", ", ".join(changed))
else:
    print("Nothing to update. (No content changes and force-refresh off or nothing matched.)")



[main 7c94e57] Auto update (nb+data) - 2025-10-16 06:47:26
 3 files changed, 1660 insertions(+), 1564 deletions(-)
Current branch main is up to date.
branch 'main' set up to track 'origin/main'.
Pushed: PredictNHL.ipynb, enhanced_game_data.csv, predictions.csv


In [1]:
!pip install tensorflow
!pip install xgboost
!pip install streamlit
!pip install imbalanced-learn
!pip install shap
!pip install statsmodels
!pip install lightgbm




[notice] A new release of pip is available: 24.1.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip






[notice] A new release of pip is available: 24.1.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 24.1.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 24.1.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 24.1.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 24.1.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
import os
import time
import json
import random
import warnings
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score, TimeSeriesSplit, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.metrics import f1_score, roc_auc_score, brier_score_loss, accuracy_score, log_loss, classification_report, make_scorer
from sklearn.calibration import CalibratedClassifierCV
from sklearn.inspection import permutation_importance
from sklearn.feature_selection import mutual_info_classif, RFECV, SelectFromModel, SelectKBest
from sklearn.base import BaseEstimator, TransformerMixin, clone
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from imblearn.over_sampling import SMOTE
import joblib
from datetime import datetime
import re
from scipy.stats import loguniform, uniform
from io import StringIO
from datetime import datetime, timedelta
import unicodedata
import os, joblib

In [2]:
#Retrieve the rank of each teams

url = 'https://www.hockey-reference.com/leagues/NHL_2025.html#all_stats'

with webdriver.Chrome() as driver:
    driver.get(url)
    time.sleep(2) 
    soup = BeautifulSoup(driver.page_source, 'html.parser')

driver.quit()

table = soup.find('table', id='stats')

if table is None:
    print("Team Statistics table not found")
else:
    
    rows = table.find_all('tr')[1:] 
    teams = [
        row.find_all('td')[0].text.strip()
        for row in rows
        if row.find_all('td')
    ]
    ranks = list(range(1, len(teams) + 1)) 

    
    df_rank = pd.DataFrame({
        'Team': teams,
        'Rank': ranks,
        'Team Rank': [f"{team} {rank}" for team, rank in zip(teams, ranks)]
    })
    df_rank = df_rank[df_rank['Team'] != 'League Average'].reset_index(drop=True)
    df_rank['Team'] = df_rank['Team'].str.replace('*', '', regex=False).str.strip()

print(df_rank)

    

                     Team  Rank                 Team Rank
0           Winnipeg Jets     1          Winnipeg Jets* 1
1     Washington Capitals     2    Washington Capitals* 2
2    Vegas Golden Knights     3   Vegas Golden Knights* 3
3     Toronto Maple Leafs     4    Toronto Maple Leafs* 4
4            Dallas Stars     5           Dallas Stars* 5
5       Los Angeles Kings     6      Los Angeles Kings* 6
6     Tampa Bay Lightning     7    Tampa Bay Lightning* 7
7      Colorado Avalanche     8     Colorado Avalanche* 8
8         Edmonton Oilers     9        Edmonton Oilers* 9
9     Carolina Hurricanes    10   Carolina Hurricanes* 10
10       Florida Panthers    11      Florida Panthers* 11
11         Minnesota Wild    12        Minnesota Wild* 12
12        Ottawa Senators    13       Ottawa Senators* 13
13         Calgary Flames    14         Calgary Flames 14
14        St. Louis Blues    15       St. Louis Blues* 15
15     Montreal Canadiens    16    Montreal Canadiens* 16
16      New Je

In [3]:

UA = {"User-Agent": "Mozilla/5.0"}
to_csv = lambda u: re.sub(r"/results/(nhl-\d{4})$", r"/download/\1-UTC.csv", u)
TIMEZONE = "America/Toronto" 

def _parse_utc_datetime(series: pd.Series) -> pd.Series:
    """
    Parse the CSV 'Date' column (which is UTC text) into tz-aware UTC timestamps.
    Tries multiple formats and both day-first and month-first just in case.
    """
    s = series.astype(str)

    # 1) Try day-first (site is typically DD/MM/YYYY HH:MM[:SS]) 
    dt = pd.to_datetime(s, errors="coerce", utc=True, dayfirst=True)

    # 2) For anything still NaT, try month-first (fallback for odd rows)
    m = dt.isna()
    if m.any():
        dt.loc[m] = pd.to_datetime(s[m], errors="coerce", utc=True, dayfirst=False)

    return dt

def load(u: str) -> pd.DataFrame:
    url = to_csv(u)  
    try:
        df = pd.read_csv(url)
    except Exception:
        r = requests.get(url, headers=UA, timeout=20); r.raise_for_status()
        df = pd.read_csv(StringIO(r.text))

    df.columns = [c.strip() for c in df.columns]
    lc = {c.lower(): c for c in df.columns}

    date_col = lc.get("date") or next(c for c in df.columns if "date" in c.lower())
    home_col = lc.get("home team") or lc.get("home") or next(c for c in df.columns if "home" in c.lower())
    away_col = lc.get("away team") or lc.get("away") or next(c for c in df.columns if "away" in c.lower())

    out = pd.DataFrame({
        "Date_str": df[date_col].astype(str),
        "Home Team": df[home_col].astype(str),
        "Away Team": df[away_col].astype(str),
    })

    # --- scores/result mapping (unchanged idea) ---
    if {"Home Score","Away Score"}.issubset(df.columns):
        out["Home Score"] = pd.to_numeric(df["Home Score"], errors="coerce")
        out["Away Score"] = pd.to_numeric(df["Away Score"], errors="coerce")
    elif "Score" in df.columns:
        s = (df["Score"].astype(str)
             .str.replace("\u2013","-",regex=False)
             .str.replace("\u2212","-",regex=False))
        g = s.str.extract(r"(\d+)\s*-\s*(\d+)")
        out["Home Score"] = pd.to_numeric(g[0], errors="coerce")
        out["Away Score"] = pd.to_numeric(g[1], errors="coerce")
    else:
        out["Home Score"] = pd.NA; out["Away Score"] = pd.NA

    out["Result"] = df[lc["result"]] if "result" in lc else ""

    need = out["Home Score"].isna() | out["Away Score"].isna()
    if need.any():
        rnum = out.loc[need, "Result"].astype(str).str.extract(r"(\d+)\s*-\s*(\d+)")
        out.loc[need, "Home Score"] = pd.to_numeric(rnum[0], errors="coerce")
        out.loc[need, "Away Score"] = pd.to_numeric(rnum[1], errors="coerce")

    # --- team name cleanup (keep accents; remove weird chars/spaces) ---
    for c in ("Home Team","Away Team"):
        out[c] = (out[c].str.replace(r"[^A-Za-zÀ-ÿ .'\-]", "", regex=True)
                        .str.replace(r"\s{2,}", " ", regex=True)
                        .str.strip())

    # === CRITICAL: keep UTC and derive Local ===
    out["Date_UTC"]   = _parse_utc_datetime(out["Date_str"])
    out["Date_Local"] = out["Date_UTC"].dt.tz_convert(TIMEZONE)
    out["LocalDate"]  = out["Date_Local"].dt.date

    # sort/dedupe on UTC (stable total order)
    out = (out.dropna(subset=["Date_UTC"])
              .sort_values("Date_UTC")
              .drop_duplicates(["Date_UTC","Home Team","Away Team"], keep="last")
              .reset_index(drop=True))

    return out[[
        "Date_UTC","Date_Local","LocalDate",
        "Home Team","Away Team","Home Score","Away Score","Result"
    ]]

# --- load seasons, merge, save ---
df_2024 = load("https://fixturedownload.com/results/nhl-2024"); df_2024["Season"], df_2024["weight"] = "2024-2025", 1.0
df_2025 = load("https://fixturedownload.com/results/nhl-2025"); df_2025["Season"], df_2025["weight"] = "2025-2026", 2.0

df_all = pd.concat([df_2024, df_2025], ignore_index=True)
df_all["Played"] = df_all["Home Score"].notna() & df_all["Away Score"].notna()
print(f"Total rows: {len(df_all)} | Played: {int(df_all['Played'].sum())}")

df_all.to_csv("nhl_results_2024_2026_weighted.csv", index=False)


Total rows: 2624 | Played: 1383


In [4]:
# Normalize both data sources
team_name_mapping = {
    "Montréal Canadiens": "Montreal Canadiens"
}

df_all["Home Team"] = df_all["Home Team"].replace(team_name_mapping)
df_all["Away Team"] = df_all["Away Team"].replace(team_name_mapping)


In [5]:
# --- Normalize team names to avoid silent mismatches ---
for c in ["Home Team", "Away Team"]:
    df_all[c] = df_all[c].astype(str).str.strip()

df_rank = df_rank.copy()
df_rank["Team"] = df_rank["Team"].astype(str).str.strip()
df_rank["Rank"] = pd.to_numeric(df_rank["Rank"], errors="coerce")

# --- Build minimal rank frames (only needed columns!) ---
rank_home = (df_rank.loc[:, ["Team", "Rank"]]
                     .rename(columns={"Team": "Home Team", "Rank": "Home Team Rank"}))

rank_away = (df_rank.loc[:, ["Team", "Rank"]]
                     .rename(columns={"Team": "Away Team", "Rank": "Away Team Rank"}))

# --- Drop any stale columns from previous merges to avoid collisions ---
for col in ["Home Team Rank", "Away Team Rank", "Rank", "Team", "Team Rank_x", "Team Rank_y"]:
    if col in df_all.columns:
        df_all = df_all.drop(columns=col)

# --- Merge one side at a time (no suffixes needed) ---
df_all = df_all.merge(rank_home, on="Home Team", how="left")
df_all = df_all.merge(rank_away, on="Away Team", how="left")

# --- Home Win indicator (safe when scores are present) ---
df_all["Home Win"] = (
    df_all["Home Score"].notna() &
    df_all["Away Score"].notna() &
    (df_all["Home Score"] > df_all["Away Score"])
).astype(int)

# --- If you want a naive local datetime "Date" for downstream code ---
if "Date_Local" in df_all.columns:
    df_all["Date"] = df_all["Date_Local"].dt.tz_localize(None)

# --- Pick your final view ---
final_columns = [
    "LocalDate", "Date_Local", "Date",  # keep all three if useful
    "Home Team", "Home Score", "Away Team", "Away Score",
    "Home Team Rank", "Away Team Rank", "Home Win"
]
df_final = df_all.loc[:, [c for c in final_columns if c in df_all.columns]].copy()

# --- Sanity checks ---
for col in ["Home Team Rank", "Away Team Rank"]:
    miss = df_final[col].isna().sum()
    if miss:
        print(f"⚠️ Missing {miss} ranks in {col} (check team name aliases).")


⚠️ Missing 41 ranks in Home Team Rank (check team name aliases).
⚠️ Missing 41 ranks in Away Team Rank (check team name aliases).


In [6]:
#DATA MANIPULATION


# Filter out the Unplayed Games
df_final = df_final.dropna(subset=["Home Score", "Away Score"])

# Convert Scores and Dates to Appropriate Data Types
df_final["Date"] = pd.to_datetime(df_final["Date"], format="%d/%m/%Y")
df_final[["Home Score", "Away Score"]] = df_final[["Home Score", "Away Score"]].astype(int)
df_final["Home Win"] = df_final["Home Win"].astype(bool)

# Sort Data by Date for Chronological Calculations
df_final = df_final.sort_values(by="Date").reset_index(drop=True)

# Function: Calculate Last 10 Games Stats
def calculate_last_10_stats(df, team_column):
    last_10_wins = []
    team_games = {team: [] for team in pd.concat([df["Home Team"], df["Away Team"]]).unique()}

    for _, row in df.iterrows():
        team = row[team_column]
        recent_games = team_games[team][-10:]  # Last 10 games
        last_10_wins.append(sum(recent_games))
        team_games[row["Home Team"]].append(row["Home Win"])
        team_games[row["Away Team"]].append(not row["Home Win"])
    return last_10_wins

# Add Last 10 Wins for Home and Away Teams
df_final["Home Last 10 Wins"] = calculate_last_10_stats(df_final, "Home Team")
df_final["Away Last 10 Wins"] = calculate_last_10_stats(df_final, "Away Team")

# Add Whether Teams Played Yesterday
def calculate_played_yesterday(df, team_column):
    played_yesterday = []
    last_game_date = {}

    for _, row in df.iterrows():
        team = row[team_column]
        played_yesterday.append(last_game_date.get(team) == row["Date"] - pd.Timedelta(days=1))
        last_game_date[team] = row["Date"]
    return played_yesterday

df_final["Home Played Yesterday"] = calculate_played_yesterday(df_final, "Home Team")
df_final["Away Played Yesterday"] = calculate_played_yesterday(df_final, "Away Team")

# Add Win Rate for Home and Away Teams
def calculate_win_rate(df, team_column, is_home_column):
    win_rate = []
    team_stats = {team: {"wins": 0, "games": 0} for team in pd.concat([df["Home Team"], df["Away Team"]]).unique()}

    for _, row in df.iterrows():
        team = row[team_column]
        stats = team_stats[team]
        win_rate.append(stats["wins"] / stats["games"] if stats["games"] > 0 else 0)
        
        
        if is_home_column:
            stats["wins"] += row["Home Win"]
        else:
            stats["wins"] += not row["Home Win"]
        stats["games"] += 1
    return win_rate

df_final["Home Win Rate"] = calculate_win_rate(df_final, "Home Team", is_home_column=True)
df_final["Away Win Rate"] = calculate_win_rate(df_final, "Away Team", is_home_column=False)

# Add Overall Win Streak for Home and Away Teams
def calculate_overall_win_streak(df):
    streak = {}
    home_streaks = []
    away_streaks = []

    for _, row in df.iterrows():
        home_team = row["Home Team"]
        away_team = row["Away Team"]

        
        home_streaks.append(streak.get(home_team, 0))
        away_streaks.append(streak.get(away_team, 0))

        
        if row["Home Win"]:
            streak[home_team] = streak.get(home_team, 0) + 1
            streak[away_team] = 0
        else:
            streak[home_team] = 0
            streak[away_team] = streak.get(away_team, 0) + 1
    return home_streaks, away_streaks

home_streaks, away_streaks = calculate_overall_win_streak(df_final)
df_final["Home Team Overall Win Streak Before Game"] = home_streaks
df_final["Away Team Overall Win Streak Before Game"] = away_streaks

# Add Average Opponent Strength for Home and Away Teams
def calculate_avg_opponent_rank(df, team_column, opponent_rank_column):
    avg_opponent_rank = []
    opponent_stats = {team: [] for team in pd.concat([df["Home Team"], df["Away Team"]]).unique()}

    for _, row in df.iterrows():
        team = row[team_column]
        avg_opponent_rank.append(
            np.mean(opponent_stats[team][-10:]) if opponent_stats[team] else np.nan
        )
        opponent_stats[row["Home Team"]].append(row["Away Team Rank"])
        opponent_stats[row["Away Team"]].append(row["Home Team Rank"])
    return avg_opponent_rank

df_final["Home Opponent Strength"] = calculate_avg_opponent_rank(df_final, "Home Team", "Away Team Rank")
df_final["Away Opponent Strength"] = calculate_avg_opponent_rank(df_final, "Away Team", "Home Team Rank")

OFFSEASON_GAP_DAYS = 45  

def calculate_days_since_last_game(df, gap=OFFSEASON_GAP_DAYS):
    last_game_date = {}
    home_rest, away_rest = [], []

    for _, row in df.iterrows():
        d = row["Date"]
        h = row["Home Team"]; a = row["Away Team"]

        # home
        prev = last_game_date.get(h)
        if prev is None:
            home_rest.append(None)
        else:
            delta = (d - prev).days
            home_rest.append(delta if delta <= gap else None)

        # away
        prev = last_game_date.get(a)
        if prev is None:
            away_rest.append(None)
        else:
            delta = (d - prev).days
            away_rest.append(delta if delta <= gap else None)

        # update last seen dates
        last_game_date[h] = d
        last_game_date[a] = d

    return home_rest, away_rest

# Apply
home_days, away_days = calculate_days_since_last_game(df_final)
df_final["Home Rest Days Since Last Game"] = home_days
df_final["Away Rest Days Since Last Game"] = away_days

print(df_final[["Date", "Home Team", "Away Team", "Home Opponent Strength", "Away Opponent Strength","Home Last 10 Wins","Away Last 10 Wins","Home Played Yesterday","Away Played Yesterday","Home Win Rate","Away Win Rate","Home Team Overall Win Streak Before Game","Away Team Overall Win Streak Before Game","Away Rest Days Since Last Game","Home Rest Days Since Last Game"]].tail(10))


df_final.to_csv("enhanced_game_data.csv", index=False)


                    Date             Home Team            Away Team  \
1373 2025-10-16 19:00:00   Philadelphia Flyers        Winnipeg Jets   
1374 2025-10-16 19:00:00     New Jersey Devils     Florida Panthers   
1375 2025-10-16 19:00:00   Toronto Maple Leafs     New York Rangers   
1376 2025-10-16 19:00:00    Montreal Canadiens  Nashville Predators   
1377 2025-10-16 19:00:00       Ottawa Senators       Seattle Kraken   
1378 2025-10-16 19:30:00    New York Islanders      Edmonton Oilers   
1379 2025-10-16 20:00:00          Dallas Stars    Vancouver Canucks   
1380 2025-10-16 22:00:00         Anaheim Ducks  Carolina Hurricanes   
1381 2025-10-16 22:00:00  Vegas Golden Knights        Boston Bruins   
1382 2025-10-16 22:00:00     Los Angeles Kings  Pittsburgh Penguins   

      Home Opponent Strength  Away Opponent Strength  Home Last 10 Wins  \
1373                    18.1                    14.2                  4   
1374                    19.2                    20.3                

In [7]:
df_final["Rank Difference"] = df_final["Home Team Rank"] - df_final["Away Team Rank"]
df_final["Home Advantage"] = df_final["Home Win Rate"] - df_final["Away Win Rate"]
df_final["Win Streak Impact"] = df_final["Home Team Overall Win Streak Before Game"] - df_final["Away Team Overall Win Streak Before Game"]
df_final["Opponent Strength"] = df_final["Home Opponent Strength"] - df_final["Away Opponent Strength"]
df_final["Last 10 Wins"] = df_final["Home Last 10 Wins"] - df_final["Away Last 10 Wins"]
df_final["Opponent Strength"] = df_final["Home Opponent Strength"] - df_final["Away Opponent Strength"]
df_final = df_final.dropna().reset_index(drop=True)

results1 = df_final

In [8]:

scorer = make_scorer(f1_score, pos_label=None, average='weighted')

y_true = df_final["Home Win"].astype(int).values

y_pred = np.ones_like(y_true)                   
y_proba = np.ones_like(y_true, dtype=float)     

f1 = f1_score(y_true, y_pred, average="weighted")


try:
    roc = roc_auc_score(y_true, y_proba)
except ValueError:
    roc = float("nan")  # undefined for constant scores


brier = brier_score_loss(y_true, y_proba)

print(f"Baseline (Always Home) — F1: {f1:.4f}, ROC AUC: {roc}, Brier: {brier:.4f}")
y_pred_away = np.zeros_like(y_true)
y_proba_away = np.zeros_like(y_true, dtype=float)

f1_away = f1_score(y_true, y_pred_away, average="weighted")
try:
    roc_away = roc_auc_score(y_true, y_proba_away)
except ValueError:
    roc_away = float("nan")
brier_away = brier_score_loss(y_true, y_proba_away)

print(f"Baseline (Always Away) — F1: {f1_away:.4f}, ROC AUC: {roc_away}, Brier: {brier_away:.4f}")

Baseline (Always Home) — F1: 0.4078, ROC AUC: 0.5, Brier: 0.4351
Baseline (Always Away) — F1: 0.2638, ROC AUC: 0.5, Brier: 0.5649


In [9]:
df_sorted = df_final.sort_values("Date").reset_index(drop=True)

feature_cols = [
    "Opponent Strength","Rank Difference","Last 10 Wins",
    "Home Played Yesterday","Away Played Yesterday",
    "Home Advantage","Win Streak Impact",
    "Away Rest Days Since Last Game","Home Rest Days Since Last Game"
]
target_col = "Home Win"

X_all = df_sorted[feature_cols]
y_all = df_sorted[target_col].astype(int)

split_idx = int(0.8 * len(df_sorted))
X_train, X_test = X_all.iloc[:split_idx], X_all.iloc[split_idx:]
y_train, y_test = y_all.iloc[:split_idx], y_all.iloc[split_idx:]


In [10]:

def compute_vif(df):
    import statsmodels.api as sm
    from statsmodels.stats.outliers_influence import variance_inflation_factor

    X = sm.add_constant(df)
    vif = pd.Series(
        [variance_inflation_factor(X.values, i) for i in range(1, X.shape[1])],
        index=df.columns
    )
    return vif.sort_values(ascending=False)

vif_series = compute_vif(X_train.select_dtypes(include=[np.number]))
print("VIF:\n", vif_series)



VIF:
 Home Rest Days Since Last Game    2.083885
Away Rest Days Since Last Game    2.082895
Last 10 Wins                      1.769676
Rank Difference                   1.539153
Home Advantage                    1.435472
Win Streak Impact                 1.297874
Opponent Strength                 1.034779
dtype: float64


In [11]:
# 0) Features & split
feature_cols = [
    "Opponent Strength","Rank Difference","Last 10 Wins",
    "Home Played Yesterday","Away Played Yesterday",
    "Home Advantage","Win Streak Impact",
    "Away Rest Days Since Last Game","Home Rest Days Since Last Game"
]
target_col = "Home Win"

df = df_final.sort_values("Date").reset_index(drop=True)
rename_map = {c: c.replace(" ", "_") for c in feature_cols}
df = df.rename(columns=rename_map)

feat = [rename_map[c] for c in feature_cols]
X, y = df[feat], df[target_col].astype(int)

split = int(0.8 * len(df))
X_train, X_test = X.iloc[:split], X.iloc[split:]
y_train, y_test = y.iloc[:split], y.iloc[split:]

cv5 = StratifiedKFold(n_splits=5, shuffle=False)

# 1) Name-preserving selectors
class NamePreservingSelectFromModel(TransformerMixin, BaseEstimator):
    def __init__(self, estimator, threshold="median"):
        self.estimator = estimator
        self.threshold = threshold
        self.selected_cols_ = None
    def fit(self, X, y=None):
        if not hasattr(X, "columns"): raise TypeError("Need pandas DataFrame.")
        est = clone(self.estimator).fit(X, y)
        sfm = SelectFromModel(est, threshold=self.threshold, prefit=True)
        self.selected_cols_ = list(X.columns[sfm.get_support()])
        return self
    def transform(self, X): return X[self.selected_cols_]

class NamePreservingSelectKBest(TransformerMixin, BaseEstimator):
    def __init__(self, score_func=mutual_info_classif, k=5):
        self.score_func = score_func; self.k = k; self.selected_cols_ = None
    def fit(self, X, y=None):
        if not hasattr(X, "columns"): raise TypeError("Need pandas DataFrame.")
        skb = SelectKBest(self.score_func, k=self.k).fit(X, y)
        self.selected_cols_ = list(X.columns[skb.get_support()])
        return self
    def transform(self, X): return X[self.selected_cols_]

class NamePreservingRFECV(TransformerMixin, BaseEstimator):
    def __init__(self, estimator, step=1, cv=None, scoring="neg_brier_score", min_features_to_select=1):
        self.estimator = estimator; self.step = step; self.cv = cv
        self.scoring = scoring; self.min_features_to_select = min_features_to_select
        self.selected_cols_ = None
    def fit(self, X, y=None):
        if not hasattr(X, "columns"): raise TypeError("Need pandas DataFrame.")
        r = RFECV(self.estimator, step=self.step, cv=self.cv, scoring=self.scoring,
                  min_features_to_select=self.min_features_to_select).fit(X, y)
        self.selected_cols_ = list(X.columns[r.support_]); return self
    def transform(self, X): return X[self.selected_cols_]

# 2) Pipelines
pipe_logistic = Pipeline([
    ("rfe", NamePreservingRFECV(LogisticRegression(max_iter=1000), step=1, cv=cv5, scoring="neg_brier_score")),
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(max_iter=1000))
])

pipe_mlp = Pipeline([
    ("skb", NamePreservingSelectKBest(k=max(3, len(feat)//2))),
    ("scaler", StandardScaler()),
    ("clf", MLPClassifier(max_iter=500, random_state=42))
])

pipe_lgbm = Pipeline([
    ("sfm", NamePreservingSelectFromModel(LGBMClassifier(random_state=42), threshold="median")),
    ("clf", LGBMClassifier(random_state=42))
])

pipe_rf = Pipeline([
    ("sfm", NamePreservingSelectFromModel(RandomForestClassifier(n_estimators=400, random_state=42), threshold="median")),
    ("clf", RandomForestClassifier(n_estimators=400, random_state=42))
])

candidates = {
    "Logistic": pipe_logistic,
    "MLP": pipe_mlp,
    "RandomForest": pipe_rf,
    "LightGBM": pipe_lgbm,
}

# 3) Fit / evaluate / save
def evaluate(model, X_te, y_te):
    y_hat = model.predict(X_te)
    p = model.predict_proba(X_te)[:, 1]
    return {"F1": f1_score(y_te, y_hat, average="weighted"),
            "ROC_AUC": roc_auc_score(y_te, p),
            "Brier": brier_score_loss(y_te, p)}

results, selected = [], {}

for name, model in candidates.items():
    model.fit(X_train, y_train)
    results.append({"Model": name, **evaluate(model, X_test, y_test)})
    # grab selected cols from whichever selector exists
    sel_cols = None
    for key in ("rfe", "skb", "sfm"):
        if key in model.named_steps and hasattr(model.named_steps[key], "selected_cols_"):
            sel_cols = model.named_steps[key].selected_cols_
    selected[name] = sel_cols
    joblib.dump(model, f"model_{name}.joblib")

with open("selected_features.json", "w") as f:
    json.dump({"rename_map": rename_map, "selected_features": selected}, f, indent=2)

res = (pd.DataFrame(results)
         .sort_values(["Brier","ROC_AUC","F1"], ascending=[True, False, False])
         .reset_index(drop=True))
print("\n=== Models ===\n", res[["Model","F1","ROC_AUC","Brier"]])
print("\nSelected features:\n", selected)

# 4) Best by Brier; calibrate if not Logistic
best = res.iloc[0]["Model"]
pipe = joblib.load(f"model_{best}.joblib")

final_model = pipe if best == "Logistic" else CalibratedClassifierCV(pipe, method="isotonic", cv=5).fit(X_train, y_train)
joblib.dump(final_model, f"model_{best}_CALIBRATED.joblib")

print("\nBest:", best)
print("Final metrics:", evaluate(final_model, X_test, y_test))

# 5) Load & predict helper
def load_model_and_predict(model_name: str, X_new: pd.DataFrame) -> np.ndarray:
    with open("selected_features.json", "r") as f:
        meta = json.load(f)
    Xn = X_new.rename(columns={k: v for k, v in meta["rename_map"].items() if k in X_new.columns})
    path = f"model_{model_name}_CALIBRATED.joblib"
    if not os.path.exists(path): path = f"model_{model_name}.joblib"
    mdl = joblib.load(path)
    return mdl.predict_proba(Xn)[:, 1]


[LightGBM] [Info] Number of positive: 596, number of negative: 470
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000091 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 618
[LightGBM] [Info] Number of data points in the train set: 1066, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.559099 -> initscore=0.237508
[LightGBM] [Info] Start training from score 0.237508
[LightGBM] [Info] Number of positive: 596, number of negative: 470
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000078 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 600
[LightGBM] [Info] Number of data points in the train set: 1066, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.559099 -> initscore=0.237508
[LightGBM] [Info] S

In [12]:
# ==== CONFIG ====
SEED=42
CV_SPLITS=6
N_ITER_LOG_L2=40
N_ITER_LOG_EN=40
N_ITER_MLP=200              
RFE_MIN_FEATS=1
MLP_MAX_ITER=2000           
MLP_EARLY_STOP=True
MLP_VAL_FRAC=0.12           # ↓ slightly smaller val set
MLP_PATIENCE=20             # ↑ give early stopping time to work
TEST_RATIO=0.2

FEATURE_COLS=[
    "Opponent Strength","Rank Difference","Last 10 Wins",
    "Home Played Yesterday","Away Played Yesterday",
    "Home Advantage","Win Streak Impact",
    "Away Rest Days Since Last Game","Home Rest Days Since Last Game"
]
TARGET_COL="Home Win"

# seeds
random.seed(SEED); np.random.seed(SEED)

# ==== DATA (time-ordered split) ====
df = df_final.sort_values("Date").reset_index(drop=True)
rename_map = {c: c.replace(" ","_") for c in FEATURE_COLS}
df = df.rename(columns=rename_map)
feat = [rename_map[c] for c in FEATURE_COLS]
X, y = df[feat], df[TARGET_COL].astype(int)

cut = int((1-TEST_RATIO)*len(df))
X_tr, X_te = X.iloc[:cut], X.iloc[cut:]
y_tr, y_te = y.iloc[:cut], y.iloc[cut:]

# --- IMPORTANT: time-series CV with a small gap to reduce leakage ---
from sklearn.model_selection import TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=CV_SPLITS, gap=3)

# ==== HELPERS ====
def evaluate(m, Xt, yt):
    p = m.predict_proba(Xt)[:,1]; yhat = (p>=0.5).astype(int)
    return {"F1": f1_score(yt,yhat,average="weighted"),
            "ROC_AUC": roc_auc_score(yt,p),
            "Brier": brier_score_loss(yt,p)}

# name-preserving selectors
class NPSFM(TransformerMixin,BaseEstimator):
    def __init__(self,estimator,threshold="median"):
        self.estimator=estimator; self.threshold=threshold; self.selected_cols_=None
    def fit(self,X,y=None):
        if not hasattr(X,"columns"): raise TypeError("Need pandas DataFrame.")
        sfm=SelectFromModel(clone(self.estimator).fit(X,y),threshold=self.threshold,prefit=True)
        self.selected_cols_=list(X.columns[sfm.get_support()]); return self
    def transform(self,X): return X[self.selected_cols_]

class NPSKB(TransformerMixin,BaseEstimator):
    def __init__(self,score_func=mutual_info_classif,k="all"):   # default: keep all
        self.score_func=score_func; self.k=k; self.selected_cols_=None
    def fit(self,X,y=None):
        if not hasattr(X,"columns"): raise TypeError("Need pandas DataFrame.")
        skb=SelectKBest(self.score_func,k=self.k).fit(X,y)
        self.selected_cols_=list(X.columns[skb.get_support()]); return self
    def transform(self,X): return X[self.selected_cols_]

class NPRFECV(TransformerMixin,BaseEstimator):
    def __init__(self,estimator,step=1,cv=None,scoring="neg_brier_score",min_features_to_select=1):
        self.estimator=estimator; self.step=step; self.cv=cv; self.scoring=scoring; self.min_features_to_select=min_features_to_select
        self.selected_cols_=None
    def fit(self,X,y=None):
        if not hasattr(X,"columns"): raise TypeError("Need pandas DataFrame.")
        r=RFECV(self.estimator,step=self.step,cv=self.cv,scoring=self.scoring,
                min_features_to_select=self.min_features_to_select).fit(X,y)
        self.selected_cols_=list(X.columns[r.support_]); return self
    def transform(self,X): return X[self.selected_cols_]

# ==== PIPELINES ====
from sklearn.preprocessing import RobustScaler  # more robust than StandardScaler for MLP
pipe_log_l2 = Pipeline([
    ("rfe",NPRFECV(LogisticRegression(max_iter=2000,solver="lbfgs"),
                   step=1,cv=tscv,scoring="neg_brier_score",min_features_to_select=RFE_MIN_FEATS)),
    ("scaler",StandardScaler()),
    ("clf",LogisticRegression(max_iter=2000,solver="lbfgs"))
])

pipe_log_en = Pipeline([
    ("rfe",NPRFECV(LogisticRegression(max_iter=2000,solver="saga",penalty="elasticnet",l1_ratio=0.5,random_state=SEED),
                   step=1,cv=tscv,scoring="neg_brier_score",min_features_to_select=RFE_MIN_FEATS)),
    ("scaler",StandardScaler()),
    ("clf",LogisticRegression(max_iter=2000,solver="saga",penalty="elasticnet",random_state=SEED))
])

pipe_mlp = Pipeline([
    ("skb",NPSKB(k="all")),                   # allow 'all' features as an option
    ("scaler",RobustScaler()),                # swap to RobustScaler for outlier resistance
    ("clf",MLPClassifier(max_iter=MLP_MAX_ITER,random_state=SEED,
                         early_stopping=MLP_EARLY_STOP,n_iter_no_change=MLP_PATIENCE,
                         validation_fraction=MLP_VAL_FRAC,learning_rate="adaptive"))
])

# ==== SEARCH SPACES ====
param_log_l2={"clf__C":loguniform(1e-3,1e1),"clf__class_weight":[None,"balanced"]}
param_log_en={"clf__C":loguniform(1e-3,1e1),"clf__l1_ratio":uniform(0,1),"clf__class_weight":[None,"balanced"]}

# broadened MLP space; SGD options are ignored automatically if solver='adam'
param_mlp={
    "skb__k":["all", max(4,len(feat)//2), max(6,len(feat)//2+2)],
    "clf__solver":["adam","sgd"],
    "clf__hidden_layer_sizes":[(64,),(128,),(256,),(64,32),(128,64)],
    "clf__activation":["relu","tanh"],
    "clf__alpha":loguniform(1e-6,1e-1),
    "clf__learning_rate_init":loguniform(5e-4,1e-2),
    "clf__batch_size":[32,64,128],
    "clf__learning_rate":["adaptive","invscaling"],  # sgd only
    "clf__momentum":uniform(0.6,0.39),               # sgd only, 0.6–0.99
    "clf__nesterovs_momentum":[True,False],          # sgd only
}

scoring={"brier":"neg_brier_score","roc_auc":"roc_auc","f1w":"f1_weighted"}

# ==== TUNE + TRAIN ====
rs_log_l2=RandomizedSearchCV(pipe_log_l2,param_log_l2,n_iter=N_ITER_LOG_L2,scoring=scoring,
                             refit="brier",cv=tscv,n_jobs=-1,verbose=1,random_state=SEED).fit(X_tr,y_tr)
rs_log_en=RandomizedSearchCV(pipe_log_en,param_log_en,n_iter=N_ITER_LOG_EN,scoring=scoring,
                             refit="brier",cv=tscv,n_jobs=-1,verbose=1,random_state=SEED).fit(X_tr,y_tr)
rs_mlp   =RandomizedSearchCV(pipe_mlp,   param_mlp,   n_iter=N_ITER_MLP,   scoring=scoring,
                             refit="brier",cv=tscv,n_jobs=-1,verbose=1,random_state=SEED).fit(X_tr,y_tr)

best_log_l2, best_log_en, best_mlp = rs_log_l2.best_estimator_, rs_log_en.best_estimator_, rs_mlp.best_estimator_

# ==== HOLDOUT ====
m_l2, m_en, m_mlp = evaluate(best_log_l2,X_te,y_te), evaluate(best_log_en,X_te,y_te), evaluate(best_mlp,X_te,y_te)
print("L2:",m_l2,"\nEN:",m_en,"\nMLP:",m_mlp)

# choose logistic winner; calibrate MLP if it helps Brier
log_final, log_metrics = (best_log_l2,m_l2) if m_l2["Brier"]<=m_en["Brier"] else (best_log_en,m_en)

def calibrate_if_better(model,Xtr,ytr,Xte,yte):
    base=evaluate(model,Xte,yte)
    cal=CalibratedClassifierCV(model,method="isotonic",cv=5).fit(Xtr,ytr)
    calm=evaluate(cal,Xte,yte)
    return (cal,calm) if calm["Brier"]<=base["Brier"] else (model,base)

mlp_final, mlp_metrics = calibrate_if_better(best_mlp,X_tr,y_tr,X_te,y_te)
print("Final Logistic:",log_metrics,"\nFinal MLP:",mlp_metrics)

# ==== SAVE MODELS ====
joblib.dump(log_final,"model_Logistic_TUNED.joblib")
joblib.dump(mlp_final,"model_MLP_TUNED.joblib")

# ==== SELECTED FEATURES (SAFE WITH CALIBRATION WRAPPER) ====
def _unwrap_pipeline(model):
    from sklearn.pipeline import Pipeline
    from sklearn.calibration import CalibratedClassifierCV
    if isinstance(model, Pipeline):
        return model
    if isinstance(model, CalibratedClassifierCV):
        be = getattr(model, "base_estimator", None)
        if isinstance(be, Pipeline):
            return be
        ccs = getattr(model, "calibrated_classifiers_", None)
        if ccs and hasattr(ccs[0], "estimator") and isinstance(ccs[0].estimator, Pipeline):
            return ccs[0].estimator
    return None

def selected_cols_from(model, default=None):
    pipe = _unwrap_pipeline(model)
    if pipe is None:
        return default
    for k in ("rfe","skb","sfm"):
        step = pipe.named_steps.get(k)
        if step is not None and hasattr(step,"selected_cols_"):
            return step.selected_cols_
    return default

with open("selected_features.json","w") as f:
    json.dump({
        "rename_map": rename_map,
        "selected_features": {
            "Logistic": selected_cols_from(log_final, default=feat),
            "MLP":      selected_cols_from(mlp_final,  default=feat)
        }
    }, f, indent=2)

# ==== SIMPLE ENSEMBLE (avg probs) ====
p_log = log_final.predict_proba(X_te)[:,1]
p_mlp = mlp_final.predict_proba(X_te)[:,1]
p_avg = 0.5*p_log + 0.5*p_mlp
ens = {"F1": f1_score(y_te,(p_avg>=0.5).astype(int),average="weighted"),
       "ROC_AUC": roc_auc_score(y_te,p_avg),
       "Brier": brier_score_loss(y_te,p_avg)}
print("Ensemble:", ens)


Fitting 6 folds for each of 40 candidates, totalling 240 fits
Fitting 6 folds for each of 40 candidates, totalling 240 fits
Fitting 6 folds for each of 200 candidates, totalling 1200 fits
L2: {'F1': 0.6241007029448048, 'ROC_AUC': np.float64(0.6679212507237985), 'Brier': np.float64(0.22432383817675844)} 
EN: {'F1': 0.6465678619030885, 'ROC_AUC': np.float64(0.6669658367110597), 'Brier': np.float64(0.22448722112109268)} 
MLP: {'F1': 0.6023792646871187, 'ROC_AUC': np.float64(0.6565141864504922), 'Brier': np.float64(0.23155605431026416)}
Final Logistic: {'F1': 0.6241007029448048, 'ROC_AUC': np.float64(0.6679212507237985), 'Brier': np.float64(0.22432383817675844)} 
Final MLP: {'F1': 0.6268117881919214, 'ROC_AUC': np.float64(0.6540532715691951), 'Brier': np.float64(0.22838572051485992)}
Ensemble: {'F1': 0.6301033653354774, 'ROC_AUC': np.float64(0.6611464968152866), 'Brier': np.float64(0.22570816025375073)}


In [13]:

DATE_OFFSET = +1
TARGET_DATE = (datetime.now() + timedelta(days=DATE_OFFSET)).date()
print(f"📅 Target slate date set to: {TARGET_DATE}")

CSV_PATH = "odds.csv"

def _strip_accents(s: str) -> str:
    return "".join(c for c in unicodedata.normalize("NFKD", str(s)) if not unicodedata.combining(c))

def _american_to_decimal(a):
    try: a = int(a)
    except Exception: return None
    if a == 0 or abs(a) < 100 or a == -100: return None
    return 1.0 + (a/100.0) if a > 0 else 1.0 + (100.0/abs(a))

def _parse_odds(s: str):
    if s is None: return (None, None, None)
    s = s.strip().lower().replace(" ", "")
    if s in {"s","skip"}:  return (None, None, "skip")
    if s in {"q","quit"}:  return (None, None, "quit")
    if "." in s or "," in s:   # accept decimal as a convenience
        try:
            dec = float(s.replace(",", "."))
            return (None, dec if dec > 1.01 else None, None)
        except Exception:
            return (None, None, None)
    m = re.fullmatch(r"([+-]?)(\d{2,4})", s)
    if not m: return (None, None, None)
    sign, num = m.groups(); num = int(num)
    american = -num if sign == "-" else +num
    if american == 0 or abs(american) < 100 or american == -100: return (None, None, None)
    return (american, _american_to_decimal(american), None)

def _mk_template(df_games, target_date):
    g = df_games.copy()
    g["Date"] = pd.to_datetime(g["Date"], errors="coerce")
    g = g[g["Date"].dt.date == target_date]
    out = (g.drop_duplicates(subset=["Date","Home Team","Away Team"])
             .loc[:, ["Date","Away Team","Home Team"]]
             .assign(**{"Away American Odds": None, "Away Odds": None,
                        "Home American Odds": None, "Home Odds": None})
             .loc[:, ["Date","Away Team","Away American Odds","Away Odds",
                      "Home Team","Home American Odds","Home Odds"]]
             .reset_index(drop=True))
    out["Away Team"] = out["Away Team"].map(_strip_accents)
    out["Home Team"] = out["Home Team"].map(_strip_accents)
    return out

def _slate_counts_nearby(df_games, center_date, span=3):
    g = df_games.copy()
    g["Date"] = pd.to_datetime(g["Date"], errors="coerce")
    counts = []
    for delta in range(-1, span+1):
        d = center_date + timedelta(days=delta)
        n = int((g["Date"].dt.date == d).sum())
        counts.append((d, n))
    return counts

def enter_american_odds(df_games, date_str=None):
    target = (pd.to_datetime(date_str).date() if date_str else datetime.now().date())
    tmpl = _mk_template(df_games, target)

    print(f"\nTarget date: {target} | games found: {len(tmpl)}")
    nearby = _slate_counts_nearby(df_games, target, span=3)
    print("Nearby slates (games per day):")
    for d, n in nearby:
        mark = "  (today)" if d == target else ""
        print(f"  {d}: {n}{mark}")

    if tmpl.empty:
        print("No games for this date in df_all. Pick another date (e.g., enter_american_odds(df_all, '2025-10-12')).")
        return tmpl  # empty

    print("\nEnter AMERICAN odds (e.g., -120, +135).")
    print("Press Enter to leave a side blank, 's' to skip a game, 'q' to quit.\n")

    for i in range(len(tmpl)):
        away = tmpl.at[i, "Away Team"]
        home = tmpl.at[i, "Home Team"]
        header = f"[{i+1}/{len(tmpl)}]  {away}  @  {home}"
        print("="*len(header))
        print(header)
        print("="*len(header))

        # Away (AWAY TEAM odds)
        while True:
            a_in = input(f"  (AWAY)  {away}  American odds: ").strip()
            if a_in == "":  a_american, a_decimal, cmd = (None, None, None); break
            a_american, a_decimal, cmd = _parse_odds(a_in)
            if cmd in {"quit","skip"} or a_decimal is not None or a_american is None: break
            print("    -> invalid (try -120, +135, or 1.95)")
        if cmd == "quit": break
        if cmd == "skip": print("  skipped game\n"); continue

        # Home (HOME TEAM odds)
        while True:
            h_in = input(f"  (HOME)  {home}  American odds: ").strip()
            if h_in == "":  h_american, h_decimal, cmd2 = (None, None, None); break
            h_american, h_decimal, cmd2 = _parse_odds(h_in)
            if cmd2 in {"quit","skip"} or h_decimal is not None or h_american is None: break
            print("    -> invalid (try -120, +135, or 1.95)")
        if cmd2 == "quit": break
        if cmd2 == "skip": print("  skipped game\n"); continue

        tmpl.at[i, "Away American Odds"] = a_american
        tmpl.at[i, "Away Odds"]          = a_decimal
        tmpl.at[i, "Home American Odds"] = h_american
        tmpl.at[i, "Home Odds"]          = h_decimal
        print()

    complete = tmpl.dropna(subset=["Away Odds","Home Odds"]).reset_index(drop=True)
    complete.to_csv(CSV_PATH, index=False)
    print(f"\n✅ Saved {len(complete)} matchup(s) with odds to {CSV_PATH}")
    return complete

# ===== RUN =====
# Today by default (change to a specific date string if you want):
df_odds = enter_american_odds(df_all, TARGET_DATE.strftime("%Y-%m-%d"))
print("\nSaved rows preview:")
print(df_odds.head(10))


📅 Target slate date set to: 2025-10-18

Target date: 2025-10-18 | games found: 13
Nearby slates (games per day):
  2025-10-17: 4
  2025-10-18: 13  (today)
  2025-10-19: 4
  2025-10-20: 5
  2025-10-21: 10

Enter AMERICAN odds (e.g., -120, +135).
Press Enter to leave a side blank, 's' to skip a game, 'q' to quit.

[1/13]  Florida Panthers  @  Buffalo Sabres


  (AWAY)  Florida Panthers  American odds:  -147
  (HOME)  Buffalo Sabres  American odds:  130



[2/13]  New York Islanders  @  Ottawa Senators


  (AWAY)  New York Islanders  American odds:  125
  (HOME)  Ottawa Senators  American odds:  -147



[3/13]  Edmonton Oilers  @  New Jersey Devils


  (AWAY)  Edmonton Oilers  American odds:  -104
  (HOME)  New Jersey Devils  American odds:  -112



[4/13]  Nashville Predators  @  Winnipeg Jets


  (AWAY)  Nashville Predators  American odds:  158
  (HOME)  Winnipeg Jets  American odds:  -189



[5/13]  Dallas Stars  @  St. Louis Blues


  (AWAY)  Dallas Stars  American odds:  -136
  (HOME)  St. Louis Blues  American odds:  118



[6/13]  Tampa Bay Lightning  @  Columbus Blue Jackets


  (AWAY)  Tampa Bay Lightning  American odds:  -114
  (HOME)  Columbus Blue Jackets  American odds:  -103



[7/13]  Minnesota Wild  @  Philadelphia Flyers


  (AWAY)  Minnesota Wild  American odds:  -110
  (HOME)  Philadelphia Flyers  American odds:  -106



[8/13]  New York Rangers  @  Montreal Canadiens


  (AWAY)  New York Rangers  American odds:  -103
  (HOME)  Montreal Canadiens  American odds:  -114



[9/13]  Seattle Kraken  @  Toronto Maple Leafs


  (AWAY)  Seattle Kraken  American odds:  175
  (HOME)  Toronto Maple Leafs  American odds:  -208



[10/13]  Boston Bruins  @  Colorado Avalanche


  (AWAY)  Boston Bruins  American odds:  205
  (HOME)  Colorado Avalanche  American odds:  -250



[11/13]  Carolina Hurricanes  @  Los Angeles Kings


  (AWAY)  Carolina Hurricanes  American odds:  -130
  (HOME)  Los Angeles Kings  American odds:  110



[12/13]  Calgary Flames  @  Vegas Golden Knights


  (AWAY)  Calgary Flames  American odds:  198
  (HOME)  Vegas Golden Knights  American odds:  -238



[13/13]  Pittsburgh Penguins  @  San Jose Sharks


  (AWAY)  Pittsburgh Penguins  American odds:  -135
  (HOME)  San Jose Sharks  American odds:  115




✅ Saved 13 matchup(s) with odds to odds.csv

Saved rows preview:
                 Date            Away Team Away American Odds Away Odds  \
0 2025-10-18 13:00:00     Florida Panthers               -147  1.680272   
1 2025-10-18 15:00:00   New York Islanders                125      2.25   
2 2025-10-18 15:30:00      Edmonton Oilers               -104  1.961538   
3 2025-10-18 19:00:00  Nashville Predators                158      2.58   
4 2025-10-18 19:00:00         Dallas Stars               -136  1.735294   
5 2025-10-18 19:00:00  Tampa Bay Lightning               -114  1.877193   
6 2025-10-18 19:00:00       Minnesota Wild               -110  1.909091   
7 2025-10-18 19:00:00     New York Rangers               -103  1.970874   
8 2025-10-18 19:00:00       Seattle Kraken                175      2.75   
9 2025-10-18 21:00:00        Boston Bruins                205      3.05   

               Home Team Home American Odds Home Odds  
0         Buffalo Sabres                130       2

In [15]:
# ===================== CONFIG (edit if needed) =====================
MODEL_LOG_PATH = "model_Logistic_TUNED.joblib"
MODEL_MLP_PATH = "model_MLP_TUNED.joblib"
ENSEMBLE_WEIGHTS = (0.5, 0.5)     # (logistic, mlp)
BANKROLL = 409
PREDICTIONS_CSV = "predictions.csv"
DATE_OFFSET = 0                   # 0=today, +1=tomorrow, etc.
DATE_COL_IN_HISTORY = "Date"      # must exist in df_final
# ===================================================================

# ---- imports you MUST have ----
import os, joblib
import numpy as np
import pandas as pd
from datetime import datetime, timedelta

# ---- target date ----
TARGET_DATE = (datetime.now() + timedelta(days=DATE_OFFSET)).date()

# ---- safety: required dataframes/objects ----
required = {
    "df_odds": "DataFrame of today's games with columns: Home Team, Away Team, Home Odds, Away Odds",
    "df_final": "Historical features per game (used to build X_one)",
    "df_rank": "Team ranking table with columns: Team, Rank",
    "rename_map": "dict mapping training feature names (keys in feats) to model input names"
}
for name in required:
    if name not in globals():
        raise RuntimeError(f"Missing required object `{name}`: {required[name]}")

# ===== MODELS & ENSEMBLE SETUP =====
assert os.path.exists(MODEL_LOG_PATH), f"Missing model file: {MODEL_LOG_PATH}"
assert os.path.exists(MODEL_MLP_PATH), f"Missing model file: {MODEL_MLP_PATH}"
log_model = joblib.load(MODEL_LOG_PATH)
mlp_model = joblib.load(MODEL_MLP_PATH)

assert hasattr(log_model, "predict_proba"), "log_model must support predict_proba()"
assert hasattr(mlp_model, "predict_proba"), "mlp_model must support predict_proba()"

# ensemble weights (normalized)
try:
    w_log, w_mlp = ENSEMBLE_WEIGHTS
except Exception:
    w_log, w_mlp = 0.5, 0.5
ws = (w_log or 0) + (w_mlp or 0)
if ws <= 0:
    w_log, w_mlp = 0.5, 0.5
else:
    w_log, w_mlp = w_log / ws, w_mlp / ws

# ---- helpers ----
def _kelly(p_win: float, dec_odds: float) -> float:
    b = dec_odds - 1.0
    if b <= 0:
        return 0.0
    q = 1.0 - p_win
    return max(0.0, (b * p_win - q) / b)

def _assert_sorted(df_hist: pd.DataFrame, date_col: str):
    if not df_hist[date_col].is_monotonic_increasing:
        df_hist.sort_values(date_col, inplace=True, kind="mergesort")  # stable

def _latest_value_asof(df_hist: pd.DataFrame, team: str, col_if_home: str, col_if_away: str,
                       as_of_dt: pd.Timestamp, date_col: str):
    # history strictly before slate day boundary (prevent leakage)
    m = (((df_hist["Home Team"] == team) | (df_hist["Away Team"] == team))
         & (df_hist[date_col] < as_of_dt))
    if not m.any():
        raise ValueError(f"No history for {team} up to {as_of_dt}")
    last = df_hist.loc[m].iloc[-1]
    return last[col_if_home] if last["Home Team"] == team else last[col_if_away]

def build_features_for_match(home_team, away_team, df_hist, df_rank, as_of_dt, date_col: str):
    # ranks with guard
    def _rank(team):
        r = df_rank.loc[df_rank["Team"] == team, "Rank"]
        if r.empty:
            raise ValueError(f"Rank missing for team: {team}")
        return int(r.iloc[0])

    r_home, r_away = _rank(home_team), _rank(away_team)
    _assert_sorted(df_hist, date_col)

    L = lambda tm, ch, ca: _latest_value_asof(df_hist, tm, ch, ca, as_of_dt, date_col)
    feats = {
        "Rank Difference": r_home - r_away,
        "Last 10 Wins": L(home_team, "Home Last 10 Wins", "Away Last 10 Wins")
                        - L(away_team, "Away Last 10 Wins", "Home Last 10 Wins"),
        "Home Played Yesterday": int(bool(L(home_team, "Home Played Yesterday", "Away Played Yesterday"))),
        "Away Played Yesterday": int(bool(L(away_team, "Away Played Yesterday", "Home Played Yesterday"))),
        "Home Advantage": L(home_team, "Home Win Rate", "Away Win Rate")
                          - L(away_team, "Away Win Rate", "Home Win Rate"),
        "Win Streak Impact": L(home_team, "Home Team Overall Win Streak Before Game", "Away Team Overall Win Streak Before Game")
                             - L(away_team, "Away Team Overall Win Streak Before Game", "Home Team Overall Win Streak Before Game"),
        "Away Rest Days Since Last Game": L(away_team, "Away Rest Days Since Last Game", "Home Rest Days Since Last Game"),
        "Home Rest Days Since Last Game": L(home_team, "Home Rest Days Since Last Game", "Away Rest Days Since Last Game"),
        "Opponent Strength": L(home_team, "Home Opponent Strength", "Away Opponent Strength")
                             - L(away_team, "Away Opponent Strength", "Home Opponent Strength"),
    }

    # map to training-time column names expected by the pipelines
    row = {rename_map.get(k, k): v for k, v in feats.items()}
    X = pd.DataFrame([row])
    return X

# ==== PREDICT & SIZE BETS ====
as_of_dt = pd.Timestamp(TARGET_DATE)  # naive local midnight of target day

# clean team strings
for c in ["Home Team", "Away Team"]:
    if c in df_odds.columns:
        df_odds[c] = df_odds[c].astype(str).str.strip()
    if c in df_final.columns:
        df_final[c] = df_final[c].astype(str).str.strip()
if "Team" in df_rank.columns:
    df_rank["Team"] = df_rank["Team"].astype(str).str.strip()

pred_rows, pre_bets, total_pre = [], [], 0.0

for _, r in df_odds.iterrows():
    h, a = str(r["Home Team"]), str(r["Away Team"])

    # odds cleaning
    try:
        oh, oa = float(r["Home Odds"]), float(r["Away Odds"])
    except Exception:
        # skip rows with missing/invalid odds
        continue

    # build feature row
    X_one = build_features_for_match(h, a, df_final, df_rank, as_of_dt=as_of_dt, date_col=DATE_COL_IN_HISTORY)

    # model probabilities
    p_h = (
        w_log * log_model.predict_proba(X_one)[:, 1].item()
        + w_mlp * mlp_model.predict_proba(X_one)[:, 1].item()
    )

    # Kelly stakes (unscaled)
    bh_pre = _kelly(p_h, oh) * BANKROLL
    ba_pre = _kelly(1.0 - p_h, oa) * BANKROLL
    pre_bets.append((bh_pre, ba_pre))
    total_pre += bh_pre + ba_pre

    pred_rows.append({
        "SlateDate": TARGET_DATE.isoformat(),
        "Away Team": a, "Away Odds": oa,
        "Home Team": h, "Home Odds": oh,
        "p_home_ens": round(p_h, 4)
    })

# scale proportionally if total stake > bankroll
for i, row in enumerate(pred_rows):
    bh, ba = pre_bets[i]
    if BANKROLL > 0 and total_pre > BANKROLL:
        s = BANKROLL / total_pre
        bh, ba = bh * s, ba * s
        row["Scaled"] = True
    else:
        row["Scaled"] = False
    row["Home Bet"] = int(round(bh)) if bh > 0 else 0
    row["Away Bet"] = int(round(ba)) if ba > 0 else 0

pred_df = pd.DataFrame(pred_rows)
cols = ["SlateDate","Away Team","Away Odds","Home Team","Home Odds","p_home_ens","Home Bet","Away Bet","Scaled"]
print(pred_df[cols] if set(cols).issubset(pred_df.columns) else pred_df)

# append/save
mode = "a" if os.path.exists(PREDICTIONS_CSV) else "w"
header = not os.path.exists(PREDICTIONS_CSV)
pred_df.to_csv(PREDICTIONS_CSV, index=False, mode=mode, header=header)
print(f"\nSaved {len(pred_df)} rows to {PREDICTIONS_CSV} (mode='{mode}', header={header})")



     SlateDate            Away Team  Away Odds              Home Team  \
0   2025-10-17     Florida Panthers   1.680272         Buffalo Sabres   
1   2025-10-17   New York Islanders   2.250000        Ottawa Senators   
2   2025-10-17      Edmonton Oilers   1.961538      New Jersey Devils   
3   2025-10-17  Nashville Predators   2.580000          Winnipeg Jets   
4   2025-10-17         Dallas Stars   1.735294        St. Louis Blues   
5   2025-10-17  Tampa Bay Lightning   1.877193  Columbus Blue Jackets   
6   2025-10-17       Minnesota Wild   1.909091    Philadelphia Flyers   
7   2025-10-17     New York Rangers   1.970874     Montreal Canadiens   
8   2025-10-17       Seattle Kraken   2.750000    Toronto Maple Leafs   
9   2025-10-17        Boston Bruins   3.050000     Colorado Avalanche   
10  2025-10-17  Carolina Hurricanes   1.769231      Los Angeles Kings   
11  2025-10-17       Calgary Flames   2.980000   Vegas Golden Knights   
12  2025-10-17  Pittsburgh Penguins   1.740741     