
# MLB Today's Winners — Robust (MLB-only, NaN-safe, 14d w/ 30d fallback)

**What this does**
- Uses only data **through yesterday** (no leakage)
- Pulls **today's schedule** & **probable pitchers** from MLB StatsAPI
- Builds **MLB-only** 14-day team & SP features; falls back to **30-day** if 14-day missing
- Fixes team-name joins using **locationName + league** (e.g., Chicago AL vs NL)
- Handles **players traded** within the window (comma-separated `Tm` exploded to each team)
- Outputs:
  - `pred_outputs/today_matchups.csv` (all features + home/away win probs)
  - `pred_outputs/READABLE_PICKS.txt` (sorted by confidence)


In [1]:

# Setup / imports
try:
    import pybaseball  # used for FanGraphs range endpoints
except ImportError:
    %pip install --quiet pybaseball

import os, math, json, shutil, time, textwrap, itertools, sys
import pandas as pd
import numpy as np
from datetime import datetime, timedelta, date
import requests
from pybaseball import batting_stats_range, pitching_stats_range

pd.set_option("display.max_columns", None)


In [2]:

# Config
TZ = "US/Eastern"  # only used for display; StatsAPI uses UTC times in JSON
TARGET_DATE = date.today()  # games to predict
YESTERDAY   = date.today() - timedelta(days=1)

# Feature windows
W14 = 14
W30 = 30

# Thresholds
MIN_BATTER_PA = 20   # for lineup/top hitters consideration
MIN_PITCHER_IP = 5.0 # for SP form

# IO
OUT_DIR = "./pred_outputs"
if os.path.exists(OUT_DIR):
    shutil.rmtree(OUT_DIR)
os.makedirs(OUT_DIR, exist_ok=True)

print(f"Predicting games on {TARGET_DATE} using data through {YESTERDAY}.")
print("✅ pred_outputs/ directory was cleared.")


Predicting games on 2025-08-08 using data through 2025-08-07.
✅ pred_outputs/ directory was cleared.


In [3]:

def fetch_json(url, params=None, tries=3, sleep=0.4):
    for i in range(tries):
        r = requests.get(url, params=params, timeout=20)
        if r.status_code == 200:
            return r.json()
        time.sleep(sleep)
    r.raise_for_status()

def statsapi_teams():
    url = "https://statsapi.mlb.com/api/v1/teams"
    js = fetch_json(url, params={"sportId": 1})
    teams = {}
    for t in js.get("teams", []):
        tid = t["id"]
        teams[tid] = {
            "id": tid,
            "name": t.get("name"),
            "teamName": t.get("teamName"),
            "locationName": t.get("locationName"),
            "abbrev": t.get("abbreviation"),
            "league": (t.get("league") or {}).get("name"),
            "league_id": (t.get("league") or {}).get("id"),
        }
    return teams

def statsapi_schedule(day):
    url = "https://statsapi.mlb.com/api/v1/schedule"
    js = fetch_json(url, params={"sportId": 1, "date": day.strftime("%Y-%m-%d"), "hydrate": "probablePitcher"})
    games = []
    for d in js.get("dates", []):
        for g in d.get("games", []):
            games.append(g)
    return games

def explode_teams(df, team_col="Team"):
    df = df.copy()
    df[team_col] = df[team_col].astype(str)
    df["__teamlist"] = df[team_col].str.split(",")
    df = df.explode("__teamlist")
    df[team_col] = df["__teamlist"].str.strip()
    df.drop(columns=["__teamlist"], inplace=True)
    return df

def zscore(s):
    s = pd.to_numeric(s, errors="coerce")
    mu = s.mean(skipna=True)
    sd = s.std(skipna=True, ddof=0)
    if sd == 0 or pd.isna(sd):
        return (s*0).fillna(0)
    return (s - mu) / sd

def pa_weighted_mean(df, col, w="PA"):
    x = pd.to_numeric(df[col], errors="coerce")
    wv = pd.to_numeric(df[w], errors="coerce").fillna(0)
    den = wv.sum()
    if den <= 0:
        return np.nan
    return (x * wv).sum(skipna=True) / den

def ip_weighted_mean(df, col, w="IP"):
    x = pd.to_numeric(df[col], errors="coerce")
    wv = pd.to_numeric(df[w], errors="coerce").fillna(0)
    den = wv.sum()
    if den <= 0:
        return np.nan
    return (x * wv).sum(skipna=True) / den

def make_lineup_proxy(bat_df, n=9):
    # choose top N by PA; return median OPS as proxy + names
    d = bat_df.copy()
    d = d.sort_values("PA", ascending=False).head(n)
    ops_med = pd.to_numeric(d["OPS"], errors="coerce").median(skipna=True)
    names = d["Name"].tolist()
    return ops_med, names


In [4]:

def pull_window_batting(end_date, days):
    start = end_date - timedelta(days=days-1)
    bat = batting_stats_range(start.strftime("%Y-%m-%d"), end_date.strftime("%Y-%m-%d")).copy()
    # Normalize
    bat = bat.rename(columns={"Tm": "Team", "BA": "AVG"})
    # MLB level only
    bat = bat[bat["Lev"].astype(str).str.startswith("Maj", na=False)].copy()
    # Ensure numerics
    for c in ["PA","AB","H","2B","3B","HR","BB","SO","HBP","SF","SB","CS","AVG","OBP","SLG","OPS"]:
        if c in bat.columns:
            bat[c] = pd.to_numeric(bat[c], errors="coerce")
    # Derived
    bat["HR_per_PA"] = bat["HR"] / bat["PA"].where(bat["PA"] > 0)
    bat["SB_per_PA"] = bat["SB"] / bat["PA"].where(bat["PA"] > 0)
    # Split traded players across teams
    bat = explode_teams(bat, "Team")
    return bat

def pull_window_pitching(end_date, days):
    start = end_date - timedelta(days=days-1)
    pit = pitching_stats_range(start.strftime("%Y-%m-%d"), end_date.strftime("%Y-%m-%d")).copy()
    pit = pit.rename(columns={"Tm": "Team"})
    pit = pit[pit["Lev"].astype(str).str.startswith("Maj", na=False)].copy()
    # Ensure numerics
    for c in ["IP","G","GS","ER","H","BB","SO","HR","BF","ERA","WHIP","SO9","SO/W","mlbID"]:
        if c in pit.columns:
            pit[c] = pd.to_numeric(pit[c], errors="coerce")
    # Derived
    pit["K_pct"] = (pit["SO"] / pit["BF"].where(pit["BF"] > 0)) * 100.0
    pit["BB_pct"] = (pit["BB"] / pit["BF"].where(pit["BF"] > 0)) * 100.0
    pit["KBB_pct"] = pit["K_pct"] - pit["BB_pct"]
    pit["HR9"] = (9.0 * pit["HR"] / pit["IP"].where(pit["IP"] > 0))
    pit = explode_teams(pit, "Team")
    return pit


In [None]:

# === Patch: safer window pull helpers (schema/empty-range tolerant) ===
from datetime import timedelta
import numpy as np
import pandas as pd

def _coerce_numeric(df, cols):
    for c in cols:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors="coerce")
    return df

def _normalize_team_and_level(df):
    df = df.copy()
    # Handle team column naming differences
    if "Team" not in df.columns and "Tm" in df.columns:
        df = df.rename(columns={"Tm": "Team"})
    if "Team" not in df.columns:
        df["Team"] = np.nan

    # Try to find a "level/league" column from pybaseball (schema can vary)
    lev_cand = next((c for c in ["Lev", "Level", "level", "Lg", "LG", "League"] if c in df.columns), None)
    if lev_cand and lev_cand != "Lev":
        df = df.rename(columns={lev_cand: "Lev"})
    if "Lev" in df.columns:
        # Keep MLB rows if present
        df = df[df["Lev"].astype(str).str.startswith("Maj", na=False)].copy()
    else:
        # Leave as NaN; we'll assign AL/NL later during team aggregation
        df["Lev"] = np.nan
    return df

def pull_window_batting(end_date, days):
    start = end_date - timedelta(days=days-1)
    try:
        raw = batting_stats_range(start.strftime("%Y-%m-%d"), end_date.strftime("%Y-%m-%d"))
    except Exception as e:
        # Return an empty-but-well-formed frame so downstream code doesn't crash
        raw = pd.DataFrame()

    bat = raw.copy() if raw is not None else pd.DataFrame()

    # Rename common schema differences
    if "BA" in bat.columns and "AVG" not in bat.columns:
        bat = bat.rename(columns={"BA": "AVG"})

    bat = _normalize_team_and_level(bat)

    # Ensure required columns exist
    required = ["PA","AB","H","2B","3B","HR","BB","SO","HBP","SF","SB","CS","AVG","OBP","SLG","OPS","Name","Team","Lev"]
    for c in required:
        if c not in bat.columns:
            bat[c] = np.nan

    bat = _coerce_numeric(bat, ["PA","AB","H","2B","3B","HR","BB","SO","HBP","SF","SB","CS","AVG","OBP","SLG","OPS"])
    # Derived
    bat["HR_per_PA"] = bat["HR"] / bat["PA"].where(bat["PA"] > 0)
    bat["SB_per_PA"] = bat["SB"] / bat["PA"].where(bat["PA"] > 0)

    # Split traded players across comma-separated team stints (if present)
    bat = explode_teams(bat, "Team") if "Team" in bat.columns else bat
    return bat

def pull_window_pitching(end_date, days):
    start = end_date - timedelta(days=days-1)
    try:
        raw = pitching_stats_range(start.strftime("%Y-%m-%d"), end_date.strftime("%Y-%m-%d"))
    except Exception as e:
        raw = pd.DataFrame()

    pit = raw.copy() if raw is not None else pd.DataFrame()
    pit = _normalize_team_and_level(pit)

    # Ensure required columns exist
    required = ["IP","G","GS","ER","H","BB","SO","HR","BF","ERA","WHIP","SO9","SO/W","mlbID","Team","Lev"]
    for c in required:
        if c not in pit.columns:
            pit[c] = np.nan

    pit = _coerce_numeric(pit, ["IP","G","GS","ER","H","BB","SO","HR","BF","ERA","WHIP","SO9","SO/W","mlbID"])

    # Derived
    pit["K_pct"] = (pit["SO"] / pit["BF"].where(pit["BF"] > 0)) * 100.0
    pit["BB_pct"] = (pit["BB"] / pit["BF"].where(pit["BF"] > 0)) * 100.0
    pit["KBB_pct"] = pit["K_pct"] - pit["BB_pct"]
    pit["HR9"] = (9.0 * pit["HR"] / pit["IP"].where(pit["IP"] > 0))

    pit = explode_teams(pit, "Team") if "Team" in pit.columns else pit
    return pit

print("✅ Applied safer pull_window_* patch — tolerant to empty ranges & schema drift.")


In [5]:

# Pull windows (through yesterday)
bat14 = pull_window_batting(YESTERDAY, W14)
pit14 = pull_window_pitching(YESTERDAY, W14)

bat30 = pull_window_batting(YESTERDAY, W30)
pit30 = pull_window_pitching(YESTERDAY, W30)

print(f"bat14 rows: {len(bat14)} | pit14 rows: {len(pit14)}")
print(f"bat30 rows: {len(bat30)} | pit30 rows: {len(pit30)}")


bat14 rows: 471 | pit14 rows: 519
bat30 rows: 503 | pit30 rows: 573


In [6]:

def team_agg_bat(bat_df):
    recs = []
    for (team, lev), g in bat_df.groupby(["Team","Lev"], dropna=True):
        if g.empty: continue
        recs.append({
            "Team": team,
            "Lev": lev,
            "team_PA": g["PA"].sum(skipna=True),
            "team_OPS": pa_weighted_mean(g, "OPS", "PA"),
            "team_OBP": pa_weighted_mean(g, "OBP", "PA"),
            "team_SLG": pa_weighted_mean(g, "SLG", "PA"),
            "lineup_ops_med": make_lineup_proxy(g[g["PA"]>=MIN_BATTER_PA])[0] if "PA" in g.columns else np.nan,
        })
    return pd.DataFrame.from_records(recs)

def team_agg_pit(pit_df):
    recs = []
    for (team, lev), g in pit_df.groupby(["Team","Lev"], dropna=True):
        if g.empty: continue
        recs.append({
            "Team": team,
            "Lev": lev,
            "team_IP": g["IP"].sum(skipna=True),
            "team_ERA": ip_weighted_mean(g, "ERA", "IP"),
            "team_WHIP": ip_weighted_mean(g, "WHIP", "IP"),
        })
    return pd.DataFrame.from_records(recs)

bat14_team = team_agg_bat(bat14)
pit14_team = team_agg_pit(pit14)
bat30_team = team_agg_bat(bat30)
pit30_team = team_agg_pit(pit30)

print("team aggregates built.")


team aggregates built.


In [7]:

def merge_team_windows(b14, p14, b30, p30):
    # Merge bats and pits separately then outer
    b = pd.merge(b14, b30, on=["Team","Lev"], how="outer", suffixes=("_14","_30"))
    p = pd.merge(p14, p30, on=["Team","Lev"], how="outer", suffixes=("_14","_30"))
    return b, p

bat_team_win, pit_team_win = merge_team_windows(bat14_team, pit14_team, bat30_team, pit30_team)

# Build SP lookup tables (14d primary, 30d fallback)
sp14 = pit14.copy()
sp30 = pit30.copy()
sp14 = sp14.sort_values(["mlbID","IP"], ascending=[True, False]).drop_duplicates(subset=["mlbID"], keep="first")
sp30 = sp30.sort_values(["mlbID","IP"], ascending=[True, False]).drop_duplicates(subset=["mlbID"], keep="first")

# Ensure mlbID int
for df in (sp14, sp30):
    df["mlbID"] = pd.to_numeric(df["mlbID"], errors="coerce").astype("Int64")


In [8]:

teams_catalog = statsapi_teams()
games = statsapi_schedule(TARGET_DATE)

print(f"Found {len(games)} games on schedule.")


Found 15 games on schedule.


In [9]:

def pick_team_row(team_id):
    t = teams_catalog[team_id]
    loc = t.get("locationName")
    league = t.get("league")
    return loc, league

def get_team_feats(team_loc, league_name):
    # league_name like "American League" or "National League"
    lev_prefix = "Maj-AL" if "American" in str(league_name) else "Maj-NL"
    row = {}
    # pick 14 first else 30
    bt = bat_team_win[(bat_team_win["Team"]==team_loc) & (bat_team_win["Lev"]==lev_prefix)]
    pt = pit_team_win[(pit_team_win["Team"]==team_loc) & (pit_team_win["Lev"]==lev_prefix)]
    # batting
    row["team_OPS"] = bt["team_OPS_14"].iloc[0] if (len(bt)>0 and not pd.isna(bt["team_OPS_14"].iloc[0])) else (bt["team_OPS_30"].iloc[0] if len(bt)>0 else np.nan)
    row["team_OBP"] = bt["team_OBP_14"].iloc[0] if (len(bt)>0 and not pd.isna(bt["team_OBP_14"].iloc[0])) else (bt["team_OBP_30"].iloc[0] if len(bt)>0 else np.nan)
    row["team_SLG"] = bt["team_SLG_14"].iloc[0] if (len(bt)>0 and not pd.isna(bt["team_SLG_14"].iloc[0])) else (bt["team_SLG_30"].iloc[0] if len(bt)>0 else np.nan)
    row["lineup_ops_med"] = bt["lineup_ops_med_14"].iloc[0] if (len(bt)>0 and not pd.isna(bt["lineup_ops_med_14"].iloc[0])) else (bt["lineup_ops_med_30"].iloc[0] if len(bt)>0 else np.nan)
    # pitching
    row["team_ERA"] = pt["team_ERA_14"].iloc[0] if (len(pt)>0 and not pd.isna(pt["team_ERA_14"].iloc[0])) else (pt["team_ERA_30"].iloc[0] if len(pt)>0 else np.nan)
    row["team_WHIP"] = pt["team_WHIP_14"].iloc[0] if (len(pt)>0 and not pd.isna(pt["team_WHIP_14"].iloc[0])) else (pt["team_WHIP_30"].iloc[0] if len(pt)>0 else np.nan)
    return row

def get_sp_feats(sp_id):
    sp_id = int(sp_id) if sp_id is not None else None
    row = {"sp_IP": np.nan, "sp_ERA": np.nan, "sp_WHIP": np.nan, "sp_KBB_pct": np.nan}
    if sp_id is None or pd.isna(sp_id): return row
    r14 = sp14[sp14["mlbID"]==sp_id]
    r30 = sp30[sp30["mlbID"]==sp_id]
    src = r14 if len(r14)>0 else r30
    if len(src)==0: return row
    rec = src.iloc[0]
    row.update({
        "sp_IP": float(rec.get("IP", np.nan)),
        "sp_ERA": float(rec.get("ERA", np.nan)),
        "sp_WHIP": float(rec.get("WHIP", np.nan)),
        "sp_KBB_pct": float(rec.get("KBB_pct", np.nan)),
    })
    return row

# league default means for filling NaNs (from 30d aggregate)
league_defaults = {
    "American League": {
        "team_OPS": pd.to_numeric(bat30_team[bat30_team["Lev"].str.contains("AL")]["team_OPS"], errors="coerce").mean(skipna=True),
        "lineup_ops_med": pd.to_numeric(bat30_team[bat30_team["Lev"].str.contains("AL")]["lineup_ops_med"], errors="coerce").mean(skipna=True),
        "team_ERA": pd.to_numeric(pit30_team[pit30_team["Lev"].str.contains("AL")]["team_ERA"], errors="coerce").mean(skipna=True),
        "team_WHIP": pd.to_numeric(pit30_team[pit30_team["Lev"].str.contains("AL")]["team_WHIP"], errors="coerce").mean(skipna=True),
    },
    "National League": {
        "team_OPS": pd.to_numeric(bat30_team[bat30_team["Lev"].str.contains("NL")]["team_OPS"], errors="coerce").mean(skipna=True),
        "lineup_ops_med": pd.to_numeric(bat30_team[bat30_team["Lev"].str.contains("NL")]["lineup_ops_med"], errors="coerce").mean(skipna=True),
        "team_ERA": pd.to_numeric(pit30_team[pit30_team["Lev"].str.contains("NL")]["team_ERA"], errors="coerce").mean(skipna=True),
        "team_WHIP": pd.to_numeric(pit30_team[pit30_team["Lev"].str.contains("NL")]["team_WHIP"], errors="coerce").mean(skipna=True),
    }
}

# Build matchups
rows = []
for g in games:
    home_id = g["teams"]["home"]["team"]["id"]
    away_id = g["teams"]["away"]["team"]["id"]
    home_loc, home_league = pick_team_row(home_id)
    away_loc, away_league = pick_team_row(away_id)

    # Basic info
    game_pk = g.get("gamePk")
    status  = g.get("status",{}).get("detailedState")
    home_name = teams_catalog[home_id]["name"]
    away_name = teams_catalog[away_id]["name"]

    # Probable pitchers by mlbID
    pp_home = (g["teams"]["home"].get("probablePitcher") or {}).get("id")
    pp_away = (g["teams"]["away"].get("probablePitcher") or {}).get("id")

    # Team features
    hf = get_team_feats(home_loc, home_league)
    af = get_team_feats(away_loc, away_league)

    # Fill NaNs with league means
    for k in ["team_OPS","lineup_ops_med","team_ERA","team_WHIP"]:
        if pd.isna(hf.get(k, np.nan)): hf[k] = league_defaults[home_league][k]
        if pd.isna(af.get(k, np.nan)): af[k] = league_defaults[away_league][k]

    # SP features
    hsp = get_sp_feats(pp_home)
    asp = get_sp_feats(pp_away)

    rows.append({
        "gamePk": game_pk, "status": status,
        "away_team": away_name, "home_team": home_name,
        "away_loc": away_loc, "home_loc": home_loc,
        "away_league": away_league, "home_league": home_league,
        "pp_away_id": pp_away, "pp_home_id": pp_home,
        # team features
        "away_team_OPS": af["team_OPS"], "home_team_OPS": hf["team_OPS"],
        "away_lineup_ops_med": af["lineup_ops_med"], "home_lineup_ops_med": hf["lineup_ops_med"],
        "away_team_ERA": af["team_ERA"], "home_team_ERA": hf["team_ERA"],
        "away_team_WHIP": af["team_WHIP"], "home_team_WHIP": hf["team_WHIP"],
        # SP features
        "away_SP_IP": asp["sp_IP"], "home_SP_IP": hsp["sp_IP"],
        "away_SP_ERA": asp["sp_ERA"], "home_SP_ERA": hsp["sp_ERA"],
        "away_SP_WHIP": asp["sp_WHIP"], "home_SP_WHIP": hsp["sp_WHIP"],
        "away_SP_KBB%": asp["sp_KBB_pct"], "home_SP_KBB%": hsp["sp_KBB_pct"],
    })

games_df = pd.DataFrame(rows)
print("Built games feature table:", games_df.shape)
games_df.head()


Built games feature table: (15, 26)


Unnamed: 0,gamePk,status,away_team,home_team,away_loc,home_loc,away_league,home_league,pp_away_id,pp_home_id,away_team_OPS,home_team_OPS,away_lineup_ops_med,home_lineup_ops_med,away_team_ERA,home_team_ERA,away_team_WHIP,home_team_WHIP,away_SP_IP,home_SP_IP,away_SP_ERA,home_SP_ERA,away_SP_WHIP,home_SP_WHIP,away_SP_KBB%,home_SP_KBB%
0,776829,In Progress,Cincinnati Reds,Pittsburgh Pirates,Cincinnati,Pittsburgh,National League,National League,695505,656605,0.640873,0.710991,0.601,0.678,3.775466,3.822583,1.249374,1.221161,6.2,7.0,4.05,10.29,1.05,2.286,37.931034,8.333333
1,776831,In Progress,Athletics,Baltimore Orioles,Sacramento,Baltimore,American League,American League,669372,608372,0.748396,0.772883,0.734288,0.863,5.133134,4.018226,1.369023,1.284823,10.1,11.0,3.48,3.27,1.161,1.091,2.325581,22.222222
2,776835,In Progress,Houston Astros,New York Yankees,Houston,Bronx,American League,American League,686613,693645,0.711338,0.748396,0.755,0.734288,4.699145,5.133134,1.335627,1.369023,12.0,9.1,1.5,4.82,0.917,1.821,16.666667,11.627907
3,776832,In Progress,Los Angeles Angels,Detroit Tigers,Anaheim,Detroit,American League,American League,543294,669373,0.748396,0.776643,0.734288,0.81,5.133134,3.915127,1.369023,1.281798,11.0,13.0,1.64,2.08,0.818,1.0,5.0,28.0
4,776834,In Progress,Miami Marlins,Atlanta Braves,Miami,Atlanta,National League,National League,665795,693821,0.785909,0.732185,0.735,0.673,4.207269,5.233972,1.302373,1.409656,12.0,9.1,1.5,9.64,0.833,1.714,15.217391,4.545455


In [10]:

# Scoring model (simple linear -> sigmoid). Tune weights below.
W_LINEUP_OPS = 1.0
W_TEAM_OPS   = 0.7
W_SP_KBB     = 0.08   # percent point scale (so smaller weight)
W_SP_ERA     = 0.6
W_SP_WHIP    = 0.5
W_TEAM_ERA   = 0.3

SIG_SCALE    = 0.10   # compress to avoid extreme 0/1

def sigmoid(x, s=SIG_SCALE):
    return 1.0 / (1.0 + np.exp(-s * x))

def compute_row_probs(r):
    # diffs oriented so positive favors HOME
    d_lineup_ops = (r["home_lineup_ops_med"] - r["away_lineup_ops_med"])
    d_team_ops   = (r["home_team_OPS"] - r["away_team_OPS"])
    d_sp_kbb     = (r["home_SP_KBB%"] - r["away_SP_KBB%"])
    d_sp_era     = (r["away_SP_ERA"] - r["home_SP_ERA"])   # lower ERA better for home
    d_sp_whip    = (r["away_SP_WHIP"] - r["home_SP_WHIP"]) # lower WHIP better for home
    d_team_era   = (r["away_team_ERA"] - r["home_team_ERA"])

    score = (
        W_LINEUP_OPS * d_lineup_ops +
        W_TEAM_OPS   * d_team_ops   +
        W_SP_KBB     * d_sp_kbb     +
        W_SP_ERA     * d_sp_era     +
        W_SP_WHIP    * d_sp_whip    +
        W_TEAM_ERA   * d_team_era
    )
    p_home = sigmoid(score, SIG_SCALE)
    return p_home, 1 - p_home

# Compute probs
games_df["model_home_prob"], games_df["model_away_prob"] = zip(*games_df.apply(compute_row_probs, axis=1))
games_df["pick"] = np.where(games_df["model_home_prob"]>=0.5, games_df["home_team"], games_df["away_team"])
games_df["confidence_pp"] = (games_df[["model_home_prob","model_away_prob"]].max(axis=1) - 0.5) * 100.0
games_df.sort_values("confidence_pp", ascending=False).head(5)


Unnamed: 0,gamePk,status,away_team,home_team,away_loc,home_loc,away_league,home_league,pp_away_id,pp_home_id,away_team_OPS,home_team_OPS,away_lineup_ops_med,home_lineup_ops_med,away_team_ERA,home_team_ERA,away_team_WHIP,home_team_WHIP,away_SP_IP,home_SP_IP,away_SP_ERA,home_SP_ERA,away_SP_WHIP,home_SP_WHIP,away_SP_KBB%,home_SP_KBB%,model_home_prob,model_away_prob,pick,confidence_pp
0,776829,In Progress,Cincinnati Reds,Pittsburgh Pirates,Cincinnati,Pittsburgh,National League,National League,695505,656605,0.640873,0.710991,0.601,0.678,3.775466,3.822583,1.249374,1.221161,6.2,7.0,4.05,10.29,1.05,2.286,37.931034,8.333333,0.340338,0.659662,Cincinnati Reds,15.966167
4,776834,In Progress,Miami Marlins,Atlanta Braves,Miami,Atlanta,National League,National League,665795,693821,0.785909,0.732185,0.735,0.673,4.207269,5.233972,1.302373,1.409656,12.0,9.1,1.5,9.64,0.833,1.714,15.217391,4.545455,0.341057,0.658943,Miami Marlins,15.894298
14,776822,Pre-Game,Washington Nationals,San Francisco Giants,Washington,San Francisco,National League,National League,663623,657424,0.616669,0.709273,0.594,0.766,7.183399,4.297079,1.780088,1.387111,11.0,6.1,5.73,0.0,1.364,0.789,4.255319,13.636364,0.63596,0.36404,San Francisco Giants,13.595961
8,776828,In Progress,New York Mets,Milwaukee Brewers,Flushing,Milwaukee,National League,National League,673540,605540,0.73767,0.893208,0.725045,0.829,5.243982,3.93711,1.396849,1.307709,9.0,12.0,7.0,3.0,1.889,0.75,-2.272727,22.222222,0.634924,0.365076,Milwaukee Brewers,13.492379
11,776820,Pre-Game,Boston Red Sox,San Diego Padres,Boston,San Diego,American League,National League,621111,601713,0.78374,0.766475,0.903,0.7345,3.084544,2.728333,1.310929,0.841621,9.0,13.1,6.0,2.03,2.333,0.45,-8.695652,14.583333,0.62509,0.37491,San Diego Padres,12.509


In [11]:

# Save CSV of matchups
csv_path = os.path.join(OUT_DIR, "today_matchups.csv")
games_df.to_csv(csv_path, index=False)

# Write readable picks
lines = []
header = f"MLB PICKS — {TARGET_DATE:%Y-%m-%d} (Data through {YESTERDAY:%Y-%m-%d})"
lines.append(header)
lines.append("="*len(header))

for _, r in games_df.sort_values("confidence_pp", ascending=False).iterrows():
    lines.append(f"{r['away_team']} @ {r['home_team']} — Home win prob: {r['model_home_prob']:.3f}")
    lines.append(f"  SP: away id {int(r['pp_away_id']) if not pd.isna(r['pp_away_id']) else 'NA'} vs home id {int(r['pp_home_id']) if not pd.isna(r['pp_home_id']) else 'NA'}")
    lines.append(f"  Lineup OPS med (14→30): away {r['away_lineup_ops_med']:.3f} | home {r['home_lineup_ops_med']:.3f} | Δ {r['home_lineup_ops_med']-r['away_lineup_ops_med']:+.3f}")
    lines.append(f"  Team OPS (14→30):       away {r['away_team_OPS']:.3f} | home {r['home_team_OPS']:.3f} | Δ {r['home_team_OPS']-r['away_team_OPS']:+.3f}")
    lines.append(f"  SP K-BB% (14→30):       away {r['away_SP_KBB%']:.1f} | home {r['home_SP_KBB%']:.1f} | Δ {r['home_SP_KBB%']-r['away_SP_KBB%']:+.1f}")
    lines.append(f"  SP ERA/WHIP (14→30):    away {r['away_SP_ERA']:.2f}/{r['away_SP_WHIP']:.2f} | home {r['home_SP_ERA']:.2f}/{r['home_SP_WHIP']:.2f}")
    lines.append(f"  Team ERA (14→30):       away {r['away_team_ERA']:.2f} | home {r['home_team_ERA']:.2f} | Δ {r['away_team_ERA']-r['home_team_ERA']:+.2f}")
    lines.append("-"*64)

txt_path = os.path.join(OUT_DIR, "READABLE_PICKS.txt")
with open(txt_path, "w", encoding="utf-8") as f:
    f.write("\n".join(lines))

print("✅ Wrote:")
print("  -", csv_path)
print("  -", txt_path)

games_df.head(3)


✅ Wrote:
  - ./pred_outputs/today_matchups.csv
  - ./pred_outputs/READABLE_PICKS.txt


Unnamed: 0,gamePk,status,away_team,home_team,away_loc,home_loc,away_league,home_league,pp_away_id,pp_home_id,away_team_OPS,home_team_OPS,away_lineup_ops_med,home_lineup_ops_med,away_team_ERA,home_team_ERA,away_team_WHIP,home_team_WHIP,away_SP_IP,home_SP_IP,away_SP_ERA,home_SP_ERA,away_SP_WHIP,home_SP_WHIP,away_SP_KBB%,home_SP_KBB%,model_home_prob,model_away_prob,pick,confidence_pp
0,776829,In Progress,Cincinnati Reds,Pittsburgh Pirates,Cincinnati,Pittsburgh,National League,National League,695505,656605,0.640873,0.710991,0.601,0.678,3.775466,3.822583,1.249374,1.221161,6.2,7.0,4.05,10.29,1.05,2.286,37.931034,8.333333,0.340338,0.659662,Cincinnati Reds,15.966167
1,776831,In Progress,Athletics,Baltimore Orioles,Sacramento,Baltimore,American League,American League,669372,608372,0.748396,0.772883,0.734288,0.863,5.133134,4.018226,1.369023,1.284823,10.1,11.0,3.48,3.27,1.161,1.091,2.325581,22.222222,0.555596,0.444404,Baltimore Orioles,5.559559
2,776835,In Progress,Houston Astros,New York Yankees,Houston,Bronx,American League,American League,686613,693645,0.711338,0.748396,0.755,0.734288,4.699145,5.133134,1.335627,1.369023,12.0,9.1,1.5,4.82,0.917,1.821,16.666667,11.627907,0.42624,0.57376,Houston Astros,7.375958
