Ensure no data leaking by replacing post game stats with previous averaging

In [67]:
import pandas as pd

path = "../raw/schedules_raw.csv"

df = pd.read_csv(path)
df.head()

Unnamed: 0,game_id,season,week,date,home_team,away_team,home_score,away_score,Winner,home_pass_cmp,...,surface,temp,wind,game_type,weekday,gametime,location,home_coach,away_coach,referee
0,2014_01_GB_SEA,2014,1,2014-09-04,SEA,GB,36.0,16.0,SEA,19.0,...,fieldturf,71.0,11.0,REG,Thursday,20:30,Home,Pete Carroll,Mike McCarthy,John Parry
1,2014_01_NO_ATL,2014,1,2014-09-07,ATL,NO,37.0,34.0,ATL,31.0,...,fieldturf,,,REG,Sunday,13:00,Home,Mike Smith,Sean Payton,Bill Leavy
2,2014_01_CIN_BAL,2014,1,2014-09-07,BAL,CIN,16.0,23.0,CIN,35.0,...,sportturf,74.0,8.0,REG,Sunday,13:00,Home,John Harbaugh,Marvin Lewis,Gene Stetatore
3,2014_01_BUF_CHI,2014,1,2014-09-07,CHI,BUF,20.0,23.0,BUF,34.0,...,grass,74.0,3.0,REG,Sunday,13:00,Home,Marc Trestman,Doug Marrone,Brad Allen
4,2014_01_WAS_HOU,2014,1,2014-09-07,HOU,WAS,17.0,6.0,HOU,14.0,...,grass,,,REG,Sunday,13:00,Home,Bill O'Brien,Jay Gruden,Jerome Boger


Calculate metrics using scheduling data
1. Have 2 rows per game, one representing each team
2. Replace game data with previous week's data - model will only have access to previous week's data

In [68]:
import pandas as pd

def to_team_games(df: pd.DataFrame) -> pd.DataFrame:
    # clean any stray whitespace in headers
    df = df.copy()
    df.columns = df.columns.str.strip()

    # columns we must NOT rename
    special_keep = {"home_team","away_team","home_score","away_score"}

    # detect prefixed columns
    home_cols_all = [c for c in df.columns if c.startswith("home_")]
    away_cols_all = [c for c in df.columns if c.startswith("away_")]

    # stats to rename (exclude team name/score)
    home_stats = [c for c in home_cols_all if c not in special_keep]
    away_stats = [c for c in away_cols_all if c not in special_keep]

    # everything else = base/meta (date, lines, stadium, roof, refs, etc.)
    base_cols = [c for c in df.columns if c not in (home_stats + away_stats)]

    # helpers
    def swap_prefix(cols, old, new):
        return {c: c.replace(old, new, 1) for c in cols}

    # HOME perspective
    home_side = (
        df[base_cols + home_stats + away_stats]
        .rename(columns={
            **swap_prefix(home_stats, "home_", "team_"),
            **swap_prefix(away_stats, "away_", "opp_"),
        })
        .assign(
            team=lambda d: d["home_team"],
            opponent=lambda d: d["away_team"],
            team_score=lambda d: d["home_score"],
            opp_score=lambda d: d["away_score"],
            is_home=1,
        )
    )

    # AWAY perspective
    away_side = (
        df[base_cols + home_stats + away_stats]
        .rename(columns={
            **swap_prefix(away_stats, "away_", "team_"),
            **swap_prefix(home_stats, "home_", "opp_"),
        })
        .assign(
            team=lambda d: d["away_team"],
            opponent=lambda d: d["home_team"],
            team_score=lambda d: d["away_score"],
            opp_score=lambda d: d["home_score"],
            is_home=0,
        )
    )

    team_games = pd.concat([home_side, away_side], ignore_index=True)

    # convenient targets
    team_games["team_win"]  = (team_games["team_score"] > team_games["opp_score"]).astype(int)
    team_games["point_diff"] = team_games["team_score"] - team_games["opp_score"]

    # order columns: core → meta → team_* → opp_*
    core = ["game_id","season","week","date","team","opponent","is_home",
            "team_score","opp_score","team_win","point_diff"]
    meta = [c for c in base_cols if c not in core]
    team_stats = sorted([c for c in team_games.columns if c.startswith("team_")])
    opp_stats  = sorted([c for c in team_games.columns if c.startswith("opp_")])

    ordered = [c for c in core if c in team_games.columns] + meta + team_stats + opp_stats
    return team_games[ordered].sort_values(["season","week","team"]).reset_index(drop=True)

# usage:
# df = pd.read_csv("../raw/schedules_raw.csv")
df = to_team_games(df)

Append injury.csv data to the df

In [69]:
import numpy as np

# capture columns to report what's new at the end
_prev_cols = set(df.columns)

# --- load & select ---
inj = pd.read_csv("../raw/injuries_raw.csv")
inj = inj[["season","week","team","position","report_status","practice_status"]].copy()

# normalize
inj["team"] = inj["team"].astype(str)
inj["position"] = inj["position"].str.upper().str.strip()

# --- severity mappings ---
report_map = {
    "OUT": 1.00,
    "DOUBTFUL": 0.75,
    "QUESTIONABLE": 0.50,
    "PROBABLE": 0.25,
}
practice_map = {
    "DID NOT PARTICIPATE IN PRACTICE": 1.00,
    "LIMITED PARTICIPATION IN PRACTICE": 0.50,
    "FULL PARTICIPATION IN PRACTICE": 0.00,
}
inj["report_sev"]   = inj["report_status"].str.upper().map(report_map).fillna(0.0)
inj["practice_sev"] = inj["practice_status"].str.upper().map(practice_map).fillna(0.0)

# --- per-position counts (inj_*) ---
counts = (
    inj.groupby(["season","week","team","position"])
       .size().unstack(fill_value=0)
       .add_prefix("inj_")
       .reset_index()
)

# --- per-position severity (mean) for report/practice (sev_* / prac_sev_*) ---
sev_rep = (
    inj.groupby(["season","week","team","position"])["report_sev"]
       .mean().unstack(fill_value=0)
       .add_prefix("sev_")
       .reset_index()
)
sev_prac = (
    inj.groupby(["season","week","team","position"])["practice_sev"]
       .mean().unstack(fill_value=0)
       .add_prefix("prac_sev_")
       .reset_index()
)

# --- merge onto df ---
df = df.merge(counts,  on=["season","week","team"], how="left")
df = df.merge(sev_rep, on=["season","week","team"], how="left")
df = df.merge(sev_prac,on=["season","week","team"], how="left")

# fill any injury-derived NaNs with 0
for c in df.columns:
    if c.startswith(("inj_","sev_","prac_sev_")):
        df[c] = df[c].fillna(0.0)

# --- grouped features (sensible football buckets) ---
def _sum_cols(cols):
    present = [c for c in cols if c in df.columns]
    return df[present].sum(axis=1) if present else 0.0

# position families we might see in the CSV
OL = ["C","G","T","OL"]                # offensive line
DL = ["DE","DT","NT","DL"]            # defensive line
SKILL = ["RB","WR","TE"]              # offensive skill (QB handled separately)
SEC = ["CB","S","DB"]                 # secondary

# counts
df["inj_qb_flag"]        = (df.get("inj_QB", 0) > 0).astype(int)
df["inj_skill"]          = _sum_cols([f"inj_{p}" for p in SKILL])
df["inj_ol"]             = _sum_cols([f"inj_{p}" for p in OL])
df["inj_dl"]             = _sum_cols([f"inj_{p}" for p in DL])
df["inj_secondary"]      = _sum_cols([f"inj_{p}" for p in SEC])
df["inj_front7"]         = df["inj_dl"] + df.get("inj_LB", 0)
df["inj_total"]          = _sum_cols([c for c in df.columns if c.startswith("inj_")])

# severity (report-based)
df["sev_qb"]             = df.get("sev_QB", 0.0)
df["sev_skill_mean"]     = _sum_cols([f"sev_{p}" for p in SKILL]) / np.maximum(1, (df[ [c for c in [f"inj_{p}" for p in SKILL] if c in df.columns] ] > 0).sum(axis=1))
df["sev_ol_mean"]        = _sum_cols([f"sev_{p}" for p in OL])   / np.maximum(1, (df[ [c for c in [f"inj_{p}" for p in OL]    if c in df.columns] ] > 0).sum(axis=1))
df["sev_dl_mean"]        = _sum_cols([f"sev_{p}" for p in DL])   / np.maximum(1, (df[ [c for c in [f"inj_{p}" for p in DL]    if c in df.columns] ] > 0).sum(axis=1))
df["sev_secondary_mean"] = _sum_cols([f"sev_{p}" for p in SEC])  / np.maximum(1, (df[ [c for c in [f"inj_{p}" for p in SEC]   if c in df.columns] ] > 0).sum(axis=1))
df["sev_total_mean"]     = _sum_cols([c for c in df.columns if c.startswith("sev_")]) / np.maximum(1, (df[ [c for c in df.columns if c.startswith("inj_")] ] > 0).sum(axis=1))

# practice severity (optional overall index)
df["prac_sev_total_mean"] = _sum_cols([c for c in df.columns if c.startswith("prac_sev_")]) / np.maximum(1, (df[ [c for c in df.columns if c.startswith("inj_")] ] > 0).sum(axis=1))

# finalize: replace any remaining inf/NaN from divisions
for c in ["sev_skill_mean","sev_ol_mean","sev_dl_mean","sev_secondary_mean","sev_total_mean","prac_sev_total_mean"]:
    df[c] = df[c].replace([np.inf, -np.inf], np.nan).fillna(0.0)

# ========== ONLY ADDITION: leak-safe shift by 1 prior game (TEAM) ==========
inj_cols_base = [c for c in df.columns if c.startswith(("inj_","sev_","prac_sev_"))]
inj_grouped   = [c for c in ["inj_qb_flag","inj_skill","inj_ol","inj_dl",
                             "inj_secondary","inj_front7","inj_total",
                             "sev_qb","sev_skill_mean","sev_ol_mean",
                             "sev_dl_mean","sev_secondary_mean",
                             "sev_total_mean","prac_sev_total_mean"]
                 if c in df.columns]
inj_all = inj_cols_base + inj_grouped

for col in inj_all:
    df[f"{col}_prior1"] = (
        df.sort_values(["team","season","week"])
          .groupby(["team","season"], sort=False)[col]
          .shift(1)
    )

# fill missing priors with 0 (meaning: no info / no injuries last week)
prior_cols = [f"{c}_prior1" for c in inj_all]

for c in prior_cols:
    if c in df.columns:
        df[c] = df[c].fillna(0.0)

# list of added variables
_added_cols = sorted(list(set(df.columns) - _prev_cols))
print(f"Added {len(_added_cols)} injury features.")

Added 130 injury features.


Roll back features to ensure model only has access to past data, adding prior_stats columns as necessary

In [70]:
def add_prior_features(
    team_games: pd.DataFrame,
    cols=None,
    group_keys=("team","season"),
    order_keys=("season","week","date"),
    lags=(1,),
    name_style="auto",
    fill=None
):
    g = team_games.copy()

    # ensure proper sort for shifting
    g["week"] = pd.to_numeric(g["week"], errors="ignore")
    g = g.sort_values(list(group_keys) + list(order_keys))

    # default: all numeric team_* columns
    if cols is None:
        cols = [c for c in g.columns if c.startswith("team_") and pd.api.types.is_numeric_dtype(g[c])]

    def prior_name(col, lag):
        if name_style == "suffix":
            return f"{col}_prior{lag}"
        if col.startswith("team_"):
            return col.replace("team_", f"team_prior{'' if lag==1 else f'{lag}_'}", 1)
        return f"{col}_prior{lag}"

    # compute lags per team-season
    for lag in lags:
        lagged = (
            g.groupby(list(group_keys), group_keys=False)[cols]
             .shift(lag)
             .rename(columns={c: prior_name(c, lag) for c in cols})
        )
        g = pd.concat([g, lagged], axis=1)

    # optional fill for first game(s) of season
    if fill is not None:
        new_cols = [prior_name(c, lag) for c in cols for lag in lags]
        if fill == "ffill":
            g[new_cols] = (
                g.groupby(list(group_keys), group_keys=False)[new_cols]
                 .apply(lambda x: x.ffill())
            )
        else:
            g[new_cols] = g[new_cols].fillna(fill)

    # --- NEW: drop duplicate columns ---
    g = g.loc[:, ~g.columns.duplicated()]

    return g

# ---- Team stats to lag ----
team_cols_to_lag = [
    "team_pass_att",
    "team_pass_cmp",
    "team_pass_yds",
    "team_pass_td",
    "team_pass_int",
    "team_pass_sacked",
    "team_pass_sacked_yds",
    
    "team_rush_att",
    "team_rush_yds",
    "team_rush_td",
    
    "team_first_down",
    "team_turnovers",
    
    "team_penalties",
    "team_penalties_yds",
    
    "team_fga", "team_fgm",       # field goals
    "team_xpa", "team_xpm",       # extra points
    
    "team_punt",
    "team_punt_yds",
    
    "team_plays_offense",
    "team_score"
]

# ---- Opponent stats to lag ----
opp_cols_to_lag = [
    "opp_pass_att",
    "opp_pass_cmp",
    "opp_pass_yds",
    "opp_pass_td",
    "opp_pass_int",
    "opp_pass_sacked",
    "opp_pass_sacked_yds",
    
    "opp_rush_att",
    "opp_rush_yds",
    "opp_rush_td",
    
    "opp_first_down",
    "opp_turnovers",
    
    "opp_penalties",
    "opp_penalties_yds",
    
    "opp_fga", "opp_fgm",
    "opp_xpa", "opp_xpm",
    
    "opp_punt",
    "opp_punt_yds",
    
    "opp_plays_offense",
    "opp_score"
]

# Team priors
df = add_prior_features(
    df,
    cols=team_cols_to_lag,
    group_keys=("team","season"),
    name_style="suffix"
)

# Opponent priors
df = add_prior_features(
    df,
    cols=opp_cols_to_lag,
    group_keys=("opponent","season"),
    name_style="suffix"
)

In [71]:
import numpy as np

# Example: rolling averages for selected features
team_roll = [
    "team_pass_yds", "team_rush_yds", "team_pass_td",
    "team_rush_td", "team_turnovers", "team_score",
    # extras
    "team_pass_att", "team_rush_att",
    "team_first_down", "team_penalties",
    "team_plays_offense",
    "team_pass_sacked",
    "team_fga"
]

opp_roll = [
    "opp_pass_yds", "opp_rush_yds", "opp_pass_td",
    "opp_rush_td", "opp_turnovers", "opp_score",
    # extras
    "opp_pass_att", "opp_rush_att",
    "opp_first_down", "opp_penalties",
    "opp_plays_offense",
    "opp_pass_sacked"
]

# --- roll3 / roll5 / roll10 ---
for w in (3, 5, 10):
    for col in team_roll:
        df[f"{col}_roll{w}"] = (
            df.groupby("team")[col]
              .apply(lambda x: x.shift(1).rolling(w, min_periods=1).mean())
        )
    for col in opp_roll:
        df[f"{col}_roll{w}"] = (
            df.groupby("opponent")[col]
              .apply(lambda x: x.shift(1).rolling(w, min_periods=1).mean())
        )

# --- season-to-date mean (before current game) ---
for col in team_roll:
    grp = df.groupby(["team","season"], sort=False)[col]
    csum = grp.cumsum().shift(1)
    cnt  = grp.cumcount()
    df[f"{col}_season"] = np.where(cnt > 0, csum / cnt, np.nan)

for col in opp_roll:
    grp = df.groupby(["opponent","season"], sort=False)[col]
    csum = grp.cumsum().shift(1)
    cnt  = grp.cumcount()
    df[f"{col}_season"] = np.where(cnt > 0, csum / cnt, np.nan)

# --- exponentially weighted momentum ---
alpha = 0.3
for col in team_roll:
    df[f"{col}_ewm"] = (
        df.groupby("team")[col]
          .apply(lambda x: x.shift(1).ewm(alpha=alpha, adjust=False, min_periods=1).mean())
    )
for col in opp_roll:
    df[f"{col}_ewm"] = (
        df.groupby("opponent")[col]
          .apply(lambda x: x.shift(1).ewm(alpha=alpha, adjust=False, min_periods=1).mean())
    )

# ---------- Safe division helper ----------
def safe_div(num, den):
    return np.where(den.astype(float) != 0, num.astype(float) / den.astype(float), np.nan)

# ---------- Windows to compute ----------
windows = ["roll3", "roll5", "roll10", "season"]

def rate_if_exists(lhs_base, rhs_base, out_base, windows):
    for w in windows:
        a, b, out = f"{lhs_base}_{w}", f"{rhs_base}_{w}", f"{out_base}_{w}"
        if a in df.columns and b in df.columns:
            df[out] = safe_div(df[a], df[b])

# ---------- Team + Opp Efficiency Rates ----------
# Yards/attempt
rate_if_exists("team_pass_yds", "team_pass_att", "team_pass_ypa", windows)
rate_if_exists("opp_pass_yds",  "opp_pass_att",  "opp_pass_ypa",  windows)

# TD rate
rate_if_exists("team_pass_td", "team_pass_att", "team_pass_td_rate", windows)
rate_if_exists("opp_pass_td",  "opp_pass_att",  "opp_pass_td_rate", windows)

# INT rate
rate_if_exists("team_pass_int", "team_pass_att", "team_int_rate", windows)
rate_if_exists("opp_pass_int",  "opp_pass_att",  "opp_int_rate",  windows)

# Rush yards/carry
rate_if_exists("team_rush_yds", "team_rush_att", "team_rush_ypc", windows)
rate_if_exists("opp_rush_yds",  "opp_rush_att",  "opp_rush_ypc",  windows)

# Play mix
rate_if_exists("team_pass_att", "team_plays_offense", "team_pass_rate", windows)
rate_if_exists("opp_pass_att",  "opp_plays_offense",  "opp_pass_rate",  windows)
rate_if_exists("team_rush_att", "team_plays_offense", "team_rush_rate", windows)
rate_if_exists("opp_rush_att",  "opp_plays_offense",  "opp_rush_rate",  windows)

# First-down rate
rate_if_exists("team_first_down", "team_plays_offense", "team_fd_rate", windows)
rate_if_exists("opp_first_down",  "opp_plays_offense",  "opp_fd_rate",  windows)

# FG attempt rate
rate_if_exists("team_fga", "team_plays_offense", "team_fga_rate", windows)
rate_if_exists("opp_fga",  "opp_plays_offense",  "opp_fga_rate",  windows)

# ---------- Differentials (matchup framing) ----------
# Raw stats (example subset)
raw_pairs = [
    "pass_yds","rush_yds","pass_td","rush_td","turnovers","score","first_down"
]
for stat in raw_pairs:
    for w in windows:
        tcol, ocol, out = f"team_{stat}_{w}", f"opp_{stat}_{w}", f"diff_{stat}_{w}"
        if tcol in df.columns and ocol in df.columns:
            df[out] = df[tcol] - df[ocol]

# Efficiency differentials
eff_pairs = ["pass_ypa","rush_ypc","pass_td_rate","int_rate","fd_rate","fga_rate"]
for stat in eff_pairs:
    for w in windows:
        tcol, ocol, out = f"team_{stat}_{w}", f"opp_{stat}_{w}", f"diff_{stat}_{w}"
        if tcol in df.columns and ocol in df.columns:
            df[out] = df[tcol] - df[ocol]

# Mix differentials
for stat in ["pass_rate","rush_rate"]:
    for w in windows:
        tcol, ocol, out = f"team_{stat}_{w}", f"opp_{stat}_{w}", f"diff_{stat}_{w}"
        if tcol in df.columns and ocol in df.columns:
            df[out] = df[tcol] - df[ocol]

# ---------- Point Differential windows ----------
if "point_diff" in df.columns:
    for w in (3, 5, 10):
        df[f"point_diff_roll{w}"] = (
            df.groupby("team")["point_diff"]
              .apply(lambda x: x.shift(1).rolling(w, min_periods=1).mean())
        )
    g = df.groupby(["team","season"], sort=False)["point_diff"]
    df["point_diff_season"] = np.where(
        g.cumcount() > 0, g.cumsum().shift(1) / g.cumcount(), np.nan
    )

To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  .apply(lambda x: x.shift(1).rolling(w, min_periods=1).mean())
To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  .apply(lambda x: x.shift(1).rolling(w, min_periods=1).mean())
To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  .apply(lambda x: x.shift(1).rolling(w, min_periods=1).mean())
To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  .apply(lambda x: x.shift(1).rolling(w, min_periods=1).mean())
To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  .apply(lambda x: x.shift(1).rolling(w, min_periods=1).mean())
To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_k

In [72]:
# 1) Opponent win % BEFORE this game
df["team_win_pre"]   = df.groupby("team")["team_win"].shift(1).cumsum()
df["team_games_pre"] = df.groupby("team").cumcount()
df["team_win_pct_pre"] = np.where(
    df["team_games_pre"] > 0, df["team_win_pre"] / df["team_games_pre"], np.nan
)

# opponent's win% prior to this game (must be lagged!)
df["opponent_win_pct_pre"] = df.groupby("opponent")["team_win_pct_pre"].shift(1)

# 2) SoS: mean opponent win% over last N games (team & opp mirrors)
for w in (3, 5, 10):
    df[f"team_sos_win_pct_roll{w}"] = (
        df.groupby("team")["opponent_win_pct_pre"]
          .apply(lambda s: s.shift(1).rolling(w, min_periods=1).mean())
    )
    df[f"opp_sos_win_pct_roll{w}"] = (
        df.groupby("opponent")["team_win_pct_pre"]
          .apply(lambda s: s.shift(1).rolling(w, min_periods=1).mean())
    )

# 3) Team rolling point diff (if not already built elsewhere)
for w in (3, 5, 10):
    col = f"team_point_diff_roll{w}"
    if col not in df.columns:
        df[col] = (
            df.groupby("team")["point_diff"]
              .apply(lambda s: s.shift(1).rolling(w, min_periods=1).mean())
        )

# 4) League-average point diff per week, then lagged rolling by SEASON
#    (avoids same-week peeking)
league_week = (
    df.groupby(["season","week"], as_index=False)["point_diff"]
      .mean()
      .rename(columns={"point_diff": "league_avg_pd"})
      .sort_values(["season","week"])
)

for w in (3, 5, 10):
    league_week[f"league_point_diff_roll{w}"] = (
        league_week.groupby("season")["league_avg_pd"]
                   .apply(lambda s: s.shift(1).rolling(w, min_periods=1).mean())
    )

# attach back
df = df.merge(league_week, on=["season","week"], how="left")

# 5) Adjusted PD = team rolling PD − league rolling PD (same window)
for w in (3, 5, 10):
    df[f"team_adj_point_diff_roll{w}"] = (
        df[f"team_point_diff_roll{w}"] - df[f"league_point_diff_roll{w}"]
    )

  df["team_win_pre"]   = df.groupby("team")["team_win"].shift(1).cumsum()
To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  .apply(lambda s: s.shift(1).rolling(w, min_periods=1).mean())
To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  .apply(lambda s: s.shift(1).rolling(w, min_periods=1).mean())
To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  .apply(lambda s: s.shift(1).rolling(w, min_periods=1).mean())
To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  .apply(lambda s: s.shift(1).rolling(w, min_periods=1).mean())
To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  .apply(lambda s: s.shift(1).rolling(w, min_periods=1).mean())
To preserve the previous behavi

In [73]:
# --- Win & loss streaks ---
def calc_streak(series, win_val=1):
    streak = []
    count = 0
    for val in series:
        if val == win_val:
            count += 1
        else:
            count = 0
        streak.append(count)
    return pd.Series(streak, index=series.index)

df["team_win_streak_pre"] = (
    df.groupby("team")["team_win"].apply(lambda x: calc_streak(x.shift(1).fillna(0), 1))
)
df["team_loss_streak_pre"] = (
    df.groupby("team")["team_win"].apply(lambda x: calc_streak(x.shift(1).fillna(1), 0))
)

# --- Blowout & close-game rates ---
for w in (3, 5, 10):
    # blowouts: |PD| >= 14
    df[f"blowout_rate_roll{w}"] = (
        df.groupby("team")["point_diff"]
          .apply(lambda x: x.shift(1).rolling(w, min_periods=1).apply(
              lambda s: np.mean(np.abs(s) >= 14)))
    )
    # close games: |PD| <= 3
    df[f"close_game_rate_roll{w}"] = (
        df.groupby("team")["point_diff"]
          .apply(lambda x: x.shift(1).rolling(w, min_periods=1).apply(
              lambda s: np.mean(np.abs(s) <= 3)))
    )


To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  df.groupby("team")["team_win"].apply(lambda x: calc_streak(x.shift(1).fillna(0), 1))
To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  df.groupby("team")["team_win"].apply(lambda x: calc_streak(x.shift(1).fillna(1), 0))
To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  .apply(lambda x: x.shift(1).rolling(w, min_periods=1).apply(
To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  .apply(lambda x: x.shift(1).rolling(w, min_periods=1).apply(
To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  .apply(lambda x: x.shift(1).rolling(w, min_periods=1).apply(
To preserve the previous behavior, use

	>>> .groupby(..., gro

In [74]:
# =========================
# 7) HOME / VENUE / WEATHER
# =========================

def to_flag(s):  # -> 0/1 Int8, NA -> 0
    return s.fillna(False).astype("Int8")

if "temp" in df.columns:
    df["temp_bin_le_32"] = to_flag(df["temp"] <= 32)
    df["temp_bin_33_60"] = to_flag((df["temp"] > 32) & (df["temp"] <= 60))
    df["temp_bin_61_80"] = to_flag((df["temp"] > 60) & (df["temp"] <= 80))
    df["temp_bin_gt_80"] = to_flag(df["temp"] > 80)
    df["extreme_cold"]   = to_flag(df["temp"] <= 32)

if "wind" in df.columns:
    df["wind2"] = df["wind"].astype(float) ** 2  # non-linear wind effect

if {"is_home","temp"}.issubset(df.columns):
    # leave float to allow NA in temp
    df["is_home_x_temp"] = df["is_home"].astype(float) * df["temp"]

if {"is_home","short_rest"}.issubset(df.columns):
    df["is_home_x_short_rest"] = to_flag(df["is_home"].astype(bool) & df["short_rest"].astype(bool))

if "roof" in df.columns:
    roof_l = df["roof"].astype("string").str.lower()
    df["roof_dome"] = to_flag(roof_l.isin(["dome","indoor","closed"]))

if "surface" in df.columns:
    surf_l = df["surface"].astype("string").str.lower()
    df["surface_turf"] = to_flag(surf_l.str.contains("turf|artificial", na=False, regex=True))

# =================================
# 8) SEASON CONTEXT & GAME TIMING
# =================================

if "week" in df.columns:
    df["week_number"] = df["week"].astype(float)
    df["week_scaled"] = df["week_number"] / 18.0
    df["late_season"] = to_flag(df["week_number"] >= 14)

if {"team","season","week","team_win"}.issubset(df.columns):
    df["team_win_pre"]   = df.groupby("team")["team_win"].shift(1).cumsum()
    df["team_games_pre"] = df.groupby("team").cumcount()
    df["team_wins_pre"]  = df["team_win_pre"].fillna(0).astype(int)

    if "late_season" in df.columns:
        df["must_win_proxy"] = to_flag((df["team_wins_pre"] >= 8) & (df["late_season"] == 1))

# Divisional flag if you have division info
if {"team_division","opponent_division"}.issubset(df.columns):
    df["is_divisional"] = to_flag(df["team_division"] == df["opponent_division"])

# Travel/timezone proxies if available
if {"team_tz","game_tz"}.issubset(df.columns):
    df["west_to_east"] = to_flag(df["team_tz"] > df["game_tz"])
    df["east_to_west"] = to_flag(df["team_tz"] < df["game_tz"])

In [75]:
import numpy as np
import pandas as pd

def safe_div(a, b):
    a = a.astype(float)
    b = b.astype(float)
    return np.where(b != 0, a / b, np.nan)

# =========================
# 10) SPECIAL TEAMS PROXIES
# =========================
# FG per-play rate + differentials (uses your existing roll windows)
for w in (3, 5, 10):
    fga_t   = f"team_fga_roll{w}"
    plays_t = f"team_plays_offense_roll{w}"
    fga_o   = f"opp_fga_roll{w}"
    plays_o = f"opp_plays_offense_roll{w}"

    if fga_t in df.columns and plays_t in df.columns:
        df[f"team_fg_per_play_rate_roll{w}"] = safe_div(df[fga_t], df[plays_t])

    if fga_o in df.columns and plays_o in df.columns:
        df[f"opp_fg_per_play_rate_roll{w}"]  = safe_div(df[fga_o], df[plays_o])

    tcol = f"team_fg_per_play_rate_roll{w}"
    ocol = f"opp_fg_per_play_rate_roll{w}"
    if tcol in df.columns and ocol in df.columns:
        df[f"diff_fg_per_play_rate_roll{w}"] = df[tcol] - df[ocol]

# (Optional FG "make opportunity" = attempts per game-in-window)
# If you want it later: team_fga_roll{w} / w

# ================================================
# 11) SEASON-TO-DATE Z-SCORES (normalize in-season)
# ================================================
# We’ll z-score the season-to-date versions you already created:
#   team_pass_ypa_season, team_rush_ypc_season, team_fd_rate_season
# Mean/STD are computed per season using only PRIOR weeks (shifted).

season_stats = ["team_pass_ypa", "team_rush_ypc", "team_fd_rate"]

for base in season_stats:
    col = f"{base}_season"
    if col not in df.columns:
        continue

    # per-season expanding mean/std, both lagged so week N uses data up to N-1
    grp = df.groupby("season")[col]
    mu  = grp.apply(lambda s: s.expanding(min_periods=1).mean().shift(1)).reset_index(level=0, drop=True)
    sd  = grp.apply(lambda s: s.expanding(min_periods=2).std(ddof=0).shift(1)).reset_index(level=0, drop=True)

    zname = f"{base}_season_z"
    df[zname] = np.where((sd.astype(float) > 0) & (~pd.isna(sd)),
                         (df[col].astype(float) - mu.astype(float)) / sd.astype(float),
                         np.nan)

# ==================================================
# 12) HIGH-VALUE INTERACTIONS (kept deliberately few)
# ==================================================
# is_home * team_win_pct_pre
if "is_home" in df.columns and "team_win_pct_pre" in df.columns:
    df["int_is_home__team_win_pct_pre"] = df["is_home"].astype(float) * df["team_win_pct_pre"].astype(float)

# is_home * diff_pass_ypa_roll5
if "is_home" in df.columns and "diff_pass_ypa_roll5" in df.columns:
    df["int_is_home__diff_pass_ypa_roll5"] = df["is_home"].astype(float) * df["diff_pass_ypa_roll5"].astype(float)

# short_rest * diff_rush_ypc_roll3
if "short_rest" in df.columns and "diff_rush_ypc_roll3" in df.columns:
    # keep as float so missing short_rest doesn’t crash
    df["int_short_rest__diff_rush_ypc_roll3"] = df["short_rest"].astype(float) * df["diff_rush_ypc_roll3"].astype(float)


To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  mu  = grp.apply(lambda s: s.expanding(min_periods=1).mean().shift(1)).reset_index(level=0, drop=True)
To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  sd  = grp.apply(lambda s: s.expanding(min_periods=2).std(ddof=0).shift(1)).reset_index(level=0, drop=True)
To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  mu  = grp.apply(lambda s: s.expanding(min_periods=1).mean().shift(1)).reset_index(level=0, drop=True)
To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  sd  = grp.apply(lambda s: s.expanding(min_periods=2).std(ddof=0).shift(1)).reset_index(level=0, drop=True)
To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)

In [76]:
# --- one-hot encode roof & surface early (cleaning phase) ---
if "roof" in df.columns:
    df["roof"] = df["roof"].astype(str).str.strip().str.lower()
if "surface" in df.columns:
    df["surface"] = df["surface"].astype(str).str.strip().str.lower()

df = pd.get_dummies(
    df,
    columns=[c for c in ["roof","surface"] if c in df.columns],
    prefix=["roof","surface"],
    dummy_na=False
)

# optional: enforce consistent columns (avoids KeyErrors across train/test)
expected_roof = ["roof_closed","roof_open","roof_outdoors","roof_retractable","roof_dome"]
expected_surface = ["surface_grass","surface_turf","surface_fieldturf","surface_astroturf"]

for col in expected_roof + expected_surface:
    if col not in df.columns:
        df[col] = 0


Add point differential metrics

In [77]:
# --- Prior game point differential (1-game lag)
df["point_diff_prior1"] = (
    df.groupby("team")["point_diff"].shift(1)
)

# --- Rolling 3-game average point differential
df["point_diff_roll3"] = (
    df.groupby("team")["point_diff"]
      .apply(lambda x: x.shift(1).rolling(3, min_periods=1).mean())
)

To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  .apply(lambda x: x.shift(1).rolling(3, min_periods=1).mean())


In [78]:
#impute averages

#impute probablitiy feature without vig
import numpy as np

def moneyline_to_prob(ml):
    """Convert American odds to implied probability (with vig)."""
    if ml < 0:
        return -ml / (-ml + 100)
    else:
        return 100 / (ml + 100)

def remove_vig_prob(team_ml, opp_ml):
    """
    Convert team/opponent moneylines into normalized probabilities.
    Returns (team_prob, opp_prob) with vig removed.
    """
    p_team = moneyline_to_prob(team_ml)
    p_opp = moneyline_to_prob(opp_ml)
    total = p_team + p_opp
    return p_team / total, p_opp / total

# --- Apply to your DataFrame ---
df["team_prob_novig"], df["opp_prob_novig"] = zip(
    *df[["team_moneyline", "opp_moneyline"]].apply(
        lambda x: remove_vig_prob(x["team_moneyline"], x["opp_moneyline"]), axis=1
    )
)

General Cleaning
1. Check for low variability columns
2. Check for missingness

In [79]:
print("Constant columns:", df.columns[df.nunique(dropna=True) <= 1].tolist())

const_cols = [
    'team_pass_sacked_yds', 'team_punt', 'team_punt_yds',
    'opp_pass_sacked_yds', 'opp_punt', 'opp_punt_yds',
    'team_pass_sacked_yds_prior1', 'team_punt_prior1', 'team_punt_yds_prior1',
    'opp_pass_sacked_yds_prior1', 'opp_punt_prior1', 'opp_punt_yds_prior1'
]

df = df.drop(columns=const_cols)

print("Constant columns:", df.columns[df.nunique(dropna=True) <= 1].tolist())

df = df.sort_values(["season", "week", "team"]).reset_index(drop=True)

# Assume df is your DataFrame
missing = (
    df.isna()
      .sum()
      .reset_index()
      .rename(columns={"index": "column", 0: "missing_count"})
)

# Add % missing
missing["missing_pct"] = (missing["missing_count"] / len(df)) * 100

# Sort by % missing
missing = missing.sort_values("missing_pct", ascending=False).reset_index(drop=True)

print(missing.head(20))  # Top 20 columns with missing values

Constant columns: ['team_pass_sacked_yds', 'team_punt', 'team_punt_yds', 'opp_pass_sacked_yds', 'opp_punt', 'opp_punt_yds', 'sev_', 'sev__prior1', 'team_pass_sacked_yds_prior1', 'team_punt_prior1', 'team_punt_yds_prior1', 'opp_pass_sacked_yds_prior1', 'opp_punt_prior1', 'opp_punt_yds_prior1', 'league_avg_pd', 'league_point_diff_roll3', 'league_point_diff_roll5', 'league_point_diff_roll10', 'roof_retractable']
Constant columns: ['sev_', 'sev__prior1', 'league_avg_pd', 'league_point_diff_roll3', 'league_point_diff_roll5', 'league_point_diff_roll10', 'roof_retractable']
                      column  missing_count  missing_pct
0             is_home_x_temp           2008    33.355482
1                       wind           2008    33.355482
2                       temp           2008    33.355482
3                      wind2           2008    33.355482
4   diff_pass_td_rate_season            989    16.428571
5      diff_rush_rate_season            989    16.428571
6       diff_pass_yds_seaso

Adding feature for number of rest days since last game: rest_days

In [80]:
import numpy as np

# Make sure date is datetime
df['date'] = pd.to_datetime(df['date'])

# Sort so diffs are correct
df = df.sort_values(['team', 'season', 'week', 'date'])

# Previous game date and season for each team
df['__prev_date'] = df.groupby('team')['date'].shift(1)
df['__prev_season'] = df.groupby('team')['season'].shift(1)

# Rest days only within the same season (avoid giant off-season gaps)
rest_days = (df['date'] - df['__prev_date']).dt.days
df['rest_days'] = np.where(df['season'].eq(df['__prev_season']), rest_days, np.nan)

# Helpful indicators (set to <NA> when rest_days is NaN)
df['short_rest'] = (df['rest_days'] <= 6).astype('Int64')
df['bye_week']   = (df['rest_days'] >= 13).astype('Int64')
df.loc[df['rest_days'].isna(), ['short_rest', 'bye_week']] = pd.NA

# Drop helper columns
df.drop(columns=['__prev_date','__prev_season'], inplace=True)


Game and Season Record Data

In [81]:
import numpy as np

# Chronological order
df = df.sort_values(["team", "season", "week"])
g  = df.groupby(["team", "season"], sort=False)

# Previous week's win (within-season)
df["team_win_prev1"] = g["team_win"].shift(1).fillna(0).astype(int)

# Games played BEFORE this game (0 for season opener)
df["team_games_played_pre"] = g.cumcount().astype(int)

# Wins BEFORE this game: shift THEN cumsum (within-season)
df["team_wins_pre"] = g["team_win"].shift(1).fillna(0).astype(int)
df["team_wins_pre"] = g["team_wins_pre"].cumsum().astype(int)

# Losses BEFORE this game
df["team_losses_pre"] = (df["team_games_played_pre"] - df["team_wins_pre"]).astype(int)

# Win% BEFORE this game (NaN for opener; fill 0.0 if you prefer)
df["team_win_pct_pre"] = np.where(
    df["team_games_played_pre"] > 0,
    df["team_wins_pre"] / df["team_games_played_pre"],
    np.nan
).astype(float)

Print to the new intermediate dataset

In [82]:
# save
df.to_csv("../intermediate/schedules_cleaned.csv", index=False)
print(df.head())

             game_id  season  week       date team opponent  is_home  \
0     2014_01_SD_ARI    2014     1 2014-09-08  ARI       SD        1   
32   2014_02_ARI_NYG    2014     2 2014-09-14  ARI      NYG        0   
64    2014_03_SF_ARI    2014     3 2014-09-21  ARI       SF        1   
122  2014_05_ARI_DEN    2014     5 2014-10-05  ARI      DEN        0   
152  2014_06_WAS_ARI    2014     6 2014-10-12  ARI      WAS        1   

     team_score  opp_score  team_win  ...  roof_retractable point_diff_prior1  \
0          18.0       17.0         1  ...                 0              14.0   
32         25.0       14.0         1  ...                 0               8.0   
64         23.0       14.0         1  ...                 0             -12.0   
122        20.0       41.0         0  ...                 0              12.0   
152        30.0       20.0         1  ...                 0              25.0   

    team_prob_novig  opp_prob_novig  rest_days short_rest  bye_week  \
0        

In [83]:
print(df.columns.tolist())

['game_id', 'season', 'week', 'date', 'team', 'opponent', 'is_home', 'team_score', 'opp_score', 'team_win', 'point_diff', 'home_team', 'away_team', 'home_score', 'away_score', 'Winner', 'spread_line', 'total_line', 'stadium', 'temp', 'wind', 'game_type', 'weekday', 'gametime', 'location', 'referee', 'team_coach', 'team_fga', 'team_fgm', 'team_first_down', 'team_fumbles_lost', 'team_moneyline', 'team_pass_att', 'team_pass_cmp', 'team_pass_int', 'team_pass_sacked', 'team_pass_td', 'team_pass_yds', 'team_penalties', 'team_penalties_yds', 'team_plays_offense', 'team_rush_att', 'team_rush_td', 'team_rush_yds', 'team_turnovers', 'team_xpa', 'team_xpm', 'opp_coach', 'opp_fga', 'opp_fgm', 'opp_first_down', 'opp_fumbles_lost', 'opp_moneyline', 'opp_pass_att', 'opp_pass_cmp', 'opp_pass_int', 'opp_pass_sacked', 'opp_pass_td', 'opp_pass_yds', 'opp_penalties', 'opp_penalties_yds', 'opp_plays_offense', 'opp_rush_att', 'opp_rush_td', 'opp_rush_yds', 'opp_turnovers', 'opp_xpa', 'opp_xpm', 'inj_', 'inj