# 03. Dynamic Feature Generation (Rolling EPA + In-Season Style Clusters)

**Goal:**  
Create per-game dynamic features based on rolling EPA (last 4 games) and assign in-season dynamic style clusters (Balanced / Offense-heavy / Struggling).

**Contents:**  
- Load play-by-play & team-game EPA data  
- Compute team_game_number (1→17 per season)  
- Rolling 4-game EPA: offense/defense/net  
- Segment teams by in-season windows (G1–4, G5–8, G9–12, G13–17)  
- Cluster each segment’s rolling EPA to derive dynamic styles  
- Produce full team-game dynamic dataset  

**Output File Saved:**  
- `data/processed/team_game_dynamic_clusters_rollingEPA.csv`  

Used directly in Notebook 04 (win prediction).

## 1.  Imports & paths

In [49]:
# Cell 1 — Setup paths and load play-by-play data

from pathlib import Path
import pandas as pd
import numpy as np

# Assume this notebook lives in: .../nfl-epa-analysis/notebooks
NOTEBOOK_DIR = Path.cwd()
PROJECT_ROOT = NOTEBOOK_DIR  # we treat "notebooks" as root for data
DATA_DIR = PROJECT_ROOT / "data"
RAW_DIR = DATA_DIR / "raw"
PROCESSED_DIR = DATA_DIR / "processed"

print("NOTEBOOK_DIR:", NOTEBOOK_DIR)
print("DATA_DIR:", DATA_DIR)
print("RAW_DIR:", RAW_DIR)
print("PROCESSED_DIR:", PROCESSED_DIR)

PBP_PARQUET = RAW_DIR / "pbp_2019_2024.parquet"

if not PBP_PARQUET.exists():
    raise FileNotFoundError(
        f"Play-by-play parquet not found at {PBP_PARQUET}.\n"
        "Make sure pbp_2019_2024.parquet exists in notebooks/data/raw."
    )

pbp = pd.read_parquet(PBP_PARQUET)

print("PBP shape:", pbp.shape)
print(pbp.head())

NOTEBOOK_DIR: /Users/minseobeom/Desktop/nfl-epa-analysis/notebooks
DATA_DIR: /Users/minseobeom/Desktop/nfl-epa-analysis/notebooks/data
RAW_DIR: /Users/minseobeom/Desktop/nfl-epa-analysis/notebooks/data/raw
PROCESSED_DIR: /Users/minseobeom/Desktop/nfl-epa-analysis/notebooks/data/processed
PBP shape: (293478, 10)
   play_id          game_id  season  week posteam defteam home_team away_team  \
0      1.0  2019_01_ATL_MIN    2019     1    None    None       MIN       ATL   
1     36.0  2019_01_ATL_MIN    2019     1     ATL     MIN       MIN       ATL   
2     51.0  2019_01_ATL_MIN    2019     1     ATL     MIN       MIN       ATL   
3     79.0  2019_01_ATL_MIN    2019     1     ATL     MIN       MIN       ATL   
4    100.0  2019_01_ATL_MIN    2019     1     ATL     MIN       MIN       ATL   

        epa play_type  
0 -0.000000      None  
1 -0.000000   kickoff  
2 -1.658763      pass  
3 -0.538914       run  
4  0.142138       run  


## 2.  Team-game EPA (offense/defense)

In [50]:
# Cell 2 — Build team-game offensive and defensive EPA

# Keep only run/pass plays with a valid offense (posteam)
pbp_off = pbp.copy()
pbp_off = pbp_off[pbp_off["posteam"].notna()]

if "play_type" in pbp_off.columns:
    pbp_off = pbp_off[pbp_off["play_type"].isin(["run", "pass"])]

print("Filtered offensive plays shape:", pbp_off.shape)

# Offensive EPA per team-game
off_epa = (
    pbp_off
    .groupby(["season", "week", "game_id", "posteam"], as_index=False)["epa"]
    .mean()
    .rename(columns={"posteam": "team", "epa": "off_epa_mean"})
)

# Defensive EPA per team-game (EPA allowed)
def_epa = (
    pbp_off
    .groupby(["season", "week", "game_id", "defteam"], as_index=False)["epa"]
    .mean()
    .rename(columns={"defteam": "team", "epa": "def_epa_mean"})
)

# Merge offense + defense
team_game = off_epa.merge(
    def_epa,
    on=["season", "week", "game_id", "team"],
    how="outer",
)

# Compute net EPA (offense - defense)
team_game["net_epa_mean"] = team_game["off_epa_mean"] - team_game["def_epa_mean"]

team_game = team_game.sort_values(
    ["season", "team", "week", "game_id"]
).reset_index(drop=True)

print("team_game shape:", team_game.shape)
team_game.head(10)

Filtered offensive plays shape: (209597, 10)
team_game shape: (3350, 7)


Unnamed: 0,season,week,game_id,team,off_epa_mean,def_epa_mean,net_epa_mean
0,2019,1,2019_01_DET_ARI,ARI,-0.091637,0.037971,-0.129608
1,2019,2,2019_02_ARI_BAL,ARI,0.110496,0.168465,-0.057969
2,2019,3,2019_03_CAR_ARI,ARI,-0.088133,0.274922,-0.363055
3,2019,4,2019_04_SEA_ARI,ARI,-0.112383,0.156417,-0.2688
4,2019,5,2019_05_ARI_CIN,ARI,0.282629,0.09804,0.184588
5,2019,6,2019_06_ATL_ARI,ARI,0.262744,0.318894,-0.05615
6,2019,7,2019_07_ARI_NYG,ARI,0.082157,-0.194107,0.276264
7,2019,8,2019_08_ARI_NO,ARI,-0.327491,0.227629,-0.55512
8,2019,9,2019_09_SF_ARI,ARI,0.210213,0.221313,-0.0111
9,2019,10,2019_10_ARI_TB,ARI,0.053874,0.012992,0.040882


## 3. Game index per team + segment labels

In [51]:
# Cell 3 — Add game index per team and segment labels

team_game = team_game.sort_values(
    ["season", "team", "week", "game_id"]
).reset_index(drop=True)

# Within each season and team, count games (1, 2, 3, ...)
team_game["team_game_number"] = (
    team_game
    .groupby(["season", "team"])
    .cumcount() + 1
)

# Segment label by game index
def assign_segment(n):
    if n <= 4:
        return "G1-4"
    elif n <= 8:
        return "G5-8"
    elif n <= 12:
        return "G9-12"
    else:
        return "G13+"

team_game["segment"] = team_game["team_game_number"].apply(assign_segment)

print("team_game with segments:")
team_game[["season", "team", "team_game_number", "segment"]].head(15)

team_game with segments:


Unnamed: 0,season,team,team_game_number,segment
0,2019,ARI,1,G1-4
1,2019,ARI,2,G1-4
2,2019,ARI,3,G1-4
3,2019,ARI,4,G1-4
4,2019,ARI,5,G5-8
5,2019,ARI,6,G5-8
6,2019,ARI,7,G5-8
7,2019,ARI,8,G5-8
8,2019,ARI,9,G9-12
9,2019,ARI,10,G9-12


## 4. Rolling EPA (last 4 games, leakage-safe)

In [52]:
# Cell 4 — Compute rolling EPA over last 4 games (leakage-safe)

tg = team_game.copy()

# Fill missing EPA with 0 for rolling computation
tg["off_epa_mean_filled"] = tg["off_epa_mean"].fillna(0.0)
tg["def_epa_mean_filled"] = tg["def_epa_mean"].fillna(0.0)
tg["net_epa_mean_filled"] = tg["net_epa_mean"].fillna(0.0)

tg = tg.sort_values(
    ["season", "team", "team_game_number"]
).reset_index(drop=True)

def rolling_last4(s):
    return s.shift(1).rolling(window=4, min_periods=1).mean()

tg["rolling_off_epa_4"] = (
    tg
    .groupby(["season", "team"])["off_epa_mean_filled"]
    .apply(rolling_last4)
)

tg["rolling_def_epa_4"] = (
    tg
    .groupby(["season", "team"])["def_epa_mean_filled"]
    .apply(rolling_last4)
)

tg["rolling_net_epa_4"] = (
    tg
    .groupby(["season", "team"])["net_epa_mean_filled"]
    .apply(rolling_last4)
)

print("tg shape with rolling features:", tg.shape)
tg[[
    "season", "team", "team_game_number",
    "rolling_off_epa_4", "rolling_def_epa_4", "rolling_net_epa_4"
]].head(15)

tg shape with rolling features: (3350, 15)


To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  .apply(rolling_last4)
To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  .apply(rolling_last4)
To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  .apply(rolling_last4)


Unnamed: 0,season,team,team_game_number,rolling_off_epa_4,rolling_def_epa_4,rolling_net_epa_4
0,2019,ARI,1,,,
1,2019,ARI,2,-0.091637,0.037971,-0.129608
2,2019,ARI,3,0.00943,0.103218,-0.093789
3,2019,ARI,4,-0.023091,0.160453,-0.183544
4,2019,ARI,5,-0.045414,0.159444,-0.204858
5,2019,ARI,6,0.048152,0.174461,-0.126309
6,2019,ARI,7,0.086214,0.212068,-0.125854
7,2019,ARI,8,0.128787,0.094811,0.033976
8,2019,ARI,9,0.07501,0.112614,-0.037605
9,2019,ARI,10,0.056906,0.143432,-0.086527


## 5. Prepare data for clustering

In [53]:
# Cell 5 — Prepare feature matrix for K-Means clustering

from sklearn.preprocessing import StandardScaler

cluster_features = tg[[
    "rolling_off_epa_4",
    "rolling_def_epa_4",
    "rolling_net_epa_4",
]].copy()

# Replace remaining NaNs with 0
cluster_features = cluster_features.fillna(0.0)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(cluster_features)

print("X_scaled shape:", X_scaled.shape)
X_scaled[:5]

X_scaled shape: (3350, 3)


array([[-7.39993056e-04,  5.24986229e-02, -3.61843157e-02],
       [-6.99415479e-01,  3.60750437e-01, -7.50663011e-01],
       [ 7.11549393e-02,  8.90434247e-01, -5.53204134e-01],
       [-1.76797431e-01,  1.35507009e+00, -1.04799165e+00],
       [-3.46995783e-01,  1.34687938e+00, -1.16548739e+00]])

## 6. K-Means & dynamic cluster names

In [54]:
# Cell 6 — Run K-Means and assign dynamic cluster names

from sklearn.cluster import KMeans

kmeans = KMeans(
    n_clusters=3,
    random_state=42,
    n_init=20,
)

tg["dyn_cluster"] = kmeans.fit_predict(X_scaled)

centers = pd.DataFrame(
    kmeans.cluster_centers_,
    columns=["rolling_off_epa_4", "rolling_def_epa_4", "rolling_net_epa_4"],
)
centers["cluster"] = centers.index

print("Cluster centers:")
print(centers)

# Name clusters based on net EPA center (offense strength vs defense)
# We sort by rolling_net_epa_4: highest -> offense-heavy, lowest -> struggling.

centers_sorted = centers.sort_values("rolling_net_epa_4")
lowest = int(centers_sorted.iloc[0]["cluster"])
middle = int(centers_sorted.iloc[1]["cluster"])
highest = int(centers_sorted.iloc[2]["cluster"])

cluster_name_map = {
    lowest: "Struggling",
    middle: "Balanced",
    highest: "Offense-heavy",
}

tg["dyn_cluster_name"] = tg["dyn_cluster"].map(cluster_name_map)

print("Cluster label distribution:")
print(tg["dyn_cluster_name"].value_counts())
tg[[
    "season", "team", "team_game_number",
    "rolling_off_epa_4", "rolling_def_epa_4", "rolling_net_epa_4",
    "dyn_cluster", "dyn_cluster_name",
]].head(20)

Cluster centers:
   rolling_off_epa_4  rolling_def_epa_4  rolling_net_epa_4  cluster
0           0.444182           0.414829           0.039464        0
1          -1.069581           0.428505          -1.064311        1
2           0.464433          -1.147848           1.115245        2
Cluster label distribution:
Balanced         1458
Struggling        994
Offense-heavy     898
Name: dyn_cluster_name, dtype: int64


Unnamed: 0,season,team,team_game_number,rolling_off_epa_4,rolling_def_epa_4,rolling_net_epa_4,dyn_cluster,dyn_cluster_name
0,2019,ARI,1,,,,0,Balanced
1,2019,ARI,2,-0.091637,0.037971,-0.129608,1,Struggling
2,2019,ARI,3,0.00943,0.103218,-0.093789,0,Balanced
3,2019,ARI,4,-0.023091,0.160453,-0.183544,1,Struggling
4,2019,ARI,5,-0.045414,0.159444,-0.204858,1,Struggling
5,2019,ARI,6,0.048152,0.174461,-0.126309,0,Balanced
6,2019,ARI,7,0.086214,0.212068,-0.125854,0,Balanced
7,2019,ARI,8,0.128787,0.094811,0.033976,0,Balanced
8,2019,ARI,9,0.07501,0.112614,-0.037605,0,Balanced
9,2019,ARI,10,0.056906,0.143432,-0.086527,0,Balanced


## 7. Build final dynamic table and save CSV

In [55]:
# Cell 7 — Build final dynamic cluster table and save CSV

dynamic_table = tg[[
    "season",
    "week",
    "game_id",
    "team",
    "team_game_number",
    "segment",
    "rolling_off_epa_4",
    "rolling_def_epa_4",
    "rolling_net_epa_4",
    "dyn_cluster",
    "dyn_cluster_name",
]].sort_values(
    ["season", "team", "team_game_number"]
).reset_index(drop=True)

print("Dynamic table shape:", dynamic_table.shape)
print(dynamic_table.head(15))

OUT_CSV = PROCESSED_DIR / "team_game_dynamic_clusters_rollingEPA.csv"
dynamic_table.to_csv(OUT_CSV, index=False)

print(f"Saved dynamic clusters to: {OUT_CSV}")

Dynamic table shape: (3350, 11)
    season  week          game_id team  team_game_number segment  \
0     2019     1  2019_01_DET_ARI  ARI                 1    G1-4   
1     2019     2  2019_02_ARI_BAL  ARI                 2    G1-4   
2     2019     3  2019_03_CAR_ARI  ARI                 3    G1-4   
3     2019     4  2019_04_SEA_ARI  ARI                 4    G1-4   
4     2019     5  2019_05_ARI_CIN  ARI                 5    G5-8   
5     2019     6  2019_06_ATL_ARI  ARI                 6    G5-8   
6     2019     7  2019_07_ARI_NYG  ARI                 7    G5-8   
7     2019     8   2019_08_ARI_NO  ARI                 8    G5-8   
8     2019     9   2019_09_SF_ARI  ARI                 9   G9-12   
9     2019    10   2019_10_ARI_TB  ARI                10   G9-12   
10    2019    11   2019_11_ARI_SF  ARI                11   G9-12   
11    2019    13   2019_13_LA_ARI  ARI                12   G9-12   
12    2019    14  2019_14_PIT_ARI  ARI                13    G13+   
13    2019    15

In [56]:
# Cell 8 — Sanity check: reload dynamic clusters from CSV

import pandas as pd

DYN_CSV = PROCESSED_DIR / "team_game_dynamic_clusters_rollingEPA.csv"
dyn_check = pd.read_csv(DYN_CSV)

print("Reloaded dynamic clusters shape:", dyn_check.shape)
print(dyn_check.head(10))

Reloaded dynamic clusters shape: (3350, 11)
   season  week          game_id team  team_game_number segment  \
0    2019     1  2019_01_DET_ARI  ARI                 1    G1-4   
1    2019     2  2019_02_ARI_BAL  ARI                 2    G1-4   
2    2019     3  2019_03_CAR_ARI  ARI                 3    G1-4   
3    2019     4  2019_04_SEA_ARI  ARI                 4    G1-4   
4    2019     5  2019_05_ARI_CIN  ARI                 5    G5-8   
5    2019     6  2019_06_ATL_ARI  ARI                 6    G5-8   
6    2019     7  2019_07_ARI_NYG  ARI                 7    G5-8   
7    2019     8   2019_08_ARI_NO  ARI                 8    G5-8   
8    2019     9   2019_09_SF_ARI  ARI                 9   G9-12   
9    2019    10   2019_10_ARI_TB  ARI                10   G9-12   

   rolling_off_epa_4  rolling_def_epa_4  rolling_net_epa_4  dyn_cluster  \
0                NaN                NaN                NaN            0   
1          -0.091637           0.037971          -0.129608          