In [52]:
import sys
import logging
from tqdm import tqdm

import pandas as pd
import numpy as np
import nfl_data_py as nfl
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import xgboost as xgb
import optuna

import warnings
warnings.filterwarnings("ignore")

sys.path.append('../py')
import preprocess
from nflplotlib import nflplot as nfp

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

LOG = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

RANDOM_SEED = 2

In [57]:
sup_data = pd.read_csv('../data/supplementary_data.csv')
tracking_input, tracking_output = pd.DataFrame(), pd.DataFrame()
for week in tqdm(range(1, 19), desc="Loading weekly data"):
    tracking_input = pd.concat([tracking_input, pd.read_csv(f'../data/train/input_2023_w{week:02d}.csv')], axis=0)
    tracking_output = pd.concat([tracking_output, pd.read_csv(f'../data/train/output_2023_w{week:02d}.csv')], axis=0)
LOG.info(f'Tracking input shape: {tracking_input.shape}, output shape: {tracking_output.shape}')

Loading weekly data: 100%|██████████| 18/18 [00:08<00:00,  2.01it/s]
2025-10-29 11:16:55,480 - INFO - Tracking input shape: (4880579, 23), output shape: (562936, 6)


In [58]:
games, plays, players, tracking = preprocess.process_data(tracking_input, tracking_output, sup_data)
team_desc = preprocess.fetch_team_desc()

2025-10-29 11:17:10,274 - INFO - Joined input and output tracking data: 14108 unique plays, 1384 unique nfl_ids
2025-10-29 11:17:10,276 - INFO - Standardizing direction of play and players to be left to right
2025-10-29 11:17:12,348 - INFO - Approximating missing speed, acceleration and direction values
2025-10-29 11:17:16,391 - INFO - Correlation results for imputations: s_approx: speed R²=0.9966 | a_approx: accel R²=0.0831 | dir_approx: dir R²=0.0587
2025-10-29 11:17:17,331 - INFO - Joining supplemental data to plays DataFrame
2025-10-29 11:17:17,499 - INFO - Loading NFL PBP data for season 2023
2025-10-29 11:17:17,499 - INFO - Loading pbp from local parquet file
2025-10-29 11:17:17,796 - INFO - Mapping player IDs to nfl_id using seasonal rosters
2025-10-29 11:17:17,796 - INFO - Rosters for season 2023 already cached, loading from parquet
2025-10-29 11:17:35,317 - INFO - Defaulting passer to QB for play without a passer: 2023111203_1068
2025-10-29 11:17:37,354 - INFO - Defaulting pas

In [61]:
tracking.query('position.isin(["FS","SS","S"]) and pass_thrown').gpid.unique()[50:100]

array(['2023091000_842', '2023091000_927', '2023091001_1042',
       '2023091001_1232', '2023091001_1368', '2023091001_1438',
       '2023091001_1470', '2023091001_1574', '2023091001_1980',
       '2023091001_2033', '2023091001_2151', '2023091001_2351',
       '2023091001_2432', '2023091001_2523', '2023091001_2759',
       '2023091001_3086', '2023091001_3514', '2023091001_3879',
       '2023091001_3902', '2023091001_4018', '2023091001_407',
       '2023091001_4191', '2023091001_4216', '2023091001_4239',
       '2023091001_4322', '2023091001_4564', '2023091001_4589',
       '2023091001_4639', '2023091001_559', '2023091001_893',
       '2023091002_1209', '2023091002_1412', '2023091002_1464',
       '2023091002_1540', '2023091002_1974', '2023091002_2872',
       '2023091002_2942', '2023091002_3253', '2023091002_3314',
       '2023091002_3337', '2023091002_3409', '2023091002_3571',
       '2023091002_3812', '2023091002_3835', '2023091002_4036',
       '2023091002_444', '2023091002_767', '2

In [62]:
# gpid="2023091100_993"
# gpid="2023091003_410"
# gpid="2023091003_1706"
gpid="2023090700_1837"
nfp.animate_play(
    tracking.query('gpid==@gpid'),
    plays.query('gpid==@gpid'),
    games.query(f'game_id=={gpid.split("_")[0]}'),
    team_desc,
    # save_path='animation.gif',
    plot_positions=True,
    highlight_postpass_players=True,
    show_postpass_paths=True
)

2025-10-29 11:19:01,825 - INFO - Animation.save using <class 'matplotlib.animation.HTMLWriter'>


# i. Baseline Model 
Model predicts an EPA point estimate at any frame of the pass play

In [65]:
def get_all_nearest_defenders(tracking, n_values=(1, 2, 3)):
    """
    Vectorized: compute the n-th nearest defenders to the receiver per frame for all plays.
    Returns one DataFrame with ['gpid','frame_id', defender_dist_n, defender_x_n, defender_y_n, ...].
    """
    defenders = tracking.query('player_side == "Defense"')[['gpid','frame_id','x','y']]
    receivers = tracking.query('is_receiver')[['gpid','frame_id','x','y']].rename(
        columns={'x':'receiver_x','y':'receiver_y'}
    )

    # Merge defenders with receiver positions per frame
    merged = defenders.merge(receivers, on=['gpid','frame_id'], how='inner')
    merged['dist'] = np.sqrt(
        (merged['x'] - merged['receiver_x'])**2 + 
        (merged['y'] - merged['receiver_y'])**2
    )

    # Rank defenders by distance *within each play and frame*
    merged['rank'] = merged.groupby(['gpid','frame_id'])['dist'].rank(method='first')

    # Build result incrementally for all n
    results = []
    for n in n_values:
        nth = (
            merged[merged['rank'] == n][['gpid','frame_id','x','y','dist']]
            .rename(columns={
                'x': f'defender_x_{n}',
                'y': f'defender_y_{n}',
                'dist': f'defender_dist_{n}'
            })
        )
        results.append(nth)

    # Combine all nth results into one wide frame
    out = results[0]
    for r in results[1:]:
        out = out.merge(r, on=['gpid','frame_id'], how='outer')

    return out


def get_ball_flight_pct(df):
    """
    Compute percent of ball flight for all plays in one pass (vectorized).
    Returns a copy with new 'ball_flight_pct' column.
    """
    df = df.sort_values(['gpid','frame_id']).copy()

    # Find throw frames and end frames per play
    throw_frame = (
        df.loc[df['pass_thrown'], ['gpid','frame_id']]
        .groupby('gpid')['frame_id']
        .min()
        .rename('throw_frame')
    )
    end_frame = df.groupby('gpid')['frame_id'].max().rename('end_frame')

    df = df.merge(throw_frame, on='gpid', how='left').merge(end_frame, on='gpid', how='left')

    # Compute flight pct
    df['ball_flight_pct'] = 0.0
    in_flight = df['frame_id'] >= df['throw_frame']
    df.loc[in_flight, 'ball_flight_pct'] = (
        (df.loc[in_flight, 'frame_id'] - df.loc[in_flight, 'throw_frame'])
        / (df.loc[in_flight, 'end_frame'] - df.loc[in_flight, 'throw_frame']).clip(lower=1)
    ) * 100

    return df.drop(columns=['throw_frame','end_frame'])

LOG.info("Preparing base data")

# Base ball + receiver merge
data = (
    tracking
    .query('position == "Ball"')[['gpid','frame_id','pass_thrown','x','y']]
    .drop_duplicates(subset=['gpid','frame_id'])
    .rename(columns={'x':'ball_x','y':'ball_y'})
    .merge(
        tracking.query('is_receiver')[['gpid','frame_id','x','y']]
        .rename(columns={'x':'receiver_x','y':'receiver_y'}),
        on=['gpid','frame_id'],
        how='left'
    )
    .assign(
        dist_ball_to_receiver=lambda df: np.sqrt(
            (df.ball_x - df.receiver_x)**2 + (df.ball_y - df.receiver_y)**2
        )
    )
)

LOG.info("Finding nearest defenders (1–3)")
nearest_defenders = get_all_nearest_defenders(tracking, n_values=(1,2,3))
data = data.merge(nearest_defenders, on=['gpid','frame_id'], how='left')

LOG.info("Calculating ball flight percentage")
data = get_ball_flight_pct(data)

LOG.info("Joining in EPA values")
data = data.merge(plays[['gpid','expected_points_added']], on='gpid', how='left')

LOG.info("Data preparation complete")

final_cols = [
    'gpid', 'frame_id', 'pass_thrown', 'ball_flight_pct',
    'dist_ball_to_receiver', 'defender_dist_1', 'defender_dist_2', 'defender_dist_3',
    'expected_points_added'
]
data = data[final_cols]

2025-10-29 11:30:27,656 - INFO - Preparing base data
2025-10-29 11:30:28,387 - INFO - Finding nearest defenders (1–3)
2025-10-29 11:30:31,697 - INFO - Calculating ball flight percentage
2025-10-29 11:30:32,113 - INFO - Joining in EPA values
2025-10-29 11:30:32,200 - INFO - Data preparation complete


In [66]:
data.isna().sum()

gpid                         0
frame_id                     0
pass_thrown                  0
ball_flight_pct              0
dist_ball_to_receiver        0
defender_dist_1           8640
defender_dist_2          34530
defender_dist_3          85340
expected_points_added        0
dtype: int64

In [68]:
def train_and_evaluate(data,
                       feature_cols = [
                           'dist_ball_to_receiver',
                           'defender_dist_1',
                           'defender_dist_2',
                           'defender_dist_3',
                       ],
                       id_col='gpid',
                       target_col='expected_points_added',
                       test_size=0.15,
                       val_size=0.15,
                       n_trials=50):
    # Basic checks
    assert id_col in data.columns, f"{id_col} not in data"
    assert target_col in data.columns, f"{target_col} not in data"
    for c in feature_cols:
        if c not in data.columns:
            raise ValueError(f"Feature column missing: {c}")

    df = data.dropna(subset=[target_col]).copy()

    # Split by gpid (group-aware split to avoid leakage)
    gpids = df[id_col].unique()
    g_train_val, g_test = train_test_split(gpids, test_size=test_size, random_state=RANDOM_SEED)
    # further split train_val into train and val
    relative_val = val_size / (1.0 - test_size)
    g_train, g_val = train_test_split(g_train_val, test_size=relative_val, random_state=RANDOM_SEED)

    train = df[df[id_col].isin(g_train)].reset_index(drop=True)
    val   = df[df[id_col].isin(g_val)].reset_index(drop=True)
    test  = df[df[id_col].isin(g_test)].reset_index(drop=True)

    print(f"Plays -> train: {len(g_train)}, val: {len(g_val)}, test: {len(g_test)}")
    print(f"Frames -> train: {len(train)}, val: {len(val)}, test: {len(test)}")

    X_train = train[feature_cols].values
    X_val   = val[feature_cols].values
    X_test  = test[feature_cols].values

    y_train = train[target_col].values
    y_val   = val[target_col].values
    y_test  = test[target_col].values

    # --- XGBoost with Optuna tuning (optimize validation MAE) ---
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dval   = xgb.DMatrix(X_val, label=y_val)

    def objective(trial):
        params = {
            "verbosity": 0,
            "objective": "reg:squarederror",
            "booster": "gbtree",
            "tree_method": "hist",
            "seed": RANDOM_SEED,
            "lambda": trial.suggest_float("lambda", 1e-8, 10.0, log=True),
            "alpha": trial.suggest_float("alpha", 1e-8, 10.0, log=True),
            "eta": trial.suggest_float("eta", 1e-3, 0.5, log=True),
            "max_depth": trial.suggest_int("max_depth", 3, 9, step=2),
            "min_child_weight": trial.suggest_float("min_child_weight", 1e-8, 10.0, log=True),
            "subsample": trial.suggest_float("subsample", 0.4, 1.0),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.4, 1.0),
        }

        # use xgboost.cv or train with early stopping on validation
        bst = xgb.train(
            params,
            dtrain,
            num_boost_round=2000,
            evals=[(dval, "validation")],
            early_stopping_rounds=30,
            verbose_eval=False
        )

        preds = bst.predict(dval, iteration_range=(0, bst.best_iteration))
        mae = mean_absolute_error(y_val, preds)
        return mae

    study = optuna.create_study(
        direction="minimize", 
        sampler=optuna.samplers.TPESampler(seed=RANDOM_SEED)
    )
    study.optimize(objective, n_trials=n_trials, show_progress_bar=True)

    print("Optuna best MAE on val:", study.best_value)
    print("Best params:", study.best_params)

    # Train final xgboost with best params on train+val combined
    best_params = study.best_params.copy()
    best_params.update({
        "verbosity": 0,
        "objective": "reg:squarederror",
        "booster": "gbtree",
        "tree_method": "hist",
        "seed": RANDOM_SEED
    })

    X_trainval = np.vstack([X_train, X_val])
    y_trainval = np.concatenate([y_train, y_val])

    dtrain_full = xgb.DMatrix(X_trainval, label=y_trainval)
    dtest = xgb.DMatrix(X_test, label=y_test)

    bst_final = xgb.train(
        best_params,
        dtrain_full,
        num_boost_round=2000,
        evals=[(dtrain_full, "train")],
        verbose_eval=False,
        early_stopping_rounds=50
    )

    y_pred_xgb = bst_final.predict(dtest, iteration_range=(0, bst_final.best_iteration))

    # --- Metrics ---
    def print_metrics(y_true, y_pred, prefix="Model"):
        mae = mean_absolute_error(y_true, y_pred)
        rmse = mean_squared_error(y_true, y_pred)**0.5
        r2 = r2_score(y_true, y_pred)
        print(f"\n{prefix} metrics on TEST:")
        print(f"MAE:  {mae:.4f}")
        print(f"RMSE: {rmse:.4f}")
        print(f"R2:   {r2:.4f}")

    print_metrics(y_test, y_pred_xgb, prefix="XGBoost (Optuna-tuned)")

    return {
        "xgb_model": bst_final,
        "optuna_study": study,
        "test_df": test,
        "y_test": y_test,
        "y_pred_xgb": y_pred_xgb
    }

results = train_and_evaluate(
    data,
    feature_cols=[
        'pass_thrown', 
        'ball_flight_pct',
        'dist_ball_to_receiver',
        'defender_dist_1',
        'defender_dist_2',
        'defender_dist_3',
    ],
    id_col='gpid',
    target_col='expected_points_added',
)

Plays -> train: 9863, val: 2114, test: 2114
Rows -> train: 389229, val: 83904, test: 83225


[I 2025-10-29 11:31:12,978] A new study created in memory with name: no-name-f4bfce99-373b-47c7-8e7e-a49ba94e8fcc


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-10-29 11:31:17,103] Trial 0 finished with value: 1.2723348373840344 and parameters: {'lambda': 8.393713070692229e-05, 'alpha': 1.7113391527987203e-08, 'eta': 0.030445460008040798, 'max_depth': 5, 'min_child_weight': 6.071698793016848e-05, 'subsample': 0.5982008926023245, 'colsample_bytree': 0.5227891804227055}. Best is trial 0 with value: 1.2723348373840344.
[I 2025-10-29 11:31:24,171] Trial 1 finished with value: 1.27332110604779 and parameters: {'lambda': 0.003744886823496263, 'alpha': 4.976133957791116e-06, 'eta': 0.005249994057689375, 'max_depth': 7, 'min_child_weight': 0.0005784673461006173, 'subsample': 0.48074796720696017, 'colsample_bytree': 0.7081468727594479}. Best is trial 0 with value: 1.2723348373840344.
[I 2025-10-29 11:31:24,650] Trial 2 finished with value: 1.272670203247868 and parameters: {'lambda': 4.570448196536859e-07, 'alpha': 0.11695433673868773, 'eta': 0.20176865513948422, 'max_depth': 5, 'min_child_weight': 0.41596172275214177, 'subsample': 0.4477872862

In [70]:
# Predict epa on each frame of data
data['predicted_epa'] = results['xgb_model'].predict(
    xgb.DMatrix(
        data[
            [
                'pass_thrown', 
                'ball_flight_pct',
                'dist_ball_to_receiver',
                'defender_dist_1',
                'defender_dist_2',
                'defender_dist_3',
            ]
        ].values
    ),
    iteration_range=(0, results['xgb_model'].best_iteration)
)
data.head()

Unnamed: 0,gpid,frame_id,pass_thrown,ball_flight_pct,dist_ball_to_receiver,defender_dist_1,defender_dist_2,defender_dist_3,expected_points_added,predicted_epa
0,2023090700_1001,1,False,0.0,6.931876,2.913589,4.664118,5.781228,1.195112,-0.699719
1,2023090700_1001,2,False,0.0,6.983552,2.894305,4.621796,5.76184,1.195112,-0.50737
2,2023090700_1001,3,False,0.0,7.057833,2.853086,4.550275,5.742752,1.195112,-0.471145
3,2023090700_1001,4,False,0.0,7.122612,2.842006,4.506995,5.727347,1.195112,-0.335215
4,2023090700_1001,5,False,0.0,7.201389,2.853384,4.461255,5.743953,1.195112,-0.412875
