In [13]:
import logging
import os
import sys
import joblib
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
from tqdm import tqdm
import nflreadpy as nfl

sys.path.append('../py')
from preprocess import preprocess

pd.set_option('display.max_columns', None)

LOG = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

N_WEEKS = 18

In [2]:
##############  Load and preprocess the tracking + play data ##############

sup_data = pd.read_csv('../data/supplementary_data.csv')
tracking_input, tracking_output = pd.DataFrame(), pd.DataFrame()
for week in tqdm(range(1, N_WEEKS+1), desc="Loading weekly data"):
    tracking_input = pd.concat([tracking_input, pd.read_csv(f'../data/train/input_2023_w{week:02d}.csv')], axis=0)
    tracking_output = pd.concat([tracking_output, pd.read_csv(f'../data/train/output_2023_w{week:02d}.csv')], axis=0)
LOG.info(f'Tracking input shape: {tracking_input.shape}, output shape: {tracking_output.shape}')

games, plays, players, tracking = preprocess.process_data(tracking_input, tracking_output, sup_data)
team_desc = preprocess.fetch_team_desc()

Loading weekly data: 100%|██████████| 18/18 [00:09<00:00,  1.82it/s]
2025-12-18 22:04:06,010 - INFO - Tracking input shape: (4880579, 23), output shape: (562936, 6)
2025-12-18 22:04:15,022 - INFO - Joined input and output tracking data: 14108 unique plays, 1384 unique nfl_ids
2025-12-18 22:04:15,022 - INFO - Standardizing direction of play and players to be left to right
2025-12-18 22:04:16,461 - INFO - Approximating missing speed, acceleration and direction values
2025-12-18 22:04:19,777 - INFO - Correlation results for imputations: s_approx: speed R²=0.9897 | a_approx: accel R²=0.0445 | dir_approx: dir R²=0.0442
2025-12-18 22:04:20,482 - INFO - Joining supplemental data to plays DataFrame
2025-12-18 22:04:20,530 - INFO - Loading NFL PBP data for season 2023
2025-12-18 22:04:20,530 - INFO - Loading pbp from local parquet file
2025-12-18 22:04:21,269 - INFO - Mapping player IDs to nfl_id using seasonal rosters
2025-12-18 22:04:21,270 - INFO - Rosters for season 2023 already cached, loa

In [6]:
import nfl_data_py as nfl

In [9]:
##############  Simple clock runoff model  ##############
pbp_raw = nfl.import_pbp_data(years=list(range(2020, 2021)))

pbp = (
    pbp_raw
    .sort_values(['game_id', 'play_id'])
    .assign(
        next_half_seconds_remaining=lambda x: x.half_seconds_remaining.shift(-1),
        next_game_id=lambda x: x.game_id.shift(-1)
    )
    .query('game_id == next_game_id and half_seconds_remaining > next_half_seconds_remaining')
    [['game_id', 'pass', 'play_id', 'incomplete_pass', 'yards_gained', 'next_half_seconds_remaining', 'half_seconds_remaining']]
    .assign(
        clock_runoff=lambda x: x.half_seconds_remaining - x.next_half_seconds_remaining
    )
)
pbp = pbp[pbp['pass'] == 1].reset_index(drop=True)
del pbp_raw

from sklearn.linear_model import LinearRegression

model_inc = LinearRegression()
model_comp = LinearRegression()

X_inc = pbp[pbp['incomplete_pass'] == 1][['yards_gained']]
y_inc = pbp[pbp['incomplete_pass'] == 1]['clock_runoff']

X_comp = pbp[pbp['incomplete_pass'] == 0][['yards_gained']]
y_comp = pbp[pbp['incomplete_pass'] == 0]['clock_runoff']
model_inc.fit(X_inc, y_inc)
model_comp.fit(X_comp, y_comp)

LOG.info(f'Incomplete pass clock runoff model coef: {model_inc.coef_}, intercept: {model_inc.intercept_}')
LOG.info(f'Complete pass clock runoff model coef: {model_comp.coef_}, intercept: {model_comp.intercept_}')

2020 done.
Downcasting floats.


2025-12-18 22:10:08,089 - INFO - Incomplete pass clock runoff model coef: [0.], intercept: 5.455140590667725
2025-12-18 22:10:08,090 - INFO - Complete pass clock runoff model coef: [0.04596147], intercept: 27.813861846923828


In [10]:
##############  Load the GNN Predictions ##############

cmp_results = (
    pd.concat([
        pd.read_parquet('../data/results/cmp_preds2.parquet'),
        pd.read_parquet('../data/results/cmp_preds.parquet').query('gpid != "2024010711_1919"')
    ], ignore_index=True)
    .assign(
        x_key=lambda x: (x['x'] * 100).astype(int).astype(str),
        y_key=lambda x: (x['y'] * 100).astype(int).astype(str)
    )
    .assign(
        row_key=lambda x: x.gpid + x.safety_nfl_id.astype(int).astype(str) + x.sample_type + x.x_key + x.y_key
    )
    .drop_duplicates('row_key', ignore_index=True)
)

int_results = (
    pd.concat([
        pd.read_parquet('../data/results/int_preds2.parquet'),
        pd.read_parquet('../data/results/int_preds.parquet').query('gpid != "2024010711_1919"')
    ], ignore_index=True)
    .assign(
        x_key=lambda x: (x['x'] * 100).astype(int).astype(str),
        y_key=lambda x: (x['y'] * 100).astype(int).astype(str)
    )
    .assign(
        row_key=lambda x: x.gpid + x.safety_nfl_id.astype(int).astype(str) + x.sample_type + x.x_key + x.y_key
    )
    .drop_duplicates('row_key', ignore_index=True)
)

yac_results = (
    pd.concat([
        pd.read_parquet('../data/results/yac_preds2.parquet'),  
        pd.read_parquet('../data/results/yac_preds.parquet').query('gpid != "2024010711_1919"')
    ], ignore_index=True)
    .assign(
        x_key=lambda x: (x['x'] * 100).astype(int).astype(str),
        y_key=lambda x: (x['y'] * 100).astype(int).astype(str)
    )
    .assign(
        row_key=lambda x: x.gpid + x.safety_nfl_id.astype(int).astype(str) + x.sample_type + x.x_key + x.y_key
    )
    .drop_duplicates('row_key', ignore_index=True)
)

ep_model = joblib.load('/Users/lukeneuendorf/projects/nfl-big-data-bowl-2026/data/models/gam_drive_points_model.pkl')

In [11]:
results = (
    cmp_results.merge(
        yac_results[['row_key','predicted_yac']],
        on='row_key', 
        how='left'
    ).merge(
        int_results[['row_key','predicted_int']],
        on='row_key',
        how='left'
    )
    .drop(columns=['x_key','y_key','row_key'])
    .merge(
        plays[['gpid', 'absolute_yardline_number', 'ball_land_x', 'ball_land_y','num_frames_output']],
        on='gpid', 
        how='left'
    ).assign(
        x=lambda x: x['absolute_yardline_number'] + x['x'],
        start_x=lambda x: x['absolute_yardline_number'] + x['start_x'],
    )
)

In [15]:
pbp = (
    nfl.load_pbp(seasons=[2023])
    .to_pandas()
    .assign(
        gpid=lambda x: x.old_game_id.astype(str) + '_' + x.play_id.astype(int).astype(str)
    )
    [['gpid','game_id','play_id','yardline_100','half_seconds_remaining','down','ydstogo',
        'posteam_timeouts_remaining','defteam_timeouts_remaining']]
    .dropna()
    .drop_duplicates(ignore_index=True)
)

In [16]:
ep = pbp[['gpid','yardline_100','half_seconds_remaining','down','ydstogo',
        'posteam_timeouts_remaining','defteam_timeouts_remaining']].copy()

ep['EP_0'] = ep_model.predict(
    ep[['yardline_100','half_seconds_remaining','down','ydstogo',
        'posteam_timeouts_remaining','defteam_timeouts_remaining']].values
)

In [17]:
for c in pbp.columns:
    if c == 'gpid':
        continue
    if c in results.columns:
        results = results.drop(columns=[c])

results = results.merge(
    pbp,
    on='gpid',
    how='left'
).assign(
    yards_gained=0
)

pred_cols = ['yardline_100','half_seconds_remaining','down','ydstogo',
             'posteam_timeouts_remaining','defteam_timeouts_remaining']

LOG.info(f'Starting EP calculations on {results.shape[0]} rows')
##############  INCOMPLETE EP (OFFENSE) ##############
results['EP_INCOMP_OFF'] = ep_model.predict(
    results
        .assign(
            half_seconds_remaining=lambda x: np.maximum(
                x.half_seconds_remaining - model_inc.predict(x[['yards_gained']]),
                0
            ),
            down=lambda x: np.minimum(x.down + 1, 4)
        )
        [pred_cols]
        .values
)
LOG.info("Finished calculating EP_INCOMP_OFF")

##############  INCOMPLETE EP (DEFENSE – TOD) ##############
results['EP_INCOMP_DEF'] = ep_model.predict(
    results
        .assign(
            half_seconds_remaining=lambda x: np.maximum(
                x.half_seconds_remaining - model_inc.predict(x[['yards_gained']]),
                0
            ),
            down=1,
            yardline_100=lambda x: 100 - x.yardline_100,
            ydstogo=10
        )
        [pred_cols]
        .values
)
LOG.info("Finished calculating EP_INCOMP_DEF")

###############  FINAL INCOMPLETE EP ##############
results['EP_INCOMP'] = np.where(
    results['down'] == 4,
    -results['EP_INCOMP_DEF'],
    results['EP_INCOMP_OFF']
)

results.drop(columns=['EP_INCOMP_OFF','EP_INCOMP_DEF'], inplace=True)

2025-12-18 22:16:10,210 - INFO - Starting EP calculations on 8191832 rows
2025-12-18 22:17:59,511 - INFO - Finished calculating EP_INCOMP_OFF
2025-12-18 22:19:35,989 - INFO - Finished calculating EP_INCOMP_DEF


In [18]:
for c in pbp.columns:
    if c == 'gpid':
        continue
    if c in results.columns:
        results = results.drop(columns=[c])

results = results.merge(
    pbp,
    on='gpid',
    how='left'
).assign(
    yards_gained=lambda x: x.ball_land_x - x.absolute_yardline_number
)

pred_cols = ['yardline_100','half_seconds_remaining','down','ydstogo',
             'posteam_timeouts_remaining','defteam_timeouts_remaining']

LOG.info(f'Starting EP calculations on {results.shape[0]} rows')
##############  Interception EP ##############
results['EP_INT'] = -ep_model.predict(
    results
        .assign(
            half_seconds_remaining=lambda x: np.maximum(
                x.half_seconds_remaining - model_inc.predict(x[['yards_gained']]),
                0
            ),
            down=1,
            yardline_100=lambda x: x.ball_land_x - 10,
            ydstogo=10
        )
        [pred_cols]
        .values
)
LOG.info("Finished calculating EP_INT")

2025-12-18 22:19:42,557 - INFO - Starting EP calculations on 8191832 rows
2025-12-18 22:21:19,191 - INFO - Finished calculating EP_INT


In [19]:
for c in list(pbp.columns) + ['pred_yardline_100']:
    if c == 'gpid':
        continue
    if c in results.columns:
        results = results.drop(columns=[c])

results = results.merge(
    pbp,
    on='gpid',
    how='left'
).assign(
    yards_gained=lambda x: x.ball_land_x + x.predicted_yac - x.absolute_yardline_number,
    pred_yardline_100=lambda x: 110 - (x.ball_land_x + x.predicted_yac)
)

pred_cols = ['yardline_100','half_seconds_remaining','down','ydstogo',
             'posteam_timeouts_remaining','defteam_timeouts_remaining']

LOG.info(f'Starting EP calculations on {results.shape[0]} rows')
##############  COMPLETION EP ##############
results['EP_COMP_OFF'] = ep_model.predict(
    results
        .assign(
            half_seconds_remaining=lambda x: np.maximum(
                x.half_seconds_remaining - model_comp.predict(x[['yards_gained']]),
                0
            ),
            down=lambda x: np.minimum(
                np.where(
                    x.yards_gained >= x.ydstogo,
                    1,
                    x.down + 1
                ), 4
            ),
            yardline_100=lambda x: np.maximum(
                x.pred_yardline_100,
                1
            ),
            ydstogo=lambda x: np.where(
                x.yards_gained >= x.ydstogo,
                10,
                x.ydstogo - x.yards_gained
            )
        )
        [pred_cols]
        .values
)
LOG.info("Finished calculating EP_COMP_OFF")

##############  Completion failed 4th down ##############
results['EP_COMP_DEF'] = ep_model.predict(
    results
        .assign(
            half_seconds_remaining=lambda x: np.maximum(
                x.half_seconds_remaining - model_inc.predict(x[['yards_gained']]),
                0
            ),
            down=1,
            yardline_100=lambda x: 100 - np.maximum(
                x.pred_yardline_100,
                1
            ),
            ydstogo=10
        )
        [pred_cols]
        .values
)
LOG.info("Finished calculating EP_COMP_OFF and EP_COMP_DEF")

###############  FINAL COMPLETION EP ##############
results['EP_COMP'] = np.where(
    results.pred_yardline_100 < 1,
    7,
    np.where(
        (results['down'] == 4) & (results.yards_gained < results.ydstogo),
        -results['EP_COMP_DEF'],
        results['EP_COMP_OFF']
    )
)

results.drop(columns=['EP_COMP_OFF','EP_COMP_DEF'], inplace=True)

2025-12-18 22:21:25,592 - INFO - Starting EP calculations on 8191832 rows
2025-12-18 22:23:14,164 - INFO - Finished calculating EP_COMP_OFF
2025-12-18 22:24:42,483 - INFO - Finished calculating EP_COMP_OFF and EP_COMP_DEF


In [20]:
results.columns

Index(['gpid', 'safety_nfl_id', 'sample_type', 'x', 'y', 'vx', 'vy', 'start_x',
       'start_y', 'start_dir', 'start_s', 'predicted_cmp', 'predicted_yac',
       'predicted_int', 'absolute_yardline_number', 'ball_land_x',
       'ball_land_y', 'num_frames_output', 'yards_gained', 'EP_INCOMP',
       'EP_INT', 'game_id', 'play_id', 'yardline_100',
       'half_seconds_remaining', 'down', 'ydstogo',
       'posteam_timeouts_remaining', 'defteam_timeouts_remaining',
       'pred_yardline_100', 'EP_COMP'],
      dtype='object')

In [21]:
ep.columns

Index(['gpid', 'yardline_100', 'half_seconds_remaining', 'down', 'ydstogo',
       'posteam_timeouts_remaining', 'defteam_timeouts_remaining', 'EP_0'],
      dtype='object')

In [22]:
###############  EPA Calculations ##############
results = results.merge(
    ep[['gpid','EP_0']],
    on='gpid',
    how='left'
)

results['EPA_INT'] = results['predicted_int'] * (results['EP_INT'] - results['EP_0'])
results['EPA_COMP'] = (
    (1 - results['predicted_int']) * (
        (results['predicted_cmp'] * (results['EP_COMP'] - results['EP_0'])) +
        ((1 - results['predicted_cmp']) * (results['EP_INCOMP'] - results['EP_0']))
    )
)
results['EPA'] = results['EPA_INT'] + results['EPA_COMP']

In [23]:
def decision_eff_rank_group(df, pred_col, model_type='epa'):
    """
    Compute rank-based decision efficiency for defense.
    Lower EPA values are better (defense wants to minimize EPA).
    
    Returns percentile where:
    0 = actual decision is worse than all simulated decisions (higher EPA)
    1 = actual decision is better than all simulated decisions (lower EPA)
    """
    original = df.loc[df["sample_type"] == "original", pred_col].values
    simulated = df.loc[df["sample_type"] == "simulated", pred_col].values

    if len(original) == 0 or len(simulated) < 2:
        return np.nan

    act_val = original[0]
    
    # Count how many simulated decisions are BETTER than actual decision
    # For defense, better = lower EPA
    if model_type == 'epa':
        better_sim_count = np.sum(simulated < act_val)
    elif model_type == 'int':
        better_sim_count = np.sum(simulated > act_val)
    
    # Calculate percentile: proportion of simulated decisions that are worse
    # than the actual decision (since worse means higher EPA)
    percentile = (len(simulated) - better_sim_count) / len(simulated)
    
    return percentile

epa_rank_df = (
    results
    .groupby(["gpid", "safety_nfl_id"], as_index=False)
    .apply(
        lambda g: decision_eff_rank_group(
            g, pred_col="EPA"
        )
    )
    .rename(columns={None: "decision_eff_rank_epa"})
)

epa_int_rank_df = (
    results
    .groupby(["gpid", "safety_nfl_id"], as_index=False)
    .apply(
        lambda g: decision_eff_rank_group(
            g, pred_col="EPA_INT"
        )
    )
    .rename(columns={None: "decision_eff_rank_epa_int"})
)   

epa_comp_rank_df = (
    results
    .groupby(["gpid", "safety_nfl_id"], as_index=False)
    .apply(
        lambda g: decision_eff_rank_group(
            g, pred_col="EPA_COMP"
        )
    )
    .rename(columns={None: "decision_eff_rank_epa_comp"})
)

int_rank_df = (
    results
    .groupby(["gpid", "safety_nfl_id"], as_index=False)
    .apply(
        lambda g: decision_eff_rank_group(
            g, pred_col="predicted_int", model_type='int'
        )
    )
    .rename(columns={None: "decision_eff_rank_int"})
)

In [24]:
cols = ['gpid', 'safety_nfl_id', 'sample_type', 'x', 'y', 'vx', 'vy', 'start_x',
       'start_y', 'start_dir', 'start_s', 'predicted_cmp', 'predicted_yac',
       'predicted_int', 'absolute_yardline_number', 'ball_land_x',
       'ball_land_y', 'num_frames_output', 'EPA_INT', 'EPA_COMP', 'EPA']

# save to results
(
    results[cols]
    .merge(
        epa_rank_df,
        on=['gpid', 'safety_nfl_id'],
        how='left'
    )
    .merge(
        epa_int_rank_df,
        on=['gpid', 'safety_nfl_id'],
        how='left'
    )
    .merge(
        epa_comp_rank_df,
        on=['gpid', 'safety_nfl_id'],
        how='left'
    )
    .merge(
        int_rank_df,
        on=['gpid', 'safety_nfl_id'],
        how='left'
    )
    .reset_index(drop=True)
    .to_parquet('../data/results/epa_predictions.parquet', index=False)
)