In [1]:
import os
import sys
from os.path import join
import json

from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nfl_data_py as nfl

ROOT_DIR = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.insert(0, os.path.join(ROOT_DIR,'py'))

import util

pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)

with open("paths.json", 'r') as f:
    paths = json.load(f)

PROCESSED_DATA_PATH = paths['processed_data']

In [2]:
# Code to add line_set events to tracking data
# A play can have multiple line_set events

def add_line_set_event(
    df_tracking: pd.DataFrame,
    events_col: str = 'event_new'
) -> pd.DataFrame:

    WINDOW_WIDTH = 3
    LINE_SET_MEAN_SPEED_THRESHOLD = 0.3
    SPEED_MULTIPLIER_THRESHOLD = 2.0
    SPEED_MULTIPLIER = 3
    MIN_AREA_PCT_ABOVE_THRESH = .01
    
    def set_lineset_frame_id(play: pd.DataFrame) -> pd.DataFrame:    
        off_players = play[(play.offense) & (play.frame_type == 'BEFORE_SNAP')].copy()

        off_players['s'] = np.where(
            off_players['s'] >= SPEED_MULTIPLIER_THRESHOLD, 
            off_players['s'] * SPEED_MULTIPLIER, 
            off_players['s']
        )
        off_team = off_players[['frame_id', 's']].groupby('frame_id')['s'].mean().reset_index(drop=True)
        
        off_team_smoothed = off_team.rolling(window=WINDOW_WIDTH, min_periods=1, center=True).mean()

        differences = off_team_smoothed - LINE_SET_MEAN_SPEED_THRESHOLD
        df = pd.DataFrame(
            differences, 
            columns=['s'], 
            index=(off_players.frame_id.unique())
        ).rename({'s':'diff_with_speed_thresh'},axis=1)

        # Group by frames by the area groups above and below the thresholds
        df['above_thresh'] = np.where(df.diff_with_speed_thresh > 0, True, False)
        df['prev_above_thresh'] = df.above_thresh.shift(1).fillna(False)
        df['new_group'] = ((df.above_thresh & ~df.prev_above_thresh) | (~df.above_thresh & df.prev_above_thresh))
        df['group'] = df.new_group.cumsum()
        df.drop(['prev_above_thresh','new_group'],axis=1, inplace=True)
        df['area'] = df.groupby('group')['diff_with_speed_thresh'].transform('sum')
        tot_area = df.drop_duplicates('group')['area'].abs().sum()
        df['area_pct'] = df['area'].abs() / tot_area
        df['diff_with_speed_thresh'] = np.where(
            (df.area_pct < MIN_AREA_PCT_ABOVE_THRESH) & df.above_thresh,
            -1e-10, # set to very small negative so these points aren't selected as lineset 
            df.diff_with_speed_thresh
        )

        # If ah area above the thresh is too small (minor player movements), join it with area below the thresh (line_set)
        df['above_thresh'] = np.where(df.diff_with_speed_thresh > 0, True, False)
        df['prev_above_thresh'] = df.above_thresh.shift(1).fillna(False)
        df['new_group'] = ((df.above_thresh & ~df.prev_above_thresh) | (~df.above_thresh & df.prev_above_thresh))
        df['group'] = df.new_group.cumsum()
        df.drop(['prev_above_thresh','new_group'],axis=1, inplace=True)
        df['area'] = df.groupby('group')['diff_with_speed_thresh'].transform('sum')
        df['diff_with_speed_thresh'] = df['diff_with_speed_thresh'] * (df.index/1e3 + 1)
        line_set_frame_ids = df[~df.above_thresh].groupby('group').diff_with_speed_thresh.idxmin().values.tolist()
        line_set_frame_ids = [v for v in line_set_frame_ids if v == v] # remove np.nan values

        if line_set_frame_ids:
            play.loc[play[play['frame_id'].isin(line_set_frame_ids)].index, events_col] = 'line_set'
        else:
            frame_before_snap = play[play['frame_type']=="SNAP"].frame_id.iloc[0] - 1
            play.loc[play['frame_id'] == frame_before_snap, events_col] = 'line_set'
            gpid = play.game_play_id.iloc[0]
            print(f'WARNING: defaulted {gpid}\'s line_set value to one frame before ball snap.')
        
        return play

    return df_tracking.groupby('game_play_id', group_keys=False).apply(set_lineset_frame_id)
    
def create_events(
    df_tracking: pd.DataFrame, 
    events_col: str = 'event_new'
) -> pd.DataFrame:
    '''Create events for line set and ball snap.

    Args:
        df_tracking: Tracking data
        events_col: Column name for the events
    Returns:
        DataFrame with the new events column
    '''

    if events_col in df_tracking.columns:
        df_tracking.drop(columns=events_col, inplace=True)
    
    df_tracking[events_col] = np.nan

    df_tracking = add_line_set_event(df_tracking, events_col=events_col)

    # Add 'ball_snap' event
    df_tracking.loc[df_tracking['frame_type'] == 'SNAP', events_col] = 'ball_snap'

    return df_tracking

In [3]:
df_player = pd.read_pickle(join(PROCESSED_DATA_PATH, 'players.pkl'))
df_team = pd.read_pickle(join(PROCESSED_DATA_PATH, 'teams.pkl'))

for wk in tqdm(range(1,10)):
    df_tracking = pd.read_pickle(join(PROCESSED_DATA_PATH, f'wk{wk}', 'tracking.pkl'))
    df_game = pd.read_pickle(join(PROCESSED_DATA_PATH, f'wk{wk}', 'games.pkl'))
    df_play = pd.read_pickle(join(PROCESSED_DATA_PATH, f'wk{wk}', 'play.pkl'))
    df_player_play = pd.read_pickle(join(PROCESSED_DATA_PATH, f'wk{wk}', 'player_play.pkl'))

    df_tracking['position'] = np.where(
        df_tracking.display_name == "Taysom Hill",
        "TE",
        df_tracking.position
    )

    # Drop all other plays with multiple QBS
    plays_with_multiple_qbs = (
        df_tracking.query('position=="QB" and frame_type=="SNAP"')
        [['game_play_id','club']]
        .value_counts()
        .reset_index()
        .rename(columns={0:'count'})
        .query('count > 1')
    )
    if len(plays_with_multiple_qbs) > 0:
        game_play_ids = plays_with_multiple_qbs.game_play_id.unique().tolist()
        game_ids = df_play.query('game_play_id in @game_play_ids').game_id.unique().tolist()
        print(f'Week {wk} game_play_ids with multiple QBs:')
        print(plays_with_multiple_qbs.game_play_id.unique())
        df_tracking = df_tracking.query('game_play_id not in @game_play_ids')
        df_game = df_game.query('game_id not in @game_ids')
        df_play = df_play.query('game_play_id not in @game_play_ids')
        df_player_play = df_player_play.query('game_play_id not in @game_play_ids')

    df_tracking = (
        df_tracking
        .rename(
            columns={
                'lateral_dist_to_ball':'dx_to_ball',
                'vertical_dist_to_ball':'dy_to_ball'
            }
        )
    )

    # create 'event_new' column with line_set and ball_snap events
    df_tracking = create_events(df_tracking)

    df_tracking.to_pickle(join(PROCESSED_DATA_PATH, f'wk{wk}', 'tracking_final.pkl'))
    df_game.to_pickle(join(PROCESSED_DATA_PATH, f'wk{wk}', 'games_final.pkl'))
    df_play.to_pickle(join(PROCESSED_DATA_PATH, f'wk{wk}', 'play_final.pkl'))
    df_player_play.to_pickle(join(PROCESSED_DATA_PATH, f'wk{wk}', 'player_play_final.pkl'))

  0%|          | 0/9 [00:00<?, ?it/s]

 11%|█         | 1/9 [00:07<00:57,  7.13s/it]



 33%|███▎      | 3/9 [00:24<00:48,  8.05s/it]



 56%|█████▌    | 5/9 [00:40<00:32,  8.02s/it]



 67%|██████▋   | 6/9 [00:46<00:22,  7.41s/it]

Week 7 game_play_ids with multiple QBs:
['2022102305_430']


 89%|████████▉ | 8/9 [01:00<00:07,  7.28s/it]



100%|██████████| 9/9 [01:14<00:00,  8.26s/it]


In [4]:
# Drop plays with fewer than 5 offensive line players
for wk in range(1,10):
    df_tracking = pd.read_pickle(join(PROCESSED_DATA_PATH, f'wk{wk}', 'tracking_final.pkl'))
    df_game = pd.read_pickle(join(PROCESSED_DATA_PATH, f'wk{wk}', 'games_final.pkl'))
    df_play = pd.read_pickle(join(PROCESSED_DATA_PATH, f'wk{wk}', 'play_final.pkl'))
    df_player_play = pd.read_pickle(join(PROCESSED_DATA_PATH, f'wk{wk}', 'player_play_final.pkl'))

    offensive_line = ['T','G','C']

    # list plays with fewer than 5 offensive line players
    o_line = df_tracking[
        df_tracking['position'].isin(offensive_line) &
    (df_tracking.frame_type == "SNAP")
    ]
    o_line = o_line.groupby('game_play_id').size().reset_index(name='count')
    o_line = o_line[o_line['count'] != 5]
    drop_gid = o_line.game_play_id.unique().tolist()

    if len(drop_gid) == 0:
        print("Wk {wk}: No plays without exactly 5 offensive linemen.")
    else:
        print(f"Wk {wk}: Dropping {len(drop_gid)} plays without exactly 5 offensive linemen.")
        df_tracking = df_tracking[~df_tracking.game_play_id.isin(drop_gid)]
        df_play = df_play[~df_play.game_play_id.isin(drop_gid)]
        df_player_play = df_player_play[~df_player_play.game_play_id.isin(drop_gid)]
        df_game = df_game[~df_game.game_id.isin(df_play.game_id.unique())]

    df_tracking.to_pickle(join(PROCESSED_DATA_PATH, f'wk{wk}', 'tracking_final.pkl'))
    df_game.to_pickle(join(PROCESSED_DATA_PATH, f'wk{wk}', 'games_final.pkl'))
    df_play.to_pickle(join(PROCESSED_DATA_PATH, f'wk{wk}', 'play_final.pkl'))
    df_player_play.to_pickle(join(PROCESSED_DATA_PATH, f'wk{wk}', 'player_play_final.pkl'))

Wk 1: Dropping 22 plays without exactly 5 offensive linemen.
Wk 2: Dropping 38 plays without exactly 5 offensive linemen.
Wk 3: Dropping 33 plays without exactly 5 offensive linemen.
Wk 4: Dropping 46 plays without exactly 5 offensive linemen.
Wk 5: Dropping 43 plays without exactly 5 offensive linemen.
Wk 6: Dropping 31 plays without exactly 5 offensive linemen.
Wk 7: Dropping 29 plays without exactly 5 offensive linemen.
Wk 8: Dropping 46 plays without exactly 5 offensive linemen.
Wk 9: Dropping 26 plays without exactly 5 offensive linemen.


In [5]:
for wk in range(1,10):
    df_tracking = pd.read_pickle(join(PROCESSED_DATA_PATH, f'wk{wk}', 'tracking_final.pkl'))
    df_game = pd.read_pickle(join(PROCESSED_DATA_PATH, f'wk{wk}', 'games_final.pkl'))
    df_play = pd.read_pickle(join(PROCESSED_DATA_PATH, f'wk{wk}', 'play_final.pkl'))
    df_player_play = pd.read_pickle(join(PROCESSED_DATA_PATH, f'wk{wk}', 'player_play_final.pkl'))

    # Drop offensive_formation == WILDCAT plays
    wildcat_plays = df_play[
        df_play['offense_formation'] == "WILDCAT"
    ].game_play_id.unique().tolist()

    # Check for other plays where QB is outside of lineman box (Taysom Hill Wildcat Plays)
    last_line_set_event = (
        df_tracking[df_tracking.event_new == 'line_set']
        .groupby('game_play_id')
        .frame_id.max()
        .reset_index()
        [['game_play_id','frame_id']]
        .rename(columns={'frame_id': 'line_set_fid'})
    )
    lineman_and_qb = df_tracking.query('position in ["T","G","C","QB"]')
    lineman_and_qb = lineman_and_qb.merge(last_line_set_event, on='game_play_id')
    lineman_and_qb = lineman_and_qb[lineman_and_qb.frame_id == lineman_and_qb.line_set_fid]
    min_x = (
        lineman_and_qb
        .query('position in ["T","G","C"]')
        .groupby('game_play_id')
        .agg(min_x=('x','min'))
        .reset_index()
    )
    max_x = (
        lineman_and_qb
        .query('position in ["T","G","C"]')
        .groupby('game_play_id')
        .agg(max_x=('x','max'))
        .reset_index()
    )
    lineman_and_qb = (
        lineman_and_qb
        .query('position == "QB"')
        [['game_play_id','frame_id','club','display_name','x']]
        .merge(min_x, on='game_play_id')
        .merge(max_x, on='game_play_id')
    )

    wildcat_plays += lineman_and_qb.query('x < min_x or x > max_x').game_play_id.unique().tolist()

    if len(wildcat_plays) == 0:
        print("Wk {wk}: No WILDCAT plays to drop")
    else:
        print(f"Wk {wk}: Dropping {len(wildcat_plays)} WILDCAT plays")
        print(wildcat_plays)
        df_tracking = df_tracking[~df_tracking.game_play_id.isin(wildcat_plays)]
        df_play = df_play[~df_play.game_play_id.isin(wildcat_plays)]
        df_player_play = df_player_play[~df_player_play.game_play_id.isin(wildcat_plays)]
        df_game = df_game[~df_game.game_id.isin(df_play.game_id.unique())]

    df_tracking.to_pickle(join(PROCESSED_DATA_PATH, f'wk{wk}', 'tracking_final.pkl'))
    df_game.to_pickle(join(PROCESSED_DATA_PATH, f'wk{wk}', 'games_final.pkl'))
    df_play.to_pickle(join(PROCESSED_DATA_PATH, f'wk{wk}', 'play_final.pkl'))
    df_player_play.to_pickle(join(PROCESSED_DATA_PATH, f'wk{wk}', 'player_play_final.pkl'))

Wk 1: Dropping 9 WILDCAT plays
['2022091113_291', '2022091105_818', '2022091113_291', '2022091105_818', '2022091100_546', '2022091100_594', '2022091100_762', '2022091100_2114', '2022091100_2512']
Wk 2: Dropping 6 WILDCAT plays
['2022091800_1233', '2022091800_515', '2022091811_2418', '2022091805_580', '2022091805_580', '2022091804_2685']
Wk 3: Dropping 3 WILDCAT plays
['2022092510_1683', '2022092509_2829', '2022092509_2829']
Wk 4: Dropping 12 WILDCAT plays
['2022100208_3365', '2022100213_614', '2022100208_3397', '2022100208_3340', '2022100213_1652', '2022100212_2238', '2022100213_614', '2022100208_3340', '2022100208_3365', '2022100208_3397', '2022100200_1162', '2022100200_3139']
Wk 5: Dropping 12 WILDCAT plays
['2022100900_3273', '2022100900_1460', '2022100909_2711', '2022100907_2013', '2022100907_1608', '2022100900_1669', '2022100909_2711', '2022100906_235', '2022100906_2763', '2022100900_1460', '2022100900_1669', '2022100900_3273']
Wk 6: Dropping 25 WILDCAT plays
['2022101610_2955', '

In [6]:
# Label Offensive line positions

MOVEMENT_THRESHOLD = 1

for wk in range(1,10):
    df_tracking = pd.read_pickle(join(PROCESSED_DATA_PATH, f'wk{wk}', 'tracking_final.pkl'))
    df_game = pd.read_pickle(join(PROCESSED_DATA_PATH, f'wk{wk}', 'games_final.pkl'))
    df_play = pd.read_pickle(join(PROCESSED_DATA_PATH, f'wk{wk}', 'play_final.pkl'))
    df_player_play = pd.read_pickle(join(PROCESSED_DATA_PATH, f'wk{wk}', 'player_play_final.pkl'))
    
    cols = ['game_play_id','frame_id','frame_type','nfl_id','position','display_name','x','y','event_new']
    lineman_and_qb = df_tracking[
        (df_tracking.frame_type == 'BEFORE_SNAP') & 
        df_tracking.offense & 
        df_tracking.position.isin(offensive_line + ['QB'])
    ][cols]

    last_line_set_event = (
        df_tracking[df_tracking.event_new == 'line_set']
        .groupby('game_play_id')
        .frame_id.max()
        .reset_index()
        .rename(columns={'frame_id': 'line_set_fid'})
        [['game_play_id','line_set_fid']]
    )
    lineman_and_qb = (
        lineman_and_qb
        .merge(last_line_set_event, on='game_play_id')
        .query('frame_id == line_set_fid')
    )

    lineman_and_qb['farthest_forward_lineman'] = (
        lineman_and_qb
        .query('position != "QB"')
        .groupby('game_play_id')
        .y.transform('max')
    )
    lineman_and_qb['farthest_forward_lineman'] = np.where(
        lineman_and_qb.farthest_forward_lineman == lineman_and_qb.y,
        True,
        False
    )
    x_qb = (
        lineman_and_qb
        .query('position == "QB"')
        [['game_play_id','x']]
        .rename(columns={'x': 'x_qb'})
    )
    lineman_and_qb = lineman_and_qb.merge(x_qb, on='game_play_id')
    lineman_and_qb['dx'] = np.abs(lineman_and_qb.x - lineman_and_qb.x_qb)
    lineman_and_qb['closest_to_qb'] = (
        lineman_and_qb
        .query('position != "QB"')
        .groupby('game_play_id')
        .dx.transform('min')
    )
    lineman_and_qb['closest_to_qb'] = np.where(
        lineman_and_qb.closest_to_qb == lineman_and_qb.dx,
        True,
        False
    )
    ball = (
        df_tracking.query('club == "football"')
        [['game_play_id','frame_id','x','y']]
        .rename(columns={'x': 'x_ball', 'y': 'y_ball'})
        .merge(last_line_set_event, on='game_play_id')
        .query('frame_id == line_set_fid')
        .drop(columns=['frame_id','line_set_fid'])
    )
    lineman_and_qb = lineman_and_qb.merge(ball, on='game_play_id')
    lineman_and_qb['delta_ball'] = np.sqrt(
        (lineman_and_qb.x - lineman_and_qb.x_ball)**2 + 
        (lineman_and_qb.y - lineman_and_qb.y_ball)**2
    )
    lineman_and_qb['closest_to_ball'] = (
        lineman_and_qb
        .query('position != "QB"')
        .groupby('game_play_id')
        .delta_ball.transform('min')
    )
    lineman_and_qb['closest_to_ball'] = np.where(
        lineman_and_qb.closest_to_ball == lineman_and_qb.delta_ball,
        True,
        False
    )

    # number the lineman from left to right by x position
    lineman_and_qb = lineman_and_qb.sort_values(['game_play_id','x'])
    position_map = {
        0: 'LT',
        1: 'LG',
        2: 'C',
        3: 'RG',
        4: 'RT'
    }
    lineman_and_qb.loc[lineman_and_qb.position.isin(['T','C','G']),'position_by_loc'] = (
        lineman_and_qb
        .query('position != "QB"')
        .groupby('game_play_id')
        .cumcount()
    ).replace(position_map)    

    lineman_and_qb['is_center'] = np.where(
        (lineman_and_qb.closest_to_qb & (lineman_and_qb.position_by_loc=="C")), #| 
        # (lineman_and_qb.closest_to_qb & lineman_and_qb.closest_to_ball),
        True,
        False
    )

    plays_without_balanced_offensive_line = (
        lineman_and_qb
        .groupby('game_play_id')
        .is_center.sum().reset_index()
        .query('is_center != 1')
        .game_play_id.values.tolist()
    )

    if len(plays_without_balanced_offensive_line) != 0:
        print(f"Wk {wk}: Dropping {len(plays_without_balanced_offensive_line)} plays without a balanced offensive line.")
        print(plays_without_balanced_offensive_line)
        df_tracking = df_tracking[~df_tracking.game_play_id.isin(plays_without_balanced_offensive_line)]
        df_play = df_play[~df_play.game_play_id.isin(plays_without_balanced_offensive_line)]
        df_player_play = df_player_play[~df_player_play.game_play_id.isin(plays_without_balanced_offensive_line)]
        df_game = df_game[~df_game.game_id.isin(df_play.game_id.unique())]
    else:
        print(f"Wk {wk}: All plays have a balanced offensive line.")

    # Create a new column for the position of the player based on the location of the player
    df_tracking = (
        df_tracking
        .merge(
            lineman_and_qb[['game_play_id','nfl_id','position_by_loc']].drop_duplicates(),
            on=['game_play_id','nfl_id'],
            how='left'
        )
    )
    df_tracking['position_by_loc'] = np.where(
        df_tracking.position_by_loc.isna(),
        df_tracking.position,
        df_tracking.position_by_loc
    )

    df_tracking.to_pickle(join(PROCESSED_DATA_PATH, f'wk{wk}', 'tracking_final.pkl'))
    df_game.to_pickle(join(PROCESSED_DATA_PATH, f'wk{wk}', 'games_final.pkl'))
    df_play.to_pickle(join(PROCESSED_DATA_PATH, f'wk{wk}', 'play_final.pkl'))
    df_player_play.to_pickle(join(PROCESSED_DATA_PATH, f'wk{wk}', 'player_play_final.pkl'))

Wk 1: Dropping 11 plays without a balanced offensive line.
['2022091100_3379', '2022091100_996', '2022091101_2365', '2022091101_2386', '2022091104_4410', '2022091104_999', '2022091105_1351', '2022091105_294', '2022091109_4474', '2022091109_656', '2022091111_2901']
Wk 2: Dropping 2 plays without a balanced offensive line.
['2022091803_2300', '2022091901_1082']
Wk 3: Dropping 1 plays without a balanced offensive line.
['2022092509_1716']
Wk 4: Dropping 6 plays without a balanced offensive line.
['2022100201_2798', '2022100209_201', '2022100209_2537', '2022100209_3196', '2022100209_501', '2022100213_3913']
Wk 5: Dropping 4 plays without a balanced offensive line.
['2022100903_1614', '2022100903_2758', '2022100906_717', '2022100911_1524']
Wk 6: Dropping 5 plays without a balanced offensive line.
['2022101600_2125', '2022101603_2354', '2022101603_3535', '2022101603_3601', '2022101606_56']
Wk 7: Dropping 9 plays without a balanced offensive line.
['2022102300_3044', '2022102300_834', '202210

In [7]:
df_tracking.head()

Unnamed: 0,game_id,play_id,game_play_id,nfl_id,week,display_name,frame_id,frame_type,time,jersey_number,club,play_direction,x,y,s,a,dis,o,dir,event,position,absolute_yardline_number,yards_to_go,offense,defense,ball_x,ball_y,euclidean_dist_to_ball,dx_to_ball,dy_to_ball,event_new,position_by_loc
0,2022110700,80,2022110700_80,33131.0,9,Calais Campbell,1,BEFORE_SNAP,2022-11-08 01:16:46.4,93.0,BAL,left,31.73,53.89,0.05,0.05,0.02,241.25,281.88,huddle_break_offense,DE,50,10,False,True,29.92,49.849998,4.42693,1.81,4.040002,,DE
1,2022110700,80,2022110700_80,33131.0,9,Calais Campbell,2,BEFORE_SNAP,2022-11-08 01:16:46.5,93.0,BAL,left,31.72,53.88,0.05,0.05,0.01,240.67,276.83,,DE,50,10,False,True,29.92,49.849998,4.413719,1.8,4.030002,,DE
2,2022110700,80,2022110700_80,33131.0,9,Calais Campbell,3,BEFORE_SNAP,2022-11-08 01:16:46.6,93.0,BAL,left,31.7,53.87,0.04,0.04,0.02,240.67,269.27,,DE,50,10,False,True,29.92,49.849998,4.396454,1.78,4.020002,,DE
3,2022110700,80,2022110700_80,33131.0,9,Calais Campbell,4,BEFORE_SNAP,2022-11-08 01:16:46.7,93.0,BAL,left,31.68,53.86,0.04,0.04,0.02,240.67,262.17,,DE,50,10,False,True,29.92,49.860001,4.37008,1.76,3.999999,,DE
4,2022110700,80,2022110700_80,33131.0,9,Calais Campbell,5,BEFORE_SNAP,2022-11-08 01:16:46.8,93.0,BAL,left,31.67,53.85,0.04,0.04,0.01,240.03,258.52,,DE,50,10,False,True,29.92,49.860001,4.356902,1.75,3.989999,,DE


# Look into weird ball snap plays
Did not end up dropping, as the ball position is not used in the classification

In [8]:
# import matplotlib.pyplot as plt
# from matplotlib.animation import FuncAnimation
# from IPython.display import HTML

# def plot_play(
#     df_tracking, 
#     game_play_id, 
#     every_other_frame=True, 
#     event_col='event',
#     highlight_lineman=False,
#     highlight_qb=False
# ) -> HTML:
#     qry = 'game_play_id==@game_play_id'
#     tracking_play = df_tracking.query(qry).copy().reset_index(drop=True)

#     # Kepe every other frame, the first and last frames, and frames with events
#     first_frame = tracking_play['frame_id'].min()
#     last_frame = tracking_play['frame_id'].max()
#     frames_with_events = tracking_play.groupby('frame_id')[event_col].transform('any')

#     if every_other_frame:
#         tracking_play = tracking_play[
#             (tracking_play['frame_id'] == first_frame) | 
#             (tracking_play['frame_id'] == last_frame) | 
#             (frames_with_events) |
#             (tracking_play['frame_id'] % 2 == 0)  # Keep even frames only
#         ].copy().reset_index(drop=True)

#     frames = tracking_play['frame_id'].unique()
#     current_event = [None]  

#     field_width = 53.3

#     fig, ax = plt.subplots(figsize=(10, 5))

#     padding = 2
#     min_y = tracking_play.y.min() - padding
#     max_y = tracking_play.y.max() + padding

#     los = tracking_play['absolute_yardline_number'].iloc[0]
#     to_go_line = los + tracking_play['yards_to_go'].iloc[0]

#     def update(frame_id):
#         """Update function for each animation frame."""
#         ax.clear()

#         # can you make the field a light grey color?
#         ax.set_facecolor('lightgrey')
        
#         ax.set_yticks(np.arange(10, 110+1, 5))
#         ax.grid(which='major', axis='y', linestyle='-', linewidth='0.5', color='black', zorder=1)

#         for spine in ax.spines.values():
#             spine.set_visible(False)

#         ax.tick_params(left=False, bottom=False, labelleft=False, labelbottom=False)

#         ax.set_xlim(0, field_width)
#         ax.set_ylim(min_y, max_y)

#         current_frame = tracking_play.query('frame_id == @frame_id')
#         if highlight_lineman and highlight_qb:
#             offense = current_frame.query('offense and position not in ["QB","T","G","C"]')
#             lineman = current_frame.query('offense and position in ["T","G","C"]')
#             qb = current_frame.query('offense and position == "QB"')
#         elif highlight_lineman:
#             offense = current_frame.query('offense and position not in ["T","G","C"]')
#             lineman = current_frame.query('offense and position in ["T","G","C"]')
#         elif highlight_qb:
#             offense = current_frame.query('offense and position != "QB"')
#             qb = current_frame.query('offense and position == "QB"')
#         else:
#             offense = current_frame.query('offense')
#         defense = current_frame.query('~offense and club != "football"')
#         football = current_frame.query('club == "football"')

#         # plot the los in blue
#         ax.axhline(los, color='blue', linewidth=1.2, linestyle='-', zorder=1)

#         # plot the line to go
#         ax.axhline(to_go_line, color='yellow', linewidth=1.2, linestyle='-', zorder=1)

#         # event = current_frame[event_col].iloc[0] if not current_frame[event_col].isna().all() else None
#         # if event:
#         #     current_event[0] = event

#         ax.scatter(offense.x, offense.y, c='red', edgecolor='black', label='Offense', zorder=2)
#         ax.scatter(defense.x, defense.y, c='blue', edgecolor='black', label='Defense', zorder=2)
#         ax.scatter(football.x, football.y, c='brown', edgecolor='black', label='Football', s=20, zorder=3)

#         if highlight_lineman:
#             ax.scatter(lineman.x, lineman.y, c='green', edgecolor='black', label='Lineman', zorder=2)
#         if highlight_qb:
#             ax.scatter(qb.x, qb.y, c='purple', edgecolor='black', label='QB', zorder=2)

#         event = current_frame[event_col].iloc[0]
#         box_color = 'white'  # Default color
#         alpha_value = 0.8    # Transparency value
        
#         # Set box color based on event type
#         if event == 'line_set':
#             box_color = 'green'
#             alpha_value = 0.5  # More transparency for the green box
#         elif event == 'ball_snap':
#             box_color = 'red'
#             alpha_value = 0.5  # More transparency for the red box
        
#         ax.text(
#             1, max_y + 1.5, f"{event}",
#             fontsize=12, ha='left', color='black',
#             bbox=dict(facecolor=box_color, alpha=alpha_value),
#             zorder=4
#         )

#         ax.text(52.3, max_y + 1.5, f"{frame_id / 10:.01f} s", fontsize=12, ha='right', color='black', 
#                 bbox=dict(facecolor='white', alpha=0.8), zorder=4)

#     ani = FuncAnimation(fig, update, frames=frames, interval=100, repeat=False)

#     plt.subplots_adjust(left=0, right=1, bottom=0, top=.9)

#     plt.close(fig)

#     return HTML(ani.to_jshtml(fps=5))

In [9]:
# import numpy as np
# import pandas as pd

# MOVEMENT_THRESHOLD = 1

# ball_tracking = df_tracking[(df_tracking.club == 'football') & (df_tracking.frame_type == 'BEFORE_SNAP')]

# snap_frames = df_tracking[df_tracking.frame_type == 'SNAP'][['game_play_id', 'frame_id']].rename(columns={'frame_id': 'ball_snap_fid'})
# ball_tracking = ball_tracking.merge(snap_frames, on='game_play_id')

# last_line_set_event = (
#     df_tracking[df_tracking.event_new == 'line_set']
#     .groupby('game_play_id')
#     .frame_id.max()
#     .reset_index()
#     .rename(columns={'frame_id': 'line_set_fid'})
# )
# ball_tracking = ball_tracking.merge(last_line_set_event, on='game_play_id')

# # Filter frames up to 3 frames before the ball_snap_fid
# ball_tracking = ball_tracking[
#     (ball_tracking['frame_id'] <= (ball_tracking['ball_snap_fid'] - 2)) &
#     (ball_tracking['frame_id'] >= ball_tracking['line_set_fid'])
# ]

# # Calculate max and min for x and y for each game_play_id
# ball_movement = ball_tracking.groupby('game_play_id').agg({'x': ['min', 'max'], 'y': ['min', 'max']}).reset_index()
# ball_movement.columns = ['game_play_id', 'x_min', 'x_max', 'y_min', 'y_max']

# # Calculate total movement and filter out excessive movements
# ball_movement['delta_x'] = (ball_movement['x_max'] - ball_movement['x_min'])
# ball_movement['delta_y'] = (ball_movement['y_max'] - ball_movement['y_min'])
# ball_movement['total_movement'] = np.sqrt(ball_movement['delta_x']**2 + ball_movement['delta_y']**2)
# drop_gids = ball_movement[
#     (ball_movement['total_movement'] >= MOVEMENT_THRESHOLD)
# ].game_play_id.tolist()

# print(drop_gids)

In [10]:
# gpid = '2022110300_221'
# plot_play(df_tracking, gpid, event_col='event_new', highlight_lineman=True, highlight_qb=True)

In [11]:
# Check that there is a line_set event for each play
for wk in range(1,10):
    df_tracking = pd.read_pickle(join(PROCESSED_DATA_PATH, f'wk{wk}', 'tracking_final.pkl'))
    df_play = pd.read_pickle(join(PROCESSED_DATA_PATH, f'wk{wk}', 'play_final.pkl'))

    line_set_check = df_tracking[df_tracking['event_new'] == 'line_set'].game_play_id.nunique() == df_play.game_play_id.nunique()
    ball_snap_check = df_tracking[df_tracking['event_new'] == 'ball_snap'].game_play_id.nunique() == df_play.game_play_id.nunique()

    print(f'Week {wk}')
    print(f'\tThere is a line_set event for each play: {line_set_check}')
    print(f'\tThere is a ball_snap event for each play: {ball_snap_check}')

Week 1
	There is a line_set event for each play: True
	There is a ball_snap event for each play: True
Week 2
	There is a line_set event for each play: True
	There is a ball_snap event for each play: True
Week 3
	There is a line_set event for each play: True
	There is a ball_snap event for each play: True
Week 4
	There is a line_set event for each play: True
	There is a ball_snap event for each play: True
Week 5
	There is a line_set event for each play: True
	There is a ball_snap event for each play: True
Week 6
	There is a line_set event for each play: True
	There is a ball_snap event for each play: True
Week 7
	There is a line_set event for each play: True
	There is a ball_snap event for each play: True
Week 8
	There is a line_set event for each play: True
	There is a ball_snap event for each play: True
Week 9
	There is a line_set event for each play: True
	There is a ball_snap event for each play: True


# Classify motion player and motion frames

In [13]:
SPEED_THRESHOLD = 1.0
MOTION_THRESHOLD = 3
X_LAST_FRAMES_MOVING = 2

for wk in range(1,10):
    df_tracking = pd.read_pickle(join(PROCESSED_DATA_PATH, f'wk{wk}', 'tracking_final.pkl'))
    df_play = pd.read_pickle(join(PROCESSED_DATA_PATH, f'wk{wk}', 'play_final.pkl'))

    df_motion = df_tracking.copy()
    df_snap_frames = df_motion[df_motion['frame_type'] == 'SNAP'].groupby('game_play_id')['frame_id'].first().reset_index()
    df_motion = df_motion.merge(df_snap_frames, on='game_play_id', suffixes=('', '_ball_snap'))
    del df_snap_frames
    df_motion = df_motion[
        (df_motion['frame_id_ball_snap'] - 10 <= df_motion['frame_id'] ) &
        (df_motion['frame_id'] <= df_motion['frame_id_ball_snap']) &
        df_motion.offense
    ]
    df_motion['frame_weight'] = np.where(
        df_motion['frame_id'] == df_motion['frame_id_ball_snap'],
        10,
        (df_motion['frame_id'] - df_motion['frame_id_ball_snap']) % 10
    )

    # groupby game_play_id and nfl_id. count the number of times their s is avoe SPEED_THRESHOLD
    df_motion['moving'] = df_motion['s'] >= SPEED_THRESHOLD
    df_motion['weighted_moving'] = df_motion['s'] * (1 + (df_motion['frame_weight'] * 0.1)) >= SPEED_THRESHOLD
    df_motion['moving_and_last_X_frames'] = np.where(
        (df_motion['frame_weight'] > 10 - X_LAST_FRAMES_MOVING) & df_motion['moving'],
        True,
        False
    )

    x_y_first_one_sec_before_snap = (
        df_motion[['game_play_id','nfl_id','x','y']]
        .drop_duplicates(['game_play_id','nfl_id'], keep='first')
        .rename(columns={'x': 'x_first_one_sec_before_snap', 'y': 'y_first_one_sec_before_snap'})
    )

    x_y_frame_before_snap = (
        df_motion[['game_play_id','nfl_id','x','y']]
        .drop_duplicates(['game_play_id','nfl_id'], keep='last')
        .rename(columns={'x': 'x_frame_before_snap', 'y': 'y_frame_before_snap'})
    )

    df_player_in_motion = (
        df_motion.groupby(['game_play_id', 'nfl_id'])
        .agg(
            num_frames_moving=('moving', 'sum'), 
            num_frames_moving_weighted=('weighted_moving', 'sum'),
            num_frames_moving_and_last_X_frames=('moving_and_last_X_frames', 'sum'),
            avg_speed=('s', 'mean')
        ).reset_index()
    )

    df_player_in_motion = df_player_in_motion.merge(x_y_first_one_sec_before_snap, on=['game_play_id','nfl_id'])
    df_player_in_motion = df_player_in_motion.merge(x_y_frame_before_snap, on=['game_play_id','nfl_id'])
    df_player_in_motion['euclidean_distance'] = np.sqrt(
        (df_player_in_motion.x_first_one_sec_before_snap - df_player_in_motion.x_frame_before_snap)**2 +
        (df_player_in_motion.y_first_one_sec_before_snap - df_player_in_motion.y_frame_before_snap)**2
    )
    df_player_in_motion.drop(
        columns=['x_first_one_sec_before_snap','y_first_one_sec_before_snap',
                'x_frame_before_snap','y_frame_before_snap'], 
        inplace=True
    )

    df_player_in_motion['in_motion'] = np.where(
        (
            (df_player_in_motion['num_frames_moving'] >= MOTION_THRESHOLD) |
            (df_player_in_motion['num_frames_moving_weighted'] >= MOTION_THRESHOLD) |
            (df_player_in_motion['num_frames_moving_and_last_X_frames'] >= X_LAST_FRAMES_MOVING)
        ) &
        (df_player_in_motion['num_frames_moving'] > 1),
        True,
        False
    )

    # Filter by eligible positions
    df_player_in_motion = df_player_in_motion.merge(df_player[['nfl_id', 'position','display_name']], on='nfl_id')
    motion_eligaible_positions = ['WR', 'TE', 'RB', 'FB']
    df_player_in_motion['in_motion'] = np.where(
        (
            df_player_in_motion.in_motion & 
            (
                df_player_in_motion.position.isin(motion_eligaible_positions) |
                (df_player_in_motion.display_name == 'Taysom Hill')
            )
        ),
        True, 
        False
    )

    # If a play has multiple players in motion, keep the player with the highest speed
    gid_multiple_players_in_motion = (
        df_player_in_motion
        .query('in_motion')
        .game_play_id.value_counts()
        .reset_index()
        .query('game_play_id > 1')
        ['index'].tolist()
    )
    for gid in gid_multiple_players_in_motion:
        players_in_motion = df_player_in_motion.query('game_play_id == @gid and in_motion').copy()
        max_speed = players_in_motion.avg_speed.max()
        players_in_motion['in_motion'] = np.where(
            players_in_motion.avg_speed == max_speed,
            True,
            False
        )
        df_player_in_motion.loc[players_in_motion.index, 'in_motion'] = players_in_motion['in_motion']

    df_player_in_motion = (
        df_player_in_motion.query('in_motion')
        .drop(columns=[
            'position','display_name','num_frames_moving','num_frames_moving_weighted',
            'num_frames_moving_and_last_X_frames','avg_speed','euclidean_distance'
        ]).rename(columns={'in_motion':'motion_player'})
    )

    if 'motion_player' in df_tracking.columns:
        df_tracking.drop(columns='motion_player', inplace=True)

    df_tracking = df_tracking.merge(df_player_in_motion, on=['game_play_id', 'nfl_id'], how='left')
    df_tracking['motion_player'] = df_tracking['motion_player'].fillna(False)

    pct_motion = df_player_in_motion.query('motion_player').shape[0] / df_play.game_play_id.nunique()
    print(f'Week {wk}: {pct_motion:.2%} of run plays have a player in motion.')

    df_tracking.to_pickle(join(PROCESSED_DATA_PATH, f'wk{wk}', 'tracking_final.pkl'))

Week 1: 44.26% of run plays have a player in motion.
Week 2: 41.94% of run plays have a player in motion.
Week 3: 39.52% of run plays have a player in motion.
Week 4: 39.27% of run plays have a player in motion.
Week 5: 36.79% of run plays have a player in motion.
Week 6: 40.58% of run plays have a player in motion.
Week 7: 43.30% of run plays have a player in motion.
Week 8: 42.02% of run plays have a player in motion.
Week 9: 41.27% of run plays have a player in motion.


In [14]:
for wk in tqdm(range(1,10)):
    df_tracking = pd.read_pickle(join(PROCESSED_DATA_PATH, f'wk{wk}', 'tracking_final.pkl'))
    
    if 'ball_snap_fid' in df_tracking.columns:
        df_tracking.drop(columns='ball_snap_fid', inplace=True)

    if 'oline_box_left' in df_tracking.columns:
        df_tracking.drop(columns='oline_box_left', inplace=True)

    if 'oline_box_right' in df_tracking.columns:
        df_tracking.drop(columns='oline_box_right', inplace=True)

    if 'last_line_set_fid' in df_tracking.columns:
        df_tracking.drop(columns='last_line_set_fid', inplace=True)

    if 'first_line_set_fid' in df_tracking.columns:
        df_tracking.drop(columns='first_line_set_fid', inplace=True)

    if 'center_x_at_line_set' in df_tracking.columns:
        df_tracking.drop(columns='center_x_at_line_set', inplace=True)
        
    last_line_set_event = (
        df_tracking[df_tracking.event_new == 'line_set']
        [['game_play_id','frame_id']]
        .groupby('game_play_id')
        .frame_id.max()
        .reset_index()
        .rename(
            columns={
                'frame_id': 'last_line_set_fid',
            }
        )  
    )

    oline_box = (
        df_tracking
        .query('position_by_loc in ["LT","RT"]')
        .merge(last_line_set_event, on='game_play_id')
        .query('frame_id == last_line_set_fid')
        [['game_play_id','last_line_set_fid','position_by_loc','x']]
    )

    ball_snap_fid = (
        df_tracking[df_tracking.event_new == 'ball_snap']
        [['game_play_id','frame_id']]
        .rename(columns={'frame_id': 'ball_snap_fid'})
        .drop_duplicates()
    )

    df_tracking = df_tracking.merge(ball_snap_fid, on='game_play_id', how='left')

    # Add oline_box_left
    df_tracking = (
        df_tracking
        .merge(
            (
                oline_box
                .query('position_by_loc=="LT"')
                [['game_play_id','last_line_set_fid','x']]
                .rename(columns={'x':'oline_box_left'})
            ), 
            on='game_play_id', 
            how='left'
        )
    )

    # Add oline_box_right
    df_tracking = (
        df_tracking
        .merge(
            (
                oline_box
                .query('position_by_loc=="RT"')
                [['game_play_id','x']]
                .rename(columns={'x':'oline_box_right'})
            ),
            on='game_play_id',
            how='left'
        )
    )

    # Add center_x_at_line_set
    df_tracking = (
        df_tracking
        .merge(
            (
                df_tracking
                .query('frame_id == last_line_set_fid and position_by_loc == "C"')
                .rename(columns={'x':'center_x_at_line_set'})
                [['game_play_id','center_x_at_line_set']]
                .drop_duplicates()
            ),
            on='game_play_id',
            how='left'
        )
    )

    first_line_set_event = (
        df_tracking[df_tracking.event_new == 'line_set']
        [['game_play_id','frame_id']]
        .groupby('game_play_id')
        .frame_id.min()
        .reset_index()
        .rename(
            columns={
                'frame_id': 'first_line_set_fid',
            }
        )
    )

    df_tracking = df_tracking.merge(first_line_set_event, on='game_play_id', how='left')

    extra_on_oline = df_tracking.query('last_line_set_fid == frame_id')
    y_min_oline_left = extra_on_oline.query('position_by_loc.isin(["LT","LG","C"])').groupby('game_play_id').y.min().reset_index()
    y_min_oline_right = extra_on_oline.query('position_by_loc.isin(["RT","RG","C"])').groupby('game_play_id').y.min().reset_index()
    extra_on_oline = extra_on_oline.merge(y_min_oline_left, on='game_play_id', suffixes=('','_min_oline_left'))
    extra_on_oline = extra_on_oline.merge(y_min_oline_right, on='game_play_id', suffixes=('','_min_oline_right'))
    extra_on_oline = extra_on_oline.query(
        'offense and ' +\
        '((x < oline_box_left and x > oline_box_left - 2 and y_min_oline_left - 0.1 < y) or ' +\
        ' (x > oline_box_right and x < oline_box_right + 2 and y_min_oline_right - 0.1 < y))'
    ).drop_duplicates()[['game_play_id','nfl_id','x','y_min_oline_left','y_min_oline_right','oline_box_left','oline_box_right']]

    extra_on_oline['extra_on_oline'] = True
    extra_on_oline['extra_oline_box_left'] = np.where(
        extra_on_oline['x'] < extra_on_oline['oline_box_left'],
        extra_on_oline['x'],
        None
    )
    extra_on_oline['extra_oline_box_right'] = np.where(
        extra_on_oline['x'] > extra_on_oline['oline_box_right'],
        extra_on_oline['x'],
        None
    )

    # Merge extra oline data to the tracking data
    if 'extra_on_oline' in df_tracking.columns:
        df_tracking.drop(columns='extra_on_oline', inplace=True)
    if 'extra_oline_box_left' in df_tracking.columns:
        df_tracking.drop(columns='extra_oline_box_left', inplace=True)
    if 'extra_oline_box_right' in df_tracking.columns:
        df_tracking.drop(columns='extra_oline_box_right', inplace=True)
    if 'y_min_oline_left' in df_tracking.columns:
        df_tracking.drop(columns='y_min_oline_left', inplace=True)
    if 'y_min_oline_right' in df_tracking.columns:
        df_tracking.drop(columns='y_min_oline_right', inplace=True)
    df_tracking = df_tracking.merge(
        extra_on_oline[['game_play_id','nfl_id','y_min_oline_left','y_min_oline_right',
                        'extra_on_oline','extra_oline_box_left','extra_oline_box_right']],
        on=['game_play_id','nfl_id'], 
        how='left'
    )
    df_tracking['extra_oline_box_left'] = df_tracking.groupby('game_play_id')['extra_oline_box_left'].transform('max')
    df_tracking['extra_oline_box_right'] = df_tracking.groupby('game_play_id')['extra_oline_box_right'].transform('max')
    df_tracking['extra_oline_box_left'] = df_tracking['extra_oline_box_left'].fillna(df_tracking['oline_box_left'])
    df_tracking['extra_oline_box_right'] = df_tracking['extra_oline_box_right'].fillna(df_tracking['oline_box_right'])
    df_tracking['extra_on_oline'] = df_tracking['extra_on_oline'].fillna(False)

    # Repeat the process for extra oline outside of the additional oline box
    extra_on_oline = df_tracking.query('extra_on_oline and (last_line_set_fid == frame_id)')
    extra_on_oline = extra_on_oline.query(
        'offense and ' +\
        '((~extra_oline_box_left.isna() and x < extra_oline_box_left and x > extra_oline_box_left - 2 and y > y_min_oline_left - 0.1) or ' +\
        ' (~extra_oline_box_right.isna() and x > extra_oline_box_right and x < extra_oline_box_right + 2 and y > y_min_oline_right - 0.1))'
    ).drop_duplicates()[['game_play_id','nfl_id','x','y_min_oline_left','y_min_oline_right','extra_oline_box_left','extra_oline_box_right']]

    extra_on_oline['extra_on_oline'] = True
    extra_on_oline['extra_oline_box_left'] = np.where(
        extra_on_oline['x'] < extra_on_oline['extra_oline_box_left'],
        extra_on_oline['x'],
        None
    )
    extra_on_oline['extra_oline_box_right'] = np.where(
        extra_on_oline['x'] > extra_on_oline['extra_oline_box_right'],
        extra_on_oline['x'],
        None
    )

    if not extra_on_oline.empty:
        df_tracking = df_tracking.merge(
            extra_on_oline[['game_play_id','nfl_id',
                            'extra_on_oline','extra_oline_box_left','extra_oline_box_right']],
            on=['game_play_id','nfl_id'], 
            how='left',
            suffixes=('','_x')
        )

        df_tracking['extra_oline_box_left_x'] = df_tracking.groupby('game_play_id')['extra_oline_box_left_x'].transform('max')
        df_tracking['extra_oline_box_right_x'] = df_tracking.groupby('game_play_id')['extra_oline_box_right_x'].transform('max')
        df_tracking['extra_on_oline_x'] = df_tracking['extra_on_oline_x'].fillna(False)

        df_tracking['extra_on_oline'] = df_tracking['extra_on_oline'] | df_tracking['extra_on_oline_x']
        df_tracking['extra_oline_box_left'] = df_tracking['extra_oline_box_left'].fillna(df_tracking['extra_oline_box_left_x'])
        df_tracking['extra_oline_box_right'] = df_tracking['extra_oline_box_right'].fillna(df_tracking['extra_oline_box_right_x'])

        df_tracking.drop(columns=['extra_on_oline_x','extra_oline_box_left_x','extra_oline_box_right_x'], inplace=True)

    df_tracking.to_pickle(join(PROCESSED_DATA_PATH, f'wk{wk}', 'tracking_final.pkl'))

100%|██████████| 9/9 [01:13<00:00,  8.18s/it]
