In [1]:
from os.path import join
from tqdm import tqdm
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from py import util

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

DATA_DIR = 'data'

## 1. Load the data

In [2]:
WEEKS = range(1,2)

In [3]:
games = util.uncamelcase_columns(pd.read_csv(join(DATA_DIR, 'games.csv')))
games = games[games['week'].isin(WEEKS)].reset_index(drop=True)
game_ids = games['game_id'].unique()
games.head()

Unnamed: 0,game_id,season,week,game_date,game_time_eastern,home_team_abbr,visitor_team_abbr,home_final_score,visitor_final_score
0,2022090800,2022,1,9/8/2022,20:20:00,LA,BUF,10,31
1,2022091100,2022,1,9/11/2022,13:00:00,ATL,NO,26,27
2,2022091101,2022,1,9/11/2022,13:00:00,CAR,CLE,24,26
3,2022091102,2022,1,9/11/2022,13:00:00,CHI,SF,19,10
4,2022091103,2022,1,9/11/2022,13:00:00,CIN,PIT,20,23


In [4]:
cols = ['game_id', 'play_id', 'play_description', 'quarter', 'down',
       'yards_to_go', 'possession_team', 'defensive_team', 'game_clock', 
       'pre_snap_home_score', 'pre_snap_visitor_score', 
       'absolute_yardline_number', 'pre_snap_home_team_win_probability',
       'pre_snap_visitor_team_win_probability', 'expected_points',
       'offense_formation', 'receiver_alignment', 'play_clock_at_snap',
       'rush_location_type',
       'yards_gained', 'home_team_win_probability_added',
       'visitor_team_win_probility_added', 'expected_points_added',
       'pff_run_concept_primary',
       'pff_pass_coverage', 'pff_man_zone']

plays = (
    util.uncamelcase_columns(pd.read_csv(join(DATA_DIR, 'plays.csv')))
    .query('game_id.isin(@game_ids)')
    .query('pff_run_concept_primary.notnull() and ~pff_run_concept_primary.isin(["TRICK","UNDEFINED"])')
    .query('play_nullified_by_penalty == "N"')
    .query('yards_gained == pre_penalty_yards_gained')
    .query('qb_kneel == 0 and qb_sneak != True and qb_spike != True')
    .query('pass_result.isna() and is_dropback == False') # Designed runs only
    .query('pff_run_pass_option == 0')
    .query('dropback_type.isna()')
    .reset_index(drop=True)
    [cols]
)
plays.insert(0, 'game_play_id', plays['game_id'].astype(str) + '_' + plays['play_id'].astype(str))

game_play_ids = plays['game_play_id'].unique()

print(f'Number of non-RPO run plays: {len(game_play_ids)}')
print(plays.pff_run_concept_primary.value_counts())
plays.head()

Number of non-RPO run plays: 477
pff_run_concept_primary
OUTSIDE ZONE    156
INSIDE ZONE      96
MAN              93
PULL LEAD        44
COUNTER          35
POWER            29
TRAP             13
DRAW              9
FB RUN            2
Name: count, dtype: int64


Unnamed: 0,game_play_id,game_id,play_id,play_description,quarter,down,yards_to_go,possession_team,defensive_team,game_clock,pre_snap_home_score,pre_snap_visitor_score,absolute_yardline_number,pre_snap_home_team_win_probability,pre_snap_visitor_team_win_probability,expected_points,offense_formation,receiver_alignment,play_clock_at_snap,rush_location_type,yards_gained,home_team_win_probability_added,visitor_team_win_probility_added,expected_points_added,pff_run_concept_primary,pff_pass_coverage,pff_man_zone
0,2022091105_3712,2022091105,3712,(2:13) J.Taylor right guard to HST 15 for 14 y...,4,1,10,IND,HOU,02:13,20,13,81,0.778418,0.221582,3.682266,SINGLEBACK,3x1,20.0,INSIDE_LEFT,14,-0.064115,0.064115,1.109916,OUTSIDE ZONE,Cover-3,Zone
1,2022091104_1094,2022091104,1094,(12:56) D.Swift right end to DET 26 for 1 yard...,2,1,10,DET,PHI,12:56,7,7,35,0.395236,0.604764,1.608981,SINGLEBACK,2x2,5.0,INSIDE_RIGHT,1,-0.011404,0.011404,-0.539352,OUTSIDE ZONE,Quarters,Zone
2,2022091101_3923,2022091101,3923,(1:24) C.McCaffrey up the middle to CLV 15 for...,4,2,11,CAR,CLE,01:24,21,23,95,0.741719,0.258281,4.186713,SINGLEBACK,2x2,11.0,OUTSIDE_RIGHT,0,-0.094092,0.094092,-0.929233,INSIDE ZONE,Cover-1,Man
3,2022091100_501,2022091100,501,(7:46) A.Kamara left end to NO 28 for 5 yards ...,1,1,10,NO,ATL,07:46,3,0,87,0.381722,0.618278,1.365245,SINGLEBACK,3x1,19.0,INSIDE_LEFT,5,-0.007771,0.007771,0.087899,OUTSIDE ZONE,Cover-2,Zone
4,2022091110_729,2022091110,729,(3:20) (Shotgun) J.Conner up the middle to ARZ...,1,1,10,ARI,KC,03:20,0,14,35,0.075713,0.924287,1.270582,SHOTGUN,2x1,16.0,INSIDE_RIGHT,2,-0.002861,0.002861,-0.587148,OUTSIDE ZONE,Cover-3,Zone


In [5]:
tracking = pd.DataFrame()
for i in tqdm(WEEKS, desc='Loading tracking data', unit='file'):
    data = util.uncamelcase_columns(pd.read_csv(join(DATA_DIR, f'tracking_week_{i}.csv')))
    data.insert(
        0, 
        'game_play_id', 
        data['game_id'].astype(str) + '_' + data['play_id'].astype(str)
    )
    data = data.query('game_play_id in @game_play_ids').drop(columns=['jersey_number'])
    tracking = pd.concat([
        tracking,
        data
    ], ignore_index=True)

del data

tracking, plays = util.standardize_direction(tracking, plays)

tracking.head()

Loading tracking data:   0%|          | 0/1 [00:00<?, ?file/s]

Loading tracking data: 100%|██████████| 1/1 [00:09<00:00,  9.78s/file]


Unnamed: 0,game_play_id,game_id,play_id,nfl_id,display_name,frame_id,frame_type,time,club,play_direction,x,y,s,a,dis,o,dir,event
0,2022091200_64,2022091200,64,35459.0,Kareem Jackson,1,BEFORE_SNAP,2022-09-13 00:16:03.5,DEN,right,24.75,51.06,0.72,0.37,0.07,293.83,111.66,huddle_break_offense
1,2022091200_64,2022091200,64,35459.0,Kareem Jackson,2,BEFORE_SNAP,2022-09-13 00:16:03.6,DEN,right,24.73,51.13,0.71,0.36,0.07,294.59,108.79,
2,2022091200_64,2022091200,64,35459.0,Kareem Jackson,3,BEFORE_SNAP,2022-09-13 00:16:03.7,DEN,right,24.71,51.2,0.69,0.23,0.07,295.55,110.1,
3,2022091200_64,2022091200,64,35459.0,Kareem Jackson,4,BEFORE_SNAP,2022-09-13 00:16:03.8,DEN,right,24.68,51.26,0.67,0.22,0.07,295.55,112.02,
4,2022091200_64,2022091200,64,35459.0,Kareem Jackson,5,BEFORE_SNAP,2022-09-13 00:16:03.9,DEN,right,24.65,51.32,0.65,0.34,0.07,294.26,117.17,


In [6]:
# only include tracking data for plays at or after the ball snap
ball_snap_frames = (
    tracking[tracking['event'] == 'ball_snap']
    .groupby('game_play_id')['frame_id']
    .min()
    .rename('ball_snap_frame_id')
)
tracking = tracking.merge(ball_snap_frames, on='game_play_id', how='left')
tracking = tracking[tracking['frame_id'] >= tracking['ball_snap_frame_id']].reset_index(drop=True)
tracking = tracking.drop(columns=['ball_snap_frame_id'])

In [7]:
players = util.uncamelcase_columns(pd.read_csv(join(DATA_DIR, 'players.csv')))
players.head()

Unnamed: 0,nfl_id,height,weight,birth_date,college_name,position,display_name
0,25511,6-4,225,1977-08-03,Michigan,QB,Tom Brady
1,29550,6-4,328,1982-01-22,Arkansas,T,Jason Peters
2,29851,6-2,225,1983-12-02,California,QB,Aaron Rodgers
3,30842,6-6,267,1984-05-19,UCLA,TE,Marcedes Lewis
4,33084,6-4,217,1985-05-17,Boston College,QB,Matt Ryan


In [8]:
player_play = util.uncamelcase_columns(pd.read_csv(join(DATA_DIR, 'player_play.csv')))
player_play.insert(0, 'game_play_id', player_play['game_id'].astype(str) + '_' + player_play['play_id'].astype(str))
player_play = player_play.query('game_play_id in @game_play_ids').reset_index(drop=True)
player_play.head()

Unnamed: 0,game_play_id,game_id,play_id,nfl_id,team_abbr,had_rush_attempt,rushing_yards,had_dropback,passing_yards,sack_yards_as_offense,had_pass_reception,receiving_yards,was_targetted_receiver,yardage_gained_after_the_catch,fumbles,fumble_lost,fumble_out_of_bounds,assisted_tackle,forced_fumble_as_defense,half_sack_yards_as_defense,pass_defensed,quarterback_hit,sack_yards_as_defense,safety_as_defense,solo_tackle,tackle_assist,tackle_for_a_loss,tackle_for_a_loss_yardage,had_interception,interception_yards,fumble_recoveries,fumble_recovery_yards,penalty_yards,penalty_names,was_initial_pass_rusher,caused_pressure,time_to_pressure_as_pass_rusher,get_off_time_as_pass_rusher,in_motion_at_ball_snap,shift_since_lineset,motion_since_lineset,was_running_route,route_ran,blocked_player_n_f_l_id1,blocked_player_n_f_l_id2,blocked_player_n_f_l_id3,pressure_allowed_as_blocker,time_to_pressure_allowed_as_blocker,pff_defensive_coverage_assignment,pff_primary_defensive_coverage_matchup_nfl_id,pff_secondary_defensive_coverage_matchup_nfl_id
0,2022090800_101,2022090800,101,35472,BUF,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,False,,,False,False,False,,,,,,,,,,
1,2022090800_101,2022090800,101,42392,BUF,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,False,,,False,False,False,,,,,,,,,,
2,2022090800_101,2022090800,101,42818,BUF,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,False,,,False,False,False,,,,,,,,,,
3,2022090800_101,2022090800,101,44875,BUF,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,False,,,False,False,False,,,,,,,,,,
4,2022090800_101,2022090800,101,46076,BUF,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,False,,,False,False,False,,,,,,,,,,


## 2. Filter to relevant play frames

In [9]:
# Find distance from each player to the ball
ball_coords = (
    tracking.query('club=="football"')
    [['game_play_id', 'frame_id', 'x', 'y']]
    .rename(columns={'x': 'ball_x', 'y': 'ball_y'})
)
cols = ['euclidean_dist_to_ball', 'ball_x', 'ball_y', 'lateral_distance_to_ball', 'vertical_dist_to_ball']
for col in cols:
    if col in tracking.columns:
        tracking = tracking.drop(columns=[col])
tracking = tracking.merge(ball_coords, on=['game_play_id', 'frame_id'], how='left')
tracking['euclidean_dist_to_ball'] = (
    (tracking['x'] - tracking['ball_x']) ** 2 +
    (tracking['y'] - tracking['ball_y']) ** 2
) ** 0.5
tracking['euclidean_dist_to_ball'] = tracking['euclidean_dist_to_ball'].round(2)
tracking['lateral_distance_to_ball'] = (
    tracking['x'] - tracking['ball_x']
).round(2)
tracking['vertical_dist_to_ball'] = (
    tracking['y'] - tracking['ball_y']
).round(2)
tracking.drop(columns=['ball_x', 'ball_y'], inplace=True)
del ball_coords


# Label the ball carrier
cols = ['is_ball_carrier']
for col in cols:
    if col in tracking.columns:
        tracking = tracking.drop(columns=[col])
tracking = tracking.merge(
    player_play[['game_play_id', 'nfl_id', 'had_rush_attempt']]
    .query('had_rush_attempt == 1')
    .rename(columns={'had_rush_attempt': 'is_ball_carrier'}),
    on=['game_play_id', 'nfl_id'],
    how='left'
)
tracking['is_ball_carrier'] = tracking['is_ball_carrier'].fillna(0).astype(int)
# drop plays with multiple ball carriers or no ball carrier
n_ball_carriers = (
    tracking.query('frame_type=="SNAP"').groupby(['game_play_id'])['is_ball_carrier']
    .sum()
    .reset_index()
    .rename(columns={'is_ball_carrier': 'n_ball_carriers'})
)
tracking = tracking.merge(n_ball_carriers, on='game_play_id', how='left')
tracking = tracking.query('n_ball_carriers == 1').reset_index(drop=True).drop(columns=['n_ball_carriers'])
game_play_ids = tracking['game_play_id'].unique()
game_ids = tracking['game_id'].unique()
games = games.query('game_id in @game_ids').reset_index(drop=True)
plays = plays.query('game_play_id in @game_play_ids').reset_index(drop=True)
player_play = player_play.query('game_play_id in @game_play_ids').reset_index(drop=True)


# Join roster assigned positions
cols = ['roster_position']
for col in cols:
    if col in tracking.columns:
        tracking = tracking.drop(columns=[col])
tracking = tracking.merge(
    players[['nfl_id', 'position']],
    on='nfl_id',
    how='left'
).rename(columns={'position': 'roster_position'})
tracking['roster_position'] = np.where(
    tracking.display_name == "Taysom Hill",
    "TE",
    tracking.roster_position
)

tracking.head()

Unnamed: 0,game_play_id,game_id,play_id,nfl_id,display_name,frame_id,frame_type,time,club,play_direction,x,y,s,a,dis,o,dir,event,euclidean_dist_to_ball,lateral_distance_to_ball,vertical_dist_to_ball,is_ball_carrier,roster_position
0,2022091200_64,2022091200,64,35459.0,Kareem Jackson,114,SNAP,2022-09-13 00:16:14.8,DEN,right,23.87,50.29,1.13,1.62,0.11,299.94,46.84,ball_snap,12.91,-5.32,11.76,0,SS
1,2022091200_64,2022091200,64,35459.0,Kareem Jackson,115,AFTER_SNAP,2022-09-13 00:16:14.9,DEN,right,23.96,50.39,1.37,1.61,0.13,299.94,48.81,,13.31,-5.24,12.23,0,SS
2,2022091200_64,2022091200,64,35459.0,Kareem Jackson,116,AFTER_SNAP,2022-09-13 00:16:15,DEN,right,24.06,50.52,1.65,1.76,0.16,301.53,50.88,,13.74,-5.22,12.71,0,SS
3,2022091200_64,2022091200,64,35459.0,Kareem Jackson,117,AFTER_SNAP,2022-09-13 00:16:15.1,DEN,right,24.18,50.67,1.99,1.94,0.2,304.34,51.68,,14.23,-5.27,13.22,0,SS
4,2022091200_64,2022091200,64,35459.0,Kareem Jackson,118,AFTER_SNAP,2022-09-13 00:16:15.2,DEN,right,24.32,50.85,2.32,2.03,0.23,308.28,52.75,,14.82,-5.39,13.8,0,SS


In [10]:
# Drop plays without exactly 5 offensive linemen
offensive_line = ['T','G','C']
o_line = tracking[
    tracking['roster_position'].isin(offensive_line) &
    (tracking.frame_type == "SNAP")
]
o_line = o_line.groupby('game_play_id').size().reset_index(name='count')
o_line = o_line[o_line['count'] != 5]
drop_gid = o_line.game_play_id.unique().tolist()


print(f"Dropping {len(drop_gid)} plays without exactly 5 offensive linemen out of "
      "{len(tracking.game_play_id.unique())} total plays")
tracking = tracking[~tracking.game_play_id.isin(drop_gid)]
plays = plays[~plays.game_play_id.isin(drop_gid)]
player_play = player_play[~player_play.game_play_id.isin(drop_gid)]
games = games[games.game_id.isin(plays.game_id.unique())]

Dropping 20 plays without exactly 5 offensive linemen out of {len(tracking.game_play_id.unique())} total plays


In [11]:
# Label Offensive line positions

cols = ['game_play_id','frame_id','frame_type','nfl_id','roster_position','display_name','x','y','event']
lineman_and_qb = tracking[
    (tracking.frame_type == 'SNAP') & 
    tracking.roster_position.isin(offensive_line + ['QB'])
][cols]

lineman_and_qb['farthest_forward_lineman'] = (
    lineman_and_qb
    .query('roster_position != "QB"')
    .groupby('game_play_id')
    .y.transform('max')
)
lineman_and_qb['farthest_forward_lineman'] = np.where(
    lineman_and_qb.farthest_forward_lineman == lineman_and_qb.y,
    True,
    False
)

ball = (
    tracking.query('club == "football" and frame_type == "SNAP"')
    [['game_play_id','frame_id','x','y']]
    .rename(columns={'x': 'x_ball', 'y': 'y_ball'})
)
lineman_and_qb = lineman_and_qb.merge(ball, on='game_play_id')
lineman_and_qb['delta_ball'] = np.sqrt(
    (lineman_and_qb.x - lineman_and_qb.x_ball)**2 + 
    (lineman_and_qb.y - lineman_and_qb.y_ball)**2
)
lineman_and_qb['closest_to_ball'] = (
    lineman_and_qb
    .query('roster_position != "QB"')
    .groupby('game_play_id')
    .delta_ball.transform('min')
)
lineman_and_qb['closest_to_ball'] = np.where(
    lineman_and_qb.closest_to_ball == lineman_and_qb.delta_ball,
    True,
    False
)

# number the lineman from left to right by x position
lineman_and_qb = lineman_and_qb.sort_values(['game_play_id','x'])
position_map = {
    0: 'LT',
    1: 'LG',
    2: 'C',
    3: 'RG',
    4: 'RT'
}
lineman_and_qb.loc[lineman_and_qb.roster_position.isin(['T','C','G']),'position_by_loc'] = (
    lineman_and_qb
    .query('roster_position != "QB"')
    .groupby('game_play_id')
    .cumcount()
).replace(position_map)    

lineman_and_qb['is_center'] = np.where(
    (lineman_and_qb.closest_to_ball & (lineman_and_qb.position_by_loc=="C")), #| 
    True,
    False
)

plays_without_balanced_offensive_line = (
    lineman_and_qb
    .groupby('game_play_id')
    .is_center.sum().reset_index()
    .query('is_center != 1')
    .game_play_id.values.tolist()
)
if len(plays_without_balanced_offensive_line) != 0:
    print(f"Dropping {len(plays_without_balanced_offensive_line)} plays without a balanced offensive line.")
    print(plays_without_balanced_offensive_line)
    tracking = tracking[~tracking.game_play_id.isin(plays_without_balanced_offensive_line)]
    plays = plays[~plays.game_play_id.isin(plays_without_balanced_offensive_line)]
    player_play = player_play[~player_play.game_play_id.isin(plays_without_balanced_offensive_line)]
    games = games[games.game_id.isin(plays.game_id.unique())]
else:
    print("All plays have a balanced offensive line.")

# Create a new column for the position of the player based on the location of the player
tracking = (
    tracking
    .merge(
        lineman_and_qb[['game_play_id','nfl_id','position_by_loc']].drop_duplicates(),
        on=['game_play_id','nfl_id'],
        how='left'
    )
)
tracking['position_by_loc'] = np.where(
    tracking.position_by_loc.isna(),
    tracking.roster_position,
    tracking.position_by_loc
)

Dropping 18 plays without a balanced offensive line.
['2022091101_2386', '2022091102_322', '2022091102_3981', '2022091102_467', '2022091104_2269', '2022091104_4410', '2022091104_999', '2022091105_1309', '2022091105_1351', '2022091105_294', '2022091105_3366', '2022091105_4512', '2022091108_1089', '2022091108_2724', '2022091109_1636', '2022091109_3463', '2022091109_656', '2022091111_2901']


In [12]:
tracking.roster_position.unique()

array(['SS', 'QB', 'G', 'CB', 'OLB', 'WR', 'DE', 'ILB', 'FS', 'C', 'NT',
       'RB', 'TE', 'T', nan, 'DT', 'MLB', 'FB', 'LB', 'DB'], dtype=object)

In [13]:
offense_positions = ['QB', 'RB', 'FB', 'WR', 'TE', 'G', 'C', 'T']
defense_positions = ['DE', 'DT', 'NT', 'OLB', 'ILB', 'MLB', 'CB', 'SS', 'FS', 'DB', 'LB']

if tracking.roster_position.nunique() != len(offense_positions + defense_positions):
    raise Exception("There are players with positions not in the offense or defense position lists.")

tracking['offense'] = np.where(
    tracking.roster_position.isin(offense_positions),
    1,
    0
)

### Identify players that are an extension of the core 5 olineman

In [14]:
cols = ['y_min_oline_left', 'y_min_oline_right', 'extra_on_oline', 
        'extra_oline_box_left', 'extra_oline_box_right', 'oline_box_left', 
        'oline_box_right', 'center_x_at_ball_snap']
for col in cols:
    if col in tracking.columns:
        tracking = tracking.drop(columns=[col])

oline_box_at_snap = (
    tracking
    .query('position_by_loc in ["LT","RT"] and frame_type == "SNAP"')
    [['game_play_id','position_by_loc','x']]
)

# Add oline_box_left (at the snap)
tracking = (
    tracking
    .merge(
        (
            oline_box_at_snap
            .query('position_by_loc=="LT"')
            [['game_play_id','x']]
            .rename(columns={'x':'oline_box_left'})
        ), 
        on='game_play_id', 
        how='left'
    )
)

# Add oline_box_right (at the snap)
tracking = (
    tracking
    .merge(
        (
            oline_box_at_snap
            .query('position_by_loc=="RT"')
            [['game_play_id','x']]
            .rename(columns={'x':'oline_box_right'})
        ),
        on='game_play_id',
        how='left'
    )
)

# Add center_x_at_ball_snap
tracking = (
    tracking
    .merge(
        (
            tracking
            .query('position_by_loc == "C" and frame_type == "SNAP"')
            .rename(columns={'x':'center_x_at_ball_snap'})
            [['game_play_id','center_x_at_ball_snap']]
            .drop_duplicates()
        ),
        on='game_play_id',
        how='left'
    )
)

extra_on_oline = tracking.query('frame_type == "SNAP"')
y_min_oline_left = extra_on_oline.query('position_by_loc.isin(["LT","LG","C"])').groupby('game_play_id').y.min().reset_index()
y_min_oline_right = extra_on_oline.query('position_by_loc.isin(["RT","RG","C"])').groupby('game_play_id').y.min().reset_index()
extra_on_oline = extra_on_oline.merge(y_min_oline_left, on='game_play_id', suffixes=('','_min_oline_left'))
extra_on_oline = extra_on_oline.merge(y_min_oline_right, on='game_play_id', suffixes=('','_min_oline_right'))

extra_on_oline = extra_on_oline[extra_on_oline['offense'] == 1]
left_condition = (
    (extra_on_oline['x'] < extra_on_oline['oline_box_left']) & 
    (extra_on_oline['x'] > extra_on_oline['oline_box_left'] - 2) & 
    (extra_on_oline['y'] > extra_on_oline['y_min_oline_left'] - 0.1)
)
right_condition = (
    (extra_on_oline['x'] > extra_on_oline['oline_box_right']) & 
    (extra_on_oline['x'] < extra_on_oline['oline_box_right'] + 2) & 
    (extra_on_oline['y'] > extra_on_oline['y_min_oline_right'] - 0.1)
)
extra_on_oline = extra_on_oline[left_condition | right_condition]
extra_on_oline = extra_on_oline.drop_duplicates()[['game_play_id', 'nfl_id', 'x', 'y_min_oline_left', 
                                                  'y_min_oline_right', 'oline_box_left', 'oline_box_right']]

extra_on_oline['extra_on_oline'] = True
extra_on_oline['extra_oline_box_left'] = np.where(
    extra_on_oline['x'] < extra_on_oline['oline_box_left'],
    extra_on_oline['x'],
    None
)

extra_on_oline['extra_on_oline'] = True
extra_on_oline['extra_oline_box_left'] = np.where(
    extra_on_oline['x'] < extra_on_oline['oline_box_left'],
    extra_on_oline['x'],
    None
)
extra_on_oline['extra_oline_box_right'] = np.where(
    extra_on_oline['x'] > extra_on_oline['oline_box_right'],
    extra_on_oline['x'],
    None
)

# Merge extra oline data to the tracking data
if 'extra_on_oline' in tracking.columns:
    tracking.drop(columns='extra_on_oline', inplace=True)
if 'extra_oline_box_left' in tracking.columns:
    tracking.drop(columns='extra_oline_box_left', inplace=True)
if 'extra_oline_box_right' in tracking.columns:
    tracking.drop(columns='extra_oline_box_right', inplace=True)
if 'y_min_oline_left' in tracking.columns:
    tracking.drop(columns='y_min_oline_left', inplace=True)
if 'y_min_oline_right' in tracking.columns:
    tracking.drop(columns='y_min_oline_right', inplace=True)
tracking = tracking.merge(
    extra_on_oline[['game_play_id','nfl_id','y_min_oline_left','y_min_oline_right',
                    'extra_on_oline','extra_oline_box_left','extra_oline_box_right']],
    on=['game_play_id','nfl_id'], 
    how='left'
)
tracking['extra_oline_box_left'] = tracking.groupby('game_play_id')['extra_oline_box_left'].transform('max')
tracking['extra_oline_box_right'] = tracking.groupby('game_play_id')['extra_oline_box_right'].transform('max')
tracking['extra_oline_box_left'] = tracking['extra_oline_box_left'].fillna(tracking['oline_box_left'])
tracking['extra_oline_box_right'] = tracking['extra_oline_box_right'].fillna(tracking['oline_box_right'])
tracking['extra_on_oline'] = tracking['extra_on_oline'].fillna(False)

# Repeat the process for extra oline outside of the additional oline box
extra_on_oline = tracking.query('extra_on_oline and frame_type == "SNAP"')
extra_on_oline = extra_on_oline.query(
    'offense and ' +\
    '((~extra_oline_box_left.isna() and x < extra_oline_box_left and x > extra_oline_box_left - 2 and y > y_min_oline_left - 0.1) or ' +\
    ' (~extra_oline_box_right.isna() and x > extra_oline_box_right and x < extra_oline_box_right + 2 and y > y_min_oline_right - 0.1))',
    engine='python'
).drop_duplicates()[['game_play_id','nfl_id','x','y_min_oline_left','y_min_oline_right','extra_oline_box_left','extra_oline_box_right']]

extra_on_oline['extra_on_oline'] = True
extra_on_oline['extra_oline_box_left'] = np.where(
    extra_on_oline['x'] < extra_on_oline['extra_oline_box_left'],
    extra_on_oline['x'],
    None
)
extra_on_oline['extra_oline_box_right'] = np.where(
    extra_on_oline['x'] > extra_on_oline['extra_oline_box_right'],
    extra_on_oline['x'],
    None
)

if not extra_on_oline.empty:
    tracking = tracking.merge(
        extra_on_oline[['game_play_id','nfl_id',
                        'extra_on_oline','extra_oline_box_left','extra_oline_box_right']],
        on=['game_play_id','nfl_id'], 
        how='left',
        suffixes=('','_x')
    )

    tracking['extra_oline_box_left_x'] = tracking.groupby('game_play_id')['extra_oline_box_left_x'].transform('max')
    tracking['extra_oline_box_right_x'] = tracking.groupby('game_play_id')['extra_oline_box_right_x'].transform('max')
    tracking['extra_on_oline_x'] = tracking['extra_on_oline_x'].fillna(False)

    tracking['extra_on_oline'] = tracking['extra_on_oline'] | tracking['extra_on_oline_x']
    tracking['extra_oline_box_left'] = tracking['extra_oline_box_left'].fillna(tracking['extra_oline_box_left_x'])
    tracking['extra_oline_box_right'] = tracking['extra_oline_box_right'].fillna(tracking['extra_oline_box_right_x'])

    tracking.drop(columns=['extra_on_oline_x','extra_oline_box_left_x','extra_oline_box_right_x'], inplace=True)

  tracking['extra_oline_box_left'] = tracking['extra_oline_box_left'].fillna(tracking['oline_box_left'])
  tracking['extra_oline_box_right'] = tracking['extra_oline_box_right'].fillna(tracking['oline_box_right'])
  tracking['extra_on_oline'] = tracking['extra_on_oline'].fillna(False)


In [15]:
drop_col = ['extra_oline_box_left','extra_oline_box_right', 'y_min_oline_right', 
            'y_min_oline_left', 'oline_box_left', 'oline_box_right', 'center_x_at_ball_snap']

tracking.drop(columns=drop_col, inplace=True)

In [16]:
tracking['is_part_of_oline'] = np.where(
    tracking['position_by_loc'].isin(['LT','LG','C','RG','RT']) | tracking['extra_on_oline'],
    True,
    False
)

### Identify the frame the rb passes beyond the offensive line

In [17]:
tracking.game_play_id.unique()[:10]

array(['2022091200_64', '2022091200_180', '2022091200_315',
       '2022091200_375', '2022091200_446', '2022091200_601',
       '2022091200_622', '2022091200_643', '2022091200_741',
       '2022091200_810'], dtype=object)

In [35]:
game_play_id = '2022090800_1187'

In [36]:
plays.query('game_play_id == @game_play_id').play_description.values[0]

'(8:48) D.Henderson right tackle to LA 44 for 4 yards (V.Miller, D.Jones).'

In [37]:
df = tracking.query('game_play_id == @game_play_id').copy()
df.head()

Unnamed: 0,game_play_id,game_id,play_id,nfl_id,display_name,frame_id,frame_type,time,club,play_direction,x,y,s,a,dis,o,dir,event,euclidean_dist_to_ball,lateral_distance_to_ball,vertical_dist_to_ball,is_ball_carrier,roster_position,position_by_loc,offense,extra_on_oline,is_part_of_oline,breakthrough_frameid
482540,2022090800_1187,2022090800,1187,34452.0,Matthew Stafford,125,SNAP,2022-09-09 01:08:54.7,LA,left,29.54,48.28,0.05,0.85,0.01,87.54,304.47,ball_snap,1.65,-0.07,-1.65,0,QB,QB,1,False,False,
482541,2022090800_1187,2022090800,1187,34452.0,Matthew Stafford,126,AFTER_SNAP,2022-09-09 01:08:54.8,LA,left,29.55,48.26,0.3,2.26,0.02,89.72,309.27,,1.67,-0.06,-1.67,0,QB,QB,1,False,False,
482542,2022090800_1187,2022090800,1187,34452.0,Matthew Stafford,127,AFTER_SNAP,2022-09-09 01:08:54.9,LA,left,29.58,48.23,0.63,3.14,0.05,91.55,309.41,,1.65,-0.03,-1.65,0,QB,QB,1,False,False,
482543,2022090800_1187,2022090800,1187,34452.0,Matthew Stafford,128,AFTER_SNAP,2022-09-09 01:08:55,LA,left,29.64,48.15,1.08,3.73,0.09,93.09,308.52,,1.68,0.02,-1.68,0,QB,QB,1,False,False,
482544,2022090800_1187,2022090800,1187,34452.0,Matthew Stafford,129,AFTER_SNAP,2022-09-09 01:08:55.1,LA,left,29.73,48.04,1.52,3.67,0.14,94.34,308.8,,1.65,0.07,-1.65,0,QB,QB,1,False,False,


In [22]:
from shapely import MultiPoint
import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation
from IPython.display import HTML

frame_ids = df['frame_id'].unique()
frame_ids.sort()
yardline = plays.query('game_play_id == @game_play_id').absolute_yardline_number.values[0]

fig, ax = plt.subplots(figsize=(8, 8))

def update(frame_id):
    ax.clear()
    frame_df = df[df['frame_id'] == frame_id]
    # Plot convex hull if enough points
    points = [tuple(v) for v in frame_df.query('is_part_of_oline')[['x', 'y']].values.tolist()]
    if len(points) >= 3:
        convex_hull = MultiPoint(points).convex_hull
        x, y = convex_hull.exterior.xy
        ax.plot(x, y, color='blue', linewidth=2, label='Convex Hull')
    # Plot players
    oline = frame_df[frame_df['is_part_of_oline']]
    ax.scatter(oline['x'], oline['y'], label='oline', s=50, color='green')
    rb = frame_df[frame_df['is_ball_carrier'] == 1]
    ax.scatter(rb['x'], rb['y'], label='ball carrier', s=50, color='purple')
    defense = frame_df[(frame_df['offense'] == 0) & (frame_df['club'] != 'football')]
    ax.scatter(defense['x'], defense['y'], label='defense', s=50, color='red')
    ax.hlines(yardline, 0, 53.3, color='blue', linestyle='--', label='Yard Line')
    ax.set_aspect('equal', adjustable='box')
    ax.set_xlim(0, 53.3)
    ax.set_ylim(df['y'].min() - 1, df['y'].max() + 1)
    ax.set_title(f'Convex Hull of Offensive Linemen\nFrame {frame_id}')
    ax.legend(bbox_to_anchor=(.8, 1.7), loc='upper left')

ani = FuncAnimation(fig, update, frames=frame_ids, interval=200, repeat=False)
plt.close()
HTML(ani.to_jshtml())

In [38]:
from shapely import MultiPoint, Point
import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation
from IPython.display import HTML

frames_per_second = 10 # Adjust if your data uses a different frame rate
outside_threshold = .5 * frames_per_second  # 2 seconds

frame_ids = df['frame_id'].unique()
frame_ids.sort()
yardline = plays.query('game_play_id == @game_play_id').absolute_yardline_number.values[0]

rb_outside_counter = 0
rb_was_inside = False
broke_frame = None

# Precompute RB inside/outside status for each frame
rb_status = []
for frame_id in frame_ids:
    frame_df = df[df['frame_id'] == frame_id]
    points = [tuple(v) for v in frame_df.query('is_part_of_oline')[['x', 'y']].values.tolist()]
    rb = frame_df[frame_df['is_ball_carrier'] == 1]
    if len(points) >= 3 and not rb.empty:
        hull = MultiPoint(points).convex_hull
        rb_point = Point(rb.iloc[0]['x'], rb.iloc[0]['y'])
        inside = hull.contains(rb_point)
    else:
        inside = False
    rb_status.append(inside)

for i, inside in enumerate(rb_status):
    if inside:
        rb_outside_counter = 0
        rb_was_inside = True
        broke_frame = None
    else:
        if rb_was_inside:
            if rb_outside_counter == 0:
                left_frame = frame_ids[i]
            rb_outside_counter += 1
            if rb_outside_counter == outside_threshold:
                broke_frame = left_frame
                print(f"RB broke past the oline at frame {broke_frame}")
                break

fig, ax = plt.subplots(figsize=(8, 8))

def update(frame_id):
    ax.clear()
    frame_df = df[df['frame_id'] == frame_id]
    points = [tuple(v) for v in frame_df.query('is_part_of_oline')[['x', 'y']].values.tolist()]
    rb = frame_df[frame_df['is_ball_carrier'] == 1]
    # Plot convex hull if enough points
    if len(points) >= 3:
        convex_hull = MultiPoint(points).convex_hull
        x, y = convex_hull.exterior.xy
        ax.plot(x, y, color='blue', linewidth=2, label='Convex Hull')
        # Highlight RB if outside hull after breaking
        if broke_frame is not None and frame_id >= broke_frame and not convex_hull.contains(Point(rb.iloc[0]['x'], rb.iloc[0]['y'])):
            ax.scatter(rb['x'], rb['y'], label='RB (broke past oline)', s=100, color='orange', edgecolor='black', zorder=5)
    # Plot players
    oline = frame_df[frame_df['is_part_of_oline']]
    ax.scatter(oline['x'], oline['y'], label='oline', s=50, color='green')
    if rb.shape[0] > 0:
        ax.scatter(rb['x'], rb['y'], label='ball carrier', s=50, color='purple')
    defense = frame_df[(frame_df['offense'] == 0) & (frame_df['club'] != 'football')]
    ax.scatter(defense['x'], defense['y'], label='defense', s=50, color='red')
    ax.hlines(yardline, 0, 53.3, color='blue', linestyle='--', label='Yard Line')
    ax.set_aspect('equal', adjustable='box')
    ax.set_xlim(0, 53.3)
    ax.set_ylim(df['y'].min() - 1, df['y'].max() + 1)
    title = f'Convex Hull of Offensive Linemen\nFrame {frame_id}'
    if broke_frame is not None and frame_id >= broke_frame:
        title += " (RB broke past oline)"
    ax.set_title(title)
    ax.legend(bbox_to_anchor=(.8, 1.7), loc='upper left')

ani = FuncAnimation(fig, update, frames=frame_ids, interval=200, repeat=False)
plt.close()
HTML(ani.to_jshtml())

In [25]:
from shapely import MultiPoint, Point

frames_per_second = 10  # Adjust if your data uses a different frame rate
outside_threshold = .5 * frames_per_second  # 2 seconds

breakthrough_dict = {}

cols = ['game_play_id', 'frame_id', 'x', 'y', 'is_ball_carrier', 'is_part_of_oline']

for game_play_id, play_df in tqdm(
        iterable=tracking[cols].groupby('game_play_id'), 
        total=len(tracking['game_play_id'].unique()), 
        desc='Finding Breakthroughs'
    ):

    play_df = play_df.sort_values('frame_id')
    frame_ids = play_df['frame_id'].unique()
    rb_outside_counter = 0
    rb_was_inside = False
    breakthrough_frame = None

    # Precompute RB inside/outside status for each frame
    rb_status = []
    for frame_id in frame_ids:
        frame_df = play_df[play_df['frame_id'] == frame_id]
        points = [tuple(v) for v in frame_df.query('is_part_of_oline')[['x', 'y']].values.tolist()]
        rb = frame_df[frame_df['is_ball_carrier'] == 1]
        if len(points) >= 3 and not rb.empty:
            hull = MultiPoint(points).convex_hull
            rb_point = Point(rb.iloc[0]['x'], rb.iloc[0]['y'])
            inside = hull.contains(rb_point)
        else:
            inside = False
        rb_status.append(inside)

    for i, inside in enumerate(rb_status):
        if inside:
            rb_outside_counter = 0
            rb_was_inside = True
            breakthrough_frame = None
        else:
            if rb_was_inside:
                if rb_outside_counter == 0:
                    left_frame = frame_ids[i]
                rb_outside_counter += 1
                if rb_outside_counter == outside_threshold:
                    breakthrough_frame = left_frame
                    break

    breakthrough_dict[game_play_id] = breakthrough_frame

# Add the breakthrough_frameid column to tracking
tracking['breakthrough_frameid'] = tracking['game_play_id'].map(breakthrough_dict)

Finding Breakthroughs: 100%|██████████| 438/438 [00:18<00:00, 23.08it/s]


In [None]:
tracking.dropna(subset=['breakthrough_frameid']).drop_duplicates(subset=['game_play_id']).shape

(171, 28)

In [40]:
tracking.head()

Unnamed: 0,game_play_id,game_id,play_id,nfl_id,display_name,frame_id,frame_type,time,club,play_direction,x,y,s,a,dis,o,dir,event,euclidean_dist_to_ball,lateral_distance_to_ball,vertical_dist_to_ball,is_ball_carrier,roster_position,position_by_loc,offense,extra_on_oline,is_part_of_oline,breakthrough_frameid
0,2022091200_64,2022091200,64,35459.0,Kareem Jackson,114,SNAP,2022-09-13 00:16:14.8,DEN,right,23.87,50.29,1.13,1.62,0.11,299.94,46.84,ball_snap,12.91,-5.32,11.76,0,SS,SS,0,False,False,
1,2022091200_64,2022091200,64,35459.0,Kareem Jackson,115,AFTER_SNAP,2022-09-13 00:16:14.9,DEN,right,23.96,50.39,1.37,1.61,0.13,299.94,48.81,,13.31,-5.24,12.23,0,SS,SS,0,False,False,
2,2022091200_64,2022091200,64,35459.0,Kareem Jackson,116,AFTER_SNAP,2022-09-13 00:16:15,DEN,right,24.06,50.52,1.65,1.76,0.16,301.53,50.88,,13.74,-5.22,12.71,0,SS,SS,0,False,False,
3,2022091200_64,2022091200,64,35459.0,Kareem Jackson,117,AFTER_SNAP,2022-09-13 00:16:15.1,DEN,right,24.18,50.67,1.99,1.94,0.2,304.34,51.68,,14.23,-5.27,13.22,0,SS,SS,0,False,False,
4,2022091200_64,2022091200,64,35459.0,Kareem Jackson,118,AFTER_SNAP,2022-09-13 00:16:15.2,DEN,right,24.32,50.85,2.32,2.03,0.23,308.28,52.75,,14.82,-5.39,13.8,0,SS,SS,0,False,False,


In [None]:
from shapely import MultiPoint, Point

frames_per_second = 10  # Adjust if your data uses a different frame rate
outside_threshold = 2 * frames_per_second  # 2 seconds

breakthrough_dict = {}

cols = ['game_play_id', 'frame_id', 'x', 'y', 'is_ball_carrier', 'is_part_of_oline']

for game_play_id, play_df in tqdm(
        iterable=tracking[cols].groupby('game_play_id'), 
        total=len(tracking['game_play_id'].unique()), 
        desc='Finding Breakthroughs'
    ):

    play_df = play_df.sort_values('frame_id')
    frame_ids = play_df['frame_id'].unique()
    rb_outside_counter = 0
    rb_was_inside = False
    breakthrough_frame = None

    # Precompute RB inside/outside status for each frame (convex hull logic)
    rb_status = []
    for frame_id in frame_ids:
        frame_df = play_df[play_df['frame_id'] == frame_id]
        points = [tuple(v) for v in frame_df.query('is_part_of_oline')[['x', 'y']].values.tolist()]
        rb = frame_df[frame_df['is_ball_carrier'] == 1]
        if len(points) >= 3 and not rb.empty:
            hull = MultiPoint(points).convex_hull
            rb_point = Point(rb.iloc[0]['x'], rb.iloc[0]['y'])
            inside = hull.contains(rb_point)
        else:
            inside = False
        rb_status.append(inside)

    for i, inside in enumerate(rb_status):
        if inside:
            rb_outside_counter = 0
            rb_was_inside = True
            breakthrough_frame = None
        else:
            if rb_was_inside:
                if rb_outside_counter == 0:
                    left_frame = frame_ids[i]
                rb_outside_counter += 1
                if rb_outside_counter == outside_threshold:
                    breakthrough_frame = left_frame
                    break

    # If breakthrough_frame is still None, check for continuous oline box breach or RB ahead of oline
    if breakthrough_frame is None:
        outside_box_counter = 0
        first_outside_box_frame = None
        for frame_id in frame_ids:
            frame_df = play_df[play_df['frame_id'] == frame_id]
            rb = frame_df[frame_df['is_ball_carrier'] == 1]
            oline = frame_df[frame_df['is_part_of_oline']]
            if not rb.empty and not oline.empty:
                rb_x = rb.iloc[0]['x']
                rb_y = rb.iloc[0]['y']
                min_oline_x = oline['x'].min()
                max_oline_x = oline['x'].max()
                max_oline_y = oline['y'].max()
                # Check if RB is at least 2 yards outside the oline box (continuous)
                if (rb_x < min_oline_x - 2) or (rb_x > max_oline_x + 2):
                    if outside_box_counter == 0:
                        first_outside_box_frame = frame_id
                    outside_box_counter += 1
                    if outside_box_counter == outside_threshold:
                        breakthrough_frame = first_outside_box_frame
                        break
                else:
                    outside_box_counter = 0
                    first_outside_box_frame = None
                # Check if RB is ahead of all oline in y (instantaneous)
                if rb_y > max_oline_y:
                    breakthrough_frame = frame_id
                    break

    breakthrough_dict[game_play_id] = breakthrough_frame

# Add the breakthrough_frameid column to tracking
tracking['breakthrough_frameid'] = tracking['game_play_id'].map(breakthrough_dict)

(500250, 28)

# Rules for RB moving outside the oline's influence
1. if rb is laterally outside the oline box for more than 2 continuous seconds
2. rb passes ahead of all onlineman
3. rb breaks through front of oline convex hull (rb must enter and then exist for at least .5 seconds)