In [1]:
# Libs
import pandas as pd
import numpy as np
import os
import gc
from datetime import datetime
from scipy.spatial.distance import pdist 

os.chdir('/home/petep/Documents/Projects/BigDataBowl')

import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

### Predicting the Blitz

A blitz is when a defense sends extra players to pressure the quarterbacks.

Potential predictor variables: 
- pre-snap positioning
- defensive personnel: formations, alignments, number of players at each position
- downs and distance
- score and time
- quarterback characteristics: mobile vs. pocketpasser

### Data preprocessing
- Defender proximity to line of scrimmage
- Relative distance between defenders
- Changes in poitioning just before the snap

### Priors
- Initial prior belief about blitz probability... could be based on historical data
- Define priors for each predictor
    - Discrete priors: estimating the probability that a formation with multiple defenders close to the line is a blitz
    - Continuous priors: for distances of defenders to the line of scrimmage, you might assume a prior distribution based on observed data

### Model Choice
- Hierarchical model: Blitz behavior may vary by team or player. A hierarchical Bayesian model can capture team-level differences in blitz tendencies while pooling data to improve generalization



In [2]:
def load_data(file_path, usecols=None, dtype=None):
    return pd.read_csv(file_path, usecols=usecols, dtype=dtype)

# Load essential data
plays = load_data(
    'data/plays.csv', 
    usecols=[
        'gameId', 'playId', 'possessionTeam', 
        'absoluteYardlineNumber', 'yardlineSide', 
        'down', 'yardsToGo', 'passResult', 
        'prePenaltyYardsGained']
)
games = load_data(
    'data/games.csv', 
    usecols=['gameId', 'homeTeamAbbr', 'visitorTeamAbbr']
)
players = load_data(
    'data/players.csv',
    usecols=['nflId', 'position']
)

In [39]:
def process_tracking(week, plays):
    tracking = load_data(
        f'data/tracking_week_{week}.csv',
        usecols = [
            'gameId', 'playId', 'nflId', 'time',
            'x', 'y', 's', 'a', 'event', 'club',
            'frameType', 'o', 'dir'
        ]
    )
    

    tracking['time'] = pd.to_datetime(tracking['time'], errors='coerce')

    tracking = tracking.merge(
        plays[['gameId', 'playId', 'possessionTeam', 'absoluteYardlineNumber', 'yardlineSide']],
        on = ['gameId', 'playId'],
        how  = 'left'
    )
    tracking['isDefensivePlayer'] = tracking['club'] != tracking['possessionTeam']

    # playDirection based on yardlineSide and posessionTeam
    tracking['playDirection'] = np.where(
        tracking['yardlineSide'] == tracking['possessionTeam'], 'right', 'left'
    )

    def_tracking = tracking[tracking['isDefensivePlayer']]

    # Calculate time_since_snap in seconds
    def_tracking['time_since_snap'] = def_tracking.groupby(['gameId', 'playId', 'nflId'])['time'].transform(
        lambda x: (x - x.iloc[0]).dt.total_seconds()
    )

    del tracking
    gc.collect()

    return def_tracking

## Blitz criteria


## Successful blitz criteria
- sack
- tackle for loss (or minimal gain)
- interception
- incomplete pass


In [42]:
def identify_blitz(group):
    los = group['absoluteYardlineNumber'].iloc[0]
    threshold_time = 1.5
    df = group[(group['time_since_snap'] <= threshold_time) & (group['time_since_snap'] >= 0)]

    if df.empty or 'playDirection' not in df.columns:
        num_rushers = 0
    else:
        play_direction = df['playDirection'].iloc[0]
        if play_direction == 'left':
            num_rushers = df[df['x'] >= los]['nflId'].nunique()
        elif play_direction == 'right':
            num_rushers = df[df['x'] <= los]['nflId'].nunique()
        else:
            num_rushers = 0

    is_blitz = num_rushers > 4
    return pd.Series({'gameId': group.name[0], 'playId': group.name[1], 'num_pass_rushers': num_rushers, 'isBlitz': is_blitz})

def create_features(group):
    # Filter to BEFORE_SNAP frame
    before_snap = group[group['frameType'] == 'BEFORE_SNAP']
    
    if before_snap.empty:
        # If no BEFORE_SNAP frames are present, return default or NaN values
        return pd.Series({
            'gameId': group.name[0],
            'playId': group.name[1],
            'avg_def_speed': 0,
            'std_def_acc': 0,
            'def_players_near_los': 0,
            'avg_def_spacing': 0
        })
    
    # Number of defensive players near LOS within 5 yards
    los = group['absoluteYardlineNumber'].iloc[0]
    def_players_near_los = before_snap[
        (before_snap['x'] >= los - 5) & 
        (before_snap['x'] <= los + 5)
    ]['nflId'].nunique()
    
    # Average speed of defensive players before snap
    avg_def_speed = before_snap['s'].mean()
    
    # Standard deviation of defensive players' acceleration before snap
    std_def_acc = before_snap['a'].std()
    
    # Average spacing between defensive players
    if before_snap.shape[0] > 1:
        coords = before_snap[['x', 'y']].values
        distances = pdist(coords)
        avg_spacing = distances.mean()
    else:
        avg_spacing = 0
    
    # Merge with players data to get positions
    before_snap = before_snap.merge(
        players[['nflId', 'position']], 
        on='nflId', 
        how='left'
    )
    
    positions = ['OLB', 'ILB', 'MLB', 'LB', 'CB', 'SS', 'FS']
    feature_dict = {}
    
    for pos in positions:
        pos_players = before_snap[before_snap['position'] == pos]
        if not pos_players.empty:
            feature_dict[f'avg_{pos.lower()}_orientation'] = pos_players['o'].mean()
            feature_dict[f'avg_{pos.lower()}_motion_angle'] = pos_players['dir'].mean()
        else:
            feature_dict[f'avg_{pos.lower()}_orientation'] = 0
            feature_dict[f'avg_{pos.lower()}_motion_angle'] = 0
    
    return pd.Series({
        'gameId': group.name[0],
        'playId': group.name[1],
        'avg_def_speed': avg_def_speed,
        'std_def_acc': std_def_acc,
        'def_players_near_los': def_players_near_los,
        'avg_def_spacing': avg_spacing,
        **feature_dict
    })

In [43]:
blitz_results = []
features_results = []

data_folder = os.getcwd() + '/data'

# Determine the number of weeks dynamically
# num_weeks = len([f for f in data_folder if f.startswith('tracking_week_')])
num_weeks = 5

for week in range(1, num_weeks + 1):
    print(f"Processing week {week}")
    def_tracking = process_tracking(week, plays)
    
    # Blitz Identification
    blitz_df = def_tracking.groupby(['gameId', 'playId']).apply(identify_blitz)
    blitz_df = blitz_df.reset_index(level=[0, 1], drop=True)
    blitz_results.append(blitz_df)
    
    # Feature Creation - Apply to All Defensive Tracking Data
    features_df = def_tracking.groupby(['gameId', 'playId']).apply(create_features)
    features_df = features_df.reset_index(level=[0, 1], drop=True)
    features_results.append(features_df)
    
    del def_tracking, features_df
    gc.collect()

# Combine results
blitz_df = pd.concat(blitz_results, ignore_index=True)
features_df = pd.concat(features_results, ignore_index=True)

del blitz_results, features_results
gc.collect()

Processing week 1
Processing week 2
Processing week 3
Processing week 4
Processing week 5


0

In [44]:
blitz_df.head()

Unnamed: 0,gameId,playId,num_pass_rushers,isBlitz
0,2022090800,56,11,True
1,2022090800,80,11,True
2,2022090800,101,11,True
3,2022090800,122,11,True
4,2022090800,167,0,False


In [45]:
features_df.head()

Unnamed: 0,gameId,playId,avg_def_speed,std_def_acc,def_players_near_los,avg_def_spacing,avg_olb_orientation,avg_olb_motion_angle,avg_ilb_orientation,avg_ilb_motion_angle,...,avg_mlb_motion_angle,avg_lb_orientation,avg_lb_motion_angle,avg_cb_orientation,avg_cb_motion_angle,avg_ss_orientation,avg_ss_motion_angle,avg_fs_orientation,avg_fs_motion_angle,0
0,2022091000.0,56.0,0.697701,0.483102,10.0,9.295926,0.0,0.0,85.948621,223.893621,...,0.0,0.0,0.0,128.305448,223.638103,112.438241,178.974828,121.888897,226.845793,
1,2022091000.0,80.0,0.426925,0.422062,6.0,11.460355,0.0,0.0,100.393793,188.953621,...,0.0,0.0,0.0,89.986782,223.559023,82.968506,180.400287,92.128391,190.794253,
2,2022091000.0,101.0,0.632603,0.323081,8.0,8.306874,110.759048,143.224857,87.843905,175.839524,...,0.0,0.0,0.0,93.98819,142.137905,68.982571,237.331286,0.0,0.0,
3,2022091000.0,122.0,0.429606,0.431985,7.0,9.62834,58.702946,117.148393,76.408214,213.636875,...,0.0,0.0,0.0,99.879167,167.784077,70.67317,231.068393,0.0,0.0,
4,2022091000.0,167.0,0.592632,0.389714,7.0,10.649911,76.908119,140.387723,97.498515,182.719604,...,0.0,0.0,0.0,110.931749,179.210264,68.57302,206.424505,0.0,0.0,


In [48]:
plays.head()

Unnamed: 0,gameId,playId,down,yardsToGo,possessionTeam,yardlineSide,absoluteYardlineNumber,passResult,prePenaltyYardsGained
0,2022102302,2655,1,10,CIN,CIN,31,C,9
1,2022091809,3698,1,10,CIN,CIN,18,C,4
2,2022103004,3146,3,12,HOU,HOU,30,C,6
3,2022110610,348,2,10,KC,TEN,33,C,4
4,2022102700,2799,2,8,BAL,TB,37,,-1


In [None]:
# Diagnostic prints
print("Plays columns:", plays.columns)
print("Blitz_df columns:", blitz_df.columns)
print("Features_df columns:", features_df.columns)

# Check for empty dataframes
print("Plays shape:", plays.shape)
print("Blitz_df shape:", blitz_df.shape)
print("Features_df shape:", features_df.shape)

Plays columns: Index(['gameId', 'playId', 'down', 'yardsToGo', 'possessionTeam',
       'yardlineSide', 'absoluteYardlineNumber', 'passResult',
       'prePenaltyYardsGained'],
      dtype='object')
Blitz_df columns: Index(['gameId', 'playId', 'num_pass_rushers', 'isBlitz'], dtype='object')
Features_df columns: Index([              'gameId',               'playId',        'avg_def_speed',
                'std_def_acc', 'def_players_near_los',      'avg_def_spacing',
        'avg_olb_orientation', 'avg_olb_motion_angle',  'avg_ilb_orientation',
       'avg_ilb_motion_angle',  'avg_mlb_orientation', 'avg_mlb_motion_angle',
         'avg_lb_orientation',  'avg_lb_motion_angle',   'avg_cb_orientation',
        'avg_cb_motion_angle',   'avg_ss_orientation',  'avg_ss_motion_angle',
         'avg_fs_orientation',  'avg_fs_motion_angle',                      0],
      dtype='object')
Plays shape: (16124, 9)
Blitz_df shape: (9456, 4)
Features_df shape: (81020, 21)


In [50]:
# Merge and prepare final dataset
model_data = plays.merge(blitz_df, on=['gameId', 'playId'], how='left')
model_data = model_data.merge(features_df, on=['gameId', 'playId'], how='left')

In [55]:
model_data.to_csv('data/model_data.csv')

In [3]:
df = pd.read_csv('data/model_data.csv')
df.head()


Unnamed: 0.1,Unnamed: 0,gameId,playId,down,yardsToGo,possessionTeam,yardlineSide,absoluteYardlineNumber,passResult,prePenaltyYardsGained,...,avg_mlb_motion_angle,avg_lb_orientation,avg_lb_motion_angle,avg_cb_orientation,avg_cb_motion_angle,avg_ss_orientation,avg_ss_motion_angle,avg_fs_orientation,avg_fs_motion_angle,0
0,0,2022102302,2655,1,10,CIN,CIN,31,C,9,...,,,,,,,,,,
1,1,2022091809,3698,1,10,CIN,CIN,18,C,4,...,,,,,,,,,,
2,2,2022103004,3146,3,12,HOU,HOU,30,C,6,...,,,,,,,,,,
3,3,2022110610,348,2,10,KC,TEN,33,C,4,...,,,,,,,,,,
4,4,2022102700,2799,2,8,BAL,TB,37,,-1,...,,,,,,,,,,


In [4]:
df.columns

Index(['Unnamed: 0', 'gameId', 'playId', 'down', 'yardsToGo', 'possessionTeam',
       'yardlineSide', 'absoluteYardlineNumber', 'passResult',
       'prePenaltyYardsGained', 'num_pass_rushers', 'isBlitz', 'avg_def_speed',
       'std_def_acc', 'def_players_near_los', 'avg_def_spacing',
       'avg_olb_orientation', 'avg_olb_motion_angle', 'avg_ilb_orientation',
       'avg_ilb_motion_angle', 'avg_mlb_orientation', 'avg_mlb_motion_angle',
       'avg_lb_orientation', 'avg_lb_motion_angle', 'avg_cb_orientation',
       'avg_cb_motion_angle', 'avg_ss_orientation', 'avg_ss_motion_angle',
       'avg_fs_orientation', 'avg_fs_motion_angle', '0'],
      dtype='object')

In [5]:
column_subset = ['gameId', 'playId', 'down', 'yardsToGo', 'possessionTeam',
       'yardlineSide', 'absoluteYardlineNumber', 'passResult',
       'prePenaltyYardsGained', 'num_pass_rushers', 'isBlitz']

In [6]:
model_data = df[column_subset]
model_data.head()

Unnamed: 0,gameId,playId,down,yardsToGo,possessionTeam,yardlineSide,absoluteYardlineNumber,passResult,prePenaltyYardsGained,num_pass_rushers,isBlitz
0,2022102302,2655,1,10,CIN,CIN,31,C,9,,
1,2022091809,3698,1,10,CIN,CIN,18,C,4,0.0,False
2,2022103004,3146,3,12,HOU,HOU,30,C,6,,
3,2022110610,348,2,10,KC,TEN,33,C,4,,
4,2022102700,2799,2,8,BAL,TB,37,,-1,,


In [7]:
model_data.shape

(16124, 11)

In [8]:
model_data['isBlitz'].value_counts()

isBlitz
False    5156
True     4300
Name: count, dtype: int64

In [16]:
df_blitz = model_data[model_data['isBlitz'] == True]

In [17]:
df_blitz.head()

Unnamed: 0,gameId,playId,down,yardsToGo,possessionTeam,yardlineSide,absoluteYardlineNumber,passResult,prePenaltyYardsGained,num_pass_rushers,isBlitz
7,2022100203,3994,3,12,ARI,CAR,82,,-1,11.0,True
10,2022100912,445,3,5,DAL,LA,95,I,0,11.0,True
20,2022092505,3437,3,6,NE,NE,77,C,28,11.0,True
22,2022091111,923,1,10,LAC,LAC,63,C,42,11.0,True
23,2022092506,1412,1,4,CIN,NYJ,106,I,0,11.0,True


## Successful blitz criteria

- If 1st or 2nd down: `prePenaltyYardsGained` < 3
    - The average play gains ~4.5-5 yards in the NFL. Limiting the  offense to fewer than 3 yards represents distrubtion
- If 3rd or 4th down: `prePenaltyYardsGained` < `yardsToGo`
    - A blitz is successful if it prevents a first down 
- If `passResult` = interception, incomplete pass, or sack

In [19]:
df_blitz['successfulBlitz'] = (df_blitz['passResult'].isin(['I', 'S', 'IN'])).astype(int)

first_two_down_condition = (
    ((df_blitz['down'].isin([1, 2])) & (df_blitz['prePenaltyYardsGained'] < 3))
)

third_fourth_down_condition = (
    ((df_blitz['down'].isin([3, 4])) & (df_blitz['prePenaltyYardsGained'] < df_blitz['yardsToGo']))
)


df_blitz.loc[first_two_down_condition, 'successfulBlitz'] = 1
df_blitz.loc[third_fourth_down_condition, 'successfulBlitz'] = 1

df_blitz['successfulBlitz'] = df_blitz['successfulBlitz'].fillna(0).astype(int)

In [20]:
df_blitz.head()

Unnamed: 0,gameId,playId,down,yardsToGo,possessionTeam,yardlineSide,absoluteYardlineNumber,passResult,prePenaltyYardsGained,num_pass_rushers,isBlitz,successfulBlitz
7,2022100203,3994,3,12,ARI,CAR,82,,-1,11.0,True,1
10,2022100912,445,3,5,DAL,LA,95,I,0,11.0,True,1
20,2022092505,3437,3,6,NE,NE,77,C,28,11.0,True,0
22,2022091111,923,1,10,LAC,LAC,63,C,42,11.0,True,0
23,2022092506,1412,1,4,CIN,NYJ,106,I,0,11.0,True,1
