# Team Tendency Analysis (v2) - Oct 28, 2024

Goal here is to get longer-term data on teams (and later players) to bake in to 

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os

root_dir = os.path.split(os.getcwd())[0]

## Import data

We load all relevant data to understand team pass rates

In [2]:
df_play = pd.read_csv(os.path.join(root_dir, 'data/plays.csv'))
df_player_play = pd.read_csv(os.path.join(root_dir,'data/player_play.csv'))
df_games = pd.read_csv(os.path.join(root_dir,'data/games.csv'))
df_players = pd.read_csv(os.path.join(root_dir,'data/players.csv'))

In [3]:
# Add in week # for each play
df_weeks = df_play[['gameId','playId','isDropback','possessionTeam','defensiveTeam']].merge(df_games[['gameId','week']],how='left',on=['gameId'])


In [4]:
df_weeks.head(1)

Unnamed: 0,gameId,playId,isDropback,possessionTeam,defensiveTeam,week
0,2022102302,2655,True,CIN,ATL,7


In [5]:
df_weeks.duplicated().sum()

0

# Aggregate team snap data

We want to know both the raw number of snaps each team took, and how many of them were passes, for each week

NOTE: DO NOT DEDUPE. This is just play-by-play data without ID labels.

In [6]:
df_working = df_weeks.drop(columns=['gameId','playId'])
df_working.head(1)

Unnamed: 0,isDropback,possessionTeam,defensiveTeam,week
0,True,CIN,ATL,7


Group by team/week, get pass & snap count for each team/week combo:

In [7]:
off_df = df_working.groupby(['possessionTeam','week'])\
                        .agg(off_passes=('isDropback', 'sum'), off_snaps=('isDropback', 'count')).reset_index()

def_df = df_working.groupby(['defensiveTeam','week'])\
                        .agg(def_passes=('isDropback', 'sum'), def_snaps=('isDropback', 'count')).reset_index()


Merge offense, defense, get pass rates effected on both sides of the ball

In [8]:
# Merge offensive, defensive snap stats
df_full = off_df.merge(def_df,how='inner',left_on=['possessionTeam','week'],right_on=['defensiveTeam','week']).drop(columns=['defensiveTeam'])

# Get running snap/pass stats over the season for the current week
df_cs = pd.concat([df_full[['possessionTeam','week']],df_full.groupby('possessionTeam').cumsum().drop(columns='week')],axis=1)

# Calc pass rate for offense & defense
df_cs['pass_rate_off'] = df_cs['off_passes']/df_cs['off_snaps']
df_cs['pass_rate_def'] = df_cs['def_passes']/df_cs['def_snaps']

In [9]:
df_cs.head(3)

Unnamed: 0,possessionTeam,week,off_passes,off_snaps,def_passes,def_snaps,pass_rate_off,pass_rate_def
0,ARI,1,39,57,40,64,0.684211,0.625
1,ARI,2,88,130,81,124,0.676923,0.653226
2,ARI,3,146,206,108,169,0.708738,0.639053


### Forward fill bye week data

We want to forward fill bye week data for merge reasons. We use a cartesian product to do so.

In [10]:
# get team, week cartesian product
ind = pd.DataFrame(index=pd.MultiIndex.from_product([range(1,10),df_cs.possessionTeam.unique()])).reset_index().rename(columns={'level_0':'week','level_1':'possessionTeam'})

Merge to get each team-week combo possible

In [11]:
# merge onto product
cm = ind.merge(df_cs,how='left',on=['week','possessionTeam']).reset_index(drop=True)
cm[cm['possessionTeam'] == 'LV']

Unnamed: 0,week,possessionTeam,off_passes,off_snaps,def_passes,def_snaps,pass_rate_off,pass_rate_def
18,1,LV,42.0,55.0,34.0,61.0,0.763636,0.557377
50,2,LV,83.0,115.0,83.0,134.0,0.721739,0.619403
82,3,LV,131.0,180.0,110.0,188.0,0.727778,0.585106
114,4,LV,170.0,254.0,141.0,235.0,0.669291,0.6
146,5,LV,201.0,308.0,187.0,302.0,0.652597,0.619205
178,6,LV,,,,,,
210,7,LV,229.0,363.0,228.0,366.0,0.630854,0.622951
242,8,LV,271.0,416.0,259.0,429.0,0.651442,0.60373
274,9,LV,308.0,469.0,292.0,494.0,0.656716,0.591093


In [12]:
# forward fill within each team
cmg = cm.groupby(['possessionTeam']).ffill().reset_index(drop=True)
cmg['possessionTeam'] = cm['possessionTeam']

In [13]:
# check na ct filled
len(cmg)-len(cm.dropna())

18

In [14]:
cmg[cmg['possessionTeam'] == 'LV']

Unnamed: 0,week,off_passes,off_snaps,def_passes,def_snaps,pass_rate_off,pass_rate_def,possessionTeam
18,1,42.0,55.0,34.0,61.0,0.763636,0.557377,LV
50,2,83.0,115.0,83.0,134.0,0.721739,0.619403,LV
82,3,131.0,180.0,110.0,188.0,0.727778,0.585106,LV
114,4,170.0,254.0,141.0,235.0,0.669291,0.6,LV
146,5,201.0,308.0,187.0,302.0,0.652597,0.619205,LV
178,6,201.0,308.0,187.0,302.0,0.652597,0.619205,LV
210,7,229.0,363.0,228.0,366.0,0.630854,0.622951,LV
242,8,271.0,416.0,259.0,429.0,0.651442,0.60373,LV
274,9,308.0,469.0,292.0,494.0,0.656716,0.591093,LV


# Integrate pass rate to game-play level

Above we have pass rate data for each team and week; now we bake it into plays. 

First we increase our df's 'week' value, such that we have a "lookback" of one

In [15]:
tp_df = cmg[['possessionTeam','week','pass_rate_off','pass_rate_def']].copy()
tp_df['week'] = tp_df['week'] +1
tp_df = tp_df.rename(columns={'possessionTeam':'team'})

In [16]:
tp_df.head(1)

Unnamed: 0,team,week,pass_rate_off,pass_rate_def
0,ARI,2,0.684211,0.625


In [17]:
tp_df.isna().sum()

team             0
week             0
pass_rate_off    0
pass_rate_def    0
dtype: int64

### Merge in offensive, defensive pass rate data to play-by-play dataframe

In [18]:
# offense
play_out = df_weeks.drop(columns={'isDropback'}).merge(tp_df[['team','pass_rate_off','week']], right_on=['team','week'],
                                   how='left',left_on=['possessionTeam','week']).drop(columns=['possessionTeam','team'])

In [19]:
play_out.head(1)

Unnamed: 0,gameId,playId,defensiveTeam,week,pass_rate_off
0,2022102302,2655,ATL,7,0.654054


In [20]:
play_out = play_out.merge(tp_df[['team','pass_rate_def','week']], right_on=['team','week'],
                                   how='left',left_on=['defensiveTeam','week']).drop(columns=['defensiveTeam','team'])

In [21]:
prate_out = play_out.drop(columns=['week'])
prate_out.head(1)

Unnamed: 0,gameId,playId,pass_rate_off,pass_rate_def
0,2022102302,2655,0.654054,0.648794


# Calculate player-weighted pass tendencies

We start by getting each player's weekly snap counts:

In [22]:
# Player-play data, flagged for if it's a dropback
db_flagged = df_player_play.merge(df_play[['playId','gameId','isDropback']],how='left',
                     on=['playId','gameId'])

# Add in week info to each play
db_weeks_id = db_flagged[['gameId','playId','nflId','isDropback','teamAbbr']].merge(df_weeks[['gameId','week']].drop_duplicates('gameId'),how='left',on=['gameId'])

# Get weekly pass counts for each player
pass_cts = db_weeks_id.groupby(['nflId','week','teamAbbr']).agg(pass_ct=('isDropback','sum'),
                                                     snap_ct=('isDropback','count')).reset_index()


In [23]:
pass_cts.head(3)

Unnamed: 0,nflId,week,teamAbbr,pass_ct,snap_ct
0,25511,1,TB,26,58
1,25511,2,TB,35,63
2,25511,3,TB,42,56


# Get rolling snapvalues for players

We aggregate on player for pass and snap count, getting sum first

This lets us calculate mean-snap ratios if we divide by week

In [24]:
pass_cts.sort_values(by=['nflId','week'],inplace=True)

In [25]:
# get cumulative sum
cum_snap_df = pd.concat([pass_cts.drop(columns=['pass_ct','snap_ct']), 
                       pass_cts.groupby(['nflId']).cumsum().drop(columns=['week'])],axis=1)
                       

In [26]:
cum_snap_df.head(3)

Unnamed: 0,nflId,week,teamAbbr,pass_ct,snap_ct
0,25511,1,TB,26,58
1,25511,2,TB,61,121
2,25511,3,TB,103,177


### Forward-fill player snap counts to account for bye, other issues

Same cartesian product/merge shenanigans as usual

In [27]:
# get cartesian product
cs_ind = pd.DataFrame(index=pd.MultiIndex.from_product([range(1,10),cum_snap_df.nflId.unique()])).reset_index().rename(columns={'level_0':'week','level_1':'nflId'})
cs_ind = cs_ind.sort_values(by=['nflId','week'],ascending=True)

In [28]:
cs_ind.head(2)

Unnamed: 0,week,nflId
0,1,25511
1697,2,25511


#### Merge into cartesian product, forward fill player pass/snap counts:

In [29]:
# merge onto cartesian product to get each player-week combo
csm = cs_ind.merge(cum_snap_df,how='left',on=['week','nflId'])

csm[csm['nflId'] == 33099]

Unnamed: 0,week,nflId,teamAbbr,pass_ct,snap_ct
45,1,33099,NYJ,57.0,72.0
46,2,33099,NYJ,102.0,134.0
47,3,33099,NYJ,156.0,208.0
48,4,33099,,,
49,5,33099,,,
50,6,33099,,,
51,7,33099,,,
52,8,33099,,,
53,9,33099,,,


In [30]:
# group by player, forward fill
csg = pd.concat([csm['nflId'],csm.groupby(['nflId']).ffill().reset_index(drop=True)],axis=1)

# get ratio
csg['pass_ratio'] = csg['pass_ct']/csg['snap_ct']
csg = csg.drop(columns=['pass_ct','snap_ct'])

csg[csg['nflId'] == 33099]

Unnamed: 0,nflId,week,teamAbbr,pass_ratio
45,33099,1,NYJ,0.791667
46,33099,2,NYJ,0.761194
47,33099,3,NYJ,0.75
48,33099,4,NYJ,0.75
49,33099,5,NYJ,0.75
50,33099,6,NYJ,0.75
51,33099,7,NYJ,0.75
52,33099,8,NYJ,0.75
53,33099,9,NYJ,0.75


### Merge player cumulative pass ratio into game-play data

We use ratio here b/c a snap-mean of ratios upweights specialist (third tackle/te) snaps


In [31]:
play_trunc = df_play[['gameId','playId','possessionTeam','defensiveTeam']]

In [32]:
# downshift week
csg['week'] += 1
csg.head(2)

Unnamed: 0,nflId,week,teamAbbr,pass_ratio
0,25511,2,TB,0.448276
1,25511,3,TB,0.504132


In [33]:
# merge in player data
pt_pre = db_weeks_id.merge(play_trunc,how='left',on=['gameId','playId'])
pt_df = pt_pre.merge(csg.drop(columns=['teamAbbr']),how='left',on=['week','nflId']).drop(columns=['isDropback'])

.58 is the mean of pass rates for 2021 across all teams, so it's a player's default expectation:

In [34]:
pt_df['pass_ratio'] = pt_df['pass_ratio'].fillna(.58)

In [35]:
pt_df.sample(3)

Unnamed: 0,gameId,playId,nflId,teamAbbr,week,possessionTeam,defensiveTeam,pass_ratio
229385,2022101607,560,42377,TB,6,TB,PIT,0.743056
241348,2022101611,2301,54489,DAL,6,DAL,PHI,0.539286
24172,2022091107,2791,46226,NYJ,1,NYJ,BAL,0.58


We can subset offensive plays to where teamAbbr matches possessionTeam, and inverse for defensive

In [36]:
pt_off = pt_df[pt_df['teamAbbr'] == pt_df['possessionTeam']].drop(columns=['teamAbbr']).drop(columns=['defensiveTeam'])
pt_def = pt_df[pt_df['teamAbbr'] == pt_df['defensiveTeam']].drop(columns=['teamAbbr']).drop(columns=['possessionTeam'])

In [37]:
pt_def.head(1)

Unnamed: 0,gameId,playId,nflId,week,defensiveTeam,pass_ratio
11,2022090800,56,38577,1,LA,0.58


## Aggregate to team level, doing offense/defense individually

We want to get defensive & offensive xpass ratios individually

In [38]:
# aggregate for off/def
pog = pt_off.groupby(['possessionTeam','week','gameId','playId'])['pass_ratio'].mean().reset_index().rename(columns={'pass_ratio':'off_xpass'})
pdg = pt_def.groupby(['defensiveTeam','week','gameId','playId'])['pass_ratio'].mean().reset_index().rename(columns={'pass_ratio':'def_xpass'})

# merge
xpass_out = pog.merge(pdg,how='left',on=['gameId','playId','week'])[['gameId','playId','off_xpass','def_xpass']]

In [39]:
xpass_out.head(1)

Unnamed: 0,gameId,playId,off_xpass,def_xpass
0,2022091110,347,0.58,0.58


In [40]:
xpass_out.isna().sum()

gameId       0
playId       0
off_xpass    0
def_xpass    0
dtype: int64

In [41]:
prate_out.head(1)

Unnamed: 0,gameId,playId,pass_rate_off,pass_rate_def
0,2022102302,2655,0.654054,0.648794


In [42]:
#prate_out.to_csv(os.path.join(root_dir, 'data/team_pass_rates.csv'))

In [43]:
#xpass_out.to_csv(os.path.join(root_dir, 'data/team_xpass_ratios.csv'))