In [1]:
import pandas as pd
import os
root_dir = os.getcwd()
from data_loading import load_ftn, load_stadium_data, load_weather_data
from data_cleaning import feature_engineering


#### Load data

In [2]:
df_games=pd.read_csv('data/games.csv')
df_games=pd.merge(df_games, load_stadium_data(),left_on='gameId', right_on='old_game_id', how='left')
df_games=pd.merge(df_games, load_weather_data(),on='gameId', how='left')

# feed play data thru feature_engineering process
df_plays = feature_engineering(pd.read_csv('data/plays.csv'))

# load ftn


2022 done.
Downcasting floats.


In [33]:
%%capture
ftn_merged = load_ftn()

Add external data, recover merged ID's from 'get_merge_pre' (i.e., product of half of aggregate_data)

In [4]:
merged_id_df = pd.read_csv('data/merged_ids.csv')
xp_fname = os.path.join(root_dir, "data/team_xpass_ratios.csv")
pr_fname = os.path.join(root_dir, "data/team_pass_rates.csv")
cf_fname = os.path.join(root_dir, "data/coverages_faced.csv")
cu_fname = os.path.join(root_dir, 'data/coverages_used.csv')
xp_df = pd.read_csv(xp_fname).drop(columns='Unnamed: 0')
pr_df = pd.read_csv(pr_fname).drop(columns='Unnamed: 0')

In [5]:
xp_df.head(3)

Unnamed: 0,gameId,playId,off_xpass,def_xpass
0,2022091110,347,0.58,0.58
1,2022091110,400,0.58,0.58
2,2022091110,729,0.58,0.58


In [6]:
xp_df.isna().sum()

gameId       0
playId       0
off_xpass    0
def_xpass    0
dtype: int64

#### Merge in week, team info to our base dataframe

In [7]:
# add week info
merged_base = merged_id_df.merge(df_games[['gameId','week']].drop_duplicates(),how='left',on=['gameId'])

# add in team info
merged_base = merged_base.merge(df_plays[['gameId','playId',
                                          'possessionTeam','defensiveTeam']].drop_duplicates(),
                                how='left',on=['gameId','playId'])

In [8]:
merged_base.head(3)

Unnamed: 0,gameId,playId,week,possessionTeam,defensiveTeam
0,2022090800,56,1,BUF,LA
1,2022090800,80,1,BUF,LA
2,2022090800,101,1,BUF,LA


#### Define ID's for each segment of the pre- and post- week 1 split:

In [9]:
w1_ids = merged_base[merged_base['week'] == 1][['gameId','playId','week','possessionTeam','defensiveTeam']]
w2_on_ids = merged_base[merged_base['week'] > 1][['gameId','playId','week','possessionTeam','defensiveTeam']]

#### Import 2021 data

In [10]:
cov_21 = pd.read_csv('data_21/cov_21.csv')
team_pr_21 = pd.read_csv('data_21/team_pr_21.csv')

# Calculate team pass rates

Note: we use team pass rate from last year as xpass for week 1 (due to 2021 data incompleteness)

In [11]:
team_pr_21.head(1)

Unnamed: 0,possessionTeam,pass_rate_def,pass_rate_off
0,ARI,0.573372,0.573464


In [12]:
w1_ids.head(1)

Unnamed: 0,gameId,playId,week,possessionTeam,defensiveTeam
0,2022090800,56,1,BUF,LA


#### Merge in '21 pass ratios to '22 week one df

We do offensive and defensive rates separately

In [13]:
# merge defensive pass rates for '21 into week 1
w1_pr = w1_ids.merge(team_pr_21.drop(columns=['pass_rate_off']).rename(columns={'possessionTeam':'defensiveTeam'}),
                     on='defensiveTeam',how='left')

# offensive
w1_pr = w1_pr.merge(team_pr_21.drop(columns=['pass_rate_def']),on='possessionTeam',how='left')

We bake in xpass here because it's just going to replicate pass_rate_off/def for week 1

In [14]:
w1_pr['off_xpass'] = w1_pr['pass_rate_off'].copy()
w1_pr['def_xpass'] = w1_pr['pass_rate_def'].copy()

In [15]:
w1_pr.head(1)

Unnamed: 0,gameId,playId,week,possessionTeam,defensiveTeam,pass_rate_def,pass_rate_off,off_xpass,def_xpass
0,2022090800,56,1,BUF,LA,0.615627,0.619126,0.619126,0.615627


# Calculate coverage data

Here we just use defensive rates for simplicity, can revisit later

In [16]:
# subset to only defensive features, rename features
cov_def = cov_21[[x for x in cov_21.columns if '_off' not in x]].rename(columns={'possessionTeam':'defensiveTeam'})
cov_def = cov_def.rename(columns={'cover_2_def':'Cover-2_def','cover_0_def':'Cover-0_def'})

#merge into running dataframe
w1_merged = w1_pr.merge(cov_def,how='left',on='defensiveTeam')

In [17]:
w1_merged.head(1)

Unnamed: 0,gameId,playId,week,possessionTeam,defensiveTeam,pass_rate_def,pass_rate_off,off_xpass,def_xpass,Man_def,Zone_def,Cover-0_def,cover_1_def,Cover-2_def,cover_3_def,Quarters_def,cover_6_def,Other_def
0,2022090800,56,1,BUF,LA,0.615627,0.619126,0.619126,0.615627,7.117647,37.117647,0.705882,6.352941,0.647059,20.823529,7.647059,7.705882,0.352941


## Integrate '22 coverage data (i.e., for week 2 on)

In [18]:
w2_on_ids.head(1)

Unnamed: 0,gameId,playId,week,possessionTeam,defensiveTeam
1950,2022091500,55,2,KC,LAC


Load coverage data, merge:

In [19]:
cu_df = pd.read_csv('data/coverages_used.csv')
cu_df['week'] = cu_df['week'].astype(int)
cu_df.head(1)

Unnamed: 0,week,cover_3_def,cover_6_def,cover_1_def,Quarters_def,Cover-2_def,Cover-0_def,Man_def,Other_def,Zone_def,defensiveTeam
0,2,21.0,2.0,21.0,7.0,4.0,3.0,26.0,4.0,34.0,ARI


In [20]:
cu_w2_on = w2_on_ids.merge(cu_df,how='left',left_on=['defensiveTeam','week'],right_on=['defensiveTeam','week'])

In [21]:
cu_w2_on.head(1)

Unnamed: 0,gameId,playId,week,possessionTeam,defensiveTeam,cover_3_def,cover_6_def,cover_1_def,Quarters_def,Cover-2_def,Cover-0_def,Man_def,Other_def,Zone_def
0,2022091500,55,2,KC,LAC,20.0,8.0,9.0,3.0,8.0,1.0,10.0,6.0,39.0


### Add pass ratio, expected pass data

In [22]:
df_w2_on = cu_w2_on.merge(pr_df,how='left',on=['gameId','playId'])
df_w2_on = df_w2_on.merge(xp_df,how='left',on=['gameId','playId'])

df_w2_on.head(2)

Unnamed: 0,gameId,playId,week,possessionTeam,defensiveTeam,cover_3_def,cover_6_def,cover_1_def,Quarters_def,Cover-2_def,Cover-0_def,Man_def,Other_def,Zone_def,pass_rate_off,pass_rate_def,off_xpass,def_xpass
0,2022091500,55,2,KC,LAC,20.0,8.0,9.0,3.0,8.0,1.0,10.0,6.0,39.0,0.625,0.763636,0.686433,0.738111
1,2022091500,76,2,KC,LAC,20.0,8.0,9.0,3.0,8.0,1.0,10.0,6.0,39.0,0.625,0.763636,0.658727,0.738111


In [23]:
w1_merged = w1_merged[df_w2_on.columns]

In [24]:
df_w2_on.week.min()

2

## Integrate week 1-imputed, week 2 onward data

In [35]:
merged_base = pd.concat([w1_merged,df_w2_on],axis=0)

In [36]:
merged_base.sample(3)

Unnamed: 0,gameId,playId,week,possessionTeam,defensiveTeam,cover_3_def,cover_6_def,cover_1_def,Quarters_def,Cover-2_def,Cover-0_def,Man_def,Other_def,Zone_def,pass_rate_off,pass_rate_def,off_xpass,def_xpass
381,2022091102,1162,1,SF,CHI,6.882353,6.352941,7.352941,1.882353,2.411765,1.764706,9.823529,0.764706,17.588235,0.507347,0.508403,0.507347,0.508403
948,2022091106,1297,1,NE,MIA,13.235294,0.294118,10.705882,1.176471,3.411765,5.529412,16.529412,0.529412,18.352941,0.536585,0.597837,0.536585,0.597837
11063,2022103000,3381,8,JAX,DEN,209.0,80.0,61.0,41.0,5.0,6.0,70.0,14.0,335.0,0.586957,0.642353,0.543273,0.622404


Get cols final for everything to line up later (sans off. coverage feats)

In [37]:
cols_final = ['gameId', 'playId', 'n_offense_backfield', 'n_defense_box',
       'is_no_huddle', 'is_motion', 'pass_rate_off', 'pass_rate_def',
       'off_xpass', 'def_xpass', 'week', 'possessionTeam', 'defensiveTeam',
       'cover_3_def', 'cover_6_def', 'cover_1_def', 'Quarters_def',
       'Cover-2_def', 'Cover-0_def', 'Man_def', 'Other_def', 'Zone_def',
       'off_snaps_lost', 'def_snaps_lost']

In [38]:
inj_fname = os.path.join(root_dir, 'data/snaps_lost_injury.csv')
inj_df = pd.read_csv(inj_fname)

In [39]:
inj_df.head(2)

Unnamed: 0,team,week,off_snaps_lost,def_snaps_lost
0,ARI,1,0.0,0.0
1,ARI,2,21.0,0.0


In [40]:
# merge in offensive snaps
merged_base = merged_base.merge(inj_df.drop(columns=['def_snaps_lost']),how='left',
                   left_on=['possessionTeam','week'], right_on=['team','week']).drop(columns=['team'])

# merge in defensive snaps
merged_base = merged_base.merge(inj_df.drop(columns=['off_snaps_lost']),how='left',
                   left_on=['defensiveTeam','week'], right_on=['team','week']).drop(columns=['team'])

In [41]:
merged_base.isna().sum()

gameId            0
playId            0
week              0
possessionTeam    0
defensiveTeam     0
cover_3_def       0
cover_6_def       0
cover_1_def       0
Quarters_def      0
Cover-2_def       0
Cover-0_def       0
Man_def           0
Other_def         0
Zone_def          0
pass_rate_off     0
pass_rate_def     0
off_xpass         0
def_xpass         0
off_snaps_lost    0
def_snaps_lost    0
dtype: int64

### Integrate FTN data

In [42]:
merged_base = merged_base.merge(ftn_merged,how='left',on=['gameId','playId'])

In [43]:
merged_base.head(2)

Unnamed: 0,gameId,playId,week,possessionTeam,defensiveTeam,cover_3_def,cover_6_def,cover_1_def,Quarters_def,Cover-2_def,...,pass_rate_off,pass_rate_def,off_xpass,def_xpass,off_snaps_lost,def_snaps_lost,n_offense_backfield,n_defense_box,is_no_huddle,is_motion
0,2022090800,56,1,BUF,LA,20.823529,7.705882,6.352941,7.647059,0.647059,...,0.619126,0.615627,0.619126,0.615627,0.0,0.0,1.0,6.0,False,False
1,2022090800,80,1,BUF,LA,20.823529,7.705882,6.352941,7.647059,0.647059,...,0.619126,0.615627,0.619126,0.615627,0.0,0.0,1.0,6.0,True,True


In [44]:
merged_base[cols_final]

Unnamed: 0,gameId,playId,n_offense_backfield,n_defense_box,is_no_huddle,is_motion,pass_rate_off,pass_rate_def,off_xpass,def_xpass,...,cover_6_def,cover_1_def,Quarters_def,Cover-2_def,Cover-0_def,Man_def,Other_def,Zone_def,off_snaps_lost,def_snaps_lost
0,2022090800,56,1.0,6.0,False,False,0.619126,0.615627,0.619126,0.615627,...,7.705882,6.352941,7.647059,0.647059,0.705882,7.117647,0.352941,37.117647,0.000000,0.000000
1,2022090800,80,1.0,6.0,True,True,0.619126,0.615627,0.619126,0.615627,...,7.705882,6.352941,7.647059,0.647059,0.705882,7.117647,0.352941,37.117647,0.000000,0.000000
2,2022090800,101,1.0,7.0,False,True,0.619126,0.615627,0.619126,0.615627,...,7.705882,6.352941,7.647059,0.647059,0.705882,7.117647,0.352941,37.117647,0.000000,0.000000
3,2022090800,122,2.0,6.0,False,False,0.619126,0.615627,0.619126,0.615627,...,7.705882,6.352941,7.647059,0.647059,0.705882,7.117647,0.352941,37.117647,0.000000,0.000000
4,2022090800,167,0.0,5.0,False,False,0.619126,0.615627,0.619126,0.615627,...,7.705882,6.352941,7.647059,0.647059,0.705882,7.117647,0.352941,37.117647,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14551,2022103100,3596,1.0,8.0,False,False,0.547297,0.604762,0.568401,0.579605,...,32.000000,85.000000,65.000000,47.000000,28.000000,116.000000,19.000000,282.000000,59.142857,90.285714
14552,2022103100,3674,1.0,8.0,False,False,0.547297,0.604762,0.564558,0.501192,...,32.000000,85.000000,65.000000,47.000000,28.000000,116.000000,19.000000,282.000000,59.142857,90.285714
14553,2022103100,3697,1.0,8.0,False,False,0.547297,0.604762,0.564558,0.501192,...,32.000000,85.000000,65.000000,47.000000,28.000000,116.000000,19.000000,282.000000,59.142857,90.285714
14554,2022103100,3727,0.0,0.0,False,False,0.547297,0.604762,0.609726,0.501192,...,32.000000,85.000000,65.000000,47.000000,28.000000,116.000000,19.000000,282.000000,59.142857,90.285714
