In [1]:
import pandas as pd
import os
from utility_db_25 import get_momentum_cols, create_momentum_index,  get_motion_cols, motion_complexity_score
from data_loading import load_ftn

# TODO: just make it work for df_play

# Load data

We load train & play data, then flag man/zone discrepancies for df_play

In [2]:
root_dir = os.getcwd()

df_play = pd.read_csv(os.path.join(root_dir,'data/plays.csv')).sort_values(by=['gameId','possessionTeam','playId'])
df_player_play = pd.read_csv(os.path.join(root_dir,'data/player_play.csv'))
df_players = pd.read_csv(os.path.join(root_dir,'data/players.csv'))

# Get box-count EWM

On the team-game level, we get the exponential windowed mean for box-count:

2022 done.
Downcasting floats.
Downcasting floats.


In [None]:
ftn_df = load_ftn()
df_play = df_play.merge(ftn_df[['gameId','playId','n_defense_box']],how='left')

In [9]:
df_play['n_defense_box'].isna().sum()

0

In [10]:
df_play['box_ewm_pre'] = df_play.groupby(['gameId','possessionTeam'])['n_defense_box'].transform(lambda x: x.ewm(alpha=.1).mean())
df_play['box_ewm'] = df_play.groupby(['gameId','possessionTeam']).box_ewm_pre.shift(1)
df_play['box_ewm'] = df_play['box_ewm'].fillna(6)
df_play[['gameId','playId','possessionTeam','n_defense_box','box_ewm']].head(6)

Unnamed: 0,gameId,playId,possessionTeam,n_defense_box,box_ewm
0,2022090800,56,BUF,6.0,6.0
1,2022090800,80,BUF,6.0,6.0
2,2022090800,101,BUF,7.0,6.0
3,2022090800,122,BUF,6.0,6.369004
4,2022090800,167,BUF,5.0,6.261704
5,2022090800,191,BUF,6.0,5.953603


Note: while non-EWM mean box count excised now, EWM significantly outperforms (almost 2:1)

In [13]:
df_play[['box_ewm','isDropback']].corr()

Unnamed: 0,box_ewm,isDropback
box_ewm,1.0,-0.088705
isDropback,-0.088705,1.0


# Process BMI data

First, we convert height to inches, then get BMI:

In [14]:
# calc height, bmi
df_players = pd.concat([df_players,df_players['height'].str.split('-',n=1,expand=True).rename(columns={0:'h_ft',1:'h_in_pre'})],axis=1)
df_players['height_inches'] = df_players['h_ft'].astype(int)*12 + df_players['h_in_pre'].astype(int)
df_players['bmi'] = df_players['weight'] /(df_players['height_inches']**2) # weight/height squared

# incorporate data back into player-play
df_bmi = df_player_play[['gameId','playId','nflId']].merge(df_players[['nflId','bmi','height_inches','weight','position']])
df_bmi.head(1)

Unnamed: 0,gameId,playId,nflId,bmi,height_inches,weight,position
0,2022090800,56,35472,0.054815,77,325,G


### Get BMI by position

For each play, we get the mean BMI, weight, and height, by position group:

In [15]:
ol_df = df_bmi[df_bmi['position'].isin(['C','G','T'])].groupby(['gameId','playId'])[['weight','height_inches','bmi']].mean().reset_index().rename(columns={'weight':'mean_OL_weight','height_inches':'mean_OL_height','bmi':'mean_OL_bmi'})
dl_df = df_bmi[df_bmi['position'].isin(['DT','NT','DE'])].groupby(['gameId','playId'])[['weight','height_inches','bmi']].mean().reset_index().rename(columns={'weight':'mean_DL_weight','height_inches':'mean_DL_height','bmi':'mean_DL_bmi'})
lb_df = df_bmi[df_bmi['position'].isin(['LB','OLB','ILB'])].groupby(['gameId','playId'])[['weight','height_inches','bmi']].mean().reset_index().rename(columns={'weight':'mean_LB_weight','height_inches':'mean_LB_height','bmi':'mean_LB_bmi'})
cb_df = df_bmi[df_bmi['position'].isin(['CB'])].groupby(['gameId','playId'])[['weight','height_inches','bmi']].mean().reset_index().rename(columns={'weight':'mean_CB_weight','height_inches':'mean_CB_height','bmi':'mean_CB_bmi'})
wr_df = df_bmi[df_bmi['position'].isin(['WR'])].groupby(['gameId','playId'])[['weight','height_inches','bmi']].mean().reset_index().rename(columns={'weight':'mean_WR_weight','height_inches':'mean_WR_height','bmi':'mean_WR_bmi'})
te_df = df_bmi[df_bmi['position'].isin(['TE'])].groupby(['gameId','playId'])[['weight','height_inches','bmi']].mean().reset_index().rename(columns={'weight':'mean_TE_weight','height_inches':'mean_TE_height','bmi':'mean_TE_bmi'})

Then, we integrate all these positional BMI's:

In [16]:
df_play = df_play.merge(ol_df,how='left')
df_play = df_play.merge(dl_df,how='left')
df_play = df_play.merge(lb_df,how='left')
df_play = df_play.merge(cb_df,how='left')
df_play = df_play.merge(wr_df,how='left')
df_play = df_play.merge(te_df,how='left')

We next calculate rough "delta" BMI's between oppositional positions (e.g., WR/CB), also adding 'box' data (DL + LB)

In [13]:
df_play['wr_cb_bmi_delta'] = df_play['mean_WR_bmi']-df_play['mean_CB_bmi']
df_play['ol_dl_bmi_delta'] = df_play['mean_OL_bmi']-df_play['mean_DL_bmi']
df_play['ol_box_delta'] = df_play['mean_OL_bmi']-((df_play['mean_DL_bmi'] + df_play['mean_LB_bmi']) /2)
df_play['ol_plus_box_delta'] = ((df_play['mean_OL_bmi']+df_play['mean_TE_bmi'])/2)-((df_play['mean_DL_bmi'] + df_play['mean_LB_bmi']) /2)
df_play['box_weight'] = (df_play['mean_DL_weight'] + df_play['mean_LB_weight']) /2
df_play['box_bmi'] = (df_play['mean_DL_bmi'] + df_play['mean_LB_bmi']) /2

### Compare new features to final features in model

We want to see if there's too much cross-correlation between our new features and our extant useful ones:

In [15]:
motion_cols=get_motion_cols(train_data.columns)
momentum_cols=get_momentum_cols(train_data.columns)
train_data=create_momentum_index(train_data, momentum_cols)
train_data=motion_complexity_score(train_data, motion_cols)
final_features=['xpass_situational',  'QB_RB1_offset','off_xpass','n_offense_backfield','motion-momentum','neg_Formations', 'mean_pairwise_dist']

Creat a few more composite features, trying to reconcile box count & DL/box weight:

In [16]:
train_data['box_ewm_dl_weight'] = train_data['box_ewm']*train_data['mean_DL_weight']
train_data['box_ewm_dl_bmi'] = train_data['box_ewm']*train_data['mean_DL_bmi']
train_data['box_ewm_weight'] = train_data['box_ewm']*train_data['box_weight']
train_data['box_ewm_bmi'] = train_data['box_ewm']*train_data['box_bmi']