In [None]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
from itertools import chain
from utility_db_25 import calc_tempo

In [None]:
root_dir = os.getcwd()
train_data = pd.read_csv(os.path.join(root_dir,'data/train_data.csv'))
test_data = pd.read_csv(os.path.join(root_dir,'data/test_data.csv'))

# subset plays to train_only?
df_plays = pd.read_csv(os.path.join(root_dir,'data/plays.csv'))
df_plays = df_plays.sort_values(by=['gameId','playId']).reset_index()
#df_plays = df_plays.merge(train_data[['gameId','playId']],how='inner')

In [None]:
'''
include_columns=[x for x in test_data.columns if '_def' not in x]+['pass_rate_def']
train_data=train_data[include_columns]
test_data = test_data[include_columns]
'''

### Track current drive pass rate stats

We want to track pass rate for the current drive, to dovetail with our "Tempo" idea

In [None]:
last_team = df_plays['possessionTeam'][0] # monitor what the last team updated was, implies switch if different
pnum=0 # play number of drive
pc = 0 # pass count
pnum_ls = [] # play number list
flag_ls = [] # switch flag list
curr_pr_ls = [] # pass rate for current drive
pr_ls = [] # overall pass_rate ls
curr_clock_ls = [] # play clock for run
clock_ls = [] # play clock tracker
curr_epa_ls = [] #list of current drive epa
epa_ls = [] #overall epa list

# loop over plays
for index, row in df_plays.iterrows():

    curr_team = row['possessionTeam']
    flag = 0

    # if we've switched teams, reset drive tracking info/add last drive's info to running list
    if last_team != curr_team:

        # reset pass count, play number for drive

        last_team = curr_team # reset team to know we're on current drive now
        pc = 0 # reset pass count, etc.
        pnum = 0
        flag = 1

        # append current clock, epa, pass rate stats to running lists
        clock_ls.append([10] + list(np.cumsum(curr_clock_ls)/np.arange(1,len(curr_clock_ls)+1))[:-1]) # assume 10 seconds left on play clock, can adjust later
        pr_ls.append([.6] + curr_pr_ls[:-1]) # lookback of one, use .6 for first play of drive (default pass rate)
        epa_ls.append([.0] + list(np.cumsum(curr_epa_ls)/np.arange(1,len(curr_epa_ls)+1))[:-1]) # inelegantly impute an EPA of zero for our first timestep
        
        # reset current drive stat lists
        curr_pr_ls = []
        curr_clock_ls = []
        curr_epa_ls = []
        

    # if not switching teams, update current drive's pass rate
    if row['isDropback']:
        pc+=1
    pnum += 1
    pr = pc/pnum

    # get current mean clock used per drive
    clock = row['playClockAtSnap']
    epa = row['expectedPointsAdded']
    
    # update pass rate, play number, possession, etc. for current drive
    pnum_ls.append(pnum)
    flag_ls.append(flag)
    curr_pr_ls.append(pr)
    curr_clock_ls.append(clock)
    curr_epa_ls.append(epa)

# if new drive not logged, append
if len(list(chain(*pr_ls))) < len(df_plays):
    pr_ls.append([.6] + curr_pr_ls[:-1])
if len(list(chain(*clock_ls))) < len(df_plays):
    clock_ls.append([10] + curr_clock_ls[:-1])
if len(list(chain(*epa_ls))) < len(df_plays):
    epa_ls.append([0] + curr_epa_ls[:-1])

We then flatten our aggregated lists, to use as features

In [None]:
# flatten running lists using using iter chain
pr_flat = list(chain(*pr_ls))
clock_flat = list(chain(*clock_ls))
epa_flat = list(chain(*epa_ls))

# estalish new features
df_plays['drive_play_num'] = pnum_ls
df_plays['pos_switch_flag'] = flag_ls
df_plays['drive_pass_rate'] = pr_flat
df_plays['mean_clocksnap'] = clock_flat
df_plays['mean_epa'] = epa_flat

Then examine an example drive

In [None]:
df_plays[['possessionTeam','drive_play_num','pos_switch_flag','drive_pass_rate','mean_clocksnap','mean_epa','isDropback']].head(11)

### Explore relation between drive-based columns and pass likelihood

Drive pass rate/mean snap time among most informative df_play features;

Want no overlap w/down-distance features, which seems to be the case.

While 'mean_epa' implies a strong negative correlation with pass rate, it's very correlated w/'down', and ergo redundant

In [None]:
top_cols = df_plays.corr()['isDropback'].sort_values(ascending=False).head(9).index
plt.figure(figsize=(15,3.5))
sns.heatmap(df_plays[[x for x in top_cols if 'prePenaltyYardsGained' not in x and 'playAction' not in x]+['mean_epa']].corr(), cmap="crest")


plt.title('Top Plays Data Feature Correlations');
plt.xticks(rotation=0);

In [None]:
df_plays[[x for x in top_cols if 'prePenaltyYardsGained' not in x and 'playAction' not in x]+['mean_epa']].corr()

In [None]:
df_plays['tempo'] = .1*df_plays['mean_clocksnap'] + df_plays['drive_pass_rate'] - df_plays['mean_epa']
df_plays['tempo'] = df_plays['tempo']/df_plays['tempo'].max()

In [None]:
import pandas as pd
from sklearn import preprocessing

In [None]:
df_plays = pd.read_csv(os.path.join(root_dir,'data/plays.csv'))

In [None]:
calc_tempo(df_plays)

In [None]:
df_plays[['mean_clocksnap','drive_pass_rate','tempo','isDropback']].head(3)

In [None]:
df_plays[['mean_clocksnap','drive_pass_rate','tempo','isDropback']].corr()['isDropback']

In [None]:
train_data.merge(df_plays[['gameId','playId','mean_clocksnap','drive_pass_rate','tempo']],how='left',on=['gameId','playId'])[['mean_clocksnap','drive_pass_rate','tempo','pass']].corr()

In [None]:
train_data.merge(df_plays[['gameId','playId','mean_clocksnap','drive_pass_rate','tempo']],how='left',on=['gameId','playId'])[['mean_clocksnap','drive_pass_rate','tempo','pass']].isna().sum()