In [1]:
import pandas as pd
pd.options.display.max_colwidth = 1000
import numpy as np
import matplotlib.pyplot as plt
import plotly
import plotly.plotly as py
import plotly.graph_objs as go
from plotly import tools, offline
plotly.tools.set_credentials_file(username='mtodisco10', api_key='bCfUmq5FralbymKXY4uX')
import re
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
from plotly.offline import init_notebook_mode, iplot
from IPython.display import display, HTML

In [2]:
%run progress_bar.ipynb
%run field_layout.ipynb

In [3]:
#Read in Data
game_data = pd.read_csv('game_data.csv')
play_info = pd.read_csv('play_information.csv')
play_player_role = pd.read_csv('play_player_role_data.csv')
player_punt = pd.read_csv('player_punt_data.csv')
video_footage_control = pd.read_csv('video_footage-control.csv')
video_footage_injury = pd.read_csv('video_footage-injury.csv')
video_review = pd.read_csv('video_review.csv')

In [4]:
def read_NGS_data(file_lst):
    for i, file in enumerate(file_lst):
        print('Reading in {}'.format(file))
        data = pd.read_csv(file)
        if i == 0:
            NGS_df = data
            del data
        else:
            NGS_df = pd.concat([NGS_df, data])
            del data
    return NGS_df

In [6]:
file_lst = ['NGS-2016-pre.csv','NGS-2016-reg-wk1-6.csv','NGS-2016-reg-wk7-12.csv','NGS-2016-reg-wk13-17.csv',\
            'NGS-2016-post.csv','NGS-2017-pre.csv','NGS-2017-reg-wk1-6.csv','NGS-2017-reg-wk7-12.csv',\
            'NGS-2017-reg-wk13-17.csv','NGS-2017-post.csv']

In [7]:
#Takes 10-15 mins to read in all Next Gen Stats data
NGS_df = read_NGS_data(file_lst)

Reading in NGS-2016-pre.csv
Reading in NGS-2016-reg-wk1-6.csv
Reading in NGS-2016-reg-wk7-12.csv
Reading in NGS-2016-reg-wk13-17.csv
Reading in NGS-2016-post.csv
Reading in NGS-2017-pre.csv
Reading in NGS-2017-reg-wk1-6.csv
Reading in NGS-2017-reg-wk7-12.csv
Reading in NGS-2017-reg-wk13-17.csv
Reading in NGS-2017-post.csv


### Injury Plays EDA

In [8]:
'There are concussion injuries on ' + str(round(len(video_review) / float(len(play_info)) * 100, 2)) + '% of ' + 'punt plays'

'There are concussion injuries on 0.55% of punt plays'

##### It is good for the NFL and players that injuries only occur on less than 1% of punt plays, but it will be hard to do analysis

#### How are players being injured?

In [9]:
trace1 = go.Bar(
        x=video_review.groupby(['Player_Activity_Derived'], as_index=False)['PlayID'].count()['Player_Activity_Derived'],
        y=video_review.groupby(['Player_Activity_Derived'], as_index=False)['PlayID'].count()['PlayID']
    )
trace2 = go.Bar(
        x=video_review.groupby(['Primary_Impact_Type'], as_index=False)['PlayID'].count()['Primary_Impact_Type'],
        y=video_review.groupby(['Primary_Impact_Type'], as_index=False)['PlayID'].count()['PlayID'],
    )
trace3 = go.Bar(
        x=video_review.groupby(['Friendly_Fire'], as_index=False)['PlayID'].count()['Friendly_Fire'],
        y=video_review.groupby(['Friendly_Fire'], as_index=False)['PlayID'].count()['PlayID'],
    )
trace4 = go.Bar(
        x=video_review.groupby(['Primary_Partner_Activity_Derived'], as_index=False)['PlayID'].count()['Primary_Partner_Activity_Derived'],
        y=video_review.groupby(['Primary_Partner_Activity_Derived'], as_index=False)['PlayID'].count()['PlayID'],
    )

fig = tools.make_subplots(rows=2, cols=2, subplot_titles=('Player Activity Derived', 'Primary Impact Type',
                                                          'Friendly Fire', 'Primary Partner Activity Derived'))

fig.append_trace(trace1, 1, 1)
fig.append_trace(trace2, 1, 2)
fig.append_trace(trace3, 2, 1)
fig.append_trace(trace4, 2, 2)

fig['layout'].update(showlegend=False)

py.iplot(fig, filename='injury-eda')


This is the format of your plot grid:
[ (1,1) x1,y1 ]  [ (1,2) x2,y2 ]
[ (2,1) x3,y3 ]  [ (2,2) x4,y4 ]



### Quick Conclusions
    - No player activity stands out
    - Helmet to player is primary impact type
    - Friendly fire is only clear 16% of the time
    - No Partner Activity Stands Out

#### Who's getting hurt?

In [10]:
#Categorize player punt roles into the sides of the ball
return_roles = ['PDL1','PDL2','PDL3','PDL4','PDL5','PDL6','PDM','PDR1','PDR2','PDR3','PDR4','PDR5','PDR6'
                ,'PFB','PLL','PLL1','PLL2','PLL3','PLM','PLM1','PLR','PLR1','PLR2','PLR3','PR','VL','VLi'
                ,'VLo','VR','VRi','VRo']

coverage_roles = ['GL','GLi','GLo','GR','GRi','GRo','P','PC','PLG','PLS','PLT','PLW','PPL','PPLi','PPLo'
                 ,'PPR','PPRi','PPRo','PRG','PRT','PRW']

gunner_blockers = ['VL','VLi','VLo','VR','VRi','VRo']

In [11]:
inj_players = video_review.merge(play_player_role, how='inner', on=['Season_Year', 'GameKey', 'PlayID', 'GSISID'])
inj_players.rename(columns={'Role':'inj_role'}, inplace=True)

In [12]:
inj_players['inj_side_of_ball'] = np.where(inj_players.inj_role.isin(return_roles), 'return',
                                          np.where(inj_players.inj_role.isin(coverage_roles), 'coverage', ''))

In [13]:
partner_players = video_review[['Season_Year', 'GameKey', 'PlayID', 'Primary_Partner_GSISID']]
partner_players['Primary_Partner_GSISID'] = partner_players.loc[:,'Primary_Partner_GSISID'] \
                                                .replace('Unclear','0').fillna(0).astype(int)
partner_players = partner_players.merge(play_player_role, how='left', \
                                        left_on=['Season_Year', 'GameKey', 'PlayID', 'Primary_Partner_GSISID'],\
                                       right_on =['Season_Year', 'GameKey', 'PlayID', 'GSISID'])
partner_players = partner_players.drop('GSISID', axis=1)
partner_players.rename(columns={'Role':'partner_role'}, inplace=True)
partner_players['partner_side_of_ball'] = np.where(partner_players.partner_role.isin(return_roles), 'return',
                                          np.where(partner_players.partner_role.isin(coverage_roles), 'coverage', ''))
inj_partner_df = pd.concat([inj_players,partner_players[['partner_role','partner_side_of_ball']]], axis = 1)

In [14]:
trace1 = go.Bar(
        x=inj_partner_df.groupby(['inj_side_of_ball'], as_index=False)['PlayID'].count()['inj_side_of_ball'],
        y=inj_partner_df.groupby(['inj_side_of_ball'], as_index=False)['PlayID'].count()['PlayID']
    )

trace2 = go.Bar(
        x=inj_partner_df.groupby(['partner_side_of_ball'], as_index=False)['PlayID'].count()['partner_side_of_ball'],
        y=inj_partner_df.groupby(['partner_side_of_ball'], as_index=False)['PlayID'].count()['PlayID']
    )

fig = tools.make_subplots(rows=1, cols=2, subplot_titles=('Injured Side of Ball', 'Partner Side of Ball'))

fig.append_trace(trace1, 1, 1)
fig.append_trace(trace2, 1, 2)

fig['layout'].update(showlegend=False)

py.iplot(fig, filename='side-of-ball')

This is the format of your plot grid:
[ (1,1) x1,y1 ]  [ (1,2) x2,y2 ]



In [15]:
pd.pivot_table(inj_partner_df, index=['inj_role'], columns=['partner_role'],
              values='GSISID', aggfunc='count', margins=True).fillna('-')

partner_role,GR,PDL1,PDR1,PDR2,PDR3,PLG,PLL1,PLS,PLT,PLW,PPR,PR,PRG,PRT,PRW,VLo,All
inj_role,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
GL,1,-,-,-,1,-,1,-,-,-,-,1,-,-,-,-,4.0
GR,-,-,-,-,-,-,-,-,-,-,-,1,-,-,-,-,1.0
P,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,1,1.0
PDL2,-,-,-,-,-,-,-,1,-,-,-,-,-,-,-,-,1.0
PDR1,-,-,-,-,-,-,-,-,-,-,-,-,1,-,-,-,1.0
PFB,-,-,-,-,-,-,-,-,-,-,-,-,-,1,-,-,1.0
PLG,-,-,1,-,-,-,-,-,-,1,-,-,-,-,1,-,3.0
PLL,1,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,1.0
PLS,-,1,-,-,-,-,-,-,-,-,-,1,-,-,-,-,2.0
PLT,-,-,-,1,-,-,-,-,-,-,-,1,-,-,-,-,2.0


### Animation

In [195]:
ex_play = NGS_df.loc[(NGS_df.Season_Year==2016) & (NGS_df.GameKey== 234) \
                     & (NGS_df.PlayID== 3278) & (NGS_df.GSISID== 28620)
                    ].sort_values('Time')

In [196]:
ball_snap_time = ex_play.loc[ex_play.Event == 'ball_snap'].Time.min()

In [197]:
ex_play = ex_play.loc[ex_play.Time >= ball_snap_time].reset_index(drop=True)

In [198]:
ex_play['Event'] = ex_play['Event'].fillna(method='ffill')

In [199]:
ret_play = NGS_df.loc[(NGS_df.Season_Year==2016) & (NGS_df.GameKey== 234) \
                     & (NGS_df.PlayID== 3278) & (NGS_df.GSISID== 27860)].sort_values('Time')
ret_play = ret_play.loc[ret_play.Time >= ball_snap_time]

In [200]:
x = np.array(ex_play.x)
y = np.array(ex_play.y)
xx = np.array(ex_play.x)
yy = np.array(ex_play.y)

In [201]:
x1 = np.array(ret_play.x)
y1 = np.array(ret_play.y)
xx1 = np.array(ret_play.x)
yy1 = np.array(ret_play.y)

In [206]:
from plotly.offline import init_notebook_mode, iplot
from IPython.display import display, HTML

init_notebook_mode(connected=True)

N = len(x)

data=[dict(x=x, y=y, 
            name='Distance',
            mode='lines',
            textposition='bottom center',
            line=dict(width=2, color=None)
          ),
      dict(x=x, y=y, 
            name='Injured Player',
            mode='markers',
            marker=dict(color=None, size=15)
          ),
      dict(x=x1, y=y1, 
           name = 'Partner Player',
           mode='markers',
           marker=dict(color='orange', size=15)
         )
    ]

layout = load_field()
layout['hovermode'] = 'closest'
layout['updatemenus'] = [{'type': 'buttons',
                           'buttons': [{'label': 'Play',
                                        'method': 'animate',
                                        'args': [None]}]}]

frames=[dict(data=[dict(x=[x1[k]], 
                        y=[y1[k]], 
                        mode='markers', 
                        marker=dict(color='#013369', size=15),
                        name='Partner Player'
                        ),
                   dict(x=[x[k]], 
                        y=[y[k]], 
                        mode='markers', 
                        marker=dict(color='orange', size=15),
                        name='Injured Player'
                        ), 
                   dict(x=[xx[k], xx1[k], None, xx[k], xx1[k]], 
                        y=[yy[k], yy1[k], None, yy[k], yy1[k]], 
                        mode='lines', 
                        text='Distance: {}'.format(round(np.sqrt((xx1[k] - xx[k])**2 + (yy1[k] - yy[k])**2),0)),
                        textposition='bottom center',
                        line=dict(color='#2c3539', width=2),
                        name='Distance'
                       )
                  ], layout=dict(title=ex_play.Event[k],
                                 annotations=[
                                     dict(x=100,
                                          y=5,
                                          showarrow=False,
                                          font=dict(
                                              family='Courier New, monospace',
                                              size=14,
                                              color='#ffffff'),
                                          align='center',
                                          bordercolor='#c7c7c7',
                                          borderwidth=2,
                                          borderpad=4,
                                          bgcolor='#2c3539',
                                          opacity=0.8,
                                          text='{} Yds'.format(round(np.sqrt((xx1[k] - xx[k])**2 + (yy1[k] - yy[k])**2),0)),
                                          )
                                 ]
                                )
            ) for k in range(0, N, 5)]
          
figure1=dict(data=data, layout=layout, frames=frames)
iplot(figure1)

#### On what types of plays are players getting hurt?

In [24]:
#Create a column in the play_info dataframe to determine the type of play
play_info['outcome'] =  np.where(play_info['PlayDescription'].str.contains('aborted|Fumbled snap|FUMBLES, and recovers', flags=re.IGNORECASE, regex=True), 'aborted',
                        np.where(play_info['PlayDescription'].str.contains('fake|pass|right end|left end|up the middle|Direct snap|right guard', flags=re.IGNORECASE, regex=True), 'fake',
                        np.where(play_info['PlayDescription'].str.contains('muffs', flags=re.IGNORECASE, regex=True), 'muff',         
                        np.where(play_info['PlayDescription'].str.contains('fair catch by', flags=re.IGNORECASE, regex=True), 'fair_catch',
                        np.where(play_info['PlayDescription'].str.contains('touchback', flags=re.IGNORECASE, regex=True), 'touchback',
                        np.where(play_info['PlayDescription'].str.contains('blocked|deflected', flags=re.IGNORECASE, regex=True), 'blocked',
                        np.where(play_info['PlayDescription'].str.contains('out of bounds.', flags=re.IGNORECASE, regex=False), 'oob',
                        np.where(play_info['PlayDescription'].str.contains('downed', flags=re.IGNORECASE, regex=True), 'downed', 
                        np.where(play_info['PlayDescription'].str.contains('safety', flags=re.IGNORECASE, regex=True), 'safety',
                        np.where(play_info['PlayDescription'].str.contains('[0-9]+ for [-+]?[0-9]+ yards?|for no gain|touchdown|(to [A-Z]+ [0-9]+ for [-+]?[0-9]+ yards?)|(to [0-9]+ for [-+]?[0-9]+ yards?)', flags=re.IGNORECASE, regex=True), 'return',         
                        np.where(play_info['PlayDescription'].str.contains('- no play|delay of game|false start, declined|penalty enforced', flags=re.IGNORECASE, regex=True), 'no_play', ' ')))))))))))

In [25]:
pi = play_info.merge(video_review[['Season_Year', 'GameKey', 'PlayID','GSISID']], how='left', on =['Season_Year', 'GameKey', 'PlayID'])
pi['injury'] = np.where(pi.GSISID.notnull(), 1, 0).astype(int)
pi.drop('GSISID', axis = 1, inplace=True)
vr = play_info.merge(video_review, how='inner', on =['Season_Year', 'GameKey', 'PlayID'])

In [26]:
vr.outcome.value_counts(normalize=True)

return        0.783784
downed        0.081081
fair_catch    0.054054
muff          0.054054
fake          0.027027
Name: outcome, dtype: float64

#### Conclusion: players are getting hurt on returns 

In [27]:
#Is it a higher proportion compared to all plays?

In [28]:
pi_inj_grouped = pi.groupby(['outcome'], as_index=False)['injury'] \
    .agg({'total_plays':'count','injuries':sum}) \
    .sort_values('total_plays', ascending = False) \
    .reset_index(drop=True)

pi_inj_grouped['injury_percentage'] = round(pi_inj_grouped['injuries'] / pi_inj_grouped['total_plays'] * 100, 1).astype(str) + '%'
pi_inj_grouped

Unnamed: 0,outcome,total_plays,injuries,injury_percentage
0,return,2741,29,1.1%
1,fair_catch,1659,2,0.1%
2,downed,796,3,0.4%
3,oob,639,0,0.0%
4,touchback,407,0,0.0%
5,muff,203,2,1.0%
6,no_play,139,0,0.0%
7,fake,45,1,2.2%
8,blocked,39,0,0.0%
9,aborted,9,0,0.0%


#### Conclusion: over 1% of punt plays with a return have an injury
    - **** This is 10x higher than when a fair catch is called

#### Calculate hang time of punts

In [29]:
def get_hang_time(ngs_df, start_event='punt', *stop_events):
    punt_event = ngs_df.loc[ngs_df.Event==start_event] \
        .groupby(['Season_Year', 'GameKey','PlayID'], as_index = False)['Time'].min()
    punt_event.rename(columns = {'Time':'punt_time'}, inplace=True)
    punt_event['punt_time'] = pd.to_datetime(punt_event['punt_time'],\
                                             format='%Y-%m-%d %H:%M:%S.%f')
    
    receiving_event = ngs_df.loc[ngs_df.Event.isin(stop_events)] \
        .groupby(['Season_Year', 'GameKey','PlayID'], as_index = False)['Time'].min()
    receiving_event.rename(columns = {'Time':'receiving_time'}, inplace=True)
    receiving_event['receiving_time'] = pd.to_datetime(receiving_event['receiving_time'],\
                                             format='%Y-%m-%d %H:%M:%S.%f')
    
    punt_df = punt_event.merge(receiving_event, how='inner', on = ['Season_Year','GameKey','PlayID']) \
                .reset_index(drop=True)
    
    punt_df['hang_time'] = (punt_df['receiving_time'] - punt_df['punt_time']).dt.total_seconds()
    
    return punt_df

In [30]:
punt_df = get_hang_time(NGS_df, 'punt', 'punt_received', 'fair_catch')

In [31]:
data = [go.Histogram(x=punt_df.hang_time)]

layout = go.Layout(
    title='Hang Time Histogram',
    xaxis=dict(
        title='Seconds'
    ),
    yaxis=dict(
        title='Count'
    )
)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='basic histogram')

In [32]:
round(punt_df['hang_time'].mean(), 1)

4.5

In [33]:
round(punt_df['hang_time'].median(), 1)

4.5

In [34]:
str(round(len(punt_df.loc[punt_df.hang_time < 5.5]) / len(punt_df) * 100, 1)) + '% of hang times are less than 5 1/2 seconds'

'96.9% of hang times are less than 5 1/2 seconds'

### Show correct vs incorrect alignment

In [118]:
def visualize_alignment(next_gen_df, game_id_lst, role_df):
    alignment_df = next_gen_df.loc[(next_gen_df.Season_Year == game_id_lst[0]) & \
                        (next_gen_df.GameKey == game_id_lst[1]) & \
                        (next_gen_df.PlayID == game_id_lst[2]) & \
                        (next_gen_df.Event == game_id_lst[3])].sort_values('y').reset_index(drop=True)
    
    align_merged = alignment_df.merge(play_player_role, how='left', \
                                          on =['Season_Year', 'GameKey', 'PlayID', 'GSISID'])
    
    align_merged['side_of_ball'] = np.where(align_merged.Role.isin(return_roles), 'return',
                                          np.where(align_merged.Role.isin(coverage_roles), 'coverage', ''))
    
    trace1 = go.Scatter(
        x = align_merged.loc[align_merged.side_of_ball == 'return'].x,
        y = align_merged.loc[align_merged.side_of_ball == 'return'].y,
        mode = 'markers',
        marker = dict(color='#013369', size=10),
        name = 'Return'
    )

    trace2 = go.Scatter(
        x = align_merged.loc[align_merged.side_of_ball == 'coverage'].x,
        y = align_merged.loc[align_merged.side_of_ball == 'coverage'].y,
        mode = 'markers',
        marker = dict(color='orange', size=10),
        name = 'Coverage'
    )
    
    #Change alignment of one of the players
    align_merged.at[2,'y'] = 7

    trace3 = go.Scatter(
        x = align_merged.loc[align_merged.side_of_ball == 'return'].x,
        y = align_merged.loc[align_merged.side_of_ball == 'return'].y,
        mode = 'markers',
        marker = dict(color='#013369', size=10),
        name = 'Return'
    )

    trace4 = go.Scatter(
        x = align_merged.loc[align_merged.side_of_ball == 'coverage'].x,
        y = align_merged.loc[align_merged.side_of_ball == 'coverage'].y,
        mode = 'markers',
        marker = dict(color='orange', size=10),
        name = 'Coverage'
    )

    fig = tools.make_subplots(rows=1, cols=2, subplot_titles=('Legal Alignment', 'Illegal Alignment'))

    fig.append_trace(trace1, 1, 1)
    fig.append_trace(trace2, 1, 1)
    fig.append_trace(trace3, 1, 2)
    fig.append_trace(trace4, 1, 2)
    
    fig['layout'].update(showlegend=False)
    
    return py.iplot(fig, filename='alignments')

In [120]:
visualize_alignment(NGS_df, [2016, 234, 3278, 'ball_snap'], play_player_role)

This is the format of your plot grid:
[ (1,1) x1,y1 ]  [ (1,2) x2,y2 ]



### Calculate speed of returner and coverage team at 1-6 seconds

In [37]:
#Create a function to calculate the space between the returner and the closest coverage player for every second between the punt and the catch
def coverage_returner_space(play_df, ngs_df):
    cov_ret_lst = []
    play_df = play_df.loc[(play_df.outcome == 'fair_catch') | (play_df.outcome == 'return')].reset_index(drop=True)
    for i in log_progress(range(0, len(play_df)), every=25):
        season_key = play_df['Season_Year'][i]
        game_key = play_df['GameKey'][i]
        play_id = play_df['PlayID'][i]
        outcome = play_df['outcome'][i]
        injury = play_df['injury'][i]
        if game_key in ngs_df.GameKey and play_id in ngs_df.PlayID:
            filtered_play = ngs_df.loc[(ngs_df.GameKey == game_key) \
                                       & (ngs_df.PlayID == play_id)].sort_values('Time').reset_index(drop=True)
            filtered_play = filtered_play.merge(play_player_role, \
                                                how='inner', on = ['Season_Year','GameKey','PlayID','GSISID'])
            if len(filtered_play) > 0:
                filtered_play['Time'] = pd.to_datetime(filtered_play['Time'], \
                                                       format='%Y-%m-%d %H:%M:%S.%f')
                punt_event_time = filtered_play.loc[filtered_play.Event == 'punt'].Time.min()
                receiving_event_time = filtered_play.loc[(filtered_play.Event == 'punt_received') | \
                                                         (filtered_play.Event == 'fair_catch')].Time.min()
                gunner_blocker_count = len(filtered_play.loc[filtered_play['Role'].isin(gunner_blockers)]['Role'].unique())
                filtered_play = filtered_play.loc[(filtered_play.Time >= punt_event_time) & \
                                                  (filtered_play.Time <= receiving_event_time)]
                coverage_df = filtered_play.loc[filtered_play['Role'].isin(coverage_roles)].sort_values('Time')
                coverage_df.rename(columns={'x':'cov_x', 
                                          'y': 'cov_y',
                                          'GSISID': 'cov_GSISID',
                                          'dis': 'cov_dis',
                                          'o': 'cov_o',
                                          'dir': 'cov_dir',
                                          'Role': 'cov_Role'
                                         }, inplace=True)
                
                returner_df = filtered_play.loc[filtered_play['Role'] == 'PR'].sort_values('Time')
                returner_df.rename(columns={'x':'ret_x', 
                                          'y': 'ret_y',
                                          'GSISID': 'ret_GSISID',
                                          'dis': 'ret_dis',
                                          'o': 'ret_o',
                                          'dir': 'ret_dir',
                                          'Role': 'ret_Role'
                                         }, inplace=True)
                returner_df = returner_df.drop('Event', axis = 1)
                
                cov_ret_df = coverage_df.merge(returner_df, how ='inner', on = ['Season_Year','GameKey','PlayID','Time'])
                cov_ret_df['dis_from_ret'] = ((cov_ret_df['cov_x'] -  cov_ret_df['ret_x']) ** 2 \
                                           + (cov_ret_df['cov_y'] -  cov_ret_df['ret_y']) ** 2).apply(np.sqrt)
                cov_ret_df['time_since_punt'] = cov_ret_df['Time'] - punt_event_time
                times_to_capture = [punt_event_time + pd.Timedelta(seconds=i) for i in range(1, 7)]
                cov_ret_df = cov_ret_df.loc[cov_ret_df['Time'].isin(times_to_capture)]
                cov_ret_df['gunner_blockers'] = gunner_blocker_count
                cov_ret_df['outcome'] = outcome
                cov_ret_df['injury'] = injury
                cov_ret_df['cov_speed'] = convert_to_mph(cov_ret_df.cov_dis, 20.455)
                cov_ret_df['ret_speed'] = convert_to_mph(cov_ret_df.ret_dis, 20.455)
                if len(cov_ret_df) > 0:
                    cov_ret_lst.append(cov_ret_df)
                    
    cov_ret_df = pd.concat(cov_ret_lst).reset_index(drop=True)          
    return cov_ret_df

In [38]:
def convert_to_mph(dis_vector, converter):
    mph_vector = dis_vector * converter
    return mph_vector

In [39]:
cov_ret_df = coverage_returner_space(pi, NGS_df)

VBox(children=(HTML(value=''), IntProgress(value=0, max=4400)))

In [183]:
#Since it is very rare that the return team has 0, 1, or 5 gunner blockers, we filter those plays out
print(round(cov_ret_df.gunner_blockers.value_counts(normalize=True), 2))
cov_ret_df = cov_ret_df.loc[cov_ret_df.gunner_blockers.isin([2, 3, 4])]

2    0.47
3    0.31
4    0.22
Name: gunner_blockers, dtype: float64


In [184]:
#Visualize play counts by alignment with a pie graph
labels = cov_ret_df.groupby(['gunner_blockers'], \
                   as_index=False)['PlayID'].agg({"play_count": pd.Series.nunique})['gunner_blockers']

values = cov_ret_df.groupby(['gunner_blockers'], \
                   as_index=False)['PlayID'].agg({"play_count": pd.Series.nunique})['play_count']

trace = go.Pie(labels=labels, values=values)

py.iplot([trace], filename='alignment_pie')

In [192]:
#Which return alignments (gunner blockers 2, 3, or 4) lead to more fair catches?
cov_ret_outcome = cov_ret_df.groupby(['gunner_blockers', 'outcome'], \
                                     as_index=False)['PlayID'].agg({"play_count": pd.Series.nunique})

#Pivot the dataframe for easier viewing
cov_ret_outcome = pd.pivot_table(cov_ret_outcome, index='gunner_blockers', \
                                 columns='outcome', values='play_count', aggfunc=sum)

#Add a ratio column to show the % of fair catches for each alignment
cov_ret_outcome['fair_catch_percentage'] = (cov_ret_outcome['fair_catch'] / \
                                        (cov_ret_outcome['fair_catch'] + cov_ret_outcome['return']) * 100).round(1).astype(str) + '%'

cov_ret_outcome

outcome,fair_catch,return,fair_catch_percentage
gunner_blockers,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,941,800,54.0%
3,326,875,27.1%
4,182,710,20.4%


#### *** Much more likely to have a fair catch when there are only 2 gunner blockers on the field

In [193]:
# We can look at the same ratios by alignment but with injuries
#Which return alignments (gunner blockers 2, 3, or 4) lead to more Injuries?
cov_ret_injury = cov_ret_df.groupby(['gunner_blockers', 'injury'], \
                                     as_index=False)['PlayID'].agg({"play_count": pd.Series.nunique})

#Pivot the dataframe for easier viewing
cov_ret_injury = pd.pivot_table(cov_ret_injury, index='gunner_blockers', \
                                 columns='injury', values='play_count', aggfunc=sum)

#Rename Columns
cov_ret_injury.columns = ['No', 'Yes']

#Add a ratio column to show the % of Injuries for each alignment
cov_ret_injury['injury_percentage'] = (cov_ret_injury['Yes'] / \
                                        (cov_ret_injury['No'] + cov_ret_injury['Yes']) * 100).round(1).astype(str) + '%'

cov_ret_injury

Unnamed: 0_level_0,No,Yes,injury_percentage
gunner_blockers,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,1559,9,0.6%
3,1110,11,1.0%
4,842,10,1.2%


In [194]:
#Concatenate two dataframes for the summary slides
pd.concat([cov_ret_injury, cov_ret_outcome], axis = 1)[['fair_catch_percentage', 'injury_percentage']]

Unnamed: 0_level_0,fair_catch_percentage,injury_percentage
gunner_blockers,Unnamed: 1_level_1,Unnamed: 2_level_1
2,54.0%,0.6%
3,27.1%,1.0%
4,20.4%,1.2%


#### *** Less likely to have injuries with only 2 gunners

In [208]:
#Speed vs Number of Gunner Blockers at each second.

pd.pivot_table(cov_ret_df.loc[(cov_ret_df.cov_Role == 'GR') | (cov_ret_df.cov_Role == 'GL')], 
               index=['gunner_blockers'], values='cov_speed',
               columns=['time_since_punt'], aggfunc=[np.mean]).round(1)

Unnamed: 0_level_0,mean,mean,mean,mean,mean,mean
time_since_punt,00:00:01,00:00:02,00:00:03,00:00:04,00:00:05,00:00:06
gunner_blockers,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
2,17.6,18.4,17.4,15.0,12.4,12.0
3,16.1,17.3,17.2,16.1,13.9,10.9
4,14.5,15.8,16.2,15.8,14.1,12.5


In [46]:
# Average distance for all player at the X second mark.  Return vs Fair Catch
round(pd.pivot_table(cov_ret_df, values='dis_from_ret',\
                     index=['time_since_punt'], columns=['outcome'], aggfunc=[np.mean, np.median]), 1)

Unnamed: 0_level_0,mean,mean,median,median
outcome,fair_catch,return,fair_catch,return
time_since_punt,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
00:00:01,44.8,48.4,45.5,48.5
00:00:02,39.5,43.6,40.2,43.4
00:00:03,32.4,37.4,32.5,37.0
00:00:04,25.5,31.8,24.8,31.0
00:00:05,21.9,27.1,20.4,25.5
00:00:06,22.7,23.1,22.8,20.3


In [48]:
round(pd.pivot_table(cov_ret_df, values='dis_from_ret',\
                     index=['time_since_punt'], columns=['outcome','gunner_blockers'], aggfunc=np.mean), 1)

outcome,fair_catch,fair_catch,fair_catch,fair_catch,fair_catch,return,return,return,return,return,return
gunner_blockers,0,1,2,3,4,0,1,2,3,4,5
time_since_punt,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
00:00:01,47.6,45.9,44.0,46.4,46.3,47.4,48.7,48.2,48.6,48.4,49.9
00:00:02,44.4,41.1,38.9,40.9,40.6,43.9,44.5,43.4,43.7,43.5,44.9
00:00:03,38.5,33.9,32.1,33.4,32.4,38.9,38.5,37.4,37.6,37.2,38.2
00:00:04,31.8,27.0,25.5,25.9,24.5,33.4,32.7,32.1,31.9,31.3,32.6
00:00:05,27.7,21.4,22.7,21.1,20.8,23.1,31.7,27.2,26.9,27.3,28.1
00:00:06,,,,,22.7,,,23.1,22.3,25.2,


In [49]:
#### At each second of each play... Who is the closest to the returner?  How far away are they?  Return vs Fair Catch

In [50]:
min_dist_from_ret = cov_ret_df.groupby(['Season_Year', 'GameKey','PlayID','time_since_punt'], as_index = False)['dis_from_ret'].min()

In [51]:
min_dist_from_ret.head()

Unnamed: 0,Season_Year,GameKey,PlayID,time_since_punt,dis_from_ret
0,2016,3,455,00:00:01,39.06856
1,2016,3,455,00:00:02,31.370344
2,2016,3,455,00:00:03,21.64588
3,2016,3,455,00:00:04,11.321985
4,2016,3,1542,00:00:01,38.607731


In [52]:
closest_df = cov_ret_df.merge(min_dist_from_ret.drop('time_since_punt', axis = 1), how = 'inner', on=['Season_Year', 'GameKey', 'PlayID', 'dis_from_ret'])

In [69]:
closest_df.groupby(['time_since_punt','cov_Role'],
                   as_index=False)['dis_from_ret'].agg({'# of times closest': 'count',
                                                        'avg distance':np.mean})\
        .sort_values('# of times closest', ascending =False).head()

Unnamed: 0,time_since_punt,cov_Role,# of times closest,avg distance
2,00:00:01,GR,1984,35.854444
15,00:00:02,GR,1954,27.315203
29,00:00:03,GR,1921,18.2403
0,00:00:01,GL,1871,35.530742
13,00:00:02,GL,1827,26.823751


In [75]:
pos_grouped_df = cov_ret_df.groupby(['cov_Role', 'outcome'], as_index=False)['dis_from_ret'].agg({'mean dist':np.mean,
                                                                                                  'count': 'count'})
                                                                                                  
pos_grouped_df = pos_grouped_df.loc[(pos_grouped_df.cov_Role != 'PPLi') & \
                                    (pos_grouped_df.cov_Role != 'PPLo')]

In [None]:
{'GL':['GLi', 'GLo', '']}

In [78]:
pos_grouped_df.replace('GLi', 'GL')

Unnamed: 0,cov_Role,outcome,mean dist,count
0,GL,fair_catch,22.666812,6590
1,GL,return,30.369937,10409
2,GL,fair_catch,27.09629,26
3,GL,return,34.447334,31
4,GLo,fair_catch,18.970336,26
5,GLo,return,27.706286,31
6,GR,fair_catch,22.575439,6564
7,GR,return,29.739586,10338
8,GRi,fair_catch,29.831213,27
9,GRi,return,33.946978,45


In [77]:
pos_grouped_df.sort_values('count')

Unnamed: 0,cov_Role,outcome,mean dist,count
14,PC,fair_catch,23.350382,5
15,PC,return,38.629537,8
2,GLi,fair_catch,27.09629,26
4,GLo,fair_catch,18.970336,26
10,GRo,fair_catch,23.929376,27
8,GRi,fair_catch,29.831213,27
3,GLi,return,34.447334,31
5,GLo,return,27.706286,31
9,GRi,return,33.946978,45
11,GRo,return,27.160889,49


In [73]:
trace1 = go.Bar(
        x=pos_grouped_df.loc[pos_grouped_df.outcome == 'fair_catch']['cov_Role'],
        y=pos_grouped_df.loc[pos_grouped_df.outcome == 'fair_catch']['dis_from_ret'],
        name='fair catch'
    )

trace2 = go.Bar(
        x=pos_grouped_df.loc[pos_grouped_df.outcome == 'return']['cov_Role'],
        y=pos_grouped_df.loc[pos_grouped_df.outcome == 'return']['dis_from_ret'],
        name='return'
    )

data = [trace1, trace2]
layout = go.Layout(
    barmode='group',
    xaxis=dict(title='Position'),
    yaxis= dict(title='Avg Distance From Returner (Yds)')
)

fig=go.Figure(data=data, layout=layout)
py.iplot(fig, filename='grouped-bar')

#### Punts Landing inside the 20

In [None]:
def parse_play_description(df, outcome_lst):
    parsed_df = play_info.loc[play_info.outcome.isin(outcome_lst),\
                             ['Season_Year','GameKey','PlayID',\
                              'PlayDescription','outcome']].reset_index(drop=True)
    punt_to_lst = []
    punt_dist_lst = []
    return_dist_lst = []
    punt_regex = '(punts [0-9]+ yards? to [A-Z]* [-+]?[0-9]+)| (punts [0-9]+ yards? to [-+]?[0-9]+)'
    return_regex = '(to [A-Z]* [0-9]+ for [-+]?[0-9]+ yards?)|(to [0-9]+ for [-+]?[0-9]+ yards?)|(ob at [A-Z]* [-+]?[0-9]+ for [-+]?[0-9]+ yards?)|(ob at [0-9]+ for [-+]?[0-9]+ yards?)|(for [-+]?[0-9]+ yards?, TOUCHDOWN)'
    
    for i in range(0, len(parsed_df)):
        punt_search = re.search(punt_regex, parsed_df.PlayDescription[i])
        return_search = re.search(return_regex, parsed_df.PlayDescription[i])
    
        if punt_search:
            punt_snip = re.findall(r'-?\d+', punt_search.group(0))
            if parsed_df.outcome[i] in ['downed','fair_catch', 'oob', 'return']:
                punt_to_lst.append(int(punt_snip[-1]))
                punt_dist_lst.append(int(punt_snip[0]))
            else:
                if parsed_df.outcome[i] == 'touchback':
                    punt_to_lst.append(0)
                    punt_dist_lst.append(int(punt_snip[0]))
                else:
                    print('Missing Punt Outcome at Row {}'.format(i))
        else:
            if parsed_df.outcome[i] == 'touchback':
                punt_to_lst.append(0)
                punt_dist_lst.append(int(punt_snip[0]))
            else:
                print('Missing Punt Outcome at Row {}'.format(i))
        
        if return_search:
            return_snip = re.findall(r'-?\d+', return_search.group(0))
            return_dist_lst.append(int(return_snip[-1]))
        else:
            if parsed_df.outcome[i] == 'touchback':
                return_dist_lst.append(20)
            elif parsed_df.outcome[i] in ['downed','fair_catch', 'oob']:
                return_dist_lst.append(0)
            elif 'no gain' in parsed_df.PlayDescription[i]:
                return_dist_lst.append(0)
            else:
                print('Missing Return Outcome at Row {}'.format(i))   
                
    parsed_df['punt_to'] = punt_to_lst
    parsed_df['punt_dist'] = punt_dist_lst
    parsed_df['return_dist'] = return_dist_lst
                                   
    return parsed_df

In [None]:
punt_detail_df = parse_play_description(play_info, ['touchback', 'fair_catch','oob', 'downed', 'return'])

In [None]:
#Fearing the extra 5 yard penalty, punters will be even more wary of avoiding a touchback.
#They will punt the ball shorter and with more hang time, or angle the ball out of bounds.

#From a returners perspective, if the ball is heading close to the goaline, they will be more likely to let it bounce 
#and take the touchback than return it because of the extra 5 yard bonus.

In [None]:
punt_detail_df.head()

In [None]:
punt_to_lst = []
punt_dist_lst = []
return_dist_lst = []

In [None]:
string = 'T.Morstead punts 47 yards to 17, Center-C.Highland. T.Ervin to HST 16 for -1 yards '

In [None]:
snip = re.search('(to [A-Z]* [0-9]+ for [-+]?[0-9]+ yards?)', string).group(0)

In [79]:
NGS_df.head()

Unnamed: 0,Season_Year,GameKey,PlayID,GSISID,Time,x,y,dis,o,dir,Event
0,2016,3,3949,33078.0,2016-08-12 02:27:11.100,58.32,12.9,0.0,347.98999,64.32,
1,2016,3,3949,24417.0,2016-08-12 02:27:11.100,56.740002,13.85,0.02,18.18,359.75,
2,2016,3,3949,32570.0,2016-08-12 02:27:11.100,50.779999,0.34,0.09,286.130005,185.149994,
3,2016,3,3949,27831.0,2016-08-12 02:27:11.200,59.34,12.33,0.05,325.130005,294.48999,
4,2016,3,3949,32575.0,2016-08-12 02:27:11.200,52.950001,-0.37,0.03,322.970001,145.889999,


In [82]:
video_review.head()

Unnamed: 0,Season_Year,GameKey,PlayID,GSISID,Player_Activity_Derived,Turnover_Related,Primary_Impact_Type,Primary_Partner_GSISID,Primary_Partner_Activity_Derived,Friendly_Fire
0,2016,5,3129,31057,Tackling,No,Helmet-to-body,32482,Tackled,No
1,2016,21,2587,29343,Blocked,No,Helmet-to-helmet,31059,Blocking,No
2,2016,29,538,31023,Tackling,No,Helmet-to-body,31941,Tackled,No
3,2016,45,1212,33121,Tackling,No,Helmet-to-body,28249,Tackled,No
4,2016,54,1045,32444,Blocked,No,Helmet-to-body,31756,Blocked,Yes


In [83]:
video_review.loc[(video_review.Season_Year == 2017) & (video_review.GameKey == 601)].merge

Unnamed: 0,Season_Year,GameKey,PlayID,GSISID,Player_Activity_Derived,Turnover_Related,Primary_Impact_Type,Primary_Partner_GSISID,Primary_Partner_Activity_Derived,Friendly_Fire
34,2017,601,602,33260,Tackling,No,Helmet-to-helmet,31697,Tackled,No


In [None]:
NGS_df