In [1]:
import pandas as pd
pd.options.display.max_colwidth = 1000
import numpy as np
import matplotlib.pyplot as plt
import plotly
import plotly.plotly as py
import plotly.graph_objs as go
from plotly import tools, offline
plotly.tools.set_credentials_file(username='mtodisco10', api_key='bCfUmq5FralbymKXY4uX')
import re
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
%run progress_bar.ipynb

In [3]:
#Read in Data
game_data = pd.read_csv('game_data.csv')
play_info = pd.read_csv('play_information.csv')
play_player_role = pd.read_csv('play_player_role_data.csv')
player_punt = pd.read_csv('player_punt_data.csv')
video_footage_control = pd.read_csv('video_footage-control.csv')
video_footage_injury = pd.read_csv('video_footage-injury.csv')
video_review = pd.read_csv('video_review.csv')

In [4]:
def read_NGS_data(file_lst):
    for i, file in enumerate(file_lst):
        print('Reading in {}'.format(file))
        data = pd.read_csv(file)
        if i == 0:
            NGS_df = data
            del data
        else:
            NGS_df = pd.concat([NGS_df, data])
            del data
    return NGS_df

In [5]:
file_lst = ['NGS-2016-pre.csv','NGS-2016-reg-wk1-6.csv','NGS-2016-reg-wk7-12.csv','NGS-2016-reg-wk13-17.csv',\
            'NGS-2016-post.csv','NGS-2017-pre.csv','NGS-2017-reg-wk1-6.csv','NGS-2017-reg-wk7-12.csv',\
            'NGS-2017-reg-wk13-17.csv','NGS-2017-post.csv']

In [6]:
NGS_df = read_NGS_data(file_lst)

Reading in NGS-2016-pre.csv
Reading in NGS-2016-reg-wk1-6.csv
Reading in NGS-2016-reg-wk7-12.csv
Reading in NGS-2016-reg-wk13-17.csv
Reading in NGS-2016-post.csv
Reading in NGS-2017-pre.csv
Reading in NGS-2017-reg-wk1-6.csv
Reading in NGS-2017-reg-wk7-12.csv
Reading in NGS-2017-reg-wk13-17.csv
Reading in NGS-2017-post.csv


### Injury Plays EDA

In [7]:
'There are concussion injuries on ' + str(round(len(video_review) / float(len(play_info)) * 100, 2)) + '% of ' + 'punt plays'

'There are concussion injuries on 0.55% of punt plays'

##### It is good for the NFL and players that injuries only occur on less than 1% of punt plays, but it will be hard to do analysis

#### How are players being injured?

In [8]:
trace1 = go.Bar(
        x=video_review.groupby(['Player_Activity_Derived'], as_index=False)['PlayID'].count()['Player_Activity_Derived'],
        y=video_review.groupby(['Player_Activity_Derived'], as_index=False)['PlayID'].count()['PlayID']
    )
trace2 = go.Bar(
        x=video_review.groupby(['Primary_Impact_Type'], as_index=False)['PlayID'].count()['Primary_Impact_Type'],
        y=video_review.groupby(['Primary_Impact_Type'], as_index=False)['PlayID'].count()['PlayID'],
    )
trace3 = go.Bar(
        x=video_review.groupby(['Friendly_Fire'], as_index=False)['PlayID'].count()['Friendly_Fire'],
        y=video_review.groupby(['Friendly_Fire'], as_index=False)['PlayID'].count()['PlayID'],
    )
trace4 = go.Bar(
        x=video_review.groupby(['Primary_Partner_Activity_Derived'], as_index=False)['PlayID'].count()['Primary_Partner_Activity_Derived'],
        y=video_review.groupby(['Primary_Partner_Activity_Derived'], as_index=False)['PlayID'].count()['PlayID'],
    )

fig = tools.make_subplots(rows=2, cols=2, subplot_titles=('Player Activity Derived', 'Primary Impact Type',
                                                          'Friendly Fire', 'Primary Partner Activity Derived'))

fig.append_trace(trace1, 1, 1)
fig.append_trace(trace2, 1, 2)
fig.append_trace(trace3, 2, 1)
fig.append_trace(trace4, 2, 2)

fig['layout'].update(showlegend=False)

py.iplot(fig, filename='make-subplots-multiple-with-titles')

This is the format of your plot grid:
[ (1,1) x1,y1 ]  [ (1,2) x2,y2 ]
[ (2,1) x3,y3 ]  [ (2,2) x4,y4 ]



### Quick Conclusions
    - No player activity stands out
    - Helmet to player is primary impact type
    - Friendly fire is only clear 16% of the time
    - No Partner Activity Stands Out

#### Who's getting hurt?

In [9]:
#Categorize player punt roles into the sides of the ball
return_roles = ['PDL1','PDL2','PDL3','PDL4','PDL5','PDL6','PDM','PDR1','PDR2','PDR3','PDR4','PDR5','PDR6'
                ,'PFB','PLL','PLL1','PLL2','PLL3','PLM','PLM1','PLR','PLR1','PLR2','PLR3','PR','VL','VLi'
                ,'VLo','VR','VRi','VRo']

coverage_roles = ['GL','GLi','GLo','GR','GRi','GRo','P','PC','PLG','PLS','PLT','PLW','PPL','PPLi','PPLo'
                 ,'PPR','PPRi','PPRo','PRG','PRT','PRW']

gunner_blockers = ['VL','VLi','VLo','VR','VRi','VRo']

In [10]:
inj_players = video_review.merge(play_player_role, how='inner', on=['Season_Year', 'GameKey', 'PlayID', 'GSISID'])
inj_players.rename(columns={'Role':'inj_role'}, inplace=True)

In [11]:
inj_players['inj_side_of_ball'] = np.where(inj_players.inj_role.isin(return_roles), 'return',
                                          np.where(inj_players.inj_role.isin(coverage_roles), 'coverage', ''))

In [12]:
partner_players = video_review[['Season_Year', 'GameKey', 'PlayID', 'Primary_Partner_GSISID']]
partner_players['Primary_Partner_GSISID'] = partner_players.loc[:,'Primary_Partner_GSISID'] \
                                                .replace('Unclear','0').fillna(0).astype(int)
partner_players = partner_players.merge(play_player_role, how='left', \
                                        left_on=['Season_Year', 'GameKey', 'PlayID', 'Primary_Partner_GSISID'],\
                                       right_on =['Season_Year', 'GameKey', 'PlayID', 'GSISID'])
partner_players = partner_players.drop('GSISID', axis=1)
partner_players.rename(columns={'Role':'partner_role'}, inplace=True)
partner_players['partner_side_of_ball'] = np.where(partner_players.partner_role.isin(return_roles), 'return',
                                          np.where(partner_players.partner_role.isin(coverage_roles), 'coverage', ''))
inj_partner_df = pd.concat([inj_players,partner_players[['partner_role','partner_side_of_ball']]], axis = 1)

In [13]:
inj_partner_df.head()

Unnamed: 0,Season_Year,GameKey,PlayID,GSISID,Player_Activity_Derived,Turnover_Related,Primary_Impact_Type,Primary_Partner_GSISID,Primary_Partner_Activity_Derived,Friendly_Fire,inj_role,inj_side_of_ball,partner_role,partner_side_of_ball
0,2016,5,3129,31057,Tackling,No,Helmet-to-body,32482,Tackled,No,PLW,coverage,PR,return
1,2016,21,2587,29343,Blocked,No,Helmet-to-helmet,31059,Blocking,No,GL,coverage,PLL1,return
2,2016,29,538,31023,Tackling,No,Helmet-to-body,31941,Tackled,No,GR,coverage,PR,return
3,2016,45,1212,33121,Tackling,No,Helmet-to-body,28249,Tackled,No,PRT,coverage,PR,return
4,2016,54,1045,32444,Blocked,No,Helmet-to-body,31756,Blocked,Yes,PRT,coverage,GR,coverage


In [14]:
partner_players.partner_side_of_ball.value_counts()

return      18
coverage    15
             4
Name: partner_side_of_ball, dtype: int64

#### On what types of plays are players getting hurt?

In [15]:
#Create a column in the play_info dataframe to determine the type of play
play_info['outcome'] =  np.where(play_info['PlayDescription'].str.contains('aborted|Fumbled snap|FUMBLES, and recovers', flags=re.IGNORECASE, regex=True), 'aborted',
                        np.where(play_info['PlayDescription'].str.contains('fake|pass|right end|left end|up the middle|Direct snap|right guard', flags=re.IGNORECASE, regex=True), 'fake',
                        np.where(play_info['PlayDescription'].str.contains('muffs', flags=re.IGNORECASE, regex=True), 'muff',         
                        np.where(play_info['PlayDescription'].str.contains('fair catch by', flags=re.IGNORECASE, regex=True), 'fair_catch',
                        np.where(play_info['PlayDescription'].str.contains('touchback', flags=re.IGNORECASE, regex=True), 'touchback',
                        np.where(play_info['PlayDescription'].str.contains('blocked|deflected', flags=re.IGNORECASE, regex=True), 'blocked',
                        np.where(play_info['PlayDescription'].str.contains('out of bounds.', flags=re.IGNORECASE, regex=False), 'oob',
                        np.where(play_info['PlayDescription'].str.contains('downed', flags=re.IGNORECASE, regex=True), 'downed', 
                        np.where(play_info['PlayDescription'].str.contains('safety', flags=re.IGNORECASE, regex=True), 'safety',
                        np.where(play_info['PlayDescription'].str.contains('[0-9]+ for [-+]?[0-9]+ yards?|for no gain|touchdown|(to [A-Z]+ [0-9]+ for [-+]?[0-9]+ yards?)|(to [0-9]+ for [-+]?[0-9]+ yards?)', flags=re.IGNORECASE, regex=True), 'return',         
                        np.where(play_info['PlayDescription'].str.contains('- no play|delay of game|false start, declined|penalty enforced', flags=re.IGNORECASE, regex=True), 'no_play', ' ')))))))))))

In [16]:
pi = play_info.merge(video_review[['Season_Year', 'GameKey', 'PlayID','GSISID']], how='left', on =['Season_Year', 'GameKey', 'PlayID'])
pi['injury'] = np.where(pi.GSISID.notnull(), 1, 0).astype(int)
pi.drop('GSISID', axis = 1, inplace=True)
vr = play_info.merge(video_review, how='inner', on =['Season_Year', 'GameKey', 'PlayID'])

In [17]:
vr.outcome.value_counts()

return        29
downed         3
muff           2
fair_catch     2
fake           1
Name: outcome, dtype: int64

#### Conclusion: players are getting hurt on returns 

In [18]:
#Is it a higher proportion compared to all plays?

In [19]:
pi_inj_grouped = pi.groupby(['outcome'], as_index=False)['injury'] \
    .agg({'total_plays':'count','injuries':sum}) \
    .sort_values('total_plays', ascending = False) \
    .reset_index(drop=True)

pi_inj_grouped['injury_percentage'] = round(pi_inj_grouped['injuries'] / pi_inj_grouped['total_plays'] * 100, 1).astype(str) + '%'
pi_inj_grouped

Unnamed: 0,outcome,total_plays,injuries,injury_percentage
0,return,2741,29,1.1%
1,fair_catch,1659,2,0.1%
2,downed,796,3,0.4%
3,oob,639,0,0.0%
4,touchback,407,0,0.0%
5,muff,203,2,1.0%
6,no_play,139,0,0.0%
7,fake,45,1,2.2%
8,blocked,39,0,0.0%
9,aborted,9,0,0.0%


#### Conclusion: over 1% of punt plays with a return have an injury
    - **** This is 10x higher than when a fair catch is called

#### Plot Player Paths on a Given Play

In [20]:
def load_layout():
    """
    Returns a dict for a Football themed Plot.ly layout 
    """
    layout = dict(
        title = "Player Activity",
        plot_bgcolor='darkseagreen',
        showlegend=True,
        xaxis=dict(
            autorange=False,
            range=[0, 120],
            showgrid=False,
            zeroline=False,
            showline=True,
            linecolor='black',
            linewidth=1,
            mirror=True,
            ticks='',
            tickmode='array',
            tickvals=[10,20, 30, 40, 50, 60, 70, 80, 90, 100, 110],
            ticktext=['Goal', 10, 20, 30, 40, 50, 40, 30, 20, 10, 'Goal'],
            showticklabels=True
        ),
        yaxis=dict(
            title='',
            autorange=False,
            range=[-3.3,56.3],
            showgrid=False,
            zeroline=False,
            showline=True,
            linecolor='black',
            linewidth=1,
            mirror=True,
            ticks='',
            showticklabels=False
        ),
        shapes=[
            dict(
                type='line',
                layer='below',
                x0=0,
                y0=0,
                x1=120,
                y1=0,
                line=dict(
                    color='white',
                    width=2
                )
            ),
            dict(
                type='line',
                layer='below',
                x0=0,
                y0=53.3,
                x1=120,
                y1=53.3,
                line=dict(
                    color='white',
                    width=2
                )
            ),
            dict(
                type='line',
                layer='below',
                x0=10,
                y0=0,
                x1=10,
                y1=53.3,
                line=dict(
                    color='white',
                    width=10
                )
            ),
            dict(
                type='line',
                layer='below',
                x0=20,
                y0=0,
                x1=20,
                y1=53.3,
                line=dict(
                    color='white'
                )
            ),
            dict(
                type='line',
                layer='below',
                x0=30,
                y0=0,
                x1=30,
                y1=53.3,
                line=dict(
                    color='white'
                )
            ),
            dict(
                type='line',
                layer='below',
                x0=40,
                y0=0,
                x1=40,
                y1=53.3,
                line=dict(
                    color='white'
                )
            ),
            dict(
                type='line',
                layer='below',
                x0=50,
                y0=0,
                x1=50,
                y1=53.3,
                line=dict(
                    color='white'
                )
            ),
            dict(
                type='line',
                layer='below',
                x0=60,
                y0=0,
                x1=60,
                y1=53.3,
                line=dict(
                    color='white'
                )
            ),dict(
                type='line',
                layer='below',
                x0=70,
                y0=0,
                x1=70,
                y1=53.3,
                line=dict(
                    color='white'
                )
            ),dict(
                type='line',
                layer='below',
                x0=80,
                y0=0,
                x1=80,
                y1=53.3,
                line=dict(
                    color='white'
                )
            ),
            dict(
                type='line',
                layer='below',
                x0=90,
                y0=0,
                x1=90,
                y1=53.3,
                line=dict(
                    color='white'
                )
            ),dict(
                type='line',
                layer='below',
                x0=100,
                y0=0,
                x1=100,
                y1=53.3,
                line=dict(
                    color='white'
                )
            ),
            dict(
                type='line',
                layer='below',
                x0=110,
                y0=0,
                x1=110,
                y1=53.3,
                line=dict(
                    color='white',
                    width=10
                )
            )
        ]
    )
    return layout

layout = load_layout()

In [24]:
# Loading and plotting functions

def load_plays_for_game(GameKey):
    """
    Returns a dataframe of play data for a given game (GameKey)
    """
    play_information = pd.read_csv('play_information.csv')
    play_information = play_information[play_information['GameKey'] == GameKey]
    return play_information


def load_game_and_ngs(ngs_file=None, GameKey=None):
    """
    Returns a dataframe of player movements (NGS data) for a given game
    """
    if ngs_file is None:
        print("Specifiy an NGS file.")
        return None
    if GameKey is None:
        print('Specify a GameKey')
        return None
    # Merge play data with NGS data    
    plays = load_plays_for_game(GameKey)
    ngs = pd.read_csv(ngs_file, low_memory=False)
    merged = pd.merge(ngs, plays, how="inner", on=["GameKey", "PlayID", "Season_Year"])
    return merged


def plot_play(game_df, PlayID, player1=None, player2=None, custom_layout=False):
    """
    Plots player movements on the field for a given game, play, and two players
    """
    game_df = game_df[game_df.PlayID==PlayID]
    
    GameKey=str(pd.unique(game_df.GameKey)[0])
    HomeTeam = pd.unique(game_df.Home_Team_Visit_Team)[0].split("-")[0]
    VisitingTeam = pd.unique(game_df.Home_Team_Visit_Team)[0].split("-")[1]
    YardLine = game_df[(game_df.PlayID==PlayID) & (game_df.GSISID==player1)]['YardLine'].iloc[0]
    
    traces=[]   
    if (player1 is not None) & (player2 is not None):
        game_df = game_df[ (game_df['GSISID']==player1) | (game_df['GSISID']==player2)]
        for player in pd.unique(game_df.GSISID):
            player = int(player)
            trace = go.Scatter(
                x = game_df[game_df.GSISID==player].x,
                y = game_df[game_df.GSISID==player].y,
                name='GSISID '+str(player),
                mode='markers'
            )
            traces.append(trace)
    else:
        print("Specify GSISIDs for player1 and player2")
        return None
    
    if custom_layout is not True:
        layout = load_layout()
        layout['title'] =  HomeTeam + \
        ' vs. ' + VisitingTeam + \
        '<br>Possession: ' + \
        YardLine.split(" ")[0] +'@'+YardLine.split(" ")[1]
    data = traces
    fig = dict(data=data, layout=layout)
    play_description = game_df[(game_df.PlayID==PlayID) & (game_df.GSISID==player1)].iloc[0]["PlayDescription"]
    print("\n\n\t",play_description)
    py.iplot(fig, filename='jupyter-table1')
    #offline.iplot(fig)

In [25]:
game280 = load_game_and_ngs('NGS-2016-reg-wk13-17.csv',GameKey=280)

In [26]:
plot_play(game_df=game280, PlayID=2918, player1=32120, player2=32725)



	 (3:36) B.Nortman punts 49 yards to HST 30, Center-C.Tinker, fair catch by W.Fuller.
High five! You successfully sent some data to your account on plotly. View your plot in your browser at https://plot.ly/~mtodisco10/0 or inside your plot.ly account where it is named 'jupyter-table1'


#### Calculate hang time of punts

In [27]:
def get_hang_time(ngs_df, start_event='punt', *stop_events):
    punt_event = ngs_df.loc[ngs_df.Event==start_event] \
        .groupby(['Season_Year', 'GameKey','PlayID'], as_index = False)['Time'].min()
    punt_event.rename(columns = {'Time':'punt_time'}, inplace=True)
    punt_event['punt_time'] = pd.to_datetime(punt_event['punt_time'],\
                                             format='%Y-%m-%d %H:%M:%S.%f')
    
    receiving_event = ngs_df.loc[ngs_df.Event.isin(stop_events)] \
        .groupby(['Season_Year', 'GameKey','PlayID'], as_index = False)['Time'].min()
    receiving_event.rename(columns = {'Time':'receiving_time'}, inplace=True)
    receiving_event['receiving_time'] = pd.to_datetime(receiving_event['receiving_time'],\
                                             format='%Y-%m-%d %H:%M:%S.%f')
    
    punt_df = punt_event.merge(receiving_event, how='inner', on = ['Season_Year','GameKey','PlayID']) \
                .reset_index(drop=True)
    
    punt_df['hang_time'] = (punt_df['receiving_time'] - punt_df['punt_time']).dt.total_seconds()
    
    return punt_df

In [28]:
punt_df = get_hang_time(NGS_df, 'punt', 'punt_received', 'fair_catch')

In [29]:
data = [go.Histogram(x=punt_df.hang_time)]

py.iplot(data, filename='basic histogram')

In [30]:
round(punt_df['hang_time'].mean(), 1)

4.5

In [31]:
round(punt_df['hang_time'].median(), 1)

4.5

In [32]:
str(round(len(punt_df.loc[punt_df.hang_time < 5.5]) / len(punt_df) * 100, 1)) + '% of hang times are less than 5 1/2 seconds'

'96.9% of hang times are less than 5 1/2 seconds'

### Calculate distance between returner and closest coverage man

In [33]:
#Create a function to calculate the space between the returner and the closest coverage player for every second between the punt and the catch
def coverage_returner_space(play_df, ngs_df):
    cov_ret_lst = []
    play_df = play_df.loc[(play_df.outcome == 'fair_catch') | (play_df.outcome == 'return')].reset_index(drop=True)
    for i in log_progress(range(0, len(play_df)), every=25):
        season_key = play_df['Season_Year'][i]
        game_key = play_df['GameKey'][i]
        play_id = play_df['PlayID'][i]
        outcome = play_df['outcome'][i]
        injury = play_df['injury'][i]
        if game_key in ngs_df.GameKey and play_id in ngs_df.PlayID:
            filtered_play = ngs_df.loc[(ngs_df.GameKey == game_key) \
                                       & (ngs_df.PlayID == play_id)].sort_values('Time').reset_index(drop=True)
            filtered_play = filtered_play.merge(play_player_role, \
                                                how='inner', on = ['Season_Year','GameKey','PlayID','GSISID'])
            if len(filtered_play) > 0:
                filtered_play['Time'] = pd.to_datetime(filtered_play['Time'], \
                                                       format='%Y-%m-%d %H:%M:%S.%f')
                punt_event_time = filtered_play.loc[filtered_play.Event == 'punt'].Time.min()
                receiving_event_time = filtered_play.loc[(filtered_play.Event == 'punt_received') | \
                                                         (filtered_play.Event == 'fair_catch')].Time.min()
                gunner_blocker_count = len(filtered_play.loc[filtered_play['Role'].isin(gunner_blockers)]['Role'].unique())
                filtered_play = filtered_play.loc[(filtered_play.Time >= punt_event_time) & \
                                                  (filtered_play.Time <= receiving_event_time)]
                coverage_df = filtered_play.loc[filtered_play['Role'].isin(coverage_roles)].sort_values('Time')
                coverage_df.rename(columns={'x':'cov_x', 
                                          'y': 'cov_y',
                                          'GSISID': 'cov_GSISID',
                                          'dis': 'cov_dis',
                                          'o': 'cov_o',
                                          'dir': 'cov_dir',
                                          'Role': 'cov_Role'
                                         }, inplace=True)
                
                returner_df = filtered_play.loc[filtered_play['Role'] == 'PR'].sort_values('Time')
                returner_df.rename(columns={'x':'ret_x', 
                                          'y': 'ret_y',
                                          'GSISID': 'ret_GSISID',
                                          'dis': 'ret_dis',
                                          'o': 'ret_o',
                                          'dir': 'ret_dir',
                                          'Role': 'ret_Role'
                                         }, inplace=True)
                returner_df = returner_df.drop('Event', axis = 1)
                
                cov_ret_df = coverage_df.merge(returner_df, how ='inner', on = ['Season_Year','GameKey','PlayID','Time'])
                cov_ret_df['dis_from_ret'] = ((cov_ret_df['cov_x'] -  cov_ret_df['ret_x']) ** 2 \
                                           + (cov_ret_df['cov_y'] -  cov_ret_df['ret_y']) ** 2).apply(np.sqrt)
                cov_ret_df['time_since_punt'] = cov_ret_df['Time'] - punt_event_time
                times_to_capture = [punt_event_time + pd.Timedelta(seconds=i) for i in range(1, 7)]
                cov_ret_df = cov_ret_df.loc[cov_ret_df['Time'].isin(times_to_capture)]
                cov_ret_df['gunner_blockers'] = gunner_blocker_count
                cov_ret_df['outcome'] = outcome
                cov_ret_df['injury'] = injury
                cov_ret_df['cov_speed'] = convert_to_mph(cov_ret_df.cov_dis, 20.455)
                cov_ret_df['ret_speed'] = convert_to_mph(cov_ret_df.ret_dis, 20.455)
                if len(cov_ret_df) > 0:
                    cov_ret_lst.append(cov_ret_df)
                    
    cov_ret_df = pd.concat(cov_ret_lst).reset_index(drop=True)          
    return cov_ret_df

In [34]:
def convert_to_mph(dis_vector, converter):
    mph_vector = dis_vector * converter
    return mph_vector

In [35]:
cov_ret_df = coverage_returner_space(pi, NGS_df)

VBox(children=(HTML(value=''), IntProgress(value=0, max=4400)))

In [36]:
cov_ret_df.gunner_blockers.value_counts(normalize=True)

2    0.461232
3    0.305723
4    0.219262
0    0.007008
1    0.006490
5    0.000285
Name: gunner_blockers, dtype: float64

In [37]:
#Speed vs Number of Gunner Blockers at each second.

pd.pivot_table(cov_ret_df.loc[(cov_ret_df.cov_Role == 'GR') | (cov_ret_df.cov_Role == 'GL')], 
               index=['gunner_blockers'], values='cov_speed',
               columns=['time_since_punt'], aggfunc=[np.mean, np.median])

Unnamed: 0_level_0,mean,mean,mean,mean,mean,mean,median,median,median,median,median,median
time_since_punt,00:00:01,00:00:02,00:00:03,00:00:04,00:00:05,00:00:06,00:00:01,00:00:02,00:00:03,00:00:04,00:00:05,00:00:06
gunner_blockers,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
0,12.371487,14.11395,14.909422,14.271624,11.337914,,13.29575,15.75035,17.079925,15.443525,10.943425,
1,15.902833,16.575988,15.665442,14.309607,8.113817,,17.79585,18.20495,17.5913,16.15945,5.11375,
2,17.551525,18.370466,17.438579,14.979314,12.350347,11.971857,18.0004,19.02315,18.20495,15.954899,13.29575,11.65935
3,16.129727,17.330794,17.194381,16.081668,13.884646,10.874142,16.97765,18.0004,17.79585,16.97765,14.52305,9.61385
4,14.525324,15.82675,16.186738,15.778057,14.083888,12.467323,15.34125,16.56855,16.97765,16.56855,15.34125,12.886651
5,5.011475,7.97745,7.261525,1.6364,6.852425,,5.011475,7.97745,7.261525,1.6364,6.852425,


In [38]:
# Average distance for all player at the X second mark.  Return vs Fair Catch
pd.pivot_table(cov_ret_df, values='dis_from_ret', index=['time_since_punt'], columns=['outcome'], aggfunc=[np.mean, np.median])

Unnamed: 0_level_0,mean,mean,median,median
outcome,fair_catch,return,fair_catch,return
time_since_punt,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
00:00:01,44.774631,48.379817,45.463458,48.487881
00:00:02,39.536096,43.550691,40.156185,43.410029
00:00:03,32.449034,37.431737,32.541663,36.995621
00:00:04,25.541349,31.820976,24.829395,30.989835
00:00:05,21.92648,27.101236,20.386059,25.5071
00:00:06,22.651864,23.142722,22.75707,20.309999


In [39]:
pd.pivot_table(cov_ret_df, values='dis_from_ret', index=['time_since_punt'], columns=['outcome','gunner_blockers'], aggfunc=np.mean)

outcome,fair_catch,fair_catch,fair_catch,fair_catch,fair_catch,return,return,return,return,return,return
gunner_blockers,0,1,2,3,4,0,1,2,3,4,5
time_since_punt,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
00:00:01,47.566563,45.938112,43.991621,46.354646,46.343894,47.38711,48.682959,48.177614,48.600492,48.364038,49.855188
00:00:02,44.363982,41.105349,38.871983,40.935949,40.561737,43.921659,44.464304,43.394259,43.702018,43.517662,44.907907
00:00:03,38.519668,33.935251,32.078948,33.389366,32.44995,38.856784,38.462795,37.424369,37.581441,37.192809,38.183061
00:00:04,31.803258,26.991194,25.538214,25.865669,24.544524,33.3799,32.705162,32.140858,31.909733,31.298615,32.561592
00:00:05,27.710252,21.354172,22.743397,21.086605,20.809497,23.076005,31.668453,27.186397,26.933362,27.31155,28.111912
00:00:06,,,,,22.651864,,,23.118329,22.25272,25.216619,


In [40]:
#### At each second of each play... Who is the closest to the returner?  How far away are they?  Return vs Fair Catch

In [41]:
min_dist_from_ret = cov_ret_df.groupby(['Season_Year', 'GameKey','PlayID','time_since_punt'], as_index = False)['dis_from_ret'].min()

In [42]:
min_dist_from_ret.head()

Unnamed: 0,Season_Year,GameKey,PlayID,time_since_punt,dis_from_ret
0,2016,3,455,00:00:01,39.06856
1,2016,3,455,00:00:02,31.370344
2,2016,3,455,00:00:03,21.64588
3,2016,3,455,00:00:04,11.321985
4,2016,3,1542,00:00:01,38.607731


In [43]:
closest_df = cov_ret_df.merge(min_dist_from_ret.drop('time_since_punt', axis = 1), how = 'inner', on=['Season_Year', 'GameKey', 'PlayID', 'dis_from_ret'])

In [44]:
#closest_df.groupby(['time_since_punt','cov_Role'], as_index=False)['dis_from_ret'].agg({'# of times closest': 'count','avg distance':np.mean}).sort_values('# of times closest', ascending =False)

In [45]:
pos_grouped_df = cov_ret_df.groupby(['cov_Role','time_since_punt', 'outcome'], as_index=False)['dis_from_ret'].mean()

In [None]:
trace1 = go.Bar(
        x=pos_grouped_df.loc[pos_grouped_df.outcome == 'fair_catch']['cov_Role'],
        y=pos_grouped_df.loc[pos_grouped_df.outcome == 'fair_catch']['dis_from_ret'],
        name='fair catch'
    )

trace2 = go.Bar(
        x=pos_grouped_df.loc[pos_grouped_df.outcome == 'return']['cov_Role'],
        y=pos_grouped_df.loc[pos_grouped_df.outcome == 'return']['dis_from_ret'],
        name='return'
    )

data = [trace1, trace2]
layout = go.Layout(
    barmode='group',
    xaxis=dict(title='Position'),
    yaxis= dict(title='Avg Distance From Returner (Yds)')
)

fig=go.Figure(data=data, layout=layout)
py.iplot(fig, filename='grouped-bar')

#### Punts Landing inside the 20

In [730]:
inside_twenty = play_info.loc[(play_info.outcome.isin(['touchback', 'fair_catch',\
                                                      'oob', 'downed', 'return'])) #& (play_info.PlayID == 817)
                             ].reset_index(drop=True)

In [731]:
#inside_twenty = inside_twenty.loc[inside_twenty.PlayDescription == '(1:34) T.Morstead punts 47 yards to HST 17, Center-C.Highland. T.Ervin to HST 16 for -1 yards (E.Harris).'].reset_index()

In [732]:
punt_to_lst = []
punt_dist_lst = []
return_dist_lst = []

In [734]:
string = 'T.Morstead punts 47 yards to 17, Center-C.Highland. T.Ervin to HST 16 for -1 yards '

In [735]:
snip = re.search('(to [A-Z]* [0-9]+ for [-+]?[0-9]+ yards?)', string).group(0)

In [736]:
for i in range(0, len(inside_twenty)):
    punt_search = re.search('(punts [0-9]+ yards? to [A-Z]* [-+]?[0-9]+)| (punts [0-9]+ yards? to [-+]?[0-9]+)', inside_twenty.PlayDescription[i])
    return_search = re.search('(to [A-Z]* [0-9]+ for [-+]?[0-9]+ yards?)|(to [0-9]+ for [-+]?[0-9]+ yards?)|(ob at [A-Z]* [-+]?[0-9]+ for [-+]?[0-9]+ yards?)|(ob at [0-9]+ for [-+]?[0-9]+ yards?)|(for [-+]?[0-9]+ yards?, TOUCHDOWN)',\
                              inside_twenty.PlayDescription[i])
    if punt_search:
        punt_snip = re.findall(r'-?\d+', punt_search.group(0))
        if inside_twenty.outcome[i] in ['downed','fair_catch', 'oob', 'return']:
            punt_to_lst.append(int(punt_snip[-1]))
            punt_dist_lst.append(int(punt_snip[0]))
        else:
            if inside_twenty.outcome[i] == 'touchback':
                punt_to_lst.append(0)
                punt_dist_lst.append(int(punt_snip[0]))
            else:
                print(i, 'shit')
    else:
        if inside_twenty.outcome[i] == 'touchback':
            punt_to_lst.append(0)
            punt_dist_lst.append(int(punt_snip[0]))
        else:
            print(i, inside_twenty.PlayDescription[i], 'No Punt Search')
        
    if return_search:
        return_snip = re.findall(r'-?\d+', return_search.group(0))
        return_dist_lst.append(int(return_snip[-1]))
    else:
        if inside_twenty.outcome[i] == 'touchback':
            return_dist_lst.append(20)
        elif inside_twenty.outcome[i] in ['downed','fair_catch', 'oob']:
            return_dist_lst.append(0)
        elif 'no gain' in inside_twenty.PlayDescription[i]:
            return_dist_lst.append(0)
        else:
            print(i, 'No Return Search')                                            

In [740]:
inside_twenty['punt_to'] = punt_to_lst
inside_twenty['punt_dist'] = punt_dist_lst
inside_twenty['return_dist'] = return_dist_lst

In [742]:
inside_twenty.to_csv('inside_twenty.csv')

In [None]:
def parse_play_description()

In [None]:
#Fearing the extra 5 yard penalty, punters will be even more wary of avoiding a touchback.
#They will punt the ball shorter and with more hang time, or angle the ball out of bounds.

#From a returners perspective, if the ball is heading close to the goaline, they will be more likely to let it bounce 
#and take the touchback than return it because of the extra 5 yard bonus.