# NHL Shot Data - Data Wrangling

In [166]:
from pathlib import Path
import pandas as pd
import numpy as np
import ast
import json
from statsmodels.distributions.empirical_distribution import ECDF
#from scipy import stats
from collections import Counter
import bz2
import _pickle as cPickle

## Read Existing Frames

### Obtain List of Games with Valid Frames
The script `produce_game_frames.py` downloads and processes individual game reports for the seasons from 2010-11 onward. Initially, this notebook reads and combines valid game frames. Start by getting obtaining the list of all games.

In [2]:
SEASON_LIST = ['20102011', '20112012', '20122013', '20132014', '20142015', '20152016', '20162017', 
               '20172018', '20182019', '20192020']
def read_game_feed_links(season):
    '''
    Reads the local file containing live feed links for the current season, if it exists.

    Parameters
    ----------
    season : str
        The season for the schedule. Example: '20182019' for the 2018-19 season.

    Returns
    -------
    game_feed_links : list of str
        List of links to live feeds of games for the season, if the local file exists.
        Returns None if the file doesn't exist.
        An individual link has the form '/api/v1/game/2018020256/feed/live', where the substring '2018020256' represents 
        the NHL's game ID for the game. 
    '''
    current_dir = Path.cwd()
    relative_path = 'data/schedule_' + season + '.json'
    game_feed_link_path = current_dir.joinpath(relative_path) 
    
    with game_feed_link_path.open('r') as infile:
        game_feed_links = json.load(infile)
    return game_feed_links

def get_game_feed_links(seasons):
    '''
    Obtains game feed links for every season indicated in seasons.

    Parameters
    ----------
    seasons : str or list
        If a string, should be the season for the schedule. Example: '20182019' for the 2018-19 season.
        If a list, should be a list of strings in the format listed above. Example: ['20182019', '20152016']
        will obtain the information for the 2015-16 and 2018-19 seasons.

    Returns
    -------
    List of str
        List of links to live feeds of games for the season. An individual link has the form 
        '/api/v1/game/2018020256/feed/live', where the substring '2018020256' represents the NHL's game ID
        for the game. 

    '''
    return [ link 
            for season in seasons 
            for link in read_game_feed_links(season)]     

In [3]:
game_links = get_game_feed_links(SEASON_LIST)

In [4]:
len(game_links)

12646

A handful of these links have missing or broken play-by-play reports. The file `bad_links.json` has been produced to contain a list of these bad games. Since no data can be obtained from these games, they will be ignored. Approximately 0.68% of games are affected.

In [5]:
def get_missing_links(live_feed_links):
    current_dir = Path.cwd()
    relative_path = 'data/bad_links.json'
    bad_link_path = current_dir.joinpath(relative_path)   
    with bad_link_path.open('r') as infile:
        bad_links = json.load(infile)
    return bad_links

In [6]:
missing_links = get_missing_links(game_links)

In [7]:
len(missing_links)

85

In [8]:
broken_links = ['/api/v1/game/2010020124/feed/live', '/api/v1/game/2013020971/feed/live']
bad_links = missing_links + broken_links

### Read Data Frames for Valid Games
Each game not marked as invalid has a data frame which tracks all shots taken during the game. Read those in now and combine all into a single frame.

In [9]:
def read_game_frame(live_feed_link):
    current_dir = Path.cwd()
    game_id = live_feed_link[13:23]
    
    relative_path = 'data/games/combined_' + game_id + '.pkl'
    frame_path = current_dir.joinpath(relative_path)
    
    game_frame = pd.read_pickle(str(frame_path))
    return game_frame

In [10]:
combined_frame_list = [read_game_frame(link) for link in game_links if link not in bad_links]
shot_frame = pd.concat(combined_frame_list)
shot_frame.reset_index(drop=True, inplace=True)

## Clean Shot Data
Now that the data has been read, clean it.

The field `game_id_htmlreport` was included in the individual game summaries for error-checking purposes. It is not needed since it should be redundant with `game_id_livefeed`.

In [11]:
shot_frame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1428368 entries, 0 to 1428367
Data columns (total 37 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   game_id_livefeed    1428356 non-null  object 
 1   season              1428356 non-null  object 
 2   type                1428356 non-null  object 
 3   game_time           1428356 non-null  object 
 4   away_code           1428356 non-null  object 
 5   home_code           1428356 non-null  object 
 6   venue               1428356 non-null  object 
 7   period              1428368 non-null  int64  
 8   period_ord          1428356 non-null  object 
 9   period_type         1428356 non-null  object 
 10  time_elapsed        1428368 non-null  object 
 11  cum_time_elapsed    1428356 non-null  float64
 12  event               1428368 non-null  object 
 13  event_team_code     1428356 non-null  object 
 14  event_team_is_home  1428368 non-null  bool   
 15  event_coord_x  

In [12]:
# game_id_htmlreport is redundant and can be removed.
shot_frame.drop('game_id_htmlreport', axis=1, inplace=True)

There are a handful of shots which seem to appear in the html play-by-play data, but not in the game live feed. This can occur when postgame evaluation of the play-by-play makes different determinations as to whether events were shots. These will be ignored as well.

In [13]:
shot_frame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1428368 entries, 0 to 1428367
Data columns (total 36 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   game_id_livefeed    1428356 non-null  object 
 1   season              1428356 non-null  object 
 2   type                1428356 non-null  object 
 3   game_time           1428356 non-null  object 
 4   away_code           1428356 non-null  object 
 5   home_code           1428356 non-null  object 
 6   venue               1428356 non-null  object 
 7   period              1428368 non-null  int64  
 8   period_ord          1428356 non-null  object 
 9   period_type         1428356 non-null  object 
 10  time_elapsed        1428368 non-null  object 
 11  cum_time_elapsed    1428356 non-null  float64
 12  event               1428368 non-null  object 
 13  event_team_code     1428356 non-null  object 
 14  event_team_is_home  1428368 non-null  bool   
 15  event_coord_x  

In [14]:
pd.set_option('display.max_columns', None)

In [15]:
shot_frame.head()

Unnamed: 0,game_id_livefeed,season,type,game_time,away_code,home_code,venue,period,period_ord,period_type,time_elapsed,cum_time_elapsed,event,event_team_code,event_team_is_home,event_coord_x,event_coord_y,secondary_type,is_rebound,strength,time_remaining,pos_a,pos_h,seconds_remaining,fwd_def_a,fwd_def_h,skaters_a,skaters_h,goalie_pulled_a,goalie_pulled_h,shot_dist,event_zone,miss_type,shot_type,calc_dist,dist_difference
0,2010020003,20102011,R,2010-10-07T16:00:00Z,CAR,MIN,Hartwall Areena,1,1st,REGULAR,01:05,65.0,SHOT,CAR,False,56.0,-15.0,Snap Shot,False,EV,18:55,"{'C': 1, 'R': 1, 'L': 1, 'D': 2, 'G': 1}","{'C': 1, 'R': 2, 'D': 2, 'G': 1}",1135.0,"{'FWD': 3, 'DEF': 2, 'GOAL': 1}","{'FWD': 3, 'DEF': 2, 'GOAL': 1}","{'SKTR': 5, 'GOAL': 1}","{'SKTR': 5, 'GOAL': 1}",False,False,37.0,Off. Zone,Save,Snap,36.249138,0.750862
1,2010020003,20102011,R,2010-10-07T16:00:00Z,CAR,MIN,Hartwall Areena,1,1st,REGULAR,01:16,76.0,MISS,CAR,False,35.0,33.0,,False,EV,18:44,"{'C': 1, 'R': 1, 'L': 1, 'D': 2, 'G': 1}","{'C': 1, 'R': 2, 'D': 2, 'G': 1}",1124.0,"{'FWD': 3, 'DEF': 2, 'GOAL': 1}","{'FWD': 3, 'DEF': 2, 'GOAL': 1}","{'SKTR': 5, 'GOAL': 1}","{'SKTR': 5, 'GOAL': 1}",False,False,64.0,Off. Zone,Wide of Net,Wrist,63.285069,0.714931
2,2010020003,20102011,R,2010-10-07T16:00:00Z,CAR,MIN,Hartwall Areena,1,1st,REGULAR,01:37,97.0,MISS,MIN,True,87.0,-6.0,,False,EV,18:23,"{'C': 1, 'R': 1, 'L': 1, 'D': 2, 'G': 1}","{'C': 1, 'R': 1, 'L': 1, 'D': 2, 'G': 1}",1103.0,"{'FWD': 3, 'DEF': 2, 'GOAL': 1}","{'FWD': 3, 'DEF': 2, 'GOAL': 1}","{'SKTR': 5, 'GOAL': 1}","{'SKTR': 5, 'GOAL': 1}",False,False,6.0,Off. Zone,Wide of Net,Wrist,6.324555,0.324555
3,2010020003,20102011,R,2010-10-07T16:00:00Z,CAR,MIN,Hartwall Areena,1,1st,REGULAR,03:33,213.0,GOAL,MIN,True,78.0,3.0,Wrist Shot,False,EV,16:27,"{'C': 1, 'R': 1, 'L': 1, 'D': 2, 'G': 1}","{'C': 2, 'L': 1, 'D': 2, 'G': 1}",987.0,"{'FWD': 3, 'DEF': 2, 'GOAL': 1}","{'FWD': 3, 'DEF': 2, 'GOAL': 1}","{'SKTR': 5, 'GOAL': 1}","{'SKTR': 5, 'GOAL': 1}",False,False,12.0,Off. Zone,,Wrist,11.401754,0.598246
4,2010020003,20102011,R,2010-10-07T16:00:00Z,CAR,MIN,Hartwall Areena,1,1st,REGULAR,03:49,229.0,BLOCK,MIN,True,77.0,13.0,,False,EV,16:11,"{'C': 1, 'R': 1, 'L': 1, 'D': 2, 'G': 1}","{'C': 1, 'R': 1, 'L': 1, 'D': 2, 'G': 1}",971.0,"{'FWD': 3, 'DEF': 2, 'GOAL': 1}","{'FWD': 3, 'DEF': 2, 'GOAL': 1}","{'SKTR': 5, 'GOAL': 1}","{'SKTR': 5, 'GOAL': 1}",False,False,,Off. Zone,Block,Snap,17.691806,


In [16]:
shot_frame[shot_frame.game_id_livefeed.isna()]

Unnamed: 0,game_id_livefeed,season,type,game_time,away_code,home_code,venue,period,period_ord,period_type,time_elapsed,cum_time_elapsed,event,event_team_code,event_team_is_home,event_coord_x,event_coord_y,secondary_type,is_rebound,strength,time_remaining,pos_a,pos_h,seconds_remaining,fwd_def_a,fwd_def_h,skaters_a,skaters_h,goalie_pulled_a,goalie_pulled_h,shot_dist,event_zone,miss_type,shot_type,calc_dist,dist_difference
592608,,,,,,,,3,,,16:02,,BLOCK,,False,,,,,EV,03:58,"{'C': 2, 'L': 1, 'D': 2, 'G': 1}","{'C': 2, 'R': 1, 'D': 2, 'G': 1}",238.0,"{'FWD': 3, 'DEF': 2, 'GOAL': 1}","{'FWD': 3, 'DEF': 2, 'GOAL': 1}","{'SKTR': 5, 'GOAL': 1}","{'SKTR': 5, 'GOAL': 1}",False,False,,Off. Zone,Block,Wrist,,
621845,,,,,,,,3,,,19:41,,SHOT,,True,,,,,PP,00:19,"{'C': 2, 'D': 3, 'G': 1}","{'C': 1, 'L': 1, 'D': 2, 'G': 1}",19.0,"{'FWD': 2, 'DEF': 3, 'GOAL': 1}","{'FWD': 2, 'DEF': 2, 'GOAL': 1}","{'SKTR': 5, 'GOAL': 1}","{'SKTR': 4, 'GOAL': 1}",False,False,36.0,Off. Zone,Save,Wrist,,
652166,,,,,,,,3,,,18:44,,SHOT,,True,,,,,PP,01:16,"{'L': 2, 'D': 2, 'G': 1}","{'C': 2, 'R': 1, 'L': 1, 'D': 2}",76.0,"{'FWD': 2, 'DEF': 2, 'GOAL': 1}","{'FWD': 4, 'DEF': 2}","{'SKTR': 4, 'GOAL': 1}",{'SKTR': 6},False,True,23.0,Off. Zone,Save,Wrist,,
652167,,,,,,,,3,,,18:45,,SHOT,,True,,,,,PP,01:15,"{'L': 2, 'D': 2, 'G': 1}","{'C': 2, 'R': 1, 'L': 1, 'D': 2}",75.0,"{'FWD': 2, 'DEF': 2, 'GOAL': 1}","{'FWD': 4, 'DEF': 2}","{'SKTR': 4, 'GOAL': 1}",{'SKTR': 6},False,True,52.0,Off. Zone,Save,Wrist,,
652168,,,,,,,,3,,,18:47,,SHOT,,True,,,,,PP,01:13,"{'L': 2, 'D': 2, 'G': 1}","{'C': 2, 'R': 1, 'L': 1, 'D': 2}",73.0,"{'FWD': 2, 'DEF': 2, 'GOAL': 1}","{'FWD': 4, 'DEF': 2}","{'SKTR': 4, 'GOAL': 1}",{'SKTR': 6},False,True,18.0,Off. Zone,Save,Wrist,,
841902,,,,,,,,3,,,06:00,,MISS,,True,,,,,EV,14:00,"{'C': 1, 'R': 1, 'L': 1, 'D': 2, 'G': 1}","{'C': 1, 'R': 1, 'L': 1, 'D': 2, 'G': 1}",840.0,"{'FWD': 3, 'DEF': 2, 'GOAL': 1}","{'FWD': 3, 'DEF': 2, 'GOAL': 1}","{'SKTR': 5, 'GOAL': 1}","{'SKTR': 5, 'GOAL': 1}",False,False,40.0,Off. Zone,Wide of Net,Wrist,,
1377738,,,,,,,,1,,,05:01,,BLOCK,,False,,,,,EV,14:59,"{'C': 2, 'L': 1, 'D': 2, 'G': 1}","{'C': 1, 'R': 2, 'D': 2, 'G': 1}",899.0,"{'FWD': 3, 'DEF': 2, 'GOAL': 1}","{'FWD': 3, 'DEF': 2, 'GOAL': 1}","{'SKTR': 5, 'GOAL': 1}","{'SKTR': 5, 'GOAL': 1}",False,False,,Off. Zone,Block,Wrist,,
1377739,,,,,,,,1,,,11:06,,SHOT,,True,,,,,SH,08:54,"{'C': 1, 'R': 1, 'L': 2, 'D': 1, 'G': 1}","{'C': 1, 'R': 1, 'D': 2, 'G': 1}",534.0,"{'FWD': 4, 'DEF': 1, 'GOAL': 1}","{'FWD': 2, 'DEF': 2, 'GOAL': 1}","{'SKTR': 5, 'GOAL': 1}","{'SKTR': 4, 'GOAL': 1}",False,False,11.0,Off. Zone,Save,Backhand,,
1377740,,,,,,,,1,,,15:02,,SHOT,,True,,,,,PP,04:58,"{'C': 2, 'L': 2, 'D': 1, 'G': 1}","{'C': 1, 'L': 1, 'D': 2, 'G': 1}",298.0,"{'FWD': 4, 'DEF': 1, 'GOAL': 1}","{'FWD': 2, 'DEF': 2, 'GOAL': 1}","{'SKTR': 5, 'GOAL': 1}","{'SKTR': 4, 'GOAL': 1}",False,False,21.0,Off. Zone,Save,Wrist,,
1377741,,,,,,,,2,,,14:10,,SHOT,,True,,,,,EV,05:50,"{'C': 1, 'R': 2, 'D': 2, 'G': 1}","{'C': 1, 'R': 1, 'L': 1, 'D': 2, 'G': 1}",350.0,"{'FWD': 3, 'DEF': 2, 'GOAL': 1}","{'FWD': 3, 'DEF': 2, 'GOAL': 1}","{'SKTR': 5, 'GOAL': 1}","{'SKTR': 5, 'GOAL': 1}",False,False,58.0,Off. Zone,Save,Wrist,,


In [17]:
shot_frame.dropna(subset=['game_id_livefeed'], inplace=True)

A few shots are missing coordinates. These will also be ignored.

In [18]:
shot_frame.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1428356 entries, 0 to 1428367
Data columns (total 36 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   game_id_livefeed    1428356 non-null  object 
 1   season              1428356 non-null  object 
 2   type                1428356 non-null  object 
 3   game_time           1428356 non-null  object 
 4   away_code           1428356 non-null  object 
 5   home_code           1428356 non-null  object 
 6   venue               1428356 non-null  object 
 7   period              1428356 non-null  int64  
 8   period_ord          1428356 non-null  object 
 9   period_type         1428356 non-null  object 
 10  time_elapsed        1428356 non-null  object 
 11  cum_time_elapsed    1428356 non-null  float64
 12  event               1428356 non-null  object 
 13  event_team_code     1428356 non-null  object 
 14  event_team_is_home  1428356 non-null  bool   
 15  event_coord_x  

In [19]:
shot_frame[shot_frame['event_coord_x'].isna() | shot_frame['event_coord_y'].isna()]

Unnamed: 0,game_id_livefeed,season,type,game_time,away_code,home_code,venue,period,period_ord,period_type,time_elapsed,cum_time_elapsed,event,event_team_code,event_team_is_home,event_coord_x,event_coord_y,secondary_type,is_rebound,strength,time_remaining,pos_a,pos_h,seconds_remaining,fwd_def_a,fwd_def_h,skaters_a,skaters_h,goalie_pulled_a,goalie_pulled_h,shot_dist,event_zone,miss_type,shot_type,calc_dist,dist_difference
25893,2010020237,20102011,R,2010-11-14T00:00:00Z,FLA,PHI,Wells Fargo Center,1,1st,REGULAR,05:52,352.0,BLOCK,FLA,False,-65.0,,,False,EV,14:08,"{'C': 1, 'L': 2, 'D': 2, 'G': 1}","{'C': 1, 'L': 2, 'D': 2, 'G': 1}",848.0,"{'FWD': 3, 'DEF': 2, 'GOAL': 1}","{'FWD': 3, 'DEF': 2, 'GOAL': 1}","{'SKTR': 5, 'GOAL': 1}","{'SKTR': 5, 'GOAL': 1}",False,False,,Def. Zone,Block,Wrist,,
95315,2010020863,20102011,R,2011-02-18T00:00:00Z,BOS,NYI,Nassau Coliseum,3,3rd,REGULAR,19:10,3550.0,BLOCK,NYI,True,46.0,,,False,EV,00:50,"{'C': 1, 'R': 1, 'L': 1, 'D': 2, 'G': 1}","{'C': 3, 'D': 2, 'G': 1}",50.0,"{'FWD': 3, 'DEF': 2, 'GOAL': 1}","{'FWD': 3, 'DEF': 2, 'GOAL': 1}","{'SKTR': 5, 'GOAL': 1}","{'SKTR': 5, 'GOAL': 1}",False,False,,Off. Zone,Block,Snap,,
149485,2011020031,20112012,R,2011-10-10T22:00:00Z,PHX,DAL,American Airlines Center,3,3rd,REGULAR,05:53,2753.0,SHOT,PHX,False,,,Slap Shot,False,PP,14:07,"{'C': 1, 'R': 2, 'L': 1, 'D': 1, 'G': 1}","{'C': 1, 'R': 1, 'D': 2, 'G': 1}",847.0,"{'FWD': 4, 'DEF': 1, 'GOAL': 1}","{'FWD': 2, 'DEF': 2, 'GOAL': 1}","{'SKTR': 5, 'GOAL': 1}","{'SKTR': 4, 'GOAL': 1}",False,False,0.0,,Save,Slap,,
263572,2011021075,20112012,R,2012-03-18T02:30:00Z,DET,SJS,HP Pavilion at San Jose,2,2nd,REGULAR,06:50,1610.0,SHOT,DET,False,31.0,,Wrist Shot,False,EV,13:10,"{'C': 1, 'L': 2, 'D': 2, 'G': 1}","{'C': 3, 'D': 2, 'G': 1}",790.0,"{'FWD': 3, 'DEF': 2, 'GOAL': 1}","{'FWD': 3, 'DEF': 2, 'GOAL': 1}","{'SKTR': 5, 'GOAL': 1}","{'SKTR': 5, 'GOAL': 1}",False,False,72.0,Off. Zone,Save,Wrist,,
338102,2012020446,20122013,R,2013-03-21T23:00:00Z,NJD,CAR,PNC Arena,1,1st,REGULAR,03:54,234.0,GOAL,NJD,False,,,,False,PP,16:06,"{'C': 1, 'R': 1, 'L': 2, 'D': 1, 'G': 1}","{'C': 1, 'R': 1, 'D': 2, 'G': 1}",966.0,"{'FWD': 4, 'DEF': 1, 'GOAL': 1}","{'FWD': 2, 'DEF': 2, 'GOAL': 1}","{'SKTR': 5, 'GOAL': 1}","{'SKTR': 4, 'GOAL': 1}",False,False,0.0,,,,,
341983,2012020482,20122013,R,2013-03-26T23:00:00Z,WPG,CAR,PNC Arena,3,3rd,REGULAR,12:37,3157.0,GOAL,WPG,False,,,,False,EV,07:23,"{'C': 1, 'R': 1, 'L': 1, 'D': 2, 'G': 1}","{'C': 2, 'R': 1, 'D': 2, 'G': 1}",443.0,"{'FWD': 3, 'DEF': 2, 'GOAL': 1}","{'FWD': 3, 'DEF': 2, 'GOAL': 1}","{'SKTR': 5, 'GOAL': 1}","{'SKTR': 5, 'GOAL': 1}",False,False,0.0,,,,,
364236,2012020684,20122013,R,2013-04-23T23:30:00Z,BOS,PHI,Wells Fargo Center,2,2nd,REGULAR,11:31,1891.0,GOAL,PHI,True,,,,False,EV,08:29,"{'C': 2, 'R': 1, 'D': 2, 'G': 1}","{'C': 1, 'R': 1, 'L': 1, 'D': 2, 'G': 1}",509.0,"{'FWD': 3, 'DEF': 2, 'GOAL': 1}","{'FWD': 3, 'DEF': 2, 'GOAL': 1}","{'SKTR': 5, 'GOAL': 1}","{'SKTR': 5, 'GOAL': 1}",False,False,0.0,,,,,
394817,2013020143,20132014,R,2013-10-24T23:30:00Z,CHI,TBL,Tampa Bay Times Forum,3,3rd,REGULAR,18:11,3491.0,GOAL,CHI,False,,,,False,EV,01:49,"{'C': 1, 'R': 2, 'D': 2, 'G': 1}","{'C': 2, 'R': 1, 'D': 2, 'G': 1}",109.0,"{'FWD': 3, 'DEF': 2, 'GOAL': 1}","{'FWD': 3, 'DEF': 2, 'GOAL': 1}","{'SKTR': 5, 'GOAL': 1}","{'SKTR': 5, 'GOAL': 1}",False,False,0.0,,,,,
402275,2013020212,20132014,R,2013-11-03T18:00:00Z,DAL,OTT,Canadian Tire Centre,3,3rd,REGULAR,03:11,2591.0,GOAL,DAL,False,,,Deflected,False,EV,16:49,"{'C': 2, 'L': 1, 'D': 2, 'G': 1}","{'C': 2, 'L': 1, 'D': 2, 'G': 1}",1009.0,"{'FWD': 3, 'DEF': 2, 'GOAL': 1}","{'FWD': 3, 'DEF': 2, 'GOAL': 1}","{'SKTR': 5, 'GOAL': 1}","{'SKTR': 5, 'GOAL': 1}",False,False,0.0,,,Deflected,,
488918,2013020999,20132014,R,2014-03-14T23:30:00Z,NJD,FLA,BB&T Center,1,1st,REGULAR,17:10,1030.0,MISS,FLA,True,,-13.0,,False,EV,02:50,"{'C': 1, 'R': 1, 'L': 1, 'D': 2, 'G': 1}","{'C': 2, 'R': 1, 'D': 2, 'G': 1}",170.0,"{'FWD': 3, 'DEF': 2, 'GOAL': 1}","{'FWD': 3, 'DEF': 2, 'GOAL': 1}","{'SKTR': 5, 'GOAL': 1}","{'SKTR': 5, 'GOAL': 1}",False,False,15.0,Off. Zone,Over Net,Wrist,,


In [20]:
shot_frame.dropna(subset=['event_coord_x', 'event_coord_y'], inplace=True)

A few events from one game are missing data as to which players are on the ice. Ignore these as well.

In [21]:
shot_frame.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1428328 entries, 0 to 1428367
Data columns (total 36 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   game_id_livefeed    1428328 non-null  object 
 1   season              1428328 non-null  object 
 2   type                1428328 non-null  object 
 3   game_time           1428328 non-null  object 
 4   away_code           1428328 non-null  object 
 5   home_code           1428328 non-null  object 
 6   venue               1428328 non-null  object 
 7   period              1428328 non-null  int64  
 8   period_ord          1428328 non-null  object 
 9   period_type         1428328 non-null  object 
 10  time_elapsed        1428328 non-null  object 
 11  cum_time_elapsed    1428328 non-null  float64
 12  event               1428328 non-null  object 
 13  event_team_code     1428328 non-null  object 
 14  event_team_is_home  1428328 non-null  bool   
 15  event_coord_x  

In [22]:
shot_frame[shot_frame['pos_a'].isna() | shot_frame['pos_h'].isna()]

Unnamed: 0,game_id_livefeed,season,type,game_time,away_code,home_code,venue,period,period_ord,period_type,time_elapsed,cum_time_elapsed,event,event_team_code,event_team_is_home,event_coord_x,event_coord_y,secondary_type,is_rebound,strength,time_remaining,pos_a,pos_h,seconds_remaining,fwd_def_a,fwd_def_h,skaters_a,skaters_h,goalie_pulled_a,goalie_pulled_h,shot_dist,event_zone,miss_type,shot_type,calc_dist,dist_difference
1412138,2019020876,20192020,R,2020-03-12T02:00:00Z,STL,ANA,Honda Center,1,1st,REGULAR,00:00,0.0,GOAL,ANA,True,57.0,-14.0,,False,EV,20:00,,,1200.0,,,,,,,34.0,Off. Zone,,,34.928498,0.928498
1412139,2019020876,20192020,R,2020-03-12T02:00:00Z,STL,ANA,Honda Center,1,1st,REGULAR,00:00,0.0,GOAL,ANA,True,57.0,-14.0,,False,EV,20:00,,,1200.0,,,,,,,41.0,Off. Zone,,,34.928498,6.071502
1412140,2019020876,20192020,R,2020-03-12T02:00:00Z,STL,ANA,Honda Center,1,1st,REGULAR,00:00,0.0,GOAL,STL,False,57.0,-26.0,,True,EV,20:00,,,1200.0,,,,,,,34.0,Off. Zone,,,41.231056,7.231056
1412141,2019020876,20192020,R,2020-03-12T02:00:00Z,STL,ANA,Honda Center,1,1st,REGULAR,00:00,0.0,GOAL,STL,False,57.0,-26.0,,True,EV,20:00,,,1200.0,,,,,,,41.0,Off. Zone,,,41.231056,0.231056


In [23]:
shot_frame.dropna(subset=['pos_a', 'pos_h'], inplace=True)

There are a handful of remaining fields with null entries. These are `secondary_type`, `shot_dist`, `miss_type`, `shot_type`, and `dist_difference`. The construction of `dist_difference` means that it's only defined when `shot_dist` is defined. Of the others, `shot_dist` is known to be missing when the event type is `BLOCK`. The `miss_type` is missing when the event type is `GOAL`.

In [24]:
shot_frame.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1428324 entries, 0 to 1428367
Data columns (total 36 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   game_id_livefeed    1428324 non-null  object 
 1   season              1428324 non-null  object 
 2   type                1428324 non-null  object 
 3   game_time           1428324 non-null  object 
 4   away_code           1428324 non-null  object 
 5   home_code           1428324 non-null  object 
 6   venue               1428324 non-null  object 
 7   period              1428324 non-null  int64  
 8   period_ord          1428324 non-null  object 
 9   period_type         1428324 non-null  object 
 10  time_elapsed        1428324 non-null  object 
 11  cum_time_elapsed    1428324 non-null  float64
 12  event               1428324 non-null  object 
 13  event_team_code     1428324 non-null  object 
 14  event_team_is_home  1428324 non-null  bool   
 15  event_coord_x  

Checking miss-type to confirm that it should only be missing when the event type is `GOAL`. There are three miss events which don't have a miss-type. This shouldn't pose a major concern, so these will be marked as `Miss`. To eliminate null entries, goals will be marked as miss-type `Goal`.

In [25]:
shot_frame['miss_type'].value_counts()

Save            700536
Block           362167
Wide of Net     251564
Over Net         29796
Goalpost         12634
Hit Crossbar      2410
Name: miss_type, dtype: int64

In [26]:
shot_frame[shot_frame['event']=='GOAL']['miss_type'].value_counts()

Series([], Name: miss_type, dtype: int64)

In [27]:
shot_frame[shot_frame['miss_type'].isna()]['event'].value_counts()

GOAL    69214
MISS        3
Name: event, dtype: int64

In [28]:
shot_frame[(shot_frame['miss_type'].isna()) & (shot_frame['event']=='MISS')]

Unnamed: 0,game_id_livefeed,season,type,game_time,away_code,home_code,venue,period,period_ord,period_type,time_elapsed,cum_time_elapsed,event,event_team_code,event_team_is_home,event_coord_x,event_coord_y,secondary_type,is_rebound,strength,time_remaining,pos_a,pos_h,seconds_remaining,fwd_def_a,fwd_def_h,skaters_a,skaters_h,goalie_pulled_a,goalie_pulled_h,shot_dist,event_zone,miss_type,shot_type,calc_dist,dist_difference
152513,2011020055,20112012,R,2011-10-15T23:00:00Z,WPG,PHX,Jobing.com Arena,3,3rd,REGULAR,12:22,3142.0,MISS,PHX,True,80.0,0.0,,False,PP,07:38,"{'R': 1, 'L': 1, 'D': 2, 'G': 1}","{'C': 2, 'L': 2, 'D': 1, 'G': 1}",458.0,"{'FWD': 2, 'DEF': 2, 'GOAL': 1}","{'FWD': 4, 'DEF': 1, 'GOAL': 1}","{'SKTR': 4, 'GOAL': 1}","{'SKTR': 5, 'GOAL': 1}",False,False,10.0,Off. Zone,,Tip-In,9.0,1.0
436420,2013020524,20132014,R,2013-12-20T00:00:00Z,PHX,TOR,Air Canada Centre,4,OT,OVERTIME,03:04,3784.0,MISS,PHX,False,35.0,-11.0,,False,EV,01:56,"{'C': 1, 'R': 1, 'D': 2, 'G': 1}","{'C': 1, 'R': 1, 'D': 2, 'G': 1}",116.0,"{'FWD': 2, 'DEF': 2, 'GOAL': 1}","{'FWD': 2, 'DEF': 2, 'GOAL': 1}","{'SKTR': 4, 'GOAL': 1}","{'SKTR': 4, 'GOAL': 1}",False,False,56.0,Off. Zone,,Wrist,55.108983,0.891017
645476,2014021068,20142015,R,2015-03-21T20:00:00Z,VAN,LAK,STAPLES Center,3,3rd,REGULAR,19:37,3577.0,MISS,LAK,True,81.0,-4.0,,True,PP,00:23,"{'C': 2, 'D': 2, 'G': 1}","{'C': 2, 'R': 1, 'L': 1, 'D': 2}",23.0,"{'FWD': 2, 'DEF': 2, 'GOAL': 1}","{'FWD': 4, 'DEF': 2}","{'SKTR': 4, 'GOAL': 1}",{'SKTR': 6},False,True,9.0,Off. Zone,,Wrist,8.944272,0.055728


In [31]:
shot_frame.loc[(shot_frame['miss_type'].isna()) & (shot_frame['event']=='MISS'), 'miss_type'] = 'Miss'

In [32]:
shot_frame[(shot_frame['miss_type'].isna()) & (shot_frame['event']=='MISS')]

Unnamed: 0,game_id_livefeed,season,type,game_time,away_code,home_code,venue,period,period_ord,period_type,time_elapsed,cum_time_elapsed,event,event_team_code,event_team_is_home,event_coord_x,event_coord_y,secondary_type,is_rebound,strength,time_remaining,pos_a,pos_h,seconds_remaining,fwd_def_a,fwd_def_h,skaters_a,skaters_h,goalie_pulled_a,goalie_pulled_h,shot_dist,event_zone,miss_type,shot_type,calc_dist,dist_difference


In [34]:
shot_frame[(shot_frame['miss_type'].isna())]['event'].value_counts()

GOAL    69214
Name: event, dtype: int64

In [36]:
shot_frame[(shot_frame['miss_type'].isna())].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 69214 entries, 3 to 1428299
Data columns (total 36 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   game_id_livefeed    69214 non-null  object 
 1   season              69214 non-null  object 
 2   type                69214 non-null  object 
 3   game_time           69214 non-null  object 
 4   away_code           69214 non-null  object 
 5   home_code           69214 non-null  object 
 6   venue               69214 non-null  object 
 7   period              69214 non-null  int64  
 8   period_ord          69214 non-null  object 
 9   period_type         69214 non-null  object 
 10  time_elapsed        69214 non-null  object 
 11  cum_time_elapsed    69214 non-null  float64
 12  event               69214 non-null  object 
 13  event_team_code     69214 non-null  object 
 14  event_team_is_home  69214 non-null  bool   
 15  event_coord_x       69214 non-null  float64
 16  ev

In [37]:
shot_frame.loc[(shot_frame['miss_type'].isna()), 'miss_type'] = 'Goal'

In [38]:
shot_frame.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1428324 entries, 0 to 1428367
Data columns (total 36 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   game_id_livefeed    1428324 non-null  object 
 1   season              1428324 non-null  object 
 2   type                1428324 non-null  object 
 3   game_time           1428324 non-null  object 
 4   away_code           1428324 non-null  object 
 5   home_code           1428324 non-null  object 
 6   venue               1428324 non-null  object 
 7   period              1428324 non-null  int64  
 8   period_ord          1428324 non-null  object 
 9   period_type         1428324 non-null  object 
 10  time_elapsed        1428324 non-null  object 
 11  cum_time_elapsed    1428324 non-null  float64
 12  event               1428324 non-null  object 
 13  event_team_code     1428324 non-null  object 
 14  event_team_is_home  1428324 non-null  bool   
 15  event_coord_x  

Now, confirm that the missing `shot_dist`/`dist_difference` entries occur for blocks.

In [40]:
shot_frame[shot_frame['shot_dist'].isna()]['event'].value_counts()

BLOCK    362167
Name: event, dtype: int64

In [41]:
shot_frame['event'].value_counts()

SHOT     700536
BLOCK    362167
MISS     296407
GOAL      69214
Name: event, dtype: int64

The field `secondary_type` carries the shot-type from the live game feeds. However, this is omitted from the feeds for blocks and misses. It is missing from a handful of additional events as well.

In [42]:
shot_frame[shot_frame['secondary_type'].isna()]['event'].value_counts()

BLOCK    362167
MISS     296407
GOAL         43
SHOT         19
Name: event, dtype: int64

The field `shot_type` is obtained from the HTML play-by-play report. This is missing for a handful of events.

In [43]:
shot_frame[shot_frame['shot_type'].isna()]['event'].value_counts()

GOAL     51
SHOT     19
BLOCK    16
MISS      3
Name: event, dtype: int64

Most of the shot-types missing from the play-by-play are also missing from the live feed. These will be left alone for now.

In [44]:
shot_frame[(shot_frame['secondary_type'].isna()) & (shot_frame['shot_type'].isna())]['event'].value_counts()

GOAL     43
SHOT     19
BLOCK    16
MISS      3
Name: event, dtype: int64

In [46]:
shot_frame[(~shot_frame['secondary_type'].isna()) & (shot_frame['shot_type'].isna()) & (shot_frame['event'] == 'GOAL')]

Unnamed: 0,game_id_livefeed,season,type,game_time,away_code,home_code,venue,period,period_ord,period_type,time_elapsed,cum_time_elapsed,event,event_team_code,event_team_is_home,event_coord_x,event_coord_y,secondary_type,is_rebound,strength,time_remaining,pos_a,pos_h,seconds_remaining,fwd_def_a,fwd_def_h,skaters_a,skaters_h,goalie_pulled_a,goalie_pulled_h,shot_dist,event_zone,miss_type,shot_type,calc_dist,dist_difference
981106,2017020083,20172018,R,2017-10-17T23:00:00Z,FLA,PHI,Wells Fargo Center,3,3rd,REGULAR,18:23,3503.0,GOAL,PHI,True,-22.0,23.0,Wrist Shot,False,EV,01:37,"{'C': 1, 'R': 2, 'L': 1, 'D': 2}","{'C': 1, 'L': 2, 'D': 2, 'G': 1}",97.0,"{'FWD': 4, 'DEF': 2}","{'FWD': 3, 'DEF': 2, 'GOAL': 1}",{'SKTR': 6},"{'SKTR': 5, 'GOAL': 1}",True,False,113.0,Neu. Zone,Goal,,113.35784,0.35784
997522,2017020222,20172018,R,2017-11-07T03:00:00Z,DET,VAN,Rogers Arena,2,2nd,REGULAR,15:15,2115.0,GOAL,DET,False,74.0,2.0,Wrist Shot,False,PP,04:45,"{'C': 2, 'R': 2, 'D': 1, 'G': 1}","{'C': 1, 'R': 1, 'D': 2, 'G': 1}",285.0,"{'FWD': 4, 'DEF': 1, 'GOAL': 1}","{'FWD': 2, 'DEF': 2, 'GOAL': 1}","{'SKTR': 5, 'GOAL': 1}","{'SKTR': 4, 'GOAL': 1}",False,False,16.0,Off. Zone,Goal,,15.132746,0.867254
1064542,2017020791,20172018,R,2018-02-03T01:00:00Z,VGK,MIN,Xcel Energy Center,3,3rd,REGULAR,18:31,3511.0,GOAL,MIN,True,40.0,-16.0,Wrist Shot,False,EV,01:29,"{'C': 1, 'R': 1, 'L': 3, 'D': 1}","{'C': 1, 'R': 1, 'L': 1, 'D': 2, 'G': 1}",89.0,"{'FWD': 5, 'DEF': 1}","{'FWD': 3, 'DEF': 2, 'GOAL': 1}",{'SKTR': 6},"{'SKTR': 5, 'GOAL': 1}",True,False,52.0,Off. Zone,Goal,,51.546096,0.453904
1076528,2017020891,20172018,R,2018-02-17T00:30:00Z,NYI,CAR,PNC Arena,3,3rd,REGULAR,19:28,3568.0,GOAL,NYI,False,67.0,-23.0,Wrist Shot,False,EV,00:32,"{'C': 3, 'D': 2, 'G': 1}","{'C': 3, 'R': 1, 'L': 1, 'D': 1}",32.0,"{'FWD': 3, 'DEF': 2, 'GOAL': 1}","{'FWD': 5, 'DEF': 1}","{'SKTR': 5, 'GOAL': 1}",{'SKTR': 6},False,True,32.0,Off. Zone,Goal,,31.827661,0.172339
1124303,2017030153,20172018,P,2018-04-17T02:00:00Z,NSH,COL,Pepsi Center,3,3rd,REGULAR,18:24,3504.0,GOAL,COL,True,75.0,-7.0,Wrist Shot,False,EV,01:36,"{'C': 2, 'L': 2, 'D': 2}","{'C': 1, 'R': 1, 'L': 1, 'D': 2, 'G': 1}",96.0,"{'FWD': 4, 'DEF': 2}","{'FWD': 3, 'DEF': 2, 'GOAL': 1}",{'SKTR': 6},"{'SKTR': 5, 'GOAL': 1}",True,False,16.0,Off. Zone,Goal,,15.652476,0.347524
1152013,2018020173,20182019,R,2018-10-31T00:00:00Z,VGK,NSH,Bridgestone Arena,3,3rd,REGULAR,17:42,3462.0,GOAL,NSH,True,79.0,1.0,Wrist Shot,False,EV,02:18,"{'C': 2, 'L': 2, 'D': 2}","{'C': 1, 'L': 2, 'D': 2, 'G': 1}",138.0,"{'FWD': 4, 'DEF': 2}","{'FWD': 3, 'DEF': 2, 'GOAL': 1}",{'SKTR': 6},"{'SKTR': 5, 'GOAL': 1}",True,False,11.0,Off. Zone,Goal,,10.049876,0.950124
1153685,2018020187,20182019,R,2018-11-02T01:00:00Z,COL,CGY,Scotiabank Saddledome,3,3rd,REGULAR,18:57,3537.0,GOAL,COL,False,81.0,1.0,Wrist Shot,True,PP,01:03,"{'C': 3, 'R': 1, 'L': 1, 'D': 1}","{'C': 2, 'D': 2, 'G': 1}",63.0,"{'FWD': 5, 'DEF': 1}","{'FWD': 2, 'DEF': 2, 'GOAL': 1}",{'SKTR': 6},"{'SKTR': 4, 'GOAL': 1}",True,False,9.0,Off. Zone,Goal,,8.062258,0.937742
1155919,2018020206,20182019,R,2018-11-04T02:30:00Z,PHI,SJS,SAP Center at San Jose,4,OT,OVERTIME,00:13,3613.0,GOAL,SJS,True,61.0,-9.0,Wrist Shot,False,EV,04:47,"{'R': 2, 'D': 1, 'G': 1}","{'C': 1, 'R': 1, 'D': 1, 'G': 1}",287.0,"{'FWD': 2, 'DEF': 1, 'GOAL': 1}","{'FWD': 2, 'DEF': 1, 'GOAL': 1}","{'SKTR': 3, 'GOAL': 1}","{'SKTR': 3, 'GOAL': 1}",False,False,30.0,Off. Zone,Goal,,29.410882,0.589118


The positions fields give the positions of players on ice at any time. Generally speaking, the number of players/attackers/forwards on the ice is more important, rather than the specific positions.

In [47]:
shot_frame['players_h'] = shot_frame['pos_h'].apply(lambda x: len(list(x.elements())))
shot_frame['players_a'] = shot_frame['pos_a'].apply(lambda x: len(list(x.elements())))
shot_frame['skaters_h'] = shot_frame['skaters_h'].apply(lambda x: x['SKTR'])
shot_frame['skaters_a'] = shot_frame['skaters_a'].apply(lambda x: x['SKTR'])
shot_frame['fwds_h'] = shot_frame['fwd_def_h'].apply(lambda x: x['FWD'])
shot_frame['fwds_a'] = shot_frame['fwd_def_a'].apply(lambda x: x['FWD'])

In [48]:
shot_frame.head()

Unnamed: 0,game_id_livefeed,season,type,game_time,away_code,home_code,venue,period,period_ord,period_type,time_elapsed,cum_time_elapsed,event,event_team_code,event_team_is_home,event_coord_x,event_coord_y,secondary_type,is_rebound,strength,time_remaining,pos_a,pos_h,seconds_remaining,fwd_def_a,fwd_def_h,skaters_a,skaters_h,goalie_pulled_a,goalie_pulled_h,shot_dist,event_zone,miss_type,shot_type,calc_dist,dist_difference,players_h,players_a,fwds_h,fwds_a
0,2010020003,20102011,R,2010-10-07T16:00:00Z,CAR,MIN,Hartwall Areena,1,1st,REGULAR,01:05,65.0,SHOT,CAR,False,56.0,-15.0,Snap Shot,False,EV,18:55,"{'C': 1, 'R': 1, 'L': 1, 'D': 2, 'G': 1}","{'C': 1, 'R': 2, 'D': 2, 'G': 1}",1135.0,"{'FWD': 3, 'DEF': 2, 'GOAL': 1}","{'FWD': 3, 'DEF': 2, 'GOAL': 1}",5,5,False,False,37.0,Off. Zone,Save,Snap,36.249138,0.750862,6,6,3,3
1,2010020003,20102011,R,2010-10-07T16:00:00Z,CAR,MIN,Hartwall Areena,1,1st,REGULAR,01:16,76.0,MISS,CAR,False,35.0,33.0,,False,EV,18:44,"{'C': 1, 'R': 1, 'L': 1, 'D': 2, 'G': 1}","{'C': 1, 'R': 2, 'D': 2, 'G': 1}",1124.0,"{'FWD': 3, 'DEF': 2, 'GOAL': 1}","{'FWD': 3, 'DEF': 2, 'GOAL': 1}",5,5,False,False,64.0,Off. Zone,Wide of Net,Wrist,63.285069,0.714931,6,6,3,3
2,2010020003,20102011,R,2010-10-07T16:00:00Z,CAR,MIN,Hartwall Areena,1,1st,REGULAR,01:37,97.0,MISS,MIN,True,87.0,-6.0,,False,EV,18:23,"{'C': 1, 'R': 1, 'L': 1, 'D': 2, 'G': 1}","{'C': 1, 'R': 1, 'L': 1, 'D': 2, 'G': 1}",1103.0,"{'FWD': 3, 'DEF': 2, 'GOAL': 1}","{'FWD': 3, 'DEF': 2, 'GOAL': 1}",5,5,False,False,6.0,Off. Zone,Wide of Net,Wrist,6.324555,0.324555,6,6,3,3
3,2010020003,20102011,R,2010-10-07T16:00:00Z,CAR,MIN,Hartwall Areena,1,1st,REGULAR,03:33,213.0,GOAL,MIN,True,78.0,3.0,Wrist Shot,False,EV,16:27,"{'C': 1, 'R': 1, 'L': 1, 'D': 2, 'G': 1}","{'C': 2, 'L': 1, 'D': 2, 'G': 1}",987.0,"{'FWD': 3, 'DEF': 2, 'GOAL': 1}","{'FWD': 3, 'DEF': 2, 'GOAL': 1}",5,5,False,False,12.0,Off. Zone,Goal,Wrist,11.401754,0.598246,6,6,3,3
4,2010020003,20102011,R,2010-10-07T16:00:00Z,CAR,MIN,Hartwall Areena,1,1st,REGULAR,03:49,229.0,BLOCK,MIN,True,77.0,13.0,,False,EV,16:11,"{'C': 1, 'R': 1, 'L': 1, 'D': 2, 'G': 1}","{'C': 1, 'R': 1, 'L': 1, 'D': 2, 'G': 1}",971.0,"{'FWD': 3, 'DEF': 2, 'GOAL': 1}","{'FWD': 3, 'DEF': 2, 'GOAL': 1}",5,5,False,False,,Off. Zone,Block,Snap,17.691806,,6,6,3,3


In [49]:
shot_frame.drop(['pos_h', 'pos_a', 'fwd_def_a', 'fwd_def_h'], axis=1, inplace=True)

In [50]:
shot_frame.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1428324 entries, 0 to 1428367
Data columns (total 36 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   game_id_livefeed    1428324 non-null  object 
 1   season              1428324 non-null  object 
 2   type                1428324 non-null  object 
 3   game_time           1428324 non-null  object 
 4   away_code           1428324 non-null  object 
 5   home_code           1428324 non-null  object 
 6   venue               1428324 non-null  object 
 7   period              1428324 non-null  int64  
 8   period_ord          1428324 non-null  object 
 9   period_type         1428324 non-null  object 
 10  time_elapsed        1428324 non-null  object 
 11  cum_time_elapsed    1428324 non-null  float64
 12  event               1428324 non-null  object 
 13  event_team_code     1428324 non-null  object 
 14  event_team_is_home  1428324 non-null  bool   
 15  event_coord_x  

Several fields: `is_rebound`, `goalie_pulled_a`, and `goalie_pulled_h` should be boolean-valued.

In [51]:
shot_frame['is_rebound'] = shot_frame['is_rebound'].astype(bool)
shot_frame['goalie_pulled_a'] = shot_frame['goalie_pulled_a'].astype(bool)
shot_frame['goalie_pulled_h'] = shot_frame['goalie_pulled_h'].astype(bool)

In [52]:
shot_frame.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1428324 entries, 0 to 1428367
Data columns (total 36 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   game_id_livefeed    1428324 non-null  object 
 1   season              1428324 non-null  object 
 2   type                1428324 non-null  object 
 3   game_time           1428324 non-null  object 
 4   away_code           1428324 non-null  object 
 5   home_code           1428324 non-null  object 
 6   venue               1428324 non-null  object 
 7   period              1428324 non-null  int64  
 8   period_ord          1428324 non-null  object 
 9   period_type         1428324 non-null  object 
 10  time_elapsed        1428324 non-null  object 
 11  cum_time_elapsed    1428324 non-null  float64
 12  event               1428324 non-null  object 
 13  event_team_code     1428324 non-null  object 
 14  event_team_is_home  1428324 non-null  bool   
 15  event_coord_x  

In [58]:
current_dir = Path.cwd()
relative_path = 'data/cleaned.pkl'
frame_path = current_dir.joinpath(relative_path)
shot_frame.to_pickle(str(frame_path))

## Analysis of Shot Location

### Shot-Angles
Next, introduce the shot-angle. This measures the angular distance between the shot (as measured from the location of the event to the center of the goal line, located at $(89,0)$) and the $x$-axis. Shots with negative $y$-coordinate are taken to have negative angles. The angle is measured in degrees and should lie between -180 and 180.

In [53]:
shot_frame['calc_angle'] = np.arctan2(shot_frame['event_coord_y'], 89 - shot_frame['event_coord_x']) * 180 / np.pi
shot_frame['calc_angle'].describe()

count    1.428324e+06
mean    -1.056570e+00
std      3.482789e+01
min     -1.800000e+02
25%     -2.740758e+01
50%      0.000000e+00
75%      2.582099e+01
max      1.800000e+02
Name: calc_angle, dtype: float64

### Shot-Location Adjustments
The shot-location data is well-known to have an arena bias (see e.g. Schnuckers and Curro at http://statsportsconsulting.com/main/wp-content/uploads/Schuckers_Curro_MIT_Sloan_THoR.pdf). Madison Square Garden is considered to be oe of the stronger outliers.

In [111]:
shot_frame['calc_dist'].describe()

count    1.428324e+06
mean     3.373279e+01
std      2.074509e+01
min      0.000000e+00
25%      1.788854e+01
50%      3.087070e+01
75%      4.617359e+01
max      1.926344e+02
Name: calc_dist, dtype: float64

The mean shot distance at Madison Square Garden is 4 feet closer to the net than in the full set. The median distance is 6 feet closer.

In [113]:
shot_frame[shot_frame['venue']=='Madison Square Garden']['calc_dist'].describe()

count    49132.000000
mean        29.666476
std         20.650588
min          0.000000
25%         14.142136
50%         24.515301
75%         41.593269
max        187.416648
Name: calc_dist, dtype: float64

The article by Schnuckers and Curro provides a method for adjusting coordinates to account for the bias.
Each coordinate is adjusted independently. For a coordinate, the empirical distribution functions $F$ and $F_A$ are calculated, where $F$ is the CDF of all shot coordinates and $F_A$ is the CDF looking at all shot coordinates where the shot was taken by the visiting team. There are a handful of neutral-site venues. In these games, the away-team was chosen to be the team determined by the NHL to be the visiting team for that game.
In a similar fashion, empirical distribution functions $F_V$ and $F_{VA}$ are calculated for each venue independently. Here, $F_V$ is the CDF for all shots at that venue and $F_{VA}$ is the CDF for all shots by the designated visiting team at that venue.
Using these ECDFs, an adjusted quantile `q` is calculated for each coordinate `c` to be $q(c) = F_V(c) - (F_{VA}(c) - F_A(c)).$  This quantile is adjusted to be in the interval $[0,1]$ by taking $q'(c) = \min(\max(q(c), 0), 1).$ Finally, the adjusted coordinate `c'` is calculated by $F^{-1}(q')$ using linear interpolation. The adjusted coordinates are assigned to variables `adj_x` and `adj_y`. Note: Schnuckers/Curro don't use linear interpolation (or any interpolation), however they suggest using a smoothed version of the ECDF.

In [None]:
venue_list = list(shot_frame['venue'].unique())
venue_all = {}
venue_away = {}
for venue in venue_list:
    venue_shots = shot_frame[shot_frame['venue']==venue]
    x_coord = ECDF(list(venue_shots['event_coord_x']))
    y_coord = ECDF(list(venue_shots['event_coord_y']))
    venue_all[venue] = {'x': x_coord, 'y': y_coord}
    venue_away_shots = venue_shots[venue_shots['event_team_is_home']==False]
    x_coord = ECDF(list(venue_away_shots['event_coord_x']))
    y_coord = ECDF(list(venue_away_shots['event_coord_y']))
    venue_away[venue] = {'x': x_coord, 'y': y_coord}
away_shots = shot_frame[shot_frame['event_team_is_home']==False]
x_coord = ECDF(list(away_shots['event_coord_x']))
y_coord = ECDF(list(away_shots['event_coord_y']))
venue_away['all'] = {'x': x_coord, 'y': y_coord}
sorted_x = shot_frame['event_coord_x'].sort_values()
sorted_y = shot_frame['event_coord_y'].sort_values()


<b>Warning!</b> The following cells may take several hours to run.

In [None]:
adjusted_frame = shot_frame
adjusted_frame['f_vx'] = adjusted_frame.apply(lambda row : venue_all[row['venue']]['x'](row['event_coord_x']) , axis = 1)
adjusted_frame['f_vax'] = adjusted_frame.apply(lambda row : venue_away[row['venue']]['x'](row['event_coord_x']) , axis = 1)
adjusted_frame['f_ax'] = adjusted_frame.apply(lambda row : venue_away['all']['x'](row['event_coord_x']) , axis = 1)
adjusted_frame['arg_x'] = adjusted_frame['f_vx'] - (adjusted_frame['f_vax'] - adjusted_frame['f_ax'])
adjusted_frame['arg_x'] = np.where(adjusted_frame['arg_x'] > 1, 1, adjusted_frame['arg_x'])
adjusted_frame['arg_x'] = np.where(adjusted_frame['arg_x'] < 0, 0, adjusted_frame['arg_x'])
adjusted_frame['adj_x'] = adjusted_frame['arg_x'].apply(lambda x: np.quantile(sorted_x, x, interpolation='linear'))

In [None]:
adjusted_frame['f_vy'] = adjusted_frame.apply(lambda row : venue_all[row['venue']]['y'](row['event_coord_y']) , axis = 1)
adjusted_frame['f_vay'] = adjusted_frame.apply(lambda row : venue_away[row['venue']]['y'](row['event_coord_y']) , axis = 1)
adjusted_frame['f_ay'] = adjusted_frame.apply(lambda row : venue_away['all']['y'](row['event_coord_y']) , axis = 1)
adjusted_frame['arg_y'] = adjusted_frame['f_vy'] - (adjusted_frame['f_vay'] - adjusted_frame['f_ay'])
adjusted_frame['arg_y'] = np.where(adjusted_frame['arg_y'] > 1, 1, adjusted_frame['arg_y'])
adjusted_frame['arg_y'] = np.where(adjusted_frame['arg_y'] < 0, 0, adjusted_frame['arg_y'])
adjusted_frame['adj_y'] = adjusted_frame['arg_y'].apply(lambda x: np.quantile(sorted_y, x, interpolation='linear'))

In [None]:
adjusted_frame[['event_coord_x','event_coord_y','adj_x','adj_y']].head()

Using the adjusted coordinates, re-calculate the angle of the shot and the distance of the shot. These are determined by using the line segment starting at the midpoint of the goal line and ending at the adjusted shot location. The goal line corresponds to the line $x = 89$, indicating that the midpoint of the goal line is at coordinates $(89, 0)$. The adjusted distance `adj_dist` is simply the length of this segment. The adjusted angle `adj_angle` is the angle between this segment and the ray from the midpoint of the goal line passing through center ice (coordinates $(0,0)$).

In [None]:
adjusted_frame['adj_dist'] = np.sqrt((shot_frame['adj_x']-89)**2 + shot_frame['adj_y']**2)
adjusted_frame['adj_angle'] = np.arctan2(shot_frame['adj_y'], 89 - shot_frame['adj_x']) * 180 / np.pi
adjusted_frame['adj_angle'].describe()

The temporary columns are no longer needed.

In [65]:
adjusted_frame.drop(['f_vx', 'f_vax', 'f_ax', 'arg_x', 'f_vy', 'f_vay', 'f_ay', 'arg_y'], axis=1, inplace=True)

Re-check the differences for Madison Square Garden.

In [114]:
adjusted_frame['adj_dist'].describe()

count    1.428324e+06
mean     3.336882e+01
std      2.080060e+01
min      0.000000e+00
25%      1.746425e+01
50%      3.046309e+01
75%      4.596738e+01
max      1.926344e+02
Name: adj_dist, dtype: float64

In [115]:
adjusted_frame[adjusted_frame['venue']=='Madison Square Garden']['adj_dist'].describe()

count    49132.000000
mean        29.740026
std         20.758430
min          0.000000
25%         14.212670
50%         24.698178
75%         41.629317
max        189.401690
Name: adj_dist, dtype: float64

In [116]:
adjusted_frame.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1428324 entries, 0 to 1428367
Data columns (total 41 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   game_id_livefeed    1428324 non-null  object 
 1   season              1428324 non-null  object 
 2   type                1428324 non-null  object 
 3   game_time           1428324 non-null  object 
 4   away_code           1428324 non-null  object 
 5   home_code           1428324 non-null  object 
 6   venue               1428324 non-null  object 
 7   period              1428324 non-null  int64  
 8   period_ord          1428324 non-null  object 
 9   period_type         1428324 non-null  object 
 10  time_elapsed        1428324 non-null  object 
 11  cum_time_elapsed    1428324 non-null  float64
 12  event               1428324 non-null  object 
 13  event_team_code     1428324 non-null  object 
 14  event_team_is_home  1428324 non-null  bool   
 15  event_coord_x  

The majority of NHL games are 60 minutes long, with a sizable minority having an additional time for overtime. We would expect the median event time to be roughly 30 minutes into the game

In [117]:
adjusted_frame['cum_time_elapsed'].describe()

count    1.428324e+06
mean     1.834033e+03
std      1.061119e+03
min      0.000000e+00
25%      9.260000e+02
50%      1.819000e+03
75%      2.728000e+03
max      9.027000e+03
Name: cum_time_elapsed, dtype: float64

In [118]:
adjusted_frame['period_type'].unique()

array(['REGULAR', 'OVERTIME'], dtype=object)

In [119]:
adjusted_frame[adjusted_frame['period_type']=='REGULAR']['cum_time_elapsed'].describe()

count    1.402454e+06
mean     1.796395e+03
std      1.031972e+03
min      0.000000e+00
25%      9.090000e+02
50%      1.787000e+03
75%      2.678750e+03
max      3.600000e+03
Name: cum_time_elapsed, dtype: float64

Likewise, most NHL periods are 20 minutes long. Only regular season overtimes are shorter, at 5 minutes. We would expect the median time remaining in a period to be 10 minutes.

In [120]:
adjusted_frame['seconds_remaining'].describe()

count    1.428324e+06
mean     5.898845e+02
std      3.480634e+02
min      0.000000e+00
25%      2.830000e+02
50%      5.930000e+02
75%      8.930000e+02
max      1.200000e+03
Name: seconds_remaining, dtype: float64

In [121]:
adjusted_frame[adjusted_frame['period_type']=='REGULAR']['seconds_remaining'].describe()

count    1.402454e+06
mean     5.955452e+02
std      3.463804e+02
min      0.000000e+00
25%      2.960000e+02
50%      6.010000e+02
75%      8.960000e+02
max      1.200000e+03
Name: seconds_remaining, dtype: float64

Overtimes are sudden-death, where a single goal ends the game. Since overtimes will often not last the entire scheduled time, we would expect the median regular-season overtime event to take place with over 2:30 remaining and the median playoff overtime event to take place with over 10:00 remining.

In [125]:
adjusted_frame[(adjusted_frame['period_type']=='OVERTIME') & (adjusted_frame['type'] =='R')]['seconds_remaining'].describe()

count    20357.000000
mean       154.351329
std         85.067489
min          0.000000
25%         82.000000
50%        162.000000
75%        228.000000
max        298.000000
Name: seconds_remaining, dtype: float64

In [126]:
adjusted_frame[(adjusted_frame['period_type']=='OVERTIME') & (adjusted_frame['type'] =='P')]['seconds_remaining'].describe()

count    5513.000000
mean      758.071830
std       319.737652
min         0.000000
25%       529.000000
50%       826.000000
75%      1032.000000
max      1195.000000
Name: seconds_remaining, dtype: float64

The valid number of players on the ice should vary from 4 to 6. 

In [132]:
adjusted_frame['players_a'].value_counts()

6     1259169
5      154813
4       14212
7         108
2          10
1           4
8           3
11          2
3           2
9           1
Name: players_a, dtype: int64

In [133]:
adjusted_frame['players_h'].value_counts()

6    1272398
5     142606
4      13237
7         61
2          8
1          8
3          3
8          2
9          1
Name: players_h, dtype: int64

In [134]:
adjusted_frame = adjusted_frame[(adjusted_frame['players_a'] >= 4) & (adjusted_frame['players_a'] <= 6)]
adjusted_frame = adjusted_frame[(adjusted_frame['players_h'] >= 4) & (adjusted_frame['players_h'] <= 6)]

In [135]:
adjusted_frame['players_a'].value_counts()

6    1259100
5     154804
4      14211
Name: players_a, dtype: int64

In [137]:
adjusted_frame['players_h'].value_counts()

6    1272278
5     142601
4      13236
Name: players_h, dtype: int64

The number of skaters should vary between 3 and 6, with the most common value being 5. The number of forwards is allowed to vary between 0 and 6, with 3 being the most common.

In [138]:
adjusted_frame['skaters_a'].value_counts()

5    1243032
4     154127
6      16781
3      14175
Name: skaters_a, dtype: int64

In [139]:
adjusted_frame['skaters_h'].value_counts()

5    1256996
4     141938
6      15971
3      13210
Name: skaters_h, dtype: int64

In [140]:
adjusted_frame['fwds_a'].value_counts()

3    1167805
2     170566
4      78216
1       6254
5       5122
6        150
0          2
Name: fwds_a, dtype: int64

In [141]:
adjusted_frame['fwds_h'].value_counts()

3    1175177
2     156863
4      85656
1       5301
5       5007
6        109
0          2
Name: fwds_h, dtype: int64

Determine the attacker and defender strength. This is traditionally referenced as being 'M-on-N', where M is the number of attackers (non-goaltenders on the ice) for the shooting team and N is the number of non-goaltenders for the defending team. Most play is 5-on-5, as both teams have 6 players is most situations and usually have a goaltender on the ice. At most one goaltender is allowed at any one time.

In [142]:
adjusted_frame['players_shooting'] = np.where(adjusted_frame['event_team_is_home']==True,
                                            adjusted_frame['players_h'],
                                            adjusted_frame['players_a'])
adjusted_frame['skaters_shooting'] = np.where(adjusted_frame['event_team_is_home']==True,
                                            adjusted_frame['skaters_h'],
                                            adjusted_frame['skaters_a'])
adjusted_frame['fwds_shooting'] = np.where(adjusted_frame['event_team_is_home']==True,
                                            adjusted_frame['fwds_h'],
                                            adjusted_frame['fwds_a'])

In [158]:
def attacker_state(home_shooting, skaters_h, skaters_a):
    if home_shooting:
        result = str(skaters_h) + '-on-' + str(skaters_a)
    else:
        result = str(skaters_a) + '-on-' + str(skaters_h)
    return result

In [159]:
adjusted_frame['attacker_state'] = adjusted_frame.apply(lambda row: attacker_state(row['event_team_is_home'], row['skaters_h'], row['skaters_a']), axis = 1 )

In [160]:
adjusted_frame['attacker_state'].value_counts()

5-on-5    1116417
5-on-4     202220
4-on-5      28795
4-on-4      28525
6-on-5      23272
3-on-3       7867
5-on-3       7089
5-on-6       5676
4-on-3       4133
6-on-4       3269
4-on-6        423
3-on-4        175
3-on-5        142
6-on-3        110
3-on-6          2
Name: attacker_state, dtype: int64

In [161]:
current_dir = Path.cwd()
relative_path = 'data/processed.pkl'
frame_path = current_dir.joinpath(relative_path)
adjusted_frame.to_pickle(str(frame_path))

In [162]:
current_dir = Path.cwd()
relative_path = 'data/processed.pkl'
frame_path = current_dir.joinpath(relative_path)
adjusted_frame = pd.read_pickle(str(frame_path))

The pickle is too large for Github, but compressing the file works.

In [167]:
current_dir = Path.cwd()
relative_path = 'data/compressed.pbz2'
frame_path = current_dir.joinpath(relative_path)
with bz2.BZ2File(str(frame_path), 'w') as f: 
    cPickle.dump(adjusted_frame, f)

In [169]:
decompressed = bz2.BZ2File(str(frame_path), 'rb')
decompressed = cPickle.load(decompressed)

In [172]:
decompressed.equals(adjusted_frame)

True