In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
# from sklearn.preprocessing import MaxAbsScaler, MinMaxScaler
# from sklearn.model_selection import train_test_split

In [2]:
# For local storage.
DATA_FOLDER = 'data/'
DATAFRAME_FILE = DATA_FOLDER + 'adjusted_cleaned_data.pkl'

In [3]:
current_dir = Path.cwd()
frame_path = current_dir.joinpath(DATAFRAME_FILE)
shots_df = pd.read_pickle(str(frame_path))

In [4]:
pd.set_option('display.max_columns', None)
shots_df.head()

Unnamed: 0,game_id,season,is_playoff_game,venue,att_code,def_code,is_home,period,cum_time_elapsed,is_overtime,seconds_remaining,strength,att_score,def_score,lead_size,event,att_players,att_skaters,att_forwards,def_players,def_skaters,def_forwards,is_extra_attacker,is_empty_net,event_zone,shot_type,event_coord_x,event_coord_y,is_rebound,calc_dist,result,is_goal
0,2010020003,20102011,False,Hartwall Areena,CAR,MIN,False,1,65,False,1135,EV,0,0,0,SHOT,6,5,3,6,5,3,False,False,Off. Zone,Snap,56.0,-15.0,False,36.249138,Save,False
1,2010020003,20102011,False,Hartwall Areena,CAR,MIN,False,1,76,False,1124,EV,0,0,0,MISS,6,5,3,6,5,3,False,False,Off. Zone,Wrist,35.0,33.0,False,63.285069,Wide of Net,False
2,2010020003,20102011,False,Hartwall Areena,MIN,CAR,True,1,97,False,1103,EV,0,0,0,MISS,6,5,3,6,5,3,False,False,Off. Zone,Wrist,87.0,-6.0,False,6.324555,Wide of Net,False
3,2010020003,20102011,False,Hartwall Areena,MIN,CAR,True,1,213,False,987,EV,0,0,0,GOAL,6,5,3,6,5,3,False,False,Off. Zone,Wrist,78.0,3.0,False,11.401754,Goal,True
4,2010020003,20102011,False,Hartwall Areena,MIN,CAR,True,1,233,False,967,EV,1,0,1,SHOT,6,5,3,6,5,3,False,False,Off. Zone,Wrist,54.0,38.0,False,51.662365,Save,False


In [5]:
shots_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1113679 entries, 0 to 1113678
Data columns (total 32 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   game_id            1113679 non-null  object 
 1   season             1113679 non-null  object 
 2   is_playoff_game    1113679 non-null  bool   
 3   venue              1113679 non-null  object 
 4   att_code           1113679 non-null  object 
 5   def_code           1113679 non-null  object 
 6   is_home            1113679 non-null  bool   
 7   period             1113679 non-null  int64  
 8   cum_time_elapsed   1113679 non-null  int64  
 9   is_overtime        1113679 non-null  bool   
 10  seconds_remaining  1113679 non-null  int64  
 11  strength           1113679 non-null  object 
 12  att_score          1113679 non-null  int32  
 13  def_score          1113679 non-null  int32  
 14  lead_size          1113679 non-null  int32  
 15  event              1113679 non-n

In [6]:
shots_df.season.unique()

array(['20102011', '20112012', '20122013', '20132014', '20142015',
       '20152016', '20162017', '20172018', '20182019', '20192020',
       '20202021'], dtype=object)

Maintaining all seasons was leading to computational difficulties while modeling. The decision was made to reduce the dataset to only include the most recent full season (2018-19 as the 2019-20 season was shortened as a result of the Covid-19 pandemic).

In [7]:
shots_df = shots_df[shots_df['season']=='20182019'].reset_index(drop=True)

In [8]:
shots_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118280 entries, 0 to 118279
Data columns (total 32 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   game_id            118280 non-null  object 
 1   season             118280 non-null  object 
 2   is_playoff_game    118280 non-null  bool   
 3   venue              118280 non-null  object 
 4   att_code           118280 non-null  object 
 5   def_code           118280 non-null  object 
 6   is_home            118280 non-null  bool   
 7   period             118280 non-null  int64  
 8   cum_time_elapsed   118280 non-null  int64  
 9   is_overtime        118280 non-null  bool   
 10  seconds_remaining  118280 non-null  int64  
 11  strength           118280 non-null  object 
 12  att_score          118280 non-null  int32  
 13  def_score          118280 non-null  int32  
 14  lead_size          118280 non-null  int32  
 15  event              118280 non-null  object 
 16  at

There are several columns that are either redundant or irrelevant to modeling. These are `game_id`, `season` (although it may be worth determining if rule changes have affected goal probability over time), `venue`/`att_code`/`def_code` (although venue could potentially have an impact, we would like to reduce the effect of individual/team skill differences on the model), `cum_time_elapsed` (effectively duplicated by `period` and `seconds_remaining`), `event` (derivable from `miss_type`), `event_team_code` (player and team might be included in a different model), `event_coord_x`/`event_coord_y` (represented by `calc_dist`/`calc_angle`), `attacker_state` (given by `skaters_shooting` and `skaters_defending`). The column `miss_type` will also be removed since it can be used by itself to correctly predict `is_goal`.

In [9]:
shots_df.drop(['game_id', 'season', 'venue', 'att_code', 'def_code', 'cum_time_elapsed', 'strength', 'event', 
               'event_zone', 'calc_dist', 'result'], axis=1, inplace=True)

In [10]:
shots_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118280 entries, 0 to 118279
Data columns (total 21 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   is_playoff_game    118280 non-null  bool   
 1   is_home            118280 non-null  bool   
 2   period             118280 non-null  int64  
 3   is_overtime        118280 non-null  bool   
 4   seconds_remaining  118280 non-null  int64  
 5   att_score          118280 non-null  int32  
 6   def_score          118280 non-null  int32  
 7   lead_size          118280 non-null  int32  
 8   att_players        118268 non-null  Int64  
 9   att_skaters        118268 non-null  Int64  
 10  att_forwards       118268 non-null  Int64  
 11  def_players        118268 non-null  Int64  
 12  def_skaters        118268 non-null  Int64  
 13  def_forwards       118268 non-null  Int64  
 14  is_extra_attacker  118280 non-null  bool   
 15  is_empty_net       118280 non-null  bool   
 16  sh

In [11]:
dfo = shots_df.select_dtypes(include=['bool']).astype(int)
shots_df = pd.concat([shots_df.drop(dfo, axis=1), dfo], axis=1)

In [12]:
shots_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118280 entries, 0 to 118279
Data columns (total 21 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   period             118280 non-null  int64  
 1   seconds_remaining  118280 non-null  int64  
 2   att_score          118280 non-null  int32  
 3   def_score          118280 non-null  int32  
 4   lead_size          118280 non-null  int32  
 5   att_players        118268 non-null  Int64  
 6   att_skaters        118268 non-null  Int64  
 7   att_forwards       118268 non-null  Int64  
 8   def_players        118268 non-null  Int64  
 9   def_skaters        118268 non-null  Int64  
 10  def_forwards       118268 non-null  Int64  
 11  shot_type          118280 non-null  object 
 12  event_coord_x      118277 non-null  float64
 13  event_coord_y      118278 non-null  float64
 14  is_playoff_game    118280 non-null  int32  
 15  is_home            118280 non-null  int32  
 16  is

In [13]:
dfo=shots_df.select_dtypes(include=['object']) # select object type columns
shots_df = pd.concat([shots_df.drop(dfo, axis=1), pd.get_dummies(dfo)], axis=1)

In [14]:
shots_df

Unnamed: 0,period,seconds_remaining,att_score,def_score,lead_size,att_players,att_skaters,att_forwards,def_players,def_skaters,def_forwards,event_coord_x,event_coord_y,is_playoff_game,is_home,is_overtime,is_extra_attacker,is_empty_net,is_rebound,is_goal,shot_type_Backhand,shot_type_Deflected,shot_type_Slap,shot_type_Snap,shot_type_Tip,shot_type_Unknown,shot_type_Wrap,shot_type_Wrist
0,1,1171,0,0,0,6,5,3,6,5,3,78.0,-19.0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
1,1,1151,0,0,0,6,5,3,6,5,3,37.0,10.0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
2,1,1140,0,0,0,6,5,3,6,5,3,47.0,-23.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,1,971,0,0,0,6,5,3,6,5,3,73.0,22.0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
4,1,966,0,0,0,6,5,3,6,5,3,53.0,14.0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118275,3,245,4,0,4,6,5,3,6,5,4,65.0,4.0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
118276,3,190,0,4,-4,6,6,4,6,5,3,42.0,3.0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,1
118277,3,175,0,4,-4,6,6,4,6,5,3,57.0,-13.0,1,1,0,1,0,0,0,0,0,0,1,0,0,0,0
118278,3,130,0,4,-4,6,6,5,6,5,3,44.0,24.0,1,1,0,1,0,0,1,0,0,0,0,0,0,0,1


In [15]:
shots_df.drop('shot_type_Unknown', inplace=True, axis=1)

In [16]:
shots_df.columns = shots_df.columns.str.lower()

In [17]:
shots_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118280 entries, 0 to 118279
Data columns (total 27 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   period               118280 non-null  int64  
 1   seconds_remaining    118280 non-null  int64  
 2   att_score            118280 non-null  int32  
 3   def_score            118280 non-null  int32  
 4   lead_size            118280 non-null  int32  
 5   att_players          118268 non-null  Int64  
 6   att_skaters          118268 non-null  Int64  
 7   att_forwards         118268 non-null  Int64  
 8   def_players          118268 non-null  Int64  
 9   def_skaters          118268 non-null  Int64  
 10  def_forwards         118268 non-null  Int64  
 11  event_coord_x        118277 non-null  float64
 12  event_coord_y        118278 non-null  float64
 13  is_playoff_game      118280 non-null  int32  
 14  is_home              118280 non-null  int32  
 15  is_overtime      

In [18]:
shots_df.isna().sum()

period                  0
seconds_remaining       0
att_score               0
def_score               0
lead_size               0
att_players            12
att_skaters            12
att_forwards           12
def_players            12
def_skaters            12
def_forwards           12
event_coord_x           3
event_coord_y           2
is_playoff_game         0
is_home                 0
is_overtime             0
is_extra_attacker       0
is_empty_net            0
is_rebound              0
is_goal                 0
shot_type_backhand      0
shot_type_deflected     0
shot_type_slap          0
shot_type_snap          0
shot_type_tip           0
shot_type_wrap          0
shot_type_wrist         0
dtype: int64

In [19]:
PROCESSED_DATAFRAME_FILE = DATA_FOLDER + 'processed_data.pkl'
current_dir = Path.cwd()
frame_path = current_dir.joinpath(PROCESSED_DATAFRAME_FILE)
shots_df.to_pickle(str(frame_path))