In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import bz2
import _pickle as cPickle
from sklearn.preprocessing import MaxAbsScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

In [2]:
current_dir = Path.cwd()
relative_path = 'data/compressed_eda_adjusted.pbz2'
frame_path = current_dir.joinpath(relative_path)
shot_frame = bz2.BZ2File(str(frame_path), 'rb')
shot_frame = cPickle.load(shot_frame)

In [3]:
pd.set_option('display.max_columns', None)
shot_frame.head()

Unnamed: 0,game_id_livefeed,season,type,game_time,venue,period,period_ord,period_type,time_elapsed,cum_time_elapsed,event,event_team_code,event_team_is_home,event_coord_x,event_coord_y,is_rebound,time_remaining,seconds_remaining,event_zone,miss_type,shot_type,calc_dist,calc_angle,players_shooting,skaters_shooting,fwds_shooting,attacker_state,is_extra_attacker,is_empty_net,players_defending,skaters_defending,fwds_defending,is_goal
0,2010020003,20102011,R,2010-10-07T16:00:00Z,Hartwall Areena,1,1st,REGULAR,01:05,65.0,SHOT,CAR,False,56.0,-15.0,False,18:55,1135.0,Off. Zone,Save,Snap,36.249138,-24.443955,6,5,3,5-on-5,False,False,6,5,3,False
1,2010020003,20102011,R,2010-10-07T16:00:00Z,Hartwall Areena,1,1st,REGULAR,01:16,76.0,MISS,CAR,False,35.0,33.0,False,18:44,1124.0,Off. Zone,Wide of Net,Wrist,63.285069,31.429566,6,5,3,5-on-5,False,False,6,5,3,False
2,2010020003,20102011,R,2010-10-07T16:00:00Z,Hartwall Areena,1,1st,REGULAR,01:37,97.0,MISS,MIN,True,87.0,-6.0,False,18:23,1103.0,Off. Zone,Wide of Net,Wrist,6.324555,-71.565051,6,5,3,5-on-5,False,False,6,5,3,False
3,2010020003,20102011,R,2010-10-07T16:00:00Z,Hartwall Areena,1,1st,REGULAR,03:33,213.0,GOAL,MIN,True,78.0,3.0,False,16:27,987.0,Off. Zone,Goal,Wrist,11.401754,15.255119,6,5,3,5-on-5,False,False,6,5,3,True
4,2010020003,20102011,R,2010-10-07T16:00:00Z,Hartwall Areena,1,1st,REGULAR,03:49,229.0,BLOCK,MIN,True,77.0,13.0,False,16:11,971.0,Off. Zone,Block,Snap,17.691806,47.29061,6,5,3,5-on-5,False,False,6,5,3,False


There are several columns that are either redundant or irrelevant to modeling. These are `game_id_livefeed`, `season` (although rule changes could potentially have long-term effects), `game_time` (a more-sophisticated model may want to account for fatigue in late-season and back-to-back games), `venue`, `period_ord` (duplicates `period`),  `time_elapsed`/`cum_time_elapsed`/`time_remaining` (effectively duplicated by `seconds_remaining`), `event` (derivable from `miss_type`), `event_team_code` (player and team might be included in a different model), `event_coord_x`/`event_coord_y` (represented by `calc_dist`/`calc_angle`), `attacker_state` (given by `skaters_shooting` and `skaters_defending`). The column `miss_type` will also be removed since it can be used by itself to correctly predict `is_goal`.

In [4]:
shot_frame.season.unique()

array(['20102011', '20112012', '20122013', '20132014', '20142015',
       '20152016', '20162017', '20172018', '20182019', '20192020'],
      dtype=object)

Maintaining all seasons was leading to computational difficulties while modeling. The decision was made to reduce the dataset to only include the most recent full season (2018-19 as the 2019-20 season was shortened as a result of the Covid-19 pandemic).

In [5]:
shot_frame = shot_frame[shot_frame['season']=='20182019']

In [6]:
shot_frame.season.unique()

array(['20182019'], dtype=object)

In [7]:
shot_frame.drop(labels=['game_id_livefeed', 'season', 'game_time', 'venue', 'period_ord', 'time_elapsed', 'cum_time_elapsed', \
                        'event', 'event_team_code', 'event_coord_x', 'event_coord_y', 'time_remaining', 'attacker_state', \
                        'miss_type'], \
                axis=1, inplace=True)

In [17]:
# shot_frame = shot_frame.drop(labels=['game_id_livefeed', 'season', 'game_time', 'venue', 'period_ord', 'time_elapsed', 'cum_time_elapsed', \
#                         'event', 'event_team_code', 'calc_dist', 'calc_angle', 'time_remaining', 'attacker_state', \
#                         'miss_type'], \
#                 axis=1)

In [8]:
shot_frame.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 157336 entries, 1131838 to 1289200
Data columns (total 19 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   type                157336 non-null  object 
 1   period              157336 non-null  int64  
 2   period_type         157336 non-null  object 
 3   event_team_is_home  157336 non-null  bool   
 4   is_rebound          157336 non-null  bool   
 5   seconds_remaining   157336 non-null  float64
 6   event_zone          157336 non-null  object 
 7   shot_type           157336 non-null  object 
 8   calc_dist           157336 non-null  float64
 9   calc_angle          157336 non-null  float64
 10  players_shooting    157336 non-null  int64  
 11  skaters_shooting    157336 non-null  int64  
 12  fwds_shooting       157336 non-null  int64  
 13  is_extra_attacker   157336 non-null  bool   
 14  is_empty_net        157336 non-null  bool   
 15  players_defending   157336 

Two of the remaining fields have only two values: `type` (either 'R' or 'P', meaning regular-season or playoff) and `period_type` (either 'REGULAR' or 'OVERTIME'). These can both be changed to boolean fields.

In [9]:
shot_frame.type.unique()

array(['R', 'P'], dtype=object)

In [10]:
shot_frame['type'] = (shot_frame.type == 'P')

In [11]:
shot_frame.type.unique()

array([False,  True])

In [12]:
shot_frame.period_type.unique()

array(['REGULAR', 'OVERTIME'], dtype=object)

In [13]:
shot_frame['period_type'] = (shot_frame.period_type == 'OVERTIME')

In [14]:
shot_frame.period_type.unique()

array([False,  True])

In [15]:
shot_frame.rename(columns={'type': 'is_playoff', 'period_type': 'is_overtime'}, inplace=True)

In [16]:
shot_frame.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 157336 entries, 1131838 to 1289200
Data columns (total 19 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   is_playoff          157336 non-null  bool   
 1   period              157336 non-null  int64  
 2   is_overtime         157336 non-null  bool   
 3   event_team_is_home  157336 non-null  bool   
 4   is_rebound          157336 non-null  bool   
 5   seconds_remaining   157336 non-null  float64
 6   event_zone          157336 non-null  object 
 7   shot_type           157336 non-null  object 
 8   calc_dist           157336 non-null  float64
 9   calc_angle          157336 non-null  float64
 10  players_shooting    157336 non-null  int64  
 11  skaters_shooting    157336 non-null  int64  
 12  fwds_shooting       157336 non-null  int64  
 13  is_extra_attacker   157336 non-null  bool   
 14  is_empty_net        157336 non-null  bool   
 15  players_defending   157336 

The data frame is down to two categorical columns. These will now be one-hot-encoded.

In [17]:
dfo=shot_frame.select_dtypes(include=['object']) # select object type columns
shot_frame = pd.concat([shot_frame.drop(dfo, axis=1), pd.get_dummies(dfo)], axis=1)

In [18]:
shot_frame.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 157336 entries, 1131838 to 1289200
Data columns (total 27 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   is_playoff             157336 non-null  bool   
 1   period                 157336 non-null  int64  
 2   is_overtime            157336 non-null  bool   
 3   event_team_is_home     157336 non-null  bool   
 4   is_rebound             157336 non-null  bool   
 5   seconds_remaining      157336 non-null  float64
 6   calc_dist              157336 non-null  float64
 7   calc_angle             157336 non-null  float64
 8   players_shooting       157336 non-null  int64  
 9   skaters_shooting       157336 non-null  int64  
 10  fwds_shooting          157336 non-null  int64  
 11  is_extra_attacker      157336 non-null  bool   
 12  is_empty_net           157336 non-null  bool   
 13  players_defending      157336 non-null  int64  
 14  skaters_defending      157336

Finally, clean up some of the remaining column names.

In [19]:
shot_frame.rename(columns={'event_zone_Def. Zone': 'event_zone_Def_Zone', 'event_zone_Neu. Zone': 'event_zone_Neu_Zone',
                          'event_zone_Off. Zone': 'event_zone_Off_Zone', 'shot_type_Tip-In': 'shot_type_Tip_In', 
                           'shot_type_Wrap-around': 'shot_type_Wrap_around'}, 
                  inplace=True)

In [20]:
shot_frame.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 157336 entries, 1131838 to 1289200
Data columns (total 27 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   is_playoff             157336 non-null  bool   
 1   period                 157336 non-null  int64  
 2   is_overtime            157336 non-null  bool   
 3   event_team_is_home     157336 non-null  bool   
 4   is_rebound             157336 non-null  bool   
 5   seconds_remaining      157336 non-null  float64
 6   calc_dist              157336 non-null  float64
 7   calc_angle             157336 non-null  float64
 8   players_shooting       157336 non-null  int64  
 9   skaters_shooting       157336 non-null  int64  
 10  fwds_shooting          157336 non-null  int64  
 11  is_extra_attacker      157336 non-null  bool   
 12  is_empty_net           157336 non-null  bool   
 13  players_defending      157336 non-null  int64  
 14  skaters_defending      157336

In [21]:
shot_frame.rename(columns={'event_team_is_home': 'is_event_team_home'}, 
                  inplace=True)

In [22]:
shot_frame.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 157336 entries, 1131838 to 1289200
Data columns (total 27 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   is_playoff             157336 non-null  bool   
 1   period                 157336 non-null  int64  
 2   is_overtime            157336 non-null  bool   
 3   is_event_team_home     157336 non-null  bool   
 4   is_rebound             157336 non-null  bool   
 5   seconds_remaining      157336 non-null  float64
 6   calc_dist              157336 non-null  float64
 7   calc_angle             157336 non-null  float64
 8   players_shooting       157336 non-null  int64  
 9   skaters_shooting       157336 non-null  int64  
 10  fwds_shooting          157336 non-null  int64  
 11  is_extra_attacker      157336 non-null  bool   
 12  is_empty_net           157336 non-null  bool   
 13  players_defending      157336 non-null  int64  
 14  skaters_defending      157336

In [23]:
dfo=shot_frame.select_dtypes(include=['bool'])
shot_frame = pd.concat([shot_frame.drop(dfo, axis=1), dfo.astype('uint8')], axis=1)

In [24]:
dfo=shot_frame.select_dtypes(include=['int64'])
shot_frame = pd.concat([shot_frame.drop(dfo, axis=1), dfo.astype('uint8')], axis=1)

In [25]:
shot_frame.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 157336 entries, 1131838 to 1289200
Data columns (total 27 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   seconds_remaining      157336 non-null  float64
 1   calc_dist              157336 non-null  float64
 2   calc_angle             157336 non-null  float64
 3   event_zone_Def_Zone    157336 non-null  uint8  
 4   event_zone_Neu_Zone    157336 non-null  uint8  
 5   event_zone_Off_Zone    157336 non-null  uint8  
 6   shot_type_Backhand     157336 non-null  uint8  
 7   shot_type_Deflected    157336 non-null  uint8  
 8   shot_type_Slap         157336 non-null  uint8  
 9   shot_type_Snap         157336 non-null  uint8  
 10  shot_type_Tip_In       157336 non-null  uint8  
 11  shot_type_Wrap_around  157336 non-null  uint8  
 12  shot_type_Wrist        157336 non-null  uint8  
 13  is_playoff             157336 non-null  uint8  
 14  is_overtime            157336

Three continuous variables (`seconds_remaining`, `calc_dist`, and `calc_angle`) are included and will need to be scaled. The first of these is technically discrete but can be treated as continuous as it represents time on a scale with a maximum of 1200 seconds. The variable `calc_angle` will be scaled by MaxAbsScaler, the remaining variables by MinMaxScaler.

In [28]:
# shot_frame[['seconds_remaining', 'event_coord_x', 'event_coord_y']].describe().T

In [29]:
shot_frame[['seconds_remaining', 'calc_dist', 'calc_angle']].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
seconds_remaining,157336.0,591.966301,347.854231,0.0,286.0,595.0,895.0,1200.0
calc_dist,157336.0,34.341196,22.904572,0.0,17.691806,30.594117,46.572524,189.65495
calc_angle,157336.0,-0.656727,34.65576,-180.0,-26.565051,0.0,25.740708,159.443955


In [30]:
shot_frame['calc_angle'] = MaxAbsScaler().fit_transform(shot_frame[['calc_angle']])
shot_frame['seconds_remaining'] = MinMaxScaler().fit_transform(shot_frame[['seconds_remaining']])
shot_frame['calc_dist'] = MinMaxScaler().fit_transform(shot_frame[['calc_dist']])

In [31]:
shot_frame[['seconds_remaining', 'calc_dist', 'calc_angle']].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
seconds_remaining,157336.0,0.493305,0.289879,0.0,0.238333,0.495833,0.745833,1.0
calc_dist,157336.0,0.181072,0.12077,0.0,0.093284,0.161315,0.245565,1.0
calc_angle,157336.0,-0.003648,0.192532,-1.0,-0.147584,0.0,0.143004,0.8858


In [32]:
current_dir = Path.cwd()
relative_path = 'data/compressed_preprocessed.pbz2'
frame_path = current_dir.joinpath(relative_path)
with bz2.BZ2File(str(frame_path), 'w') as f: 
    cPickle.dump(shot_frame, f)

In [33]:
# y = shot_frame['is_goal']
# X = shot_frame.drop(labels=['is_goal'], axis=1)

In [70]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify=shot_frame['is_goal'],random_state=0)

In [65]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20,random_state=0)

In [29]:
# df_map = {'x_train': X_train, 'x_test': X_test, 'y_train': y_train, 'y_test': y_test}
# current_dir = Path.cwd()
# for name, df in df_map.items():
#     relative_path = 'data/preprocessed_' + name + '.pbz2'
#     frame_path = current_dir.joinpath(relative_path)
#     with bz2.BZ2File(str(frame_path), 'w') as f: 
#         cPickle.dump(df, f)