# Feature Engineering - Skater + Goalie Data

In [1]:
import pandas as pd
import numpy as np

## Read in data from hockey reference

In [106]:
# Skater data
skaters_2021 = pd.read_csv("../data/stats/skaters_2021.csv")
skaters_2022 = pd.read_csv("../data/stats/skaters_2022.csv")

# Goalie data
goalies_2021 = pd.read_csv("../data/stats/goalies_2021.csv")
goalies_2022 = pd.read_csv("../data/stats/goalies_2022.csv")

In [4]:
#skaters_2021.isnull().sum(axis = 0)
#skaters_2022.isnull().sum(axis = 0)
#goalies_2021.isnull().sum(axis = 0)
#goalies_2022.isnull().sum(axis = 0)

## Function for creating skaters final data
Should create all the cumulative stats for the skaters. We can include all of the same information like player ID, game number, team, etc. But for the features like shots or TOI, we will compute the cumulative sums/averages up to the start of each game and store them here. This way, we can keep separate the raw game by game data from the cumulative data.

In [3]:
def create_skater_cumulative_stats(indiv_games, r_window = None):
    # Make sure data is in correct order
    indiv_games = indiv_games.sort_values(['player_id', 'game_num'])
    
    # Create new data frames containing only certain columns from the old data frames
    cumulative_games = indiv_games.loc[:, ["player_id", "player_name", "age", "season", "game_num", "date", 
                                           "team", "opponent", "home_away_status", "result", "G"]]
    
    # Assign the number of completed games as 1 less than game num
    cumulative_games['n_completed_games'] = cumulative_games['game_num'] - 1

    # For each numerical statistic, compute the sum of that statistic for all games completed by the player prior to the game that is about to start
    prior_stats_totals = indiv_games.apply(lambda x: indiv_games[(indiv_games.player_id == x.player_id) & 
                                                                    (indiv_games.game_num < x.game_num) & 
                                                                    (indiv_games.game_num >= x.game_num - r_window)][['G', 'A', 'P', 'rating', 'PIM', 'EVG', 'PPG', 'SHG', 'GWG', 
                                                                                                                      'EVA', 'PPA', 'SHA', 'S', 'shifts', 'TOI', 'HIT', 'BLK', 'FOW', 'FOL']].agg(np.sum), axis=1)
    
    # Select all columns except TOI. We will compute the stat/60 min for the other columns, but leave the total TOI column unchanged.
    cols = [col for col in prior_stats_totals.columns if col != 'TOI']

    # Divide to compute the per 60 minute statistics
    prior_stats_60 = prior_stats_totals[cols].div(prior_stats_totals['TOI'], axis = 0)
    prior_stats_60 = 60 * prior_stats_60
    prior_stats_60.columns = [col + '_60' for col in cols]

    # Add total TOI back in
    prior_stats_final = prior_stats_60.copy()
    prior_stats_final['total_TOI'] = prior_stats_totals['TOI']

    # Concatenate with cumulative games
    cumulative_games = pd.concat([cumulative_games, prior_stats_final], axis = 1)

    # Calculate average TOI column
    # This gives us a final data set
    if(r_window):
        cumulative_games.loc[cumulative_games['game_num'] >= r_window, 'avg_TOI'] = cumulative_games['total_TOI'] / r_window
        cumulative_games.loc[cumulative_games['game_num'] < r_window, 'avg_TOI'] = cumulative_games['total_TOI'] / cumulative_games['n_completed_games']
    else:
        cumulative_games['avg_TOI'] = cumulative_games['total_TOI'] / cumulative_games['n_completed_games']

    cumulative_games.drop(columns = 'total_TOI', inplace = True)
        
    return cumulative_games

In [4]:
def create_goalie_cumulative_stats(indiv_games, r_window = None):
    # Make sure data is in correct order
    indiv_games = indiv_games.sort_values(['player_id', 'game_num'])
    
    # Create new data frames containing only certain columns from the old data frames
    cumulative_games = indiv_games.loc[:, ["player_id", "player_name", "age", "season", "game_num", "date", 
                                           "team", "opponent", "home_away_status", "result", "decision"]]
    
    # Assign the number of completed games as 1 less than game num
    cumulative_games['n_completed_games'] = cumulative_games['game_num'] - 1

    # For each numerical statistic, compute the sum of that statistic for all games completed by the player prior to the game that is about to start
    prior_stats_totals = indiv_games.apply(lambda x: indiv_games[(indiv_games.player_id == x.player_id) & 
                                                                    (indiv_games.game_num < x.game_num) & 
                                                                    (indiv_games.game_num >= x.game_num - r_window)][['GA', 'SA', 'SV', 'shutout', 'TOI']].agg(np.sum), axis=1)
    
    # Select all columns except shutout and TOI. We will compute the stat/60 min for the other columns, but leave these columns unchanged.
    cols = [col for col in prior_stats_totals.columns if col not in ['shutout', 'TOI']]

    # Divide to compute the per 60 minute statistics
    prior_stats_60 = prior_stats_totals[cols].div(prior_stats_totals['TOI'], axis = 0)
    prior_stats_60 = 60 * prior_stats_60
    prior_stats_60.columns = [col + '_60' for col in cols]

    # Add total TOI and total shutout back in
    prior_stats_final = prior_stats_60.copy()
    prior_stats_final['total_shutout'] = prior_stats_totals['shutout']
    prior_stats_final['total_TOI'] = prior_stats_totals['TOI']

    # Calculate the save percentage in the last 'r_window' games
    prior_stats_final['SV_perc'] = prior_stats_final['SV_60'] / prior_stats_final['SA_60']

    # Concatenate with cumulative games
    cumulative_games = pd.concat([cumulative_games, prior_stats_final], axis = 1)

    # Calculate average TOI column
    # This gives us a final data set
    if(r_window):
        cumulative_games.loc[cumulative_games['game_num'] >= r_window, 'avg_TOI'] = cumulative_games['total_TOI'] / r_window
        cumulative_games.loc[cumulative_games['game_num'] < r_window, 'avg_TOI'] = cumulative_games['total_TOI'] / cumulative_games['n_completed_games']
    else:
        cumulative_games['avg_TOI'] = cumulative_games['total_TOI'] / cumulative_games['n_completed_games']

    cumulative_games.drop(columns = 'total_TOI', inplace = True)
        
    return cumulative_games



In [5]:
# Use function to create final data frames for skaters and goalies
skaters_final_2021 = create_skater_cumulative_stats(skaters_2021, r_window = 20)
skaters_final_2022 = create_skater_cumulative_stats(skaters_2022, r_window = 20)
goalies_final_2021 = create_goalie_cumulative_stats(goalies_2021, r_window = 20)
goalies_final_2022 = create_goalie_cumulative_stats(goalies_2022, r_window = 20)

In [6]:
display(skaters_final_2022.head(3))
display(goalies_final_2022.head(3))

Unnamed: 0,player_id,player_name,age,season,game_num,date,team,opponent,home_away_status,result,...,EVA_60,PPA_60,SHA_60,S_60,shifts_60,HIT_60,BLK_60,FOW_60,FOL_60,avg_TOI
0,/a/abruzni01,Nicholas Abruzzese,22,2022,1,2022-04-02,TOR,PHI,0,W,...,,,,,,,,,,
1,/a/abruzni01,Nicholas Abruzzese,22,2022,2,2022-04-04,TOR,TBL,0,W,...,0.0,0.0,0.0,0.0,99.082569,0.0,0.0,0.0,0.0,9.083333
2,/a/abruzni01,Nicholas Abruzzese,22,2022,3,2022-04-07,TOR,DAL,0,W,...,0.0,0.0,0.0,5.638215,87.392326,5.638215,2.819107,0.0,0.0,10.641667


Unnamed: 0,player_id,player_name,age,season,game_num,date,team,opponent,home_away_status,result,decision,n_completed_games,GA_60,SA_60,SV_60,total_shutout,SV_perc,avg_TOI
0,/a/allenja01,Jake Allen,31,2022,1,2021-10-13,MTL,TOR,0,L,L,0,,,,0.0,,
1,/a/allenja01,Jake Allen,31,2022,2,2021-10-16,MTL,NYR,1,L,L,1,2.043713,30.655691,28.611978,0.0,0.933333,58.716667
2,/a/allenja01,Jake Allen,31,2022,3,2021-10-19,MTL,SJS,1,L,L,2,2.045164,27.098424,25.053259,0.0,0.924528,58.675


# Additional processing for goalies

## Issue: On some days, a goalie played bad and was pulled from the game
This matters because once the 2nd goalie is put in, there are now 2 goalies that have played in the exact same game. Therefore, when we left join the goalie information to the skater information, we will be duplicating a lot of rows in the skater data frame (becuase there will be 2 rows in the goalie data frame that have the same date and team information as 1 row in the skater data. 

Extending/elaborating on this, we do not know which goalie the skaters scored on during this game. This poses a problem since we want to include the strength of the opposing teams goalie in the training data. If we don't know which goalie was in the game for the times that a player was on the ice, there is some innacuracy here.

There are 116 cases of a goalie getting pulled in the 2021 season.

Here is an example.

To address these situations, I would probably default to picking the goalie that let in the most goals during the game. If 2 goalies gave up the same number of goals, take the goalie that had more TOI?

In [7]:
#goalies_final_2021.groupby(["date", "team"])["player_id"].count()
pulled_goalies = goalies_final_2021.groupby(["date", "team"])["player_id"].nunique()
pulled_goalies[pulled_goalies >= 2]

date        team
2021-01-15  PIT     2
            STL     2
2021-01-16  SJS     2
2021-01-18  PHI     2
2021-01-19  BUF     2
                   ..
2021-05-06  EDM     2
            MTL     2
2021-05-08  NYR     2
2021-05-09  OTT     2
2021-05-10  NYI     2
Name: player_id, Length: 116, dtype: int64

In [69]:
display(goalies_2021.loc[(goalies_final_2021.team == "PHI") & (goalies_final_2021.date == "2021-01-18"), :])
goalies_to_keep_2021.loc[(goalies_final_2021.team == "PHI") & (goalies_final_2021.date == "2021-01-18"), :]

Unnamed: 0,player_id,player_name,age,season,game_num,date,team,opponent,home_away_status,result,decision,n_completed_games,GA_60,SA_60,SV_60,total_shutout,SV_perc,avg_TOI
349,/e/elliobr01,Brian Elliott,35,2021,1,2021-01-18,PHI,BUF,1,L,,0,,,,0.0,,
585,/h/hartca01,Carter Hart,22,2021,3,2021-01-18,PHI,BUF,1,L,L,2,2.508711,33.616725,31.108014,0.0,0.925373,59.791667


Unnamed: 0,player_id,player_name,age,season,game_num,date,team,opponent,home_away_status,result,decision,GA,SA,SV,SV_perc,shutout,PIM,TOI
585,/h/hartca01,Carter Hart,22,2021,3,2021-01-18,PHI,BUF,1,L,L,4,22,18,0.818,0,2,28.15


## Choose goalie that gave up the most goals
If they gave up equal goals, select the one that had the most time on ice for the game.

In [107]:
# Make a copy of goalies data frame
goalies_to_keep_2021 = goalies_2021.loc[:, ['player_id', 'player_name', 'date', 'team', 'GA', 'TOI']]
goalies_to_keep_2021['keep_flag'] = 1

# Keep goalies that let in the most goals
ind_for_max_GA = goalies_to_keep_2021.groupby(['date', 'team'])['GA'].transform(max) == goalies_to_keep_2021['GA']
goalies_to_keep_2021 =  goalies_to_keep_2021[ind_for_max_GA]

# As a tiebreaker, keep goalies that had the highest time on ice
ind_for_max_TOI = goalies_to_keep_2021.groupby(['date', 'team'])['TOI'].transform(max) == goalies_to_keep_2021['TOI']
goalies_to_keep_2021 =  goalies_to_keep_2021[ind_for_max_TOI]

In [102]:
# See the number of rows that were dropped for the join
print(goalies_2021.shape)
print(goalies_to_keep_2021.shape)

(1852, 18)
(1736, 7)


In [103]:
# Sanity check
sum(goalies_to_keep_2021.groupby(["date", "team"])["player_id"].nunique() >= 2)

0

In [111]:
# Get rid  of GA and TOI columns to prepare for join
goalies_to_keep_2021.drop(columns = ['GA', 'TOI'],  inplace=True)

# Join to the cumulative goalie statistics data frame
goalies_final_2021 = goalies_final_2021.merge(goalies_to_keep_2021, how =  'left', on = ['player_id', 'player_name', 'date', 'team'])

# Only keep rows where keep_flag == 1. Drop the other rows
goalies_final_2021 = goalies_final_2021.loc[goalies_final_2021['keep_flag'] ==  1, :]

# Drop the keep flag column
goalies_final_2021.drop(columns='keep_flag', inplace=True)

In [90]:
# Make sure number of rows match the above again 
goalies_final_2021.shape

(1736, 21)

## Join the goalie data to the skater data for each skater/game combination

In [98]:
# See the initial number of rows that we should expect the final df to also be
skaters_final_2021.shape

(31226, 31)

In [99]:
pd.merge(skaters_final_2021, goalies_final_2021, how = 'left', left_on = ['date', 'team', 'opponent'], right_on = ['date', 'opponent', 'team']).columns

Index(['player_id_x', 'player_name_x', 'age_x', 'season_x', 'game_num_x',
       'date', 'team_x', 'opponent_x', 'home_away_status_x', 'result_x', 'G',
       'n_completed_games_x', 'G_60', 'A_60', 'P_60', 'rating_60', 'PIM_60',
       'EVG_60', 'PPG_60', 'SHG_60', 'GWG_60', 'EVA_60', 'PPA_60', 'SHA_60',
       'S_60', 'shifts_60', 'HIT_60', 'BLK_60', 'FOW_60', 'FOL_60',
       'avg_TOI_x', 'player_id_y', 'player_name_y', 'age_y', 'season_y',
       'game_num_y', 'team_y', 'opponent_y', 'home_away_status_y', 'result_y',
       'decision', 'n_completed_games_y', 'GA_60', 'SA_60', 'SV_60',
       'total_shutout', 'SV_perc', 'avg_TOI_y', 'keep_flag_x', 'keep_flag_y',
       'keep_flag'],
      dtype='object')