# Feature Engineering - Skater + Goalie Data

In [6]:
import pandas as pd
import numpy as np

## Read in data from hockey reference

In [7]:
# Skater data
skaters_2021 = pd.read_csv("../data/stats/skaters_2021.csv")
skaters_2022 = pd.read_csv("../data/stats/skaters_2022.csv")

# Goalie data
goalies_2021 = pd.read_csv("../data/stats/goalies_2021.csv")
goalies_2022 = pd.read_csv("../data/stats/goalies_2022.csv")

In [8]:
#skaters_2021.isnull().sum(axis = 0)
#skaters_2022.isnull().sum(axis = 0)
#goalies_2021.isnull().sum(axis = 0)
#goalies_2022.isnull().sum(axis = 0)

## Function for creating skaters final data
Should create all the cumulative stats for the skaters. We can include all of the same information like player ID, game number, team, etc. But for the features like shots or TOI, we will compute the cumulative sums/averages up to the start of each game and store them here. This way, we can keep separate the raw game by game data from the cumulative data.

In [9]:
def create_skater_cumulative_stats(indiv_games, r_window = None):
    # Make sure data is in correct order
    #indiv_games = indiv_games.sort_values(['player_id', 'game_num'])
    
    # Create new data frames containing only certain columns from the old data frames
    cumulative_games = indiv_games.loc[:, ["player_id", "player_name", "age", "season", "game_num", "date", 
                                           "team", "opponent", "home_away_status", "result", "G"]]
    
    # Assign the number of completed games as 1 less than game num
    cumulative_games['n_completed_games'] = cumulative_games['game_num'] - 1

    # For each numerical statistic, compute the sum of that statistic for all games completed by the player prior to the game that is about to start
    prior_stats_totals = indiv_games.apply(lambda x: indiv_games[(indiv_games.player_id == x.player_id) & 
                                                                    (indiv_games.game_num < x.game_num) & 
                                                                    (indiv_games.game_num >= x.game_num - r_window)][['G', 'A', 'P', 'rating', 'PIM', 'EVG', 'PPG', 'SHG', 'GWG', 
                                                                                                                      'EVA', 'PPA', 'SHA', 'S', 'shifts', 'TOI', 'HIT', 'BLK', 'FOW', 'FOL']].agg(np.sum), axis=1)
    
    # Select all columns except TOI. We will compute the stat/60 min for the other columns, but leave the total TOI column unchanged.
    cols = [col for col in prior_stats_totals.columns if col != 'TOI']

    # Divide to compute the per 60 minute statistics
    prior_stats_60 = prior_stats_totals[cols].div(prior_stats_totals['TOI'], axis = 0)
    prior_stats_60 = 60 * prior_stats_60
    prior_stats_60.columns = [col + '_60' for col in cols]

    # Add total TOI back in
    prior_stats_final = prior_stats_60.copy()
    prior_stats_final['total_TOI'] = prior_stats_totals['TOI']

    # Concatenate with cumulative games
    cumulative_games = pd.concat([cumulative_games, prior_stats_final], axis = 1)

    # Calculate average TOI column
    # This gives us a final data set
    if(r_window):
        cumulative_games.loc[cumulative_games['game_num'] >= r_window, 'avg_TOI'] = cumulative_games['total_TOI'] / r_window
        cumulative_games.loc[cumulative_games['game_num'] < r_window, 'avg_TOI'] = cumulative_games['total_TOI'] / cumulative_games['n_completed_games']
    else:
        cumulative_games['avg_TOI'] = cumulative_games['total_TOI'] / cumulative_games['n_completed_games']

    # Get rid of the total TOI column
    cumulative_games.drop(columns = 'total_TOI', inplace = True)
        
    return cumulative_games

In [10]:
def create_goalie_cumulative_stats(indiv_games, r_window = None):
    # Make sure data is in correct order
    #indiv_games = indiv_games.sort_values(['player_id', 'game_num'])
    
    # Create new data frames containing only certain columns from the old data frames
    cumulative_games = indiv_games.loc[:, ["player_id", "player_name", "age", "season", "game_num", "date", 
                                           "team", "opponent", "home_away_status", "result", "decision"]]
    
    # Assign the number of completed games as 1 less than game num
    cumulative_games['n_completed_games'] = cumulative_games['game_num'] - 1

    # For each numerical statistic, compute the sum of that statistic for all games completed by the player prior to the game that is about to start
    prior_stats_totals = indiv_games.apply(lambda x: indiv_games[(indiv_games.player_id == x.player_id) & 
                                                                    (indiv_games.game_num < x.game_num) & 
                                                                    (indiv_games.game_num >= x.game_num - r_window)][['GA', 'SA', 'SV', 'shutout', 'TOI']].agg(np.sum), axis=1)
    
    # Select all columns except shutout and TOI. We will compute the stat/60 min for the other columns, but leave these columns unchanged.
    cols = [col for col in prior_stats_totals.columns if col not in ['shutout', 'TOI']]

    # Divide to compute the per 60 minute statistics
    prior_stats_60 = prior_stats_totals[cols].div(prior_stats_totals['TOI'], axis = 0)
    prior_stats_60 = 60 * prior_stats_60
    prior_stats_60.columns = [col + '_60' for col in cols]

    # Add total TOI and total shutout back in
    prior_stats_final = prior_stats_60.copy()
    prior_stats_final['total_shutout'] = prior_stats_totals['shutout']
    prior_stats_final['total_TOI'] = prior_stats_totals['TOI']

    # Calculate the save percentage in the last 'r_window' games
    prior_stats_final['SV_perc'] = prior_stats_final['SV_60'] / prior_stats_final['SA_60']

    # Concatenate with cumulative games
    cumulative_games = pd.concat([cumulative_games, prior_stats_final], axis = 1)

    # Calculate average TOI column
    # This gives us a final data set
    if(r_window):
        cumulative_games.loc[cumulative_games['game_num'] >= r_window, 'avg_TOI'] = cumulative_games['total_TOI'] / r_window
        cumulative_games.loc[cumulative_games['game_num'] < r_window, 'avg_TOI'] = cumulative_games['total_TOI'] / cumulative_games['n_completed_games']
    else:
        cumulative_games['avg_TOI'] = cumulative_games['total_TOI'] / cumulative_games['n_completed_games']

    # Get rid of the total TOI column
    cumulative_games.drop(columns = 'total_TOI', inplace = True)
        
    return cumulative_games



In [11]:
# Use function to create final data frames for skaters and goalies
skaters_final_2021 = create_skater_cumulative_stats(skaters_2021, r_window = 15)
skaters_final_2022 = create_skater_cumulative_stats(skaters_2022, r_window = 15)
goalies_mult_per_game_2021 = create_goalie_cumulative_stats(goalies_2021, r_window = 15)
goalies_mult_per_game_2022 = create_goalie_cumulative_stats(goalies_2022, r_window = 15)

In [None]:
display(skaters_final_2022.head(3))
display(goalies_mult_per_game_2022.head(3))

# Additional processing for goalies

### Issue: On some days, multiple goalies played in the same game
These situations arise when a goalie gets pulled after playing bad or if a goalie gets injured during the game. This is an issue because when we left join the goalie information to the skater information, we will be duplicating a lot of rows in the skater data frame (ex: there will be 2 rows in the goalie data frame that have the same date and team information as 1 row in the skater data. 

This would be inaccurate/imprecise to duplicate a lot of skater rows. As a result, we need to identify only one goalie per game to assign to each combination of skater/game. For new predictions (games that have yet to occur), the goalie will be whichever goalie is slated to start for that game. For old observations and situations described above, we have to choose how to reduce the number of goalies to only 1 per game.

The main thing we want to capture when training the model is which goalie a skater may have an advantage over. We want to know which goalies a particular skater is going to score a lot of goals against. Because of this, these special training observations should be assigned a goalie using the following criteria:
1. Choose the goalie that gave up the most goals during the game. This maximizes the probability that any skater on the opposing team scored their respective goal against this goalie.
2. If mulitple goalies gave up the same number of goals in a particular game, choose the goalie that had the most time on ice for that game. We may be able to assume that the majority of a skaters interactions with a goalie were with the goalie that spent the most TOI. Hopefully, this is more reflective of why they were or were not able to score a goal in the game.

There are not many observations where the 2nd criteria should be needed. If neither criteria can return only 1 goalie, an error is thrown in the code.

An example of multiple goalies playing the same game is shown below in comments.

In [None]:
#mult_goalies = goalies_mult_per_game_2021.groupby(["date", "team"])["player_id"].nunique()
#mult_goalies[mult_goalies >= 2]
#display(goalies_2021.loc[(goalies_mult_per_game_2021.team == "PHI") & (goalies_mult_per_game_2021.date == "2021-01-18"), :])

## Create function to join goalie df to skater df
This function should also handle situations where multiple goalies played in one game.

In [12]:
def combine_skater_goalie(skater_cumulative, goalie, goalie_cumulative):
    # Make a smaller copy of goalies data frame
    goalies_to_keep = goalie.loc[:, ['player_id', 'player_name', 'date', 'team', 'GA', 'TOI']]
    goalies_to_keep['keep_flag'] = 1

    # Keep goalies that let in the most goals
    ind_for_max_GA = goalies_to_keep.groupby(['date', 'team'])['GA'].transform(max) == goalies_to_keep['GA']
    goalies_to_keep =  goalies_to_keep[ind_for_max_GA]

    # As a tiebreaker, keep goalies that had the highest time on ice
    ind_for_max_TOI = goalies_to_keep.groupby(['date', 'team'])['TOI'].transform(max) == goalies_to_keep['TOI']
    goalies_to_keep =  goalies_to_keep[ind_for_max_TOI]

    # Check to make sure there is only 1 goalie per game after reducing via conditions above
    if sum(goalies_to_keep.groupby(["date", "team"])["player_id"].nunique() >= 2) > 0:
        raise Exception('At least one game has multiple goalies. Join to skaters will not work correctly.')

    # Get rid of GA and TOI columns to prepare for join
    goalies_to_keep.drop(columns = ['GA', 'TOI'],  inplace=True)

    # Join to the cumulative goalie statistics data frame
    goalies_final = pd.merge(goalie_cumulative, goalies_to_keep, how = 'left', on = ['player_id', 'player_name', 'date', 'team'])
    
    # Only keep rows where keep_flag == 1. Drop the other rows
    goalies_final = goalies_final.loc[goalies_final['keep_flag'] ==  1, :]

    # Check that the correct rows from the cumulative goalie information were selected
    if goalies_to_keep.shape[0] != goalies_final.shape[0]:
        raise Exception('Number of rows in goalie cumulative stats data frame does not match number expected based on individual game data frame.')

    # See the number of rows that were dropped before the join
    print('Number of rows removed after choosing 1 goalie per game', goalie.shape[0] - goalies_final.shape[0])

    # Drop the keep flag column
    goalies_final.drop(columns='keep_flag', inplace=True) 

    # Append the cumulative goalie information to cumulative skater information
    combined_skater_goalie = pd.merge(skater_cumulative, goalies_final, how = 'left', left_on = ['date', 'team', 'opponent'], right_on = ['date', 'opponent', 'team']) 

    # Ensure the join retained the exact same number of rows. For every skater/game combination, there should only be one corresponding opposing goalie
    if combined_skater_goalie.shape[0] != skater_cumulative.shape[0]:
        raise Exception('Join between goalies and skaters did not produce the same number of rows as original skater data frame.')
    
    # Select only the required columns
    cols_to_keep = ['player_id_x', 'player_name_x', 'age_x', 'season_x', 'game_num_x', 'date', 'team_x', 'opponent_x', 'home_away_status_x', 
                    'result_x', 'G','n_completed_games_x', 'G_60', 'A_60', 'P_60', 'rating_60', 'PIM_60','EVG_60', 'PPG_60', 'SHG_60', 'GWG_60', 
                    'EVA_60', 'PPA_60', 'SHA_60', 'S_60', 'shifts_60', 'HIT_60', 'BLK_60', 'FOW_60', 'FOL_60', 'avg_TOI_x', 'player_id_y', 'player_name_y', 'age_y', 'game_num_y', 'decision', 'n_completed_games_y', 'GA_60', 'SA_60', 'SV_60', 'total_shutout', 'SV_perc', 'avg_TOI_y']
    
    combined_skater_goalie = combined_skater_goalie.loc[:, cols_to_keep]

    # Rename the columns
    combined_skater_goalie = combined_skater_goalie.rename(columns = {
        'player_id_x':'s_player_id', 
        'player_name_x':'s_name', 
        'age_x':'s_age', 
        'season_x':'season', 
        'game_num_x':'s_game_num', 
        'team_x':'s_team', 
        'opponent_x':'s_opponent', 
        'home_away_status_x':'s_home_away_status', 
        'result_x':'s_result', 
        #'G':'G',
        'n_completed_games_x':'s_n_completed_games', 
        #'G_60', 
        #'A_60', 
        #'P_60', 
        #'rating_60', 
        #'PIM_60',
        #'EVG_60', 
        #'PPG_60', 
        #'SHG_60', 
        #'GWG_60', 
        #'EVA_60', 
        #'PPA_60', 
        #'SHA_60', 
        #'S_60', 
        #'shifts_60', 
        #'HIT_60', 
        #'BLK_60', 
        #'FOW_60', 
        #'FOL_60', 
        'avg_TOI_x':'s_avg_TOI', 
        'player_id_y':'g_player_id', 
        'player_name_y':'g_name', 
        'age_y':'g_age', 
        'game_num_y':'g_game_num', 
        #'decision', 
        'n_completed_games_y':'g_n_completed_games', 
        #'GA_60', 
        #'SA_60', 
        #'SV_60', 
        #'total_shutout', 
        #'SV_perc',
        'avg_TOI_y':'g_avg_TOI'
    })

    return combined_skater_goalie

In [13]:
# Join skater cumulative data with goalie cumulative data
player_game_combos_2021 = combine_skater_goalie(skaters_final_2021, goalies_2021, goalies_mult_per_game_2021)
player_game_combos_2022 = combine_skater_goalie(skaters_final_2022, goalies_2022, goalies_mult_per_game_2022)

Number of rows removed after choosing 1 goalie per game 116
Number of rows removed after choosing 1 goalie per game 194


In [14]:
player_game_combos_2022.loc[(player_game_combos_2022['date'] == '2022-03-10') & (player_game_combos_2022['s_team'] == 'DET'),['s_name', 's_game_num', 'date', 's_team', 's_opponent', 's_result', 'G', 'S_60', 'g_name', 'g_game_num', 'decision', 'GA_60', 'SV_perc']]

Unnamed: 0,s_name,s_game_num,date,s_team,s_opponent,s_result,G,S_60,g_name,g_game_num,decision,GA_60,SV_perc
2885,Tyler Bertuzzi,50,2022-03-10,DET,MIN,L-SO,0,7.546753,Cam Talbot,36,W,3.487941,0.889151
12147,Adam Erne,57,2022-03-10,DET,MIN,L-SO,0,8.375571,Cam Talbot,36,W,3.487941,0.889151
12364,Robby Fabbri,56,2022-03-10,DET,MIN,L-SO,0,5.598408,Cam Talbot,36,W,3.487941,0.889151
14046,Sam Gagner,58,2022-03-10,DET,MIN,L-SO,0,7.930243,Cam Talbot,36,W,3.487941,0.889151
19039,Filip Hronek,54,2022-03-10,DET,MIN,L-SO,0,5.416342,Cam Talbot,36,W,3.487941,0.889151
24711,Dylan Larkin,53,2022-03-10,DET,MIN,L-SO,0,9.431285,Cam Talbot,36,W,3.487941,0.889151
25152,Nick Leddy,53,2022-03-10,DET,MIN,L-SO,0,2.867612,Cam Talbot,36,W,3.487941,0.889151
31016,Vladislav Namestnikov,56,2022-03-10,DET,MIN,L-SO,0,4.796447,Cam Talbot,36,W,3.487941,0.889151
32283,Jordan Oesterle,31,2022-03-10,DET,MIN,L-SO,1,4.280618,Cam Talbot,36,W,3.487941,0.889151
35882,Michael Rasmussen,56,2022-03-10,DET,MIN,L-SO,0,7.483759,Cam Talbot,36,W,3.487941,0.889151


### Potential sanity checks to include
1. Each group of date and team should have 18 rows (teams dress 18 skaters and 2 goalies per each game)
2. Check to see how many rows are left when both s_game_num >= 20 and g_game_num >= ~10??

In [15]:
# Check to make sure there are never more than 18 skaters in a game
display(player_game_combos_2021.groupby(['s_team', 'date'])['s_name'].agg(s_name_count = 'count').reset_index()['s_name_count'].value_counts())
print()
display(player_game_combos_2022.groupby(['s_team', 'date'])['s_name'].agg(s_name_count = 'count').reset_index()['s_name_count'].value_counts())


18    1717
17      17
16       1
15       1
Name: s_name_count, dtype: int64




18    2603
17      15
16       5
15       1
Name: s_name_count, dtype: int64

In [16]:
# Check one of the games that only had 15 skaters recorded
test = player_game_combos_2022.groupby(['s_team', 'date'])['s_name'].agg(s_name_count = 'count').reset_index()
test = test.loc[test['s_name_count'] == 15, :]
pd.merge(player_game_combos_2022, test, how = 'inner', on =['s_team', 'date'])

Unnamed: 0,s_player_id,s_name,s_age,season,s_game_num,date,s_team,s_opponent,s_home_away_status,s_result,...,g_game_num,decision,g_n_completed_games,GA_60,SA_60,SV_60,total_shutout,SV_perc,g_avg_TOI,s_name_count
0,/c/carlslu01,Lucas Carlsson,24,2022,13,2021-12-16,FLA,LAK,1,L,...,17,W,16,2.029202,30.307108,28.277906,2.0,0.933045,61.107778,15
1,/d/denisgr01,Grigori Denisenko,21,2022,1,2021-12-16,FLA,LAK,1,L,...,17,W,16,2.029202,30.307108,28.277906,2.0,0.933045,61.107778,15
2,/d/duclaan01,Anthony Duclair,26,2022,21,2021-12-16,FLA,LAK,1,L,...,17,W,16,2.029202,30.307108,28.277906,2.0,0.933045,61.107778,15
3,/h/heponal01,Aleksi Heponiemi,22,2022,2,2021-12-16,FLA,LAK,1,L,...,17,W,16,2.029202,30.307108,28.277906,2.0,0.933045,61.107778,15
4,/h/hornqpa01,Patric Hornqvist,34,2022,29,2021-12-16,FLA,LAK,1,L,...,17,W,16,2.029202,30.307108,28.277906,2.0,0.933045,61.107778,15
5,/h/huberjo01,Jonathan Huberdeau,28,2022,29,2021-12-16,FLA,LAK,1,L,...,17,W,16,2.029202,30.307108,28.277906,2.0,0.933045,61.107778,15
6,/j/juoleol01,Olli Juolevi,23,2022,4,2021-12-16,FLA,LAK,1,L,...,17,W,16,2.029202,30.307108,28.277906,2.0,0.933045,61.107778,15
7,/k/kiersma01,Matt Kiersted,23,2022,5,2021-12-16,FLA,LAK,1,L,...,17,W,16,2.029202,30.307108,28.277906,2.0,0.933045,61.107778,15
8,/l/lundean01,Anton Lundell,20,2022,25,2021-12-16,FLA,LAK,1,L,...,17,W,16,2.029202,30.307108,28.277906,2.0,0.933045,61.107778,15
9,/l/luostee01,Eetu Luostarinen,23,2022,26,2021-12-16,FLA,LAK,1,L,...,17,W,16,2.029202,30.307108,28.277906,2.0,0.933045,61.107778,15


In [18]:
# How many rows are there where s_game_num > ___ and g_game_num > ___
s_games = 15
g_games = 10

print(player_game_combos_2021.shape)
test2 = player_game_combos_2021.loc[(player_game_combos_2021['s_game_num'] > s_games) & (player_game_combos_2021['g_game_num'] > g_games), :]
print(test2.shape)
print('Rows lost:', player_game_combos_2021.shape[0] - test2.shape[0])
print('Percent rows lost:', 100 * (player_game_combos_2021.shape[0] - test2.shape[0]) / player_game_combos_2021.shape[0])

print()

print(player_game_combos_2022.shape)
test2 = player_game_combos_2022.loc[(player_game_combos_2022['s_game_num'] > s_games) & (player_game_combos_2022['g_game_num'] > g_games), :]
print(test2.shape)
print('Rows lost:', player_game_combos_2022.shape[0] - test2.shape[0])
print('Percent rows lost:', 100 * (player_game_combos_2022.shape[0] - test2.shape[0]) / player_game_combos_2022.shape[0])


(31226, 43)
(15188, 43)
Rows lost: 16038
Percent rows lost: 51.36104528277717

(47204, 43)
(29041, 43)
Rows lost: 18163
Percent rows lost: 38.477671383781036


In [20]:
# Number of total rows of data frames
print(player_game_combos_2021.shape)
print(player_game_combos_2022.shape)

(31226, 43)
(47204, 43)


Might want to move this to EDA.
Models could be
1. Early skater model (<= 20 games)
2. Early goalie moddel (<= 5-10 games)
3. Early skater and goalie model (both of the above)
4. Rest of season model (none of the above)

## Write these files to the data folder

In [21]:
# Convert to CSV's in data folder
player_game_combos_2021.to_csv('../data/stats/past_seasons/combined_per60_roll15_2021.csv')
player_game_combos_2022.to_csv('../data/stats/past_seasons/combined_per60_roll15_2022.csv')