# Feature Engineering - Skater + Goalie Data

In [1]:
import pandas as pd
import numpy as np

## Read in data from hockey reference

In [3]:
# Skater data
skaters_2021 = pd.read_csv("../data/stats/skaters_2021.csv")
skaters_2022 = pd.read_csv("../data/stats/skaters_2022.csv")

# Goalie data
goalies_2021 = pd.read_csv("../data/stats/goalies_2021.csv")
goalies_2022 = pd.read_csv("../data/stats/goalies_2022.csv")

In [4]:
#skaters_2021.isnull().sum(axis = 0)
#skaters_2022.isnull().sum(axis = 0)
#goalies_2021.isnull().sum(axis = 0)
#goalies_2022.isnull().sum(axis = 0)

## Function for creating skaters final data
Should create all the cumulative stats for the skaters.

### Setting up a place to store cumulative computed features
Ideally, this will basically be the data frame containing the rows/observations to train a model on. We can include all of the same information like player ID, game number, team, etc. But for the features like shots or TOI, we will compute the cumulative sums/averages up to the start of each game and store them here. This way, we can keep separate the raw game by game data from the cumulative data.

In [29]:
display(goalies_2021.sort_values(['player_id', 'game_num']).head(10))

Unnamed: 0,player_id,player_name,age,season,game_num,date,team,opponent,home_away_status,result,decision,GA,SA,SV,SV_perc,shutout,PIM,TOI
0,/a/allenja01,Jake Allen,30,2021,1,2021-01-18,MTL,EDM,0,W,W,1,26,25,0.962,0,0,59.65
1,/a/allenja01,Jake Allen,30,2021,2,2021-01-21,MTL,VAN,0,W,W,3,17,14,0.824,0,0,59.766667
2,/a/allenja01,Jake Allen,30,2021,3,2021-01-30,MTL,CGY,1,L,L,1,32,31,0.969,0,0,58.85
3,/a/allenja01,Jake Allen,30,2021,4,2021-02-02,MTL,VAN,1,W,W,3,39,36,0.923,0,0,59.666667
4,/a/allenja01,Jake Allen,30,2021,5,2021-02-06,MTL,OTT,0,W,W,1,35,34,0.971,0,0,60.0
5,/a/allenja01,Jake Allen,30,2021,6,2021-02-11,MTL,EDM,1,L,L,3,31,28,0.903,0,0,60.0
6,/a/allenja01,Jake Allen,30,2021,7,2021-02-21,MTL,OTT,0,L-OT,O,3,39,36,0.923,0,0,63.5
7,/a/allenja01,Jake Allen,30,2021,8,2021-02-27,MTL,WPG,0,L-OT,O,2,21,19,0.905,0,0,60.6
8,/a/allenja01,Jake Allen,30,2021,9,2021-03-04,MTL,WPG,1,L-OT,O,4,27,23,0.852,0,0,64.033333
9,/a/allenja01,Jake Allen,30,2021,10,2021-03-11,MTL,CGY,0,L,L,2,29,27,0.931,0,0,59.383333


In [21]:
def create_cumulative_statistics(indiv_games, player_type, r_window = None):
    # Make sure data is in correct order
    indiv_games = indiv_games.sort_values(['player_id', 'game_num'])
    
    if player_type == "skater":
        # Create new data frames containing only certain columns from the old data frames
        cumulative_games = indiv_games.loc[:, ["player_id", "player_name", "age", "season", "game_num", "date", 
                                                        "team", "opponent", "home_away_status", "result", "G"]]
        
        # Assign the number of completed games as 1 less than game num
        cumulative_games['n_completed_games'] = cumulative_games['game_num'] - 1

        # For each numerical statistic, compute the sum of that statistic for all games completed by the player prior to the game that is about to start
        prior_stats_totals = indiv_games.apply(lambda x: indiv_games[(indiv_games.player_id == x.player_id) & 
                                                                     (indiv_games.game_num < x.game_num) & 
                                                                     (indiv_games.game_num >= x.game_num - r_window)][['G', 'A', 'P', 'rating', 'PIM', 'EVG', 'PPG', 'SHG', 'GWG', 'EVA', 'PPA', 'SHA', 'S', 'shifts', 'TOI', 'HIT', 'BLK', 'FOW', 'FOL']].agg(np.sum), axis=1)
        
        # Select all columns except TOI. We will compute the stat/60 min for the other columns, but leave the total TOI column unchanged.
        cols = [col for col in prior_stats_totals.columns if col != 'TOI']

        # Divide to compute the per 60 minute statistics
        prior_stats_60 = prior_stats_totals[cols].div(prior_stats_totals['TOI'], axis = 0)
        prior_stats_60 = 60 * prior_stats_60
        prior_stats_60.columns = [col + '_60' for col in cols]

        # Add total TOI back in
        prior_stats_final = prior_stats_60.copy()
        prior_stats_final['total_TOI'] = prior_stats_totals['TOI']

        # Concatenate with cumulative games
        cumulative_games = pd.concat([cumulative_games, prior_stats_final], axis = 1)

        # Calculate average TOI column
        # This gives us a final data set
        if(r_window):
            cumulative_games['avg_TOI'] = cumulative_games['total_TOI'] / r_window
        else:
            cumulative_games['avg_TOI'] = cumulative_games['total_TOI'] / cumulative_games['n_completed_games']
        
        
    elif player_type == "goalie":
        # Create new data frames containing only certain columns from the old data frames
        cumulative_games = indiv_games.loc[:, ["player_id", "player_name", "age", "season", "game_num", "date", 
                                                    "team", "opponent", "home_away_status", "result", "decision"]]
    
        # Compute Save Percentage = sum(Saves) / sum(Shots Against)
        cumulative_games[["SA", "SV"]] = indiv_games.groupby("player_id")[["SA", "SV"]].cumsum()
        cumulative_games["SV_perc"] = cumulative_games.SV / cumulative_games.SA
        cumulative_games = cumulative_games.drop(columns = ["SV", "SA"])
        
        # Compute goals against average?
        
    else:
        # If wrong player type, print warning message 
        print("The player type is not known. Should be one of either 1) skater 2) goalie.")
        return
    
    return cumulative_games

In [22]:
# Use function to create final data frames for skaters and goalies
skaters_final_2021 = create_cumulative_statistics(skaters_2021, player_type = "skater", r_window = 20)
#skaters_final_2022 = create_cumulative_statistics(skaters_2022, player_type = "skater", r_window = 20)
#goalies_final_2021 = create_cumulative_statistics(goalies_2021, player_type = "goalie", r_window = 20)
#goalies_final_2022 = create_cumulative_statistics(goalies_2022, player_type = "goalie", r_window = 20)

In [28]:
skaters_final_2021.head()

Unnamed: 0,player_id,player_name,age,season,game_num,date,team,opponent,home_away_status,result,...,PPA_60,SHA_60,S_60,shifts_60,HIT_60,BLK_60,FOW_60,FOL_60,total_TOI,avg_TOI
0,/a/abramvi01,Vitaly Abramov,22,2021,1,2021-05-05,OTT,MTL,1,W,...,,,,,,,,,0.0,0.0
1,/a/abramvi01,Vitaly Abramov,23,2021,2,2021-05-08,OTT,WPG,0,W,...,0.0,0.0,0.0,74.740484,0.0,0.0,0.0,0.0,9.633333,0.481667
2,/a/acciano01,Noel Acciari,29,2021,1,2021-01-17,FLA,CHI,1,W,...,,,,,,,,,0.0,0.0
3,/a/acciano01,Noel Acciari,29,2021,2,2021-01-19,FLA,CHI,1,W,...,0.0,0.0,4.100228,86.104784,8.200456,4.100228,20.501139,41.002278,14.633333,0.731667
4,/a/acciano01,Noel Acciari,29,2021,3,2021-01-26,FLA,CBJ,0,W,...,0.0,0.0,2.006689,88.294314,10.033445,8.026756,20.06689,40.133779,29.9,1.495


## Issue: On some days, a goalie played bad and was pulled from the game
This matters because once the 2nd goalie is put in, there are now 2 goalies that have played in the exact same game. Therefore, when we left join the goalie information to the skater information, we will be duplicating a lot of rows in the skater data frame (becuase there will be 2 rows in the goalie data frame that have the same date and team information as 1 row in the skater data. 

Extending/elaborating on this, we do not know which goalie the skaters scored on during this game. This poses a problem since we want to include the strength of the opposing teams goalie in the training data. If we don't know which goalie was in the game for the times that a player was on the ice, there is some innacuracy here.

There are 116 cases of a goalie getting pulled in the 2021 season.

Here is an example.

To address these situations, I would probably default to picking the goalie that let in the most goals during the game. If 2 goalies gave up the same number of goals, take the goalie that had more TOI?

In [45]:
#goalies_final_2021.groupby(["date", "team"])["player_id"].count()
pulled_goalies = goalies_final_2021.groupby(["date", "team"])["player_id"].nunique()
pulled_goalies[pulled_goalies >= 2]

date        team
2021-01-15  PIT     2
            STL     2
2021-01-16  SJS     2
2021-01-18  PHI     2
2021-01-19  BUF     2
                   ..
2021-05-06  EDM     2
            MTL     2
2021-05-08  NYR     2
2021-05-09  OTT     2
2021-05-10  NYI     2
Name: player_id, Length: 116, dtype: int64

In [46]:
goalies_2021.loc[(goalies_2021.team == "PIT") & (goalies_2021.date == "2021-01-15"), :]

Unnamed: 0,player_id,player_name,age,season,game_num,date,team,opponent,home_away_status,result,decision,GA,SA,SV,SV_perc,shutout,PIM,TOI
283,/d/desmica01,Casey DeSmith,29,2021,1,2021-01-15,PIT,PHI,0,L,,1,13,12,0.923,0,0,47.75
752,/j/jarrytr01,Tristan Jarry,25,2021,2,2021-01-15,PIT,PHI,0,L,L,3,6,3,0.5,0,0,11.5
