In [1]:
import json
import pandas as pd
import os

Create a function to convert all events of every game into a pandas dataframe.

For this milestone, you will want to include events of the type “shots” and “goals”. You can ignore missed shots or blocked shots for now. For each event, you will want to include as features (at minimum): game time/period information, game ID, team information (which team took the shot), indicator if its a shot or a goal, the on-ice coordinates, the shooter and goalie name (don’t worry about assists for now), shot type, if it was on an empty net, and whether or not a goal was at even strength, shorthanded, or on the power play.


In [2]:
def convert_single_play_data(raw_data):
    single_play_data_list = []

    # print(season)
    for single_play in raw_data['plays']:
      event_type =  single_play['typeDescKey']
      event_code =  single_play['typeCode']
      home_team_id = raw_data['homeTeam']['id']
      away_team_id = raw_data['awayTeam']['id']
      event_data = {
          'event_type': event_type,
          'gameID': raw_data['id'],
          'gameType': raw_data['gameType'],
          'home': raw_data['homeTeam']['name']['default'],
          'home_id': home_team_id,
          'away': raw_data['awayTeam']['name']['default'],
          'away_id': away_team_id,
          'season': raw_data['season']
      }
      # if "shot" in event_type or event_type == "goal":
      if event_code in [505, 506, 507, 508]: #goals and shots codes according to https://gitlab.com/dword4/nhlapi/-/issues/110
        # get the game time/period information
        event_data['game_period'] = single_play['period']
        # get the on-ice coordinates
        event_data['x_coordinate'] = single_play['details'].get('xCoord', None)
        event_data['y_coordinate'] = single_play['details'].get('yCoord', None)

        # get the shot type
        event_data['shot_type'] = single_play['details'].get('shotType',None)

        event_data['shooter_id'] = single_play['details'].get('scoringPlayerId', None)
        event_data['goalie_id'] = single_play['details'].get('goalieInNetId', None)
        event_data['event_team'] = "home" if single_play['details'].get("eventOwnerTeamId", None) == home_team_id else "away"

        if event_type == 'goal':
          event_data['is_goal'] = True
          event_data['scoring_team'] = "home" if single_play['details'].get("eventOwnerTeamId", None) == home_team_id else "away"
          # Get if goal was empty, 
          # if home team scoring we check if away goalie was on ice using 1st digit in situation code (if digit is 1 then not an empty net)
          # if away team scoring we check if home goalie was on ice using 4th digit in situation code (if digit is 1 then not an empty net)
          event_data['is_emptyNet'] = not int(single_play['situationCode'][0]) if event_data['scoring_team'] == "home" else not int(single_play['situationCode'][3]) 
        else:
          event_data['is_goal'] = False

        single_play_data_list.append(event_data)

    # Converting the list of event data into a Pandas DataFrame
    single_play_df = pd.DataFrame(single_play_data_list)
    return single_play_df

In [3]:
def process_game_json(file_path):
    with open(file_path, 'r') as file:
        raw_data = json.load(file)
    return convert_single_play_data(raw_data)

def concatenate_all_games_data(dataset_root_dir):
    all_games_data = []

    for root, dirs, files in os.walk(dataset_root_dir):
        print(f'Processing directory: {root}')  # Debugging information
        for file in files:
            if file.endswith(".json"):
                file_path = os.path.join(root, file)
                # print('Processing file: {file_path}')
                game_data_df = process_game_json(file_path)
                all_games_data.append(game_data_df)

    # Concatenate all individual game data DataFrames into a single DataFrame
    all_games_df = pd.concat(all_games_data, ignore_index=True)

    return all_games_df

# Usage:
dataset_root_dir = 'new_data'
all_games_df = concatenate_all_games_data(dataset_root_dir)
all_games_df

Processing directory: new_data


Unnamed: 0,event_type,gameID,gameType,home,home_id,away,away_id,season,game_period,x_coordinate,y_coordinate,shot_type,shooter_id,goalie_id,event_team,is_goal,scoring_team,is_emptyNet
0,shot-on-goal,2022020671,2,Blackhawks,16,Avalanche,21,20222023,1,37,37,slap,,8475852.0,away,False,,
1,shot-on-goal,2022020671,2,Blackhawks,16,Avalanche,21,20222023,1,72,16,wrist,,8475852.0,away,False,,
2,blocked-shot,2022020671,2,Blackhawks,16,Avalanche,21,20222023,1,67,-4,,,,home,False,,
3,shot-on-goal,2022020671,2,Blackhawks,16,Avalanche,21,20222023,1,55,14,wrist,,8475852.0,away,False,,
4,shot-on-goal,2022020671,2,Blackhawks,16,Avalanche,21,20222023,1,-81,-12,wrist,,8480925.0,home,False,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
324985,shot-on-goal,2022020520,2,Avalanche,21,Canadiens,8,20222023,3,70,-32,wrist,,8480382.0,away,False,,
324986,blocked-shot,2022020520,2,Avalanche,21,Canadiens,8,20222023,3,59,-7,,,,home,False,,
324987,blocked-shot,2022020520,2,Avalanche,21,Canadiens,8,20222023,3,-76,4,,,,away,False,,
324988,shot-on-goal,2022020520,2,Avalanche,21,Canadiens,8,20222023,4,-81,3,wrist,,8480382.0,away,False,,


In [4]:
all_games_df.is_emptyNet.value_counts()

is_emptyNet
False    16980
True       983
Name: count, dtype: int64

In [5]:
all_games_df.to_csv('data/all_new_game_data.csv', index=False)

In [6]:
## Process Features
from feature_engineering import FeatureEngineering

In [7]:
feat_eng = FeatureEngineering(path_to_tidy_data='data/all_new_game_data.csv')

In [8]:
feat_eng.tranform()

In [9]:
df = feat_eng.df

In [10]:
df

Unnamed: 0,event_type,gameID,gameType,home,home_id,away,away_id,season,game_period,x_coordinate,...,is_goal,scoring_team,is_emptyNet,distance_to_positive_goal,distance_to_negative_goal,goal_coordinate,shot_distance_to_goal,shot_angle,goal_rate_dist,goal_rate_angle
0,shot-on-goal,2022020671,2,Blackhawks,16,Avalanche,21,20222023,1,37,...,0,,0,59.816386,126.530629,84,59.816386,38.211025,0.000924,0.001447
1,shot-on-goal,2022020671,2,Blackhawks,16,Avalanche,21,20222023,1,72,...,0,,0,20.000000,156.818366,84,20.000000,53.130102,0.002764,0.001040
2,blocked-shot,2022020671,2,Blackhawks,16,Avalanche,21,20222023,1,67,...,0,,0,17.464249,151.052971,84,17.464249,-13.240520,0.003165,-0.004174
3,shot-on-goal,2022020671,2,Blackhawks,16,Avalanche,21,20222023,1,55,...,0,,0,32.202484,139.703257,84,32.202484,25.769328,0.001716,0.002145
4,shot-on-goal,2022020671,2,Blackhawks,16,Avalanche,21,20222023,1,-81,...,0,,0,165.435788,12.369317,-84,12.369317,75.963757,0.004469,0.000728
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
324985,shot-on-goal,2022020520,2,Avalanche,21,Canadiens,8,20222023,3,70,...,0,,0,34.928498,157.289542,84,34.928498,-66.370622,0.001582,-0.000833
324986,blocked-shot,2022020520,2,Avalanche,21,Canadiens,8,20222023,3,59,...,0,,0,25.961510,143.171226,84,25.961510,-15.642246,0.002129,-0.003534
324987,blocked-shot,2022020520,2,Avalanche,21,Canadiens,8,20222023,3,-76,...,0,,0,160.049992,8.944272,-84,8.944272,-26.565051,0.006180,-0.002081
324988,shot-on-goal,2022020520,2,Avalanche,21,Canadiens,8,20222023,4,-81,...,0,,0,165.027270,4.242641,-84,4.242641,-45.000000,0.013028,-0.001228


In [11]:
df.to_csv('data/all_new_game_data_with_features.csv', index=False)