## Get the schedule and results of each NHL game in a particular season

In [38]:
import pandas as pd
import numpy as np
import json

In [39]:
# Read in team name dictionary for cleaning
with open('../data/team_name_dictionary.txt', 'r') as f:
    # Load the dictionary from the file
    team_name_dict = json.load(f)

In [40]:
# User input
season = 2023
url = 'https://www.hockey-reference.com/leagues/NHL_' + str(season) + '_games.html'

In [41]:
# Get the schedule from hockey reference for the given season
season_results = pd.read_html(url, attrs={'class':'stats_table', 'id':'games'})[0]

In [42]:
# Clean up the schedule
season_results.drop(columns=['Att.', 'LOG', 'Notes'], inplace = True)
season_results.columns = ['date', 'away', 'away_G',  'home', 'home_G', 'OT_flag']
season_results['away'] = season_results['away'].str.lower().replace(team_name_dict)
season_results['home'] = season_results['home'].str.lower().replace(team_name_dict)

In [43]:
season_results

Unnamed: 0,date,away,away_G,home,home_G,OT_flag
0,2022-10-07,SJS,1,NSH,4,
1,2022-10-08,NSH,3,SJS,2,
2,2022-10-11,VGK,4,LAK,3,
3,2022-10-11,TBL,1,NYR,3,
4,2022-10-12,SEA,4,ANA,5,OT
...,...,...,...,...,...,...
1307,2023-04-13,VGK,3,SEA,1,
1308,2023-04-13,DET,0,TBL,5,
1309,2023-04-13,NJD,5,WSH,4,OT
1310,2023-04-14,BUF,5,CBJ,2,


In [44]:
# Assign a game ID column to each row of the data frame
# Game ID = season + game number so ensure it is always unique for every game
# Ex: 231 = season 23, game 1... 231310 = season 23, game 1310
season_results['game_id'] = [int(str(season % 100) + str(i)) for i in range(1, len(season_results) + 1)]

In [45]:
# Every game should have a winner (one team with more goals)
season_results.loc[season_results['away_G'] == season_results['home_G'],:]

Unnamed: 0,date,away,away_G,home,home_G,OT_flag,game_id


In [46]:
# Melt df so that there is 1 row per team/game
home_melt = pd.melt(season_results, id_vars=['date', 'game_id',  'home', 'OT_flag'], value_vars=['home_G'], value_name='G').drop(columns='variable').rename(columns={'home':'team'})
away_melt = pd.melt(season_results, id_vars=['date', 'game_id', 'away', 'OT_flag'], value_vars=['away_G'], value_name='G').drop(columns='variable').rename(columns={'away':'team'})

combined_melt = pd.concat([home_melt, away_melt], axis=0)[['date', 'game_id', 'team', 'G', 'OT_flag']]
#combined_melt[combined_melt['game_id'] == 23401]

In [47]:
# Add a column for the winner of each game
combined_melt['win_flag'] = combined_melt['G'].eq(combined_melt.groupby('game_id')['G'].transform('max')).astype(int)
combined_melt.head()

Unnamed: 0,date,game_id,team,G,OT_flag,win_flag
0,2022-10-07,231,NSH,4,,1
1,2022-10-08,232,SJS,2,,0
2,2022-10-11,233,LAK,3,,0
3,2022-10-11,234,NYR,3,,1
4,2022-10-12,235,ANA,5,OT,1


In [48]:
# Write schedule to csv
combined_melt.to_csv('../data/season_scores/scores_' + str(season), header=True, index=False)