## Get the schedule and results of each NHL game in a particular season

In [1]:
import pandas as pd
import numpy as np
import json

In [2]:
# Read in team name dictionary for cleaning
with open('../data/team_name_dictionary.txt', 'r') as f:
    # Load the dictionary from the file
    team_name_dict = json.load(f)

In [14]:
# User input
season = 2021
url = 'https://www.hockey-reference.com/leagues/NHL_' + str(season) + '_games.html'

In [15]:
# Get the schedule from hockey reference for the given season
season_results = pd.read_html(url, attrs={'class':'stats_table', 'id':'games'})[0]

In [16]:
# Clean up the schedule
season_results.drop(columns=['Att.', 'LOG', 'Notes'], inplace = True)
season_results.columns = ['date', 'away', 'away_G',  'home', 'home_G', 'OT_flag']
season_results['away'] = season_results['away'].str.lower().replace(team_name_dict)
season_results['home'] = season_results['home'].str.lower().replace(team_name_dict)

In [17]:
season_results

Unnamed: 0,date,away,away_G,home,home_G,OT_flag
0,2021-01-13,STL,4,COL,1,
1,2021-01-13,VAN,5,EDM,3,
2,2021-01-13,PIT,3,PHI,6,
3,2021-01-13,CHI,1,TBL,5,
4,2021-01-13,MTL,4,TOR,5,OT
...,...,...,...,...,...,...
863,2021-05-14,TOR,2,WPG,4,
864,2021-05-15,VAN,4,EDM,1,
865,2021-05-16,CGY,6,VAN,5,OT
866,2021-05-18,CGY,2,VAN,4,


In [18]:
# Assign a game ID column to each row of the data frame
# Game ID = season + game number so ensure it is always unique for every game
# Ex: 231 = season 23, game 1... 231310 = season 23, game 1310
season_results['game_id'] = [int(str(season % 100) + str(i)) for i in range(1, len(season_results) + 1)]

In [19]:
# Every game should have a winner (one team with more goals)
season_results.loc[season_results['away_G'] == season_results['home_G'],:]

Unnamed: 0,date,away,away_G,home,home_G,OT_flag,game_id


In [20]:
# Melt df so that there is 1 row per team/game
home_melt = pd.melt(season_results, id_vars=['date', 'game_id',  'home', 'OT_flag'], value_vars=['home_G'], value_name='G').drop(columns='variable').rename(columns={'home':'team'})
away_melt = pd.melt(season_results, id_vars=['date', 'game_id', 'away', 'OT_flag'], value_vars=['away_G'], value_name='G').drop(columns='variable').rename(columns={'away':'team'})

combined_melt = pd.concat([home_melt, away_melt], axis=0)[['date', 'game_id', 'team', 'G', 'OT_flag']]
#combined_melt[combined_melt['game_id'] == 23401]

In [21]:
# Add a column for the winner of each game
combined_melt['win_flag'] = combined_melt['G'].eq(combined_melt.groupby('game_id')['G'].transform('max')).astype(int)
combined_melt.head()

Unnamed: 0,date,game_id,team,G,OT_flag,win_flag
0,2021-01-13,211,COL,1,,0
1,2021-01-13,212,EDM,3,,0
2,2021-01-13,213,PHI,6,,1
3,2021-01-13,214,TBL,5,,1
4,2021-01-13,215,TOR,5,OT,1


In [22]:
# Check row count
combined_melt.shape

(1736, 6)

In [23]:
# Write schedule to csv
combined_melt.to_csv('../data/season_scores/scores_' + str(season) + '.csv', header=True, index=False)