## Get the schedule and results of each NHL game in a particular season

In [80]:
import pandas as pd
import numpy as np
import json

In [81]:
# Read in team name dictionary for cleaning
with open('../data/team_name_dictionary.txt', 'r') as f:
    # Load the dictionary from the file
    team_name_dict = json.load(f)

In [104]:
# User input
season = 2023
url = 'https://www.hockey-reference.com/leagues/NHL_' + str(season) + '_games.html'

In [105]:
# Get the schedule from hockey reference for the given season
season_results = pd.read_html(url, attrs={'class':'stats_table', 'id':'games'})[0]

In [106]:
# Clean up the schedule
season_results.drop(columns=['Att.', 'LOG', 'Notes'], inplace = True)
season_results.columns = ['date', 'away', 'away_G',  'home', 'home_G', 'OT_status']
season_results['away'] = season_results['away'].str.lower().replace(team_name_dict)
season_results['home'] = season_results['home'].str.lower().replace(team_name_dict)

In [107]:
season_results

Unnamed: 0,date,away,away_G,home,home_G,OT_status
0,2022-10-07,SJS,1,NSH,4,
1,2022-10-08,NSH,3,SJS,2,
2,2022-10-11,VGK,4,LAK,3,
3,2022-10-11,TBL,1,NYR,3,
4,2022-10-12,SEA,4,ANA,5,OT
...,...,...,...,...,...,...
1307,2023-04-13,VGK,3,SEA,1,
1308,2023-04-13,DET,0,TBL,5,
1309,2023-04-13,NJD,5,WSH,4,OT
1310,2023-04-14,BUF,5,CBJ,2,


In [108]:
# Assign a game ID column to each row of the data frame
# Game ID = yymmdd_homeaway
season_results['date'] = pd.to_datetime(season_results['date'])
season_results['game_id'] = season_results.apply(lambda row: f"{row['date'].strftime('%y%m%d')}_{row['home']}{row['away']}", axis=1)

In [109]:
# Every game should have a winner (one team with more goals)
season_results.loc[season_results['away_G'] == season_results['home_G'],:]

Unnamed: 0,date,away,away_G,home,home_G,OT_status,game_id


In [110]:
# Melt df so that there is 1 row per team/game
home_melt = pd.melt(season_results, id_vars=['date', 'game_id',  'home', 'OT_status'], value_vars=['home_G'], value_name='G').drop(columns='variable').rename(columns={'home':'team'})
away_melt = pd.melt(season_results, id_vars=['date', 'game_id', 'away', 'OT_status'], value_vars=['away_G'], value_name='G').drop(columns='variable').rename(columns={'away':'team'})

# Add the column location to each
home_melt['location'] = 'H'
away_melt['location'] = 'A'

# Combine the melted df's
combined_melt = pd.concat([home_melt, away_melt], axis=0)

In [111]:
# Add a column for the winner of each game
combined_melt['win_flag'] = combined_melt['G'].eq(combined_melt.groupby('game_id')['G'].transform('max')).astype(int)

# Add season column
combined_melt['season'] = season

# Get correct column order
combined_melt = combined_melt[['team', 'game_id', 'date', 'season', 'location', 'G', 'OT_status', 'win_flag']]

# See results
combined_melt.head()

Unnamed: 0,team,game_id,date,season,location,G,OT_status,win_flag
0,NSH,221007_NSHSJS,2022-10-07,2023,H,4,,1
1,SJS,221008_SJSNSH,2022-10-08,2023,H,2,,0
2,LAK,221011_LAKVGK,2022-10-11,2023,H,3,,0
3,NYR,221011_NYRTBL,2022-10-11,2023,H,3,,1
4,ANA,221012_ANASEA,2022-10-12,2023,H,5,OT,1


In [112]:
# Test of game ID
combined_melt.loc[combined_melt['game_id'] == '221012_ANASEA']

Unnamed: 0,team,game_id,date,season,location,G,OT_status,win_flag
4,ANA,221012_ANASEA,2022-10-12,2023,H,5,OT,1
4,SEA,221012_ANASEA,2022-10-12,2023,A,4,OT,0


In [113]:
# Check row count
combined_melt.shape

(2624, 8)

In [114]:
# Write schedule to csv
combined_melt.to_csv('../data/season_scores/scores_' + str(season) + '.csv', header=True, index=False)