## Get the schedule and results of each NHL game in a particular season
If you run this file a few times during the middle of the season and once at the end of a season, it will update and fill all the games that have not happened yet. As a result, no reason to store daily results in a CSV. In the production file, I will try to set up a scrape that directly inserts into the MySQL database. If for whatever reason that has trouble, can use this file to save to CSV instead.

In [1]:
import pandas as pd
import numpy as np
import json

In [3]:
# Read in team name dictionary for cleaning
with open('../../data/team_name_dictionary.txt', 'r') as f:
    # Load the dictionary from the file
    team_name_dict = json.load(f)

In [4]:
# User input
season = 2024
url = 'https://www.hockey-reference.com/leagues/NHL_' + str(season) + '_games.html'

In [5]:
# Get the schedule from hockey reference for the given season
season_results = pd.read_html(url, attrs={'class':'stats_table', 'id':'games'})[0]

In [6]:
# Clean up the schedule
season_results.drop(columns=['Att.', 'LOG', 'Notes'], inplace = True)
season_results.columns = ['date', 'away', 'away_G',  'home', 'home_G', 'OT_status']
season_results['away'] = season_results['away'].str.lower().replace(team_name_dict)
season_results['home'] = season_results['home'].str.lower().replace(team_name_dict)

In [7]:
season_results

Unnamed: 0,date,away,away_G,home,home_G,OT_status
0,2023-10-10,CHI,4.0,PIT,2.0,
1,2023-10-10,NSH,3.0,TBL,5.0,
2,2023-10-10,SEA,1.0,VGK,4.0,
3,2023-10-11,CHI,1.0,BOS,3.0,
4,2023-10-11,OTT,3.0,CAR,5.0,
...,...,...,...,...,...,...
1307,2024-04-18,EDM,,COL,,
1308,2024-04-18,CHI,,LAK,,
1309,2024-04-18,SEA,,MIN,,
1310,2024-04-18,ANA,,VGK,,


In [8]:
# Assign a game ID column to each row of the data frame
# Game ID = yymmdd_homeaway
season_results['date'] = pd.to_datetime(season_results['date'])
season_results['game_id'] = season_results.apply(lambda row: f"{row['date'].strftime('%y%m%d')}_{row['home']}{row['away']}", axis=1)

In [9]:
# Every game should have a winner (one team with more goals)
season_results.loc[season_results['away_G'] == season_results['home_G'],:]

Unnamed: 0,date,away,away_G,home,home_G,OT_status,game_id


In [40]:
# Melt df so that there is 1 row per team/game
home_melt = pd.melt(season_results, id_vars=['date', 'game_id',  'home', 'OT_status'], value_vars=['home_G'], value_name='G').drop(columns='variable').rename(columns={'home':'team'})
away_melt = pd.melt(season_results, id_vars=['date', 'game_id', 'away', 'OT_status'], value_vars=['away_G'], value_name='G').drop(columns='variable').rename(columns={'away':'team'})

# Add the column location to each
home_melt['location'] = 'H'
away_melt['location'] = 'A'

# Combine the melted df's
combined_melt = pd.concat([home_melt, away_melt], axis=0)

In [52]:
# Add a column for the winner of each game
combined_melt['win_flag'] = combined_melt['G'].eq(combined_melt.groupby('game_id')['G'].transform('max')) #.astype(int)
# Reset some values to null is G is missing (game has not been played yet)
combined_melt.loc[combined_melt['G'].isna(), 'win_flag'] = pd.NA
# Convert columns to int
combined_melt[['G', 'win_flag']] = combined_melt[['G', 'win_flag']].astype(pd.Int64Dtype())

# Add season column
combined_melt['season'] = season

# Get correct column order
combined_melt = combined_melt[['team', 'game_id', 'date', 'season', 'location', 'G', 'OT_status', 'win_flag']]

# See results
combined_melt.head(5)

Unnamed: 0,team,game_id,date,season,location,G,OT_status,win_flag
0,PIT,231010_PITCHI,2023-10-10,2024,H,2,,0
1,TBL,231010_TBLNSH,2023-10-10,2024,H,5,,1
2,VGK,231010_VGKSEA,2023-10-10,2024,H,4,,1
3,BOS,231011_BOSCHI,2023-10-11,2024,H,3,,1
4,CAR,231011_CAROTT,2023-10-11,2024,H,5,,1


In [53]:
# Test of game ID
combined_melt.loc[combined_melt['game_id'] == '231130_CGYDAL']

Unnamed: 0,team,game_id,date,season,location,G,OT_status,win_flag
343,CGY,231130_CGYDAL,2023-11-30,2024,H,4,OT,1
343,DAL,231130_CGYDAL,2023-11-30,2024,A,3,OT,0


In [47]:
# Test of game ID
combined_melt.loc[combined_melt['game_id'] == '240308_ARIDET']

Unnamed: 0,team,game_id,date,season,location,G,OT_status,win_flag
1005,ARI,240308_ARIDET,2024-03-08,2024,H,,,
1005,DET,240308_ARIDET,2024-03-08,2024,A,,,


In [54]:
# Check row count
combined_melt.shape

(2624, 8)

In [55]:
# Write schedule to csv
combined_melt.to_csv('../../data/historic_batch/season_schedules/schedule_' + str(season) + '.csv', header=True, index=False)