## Get the schedule and results of each NHL game in a particular season
If you run this file a few times during the middle of the season and once at the end of a season, it will update and fill all the games that have not happened yet. As a result, no reason to store daily results in a CSV. In the production file, I will try to set up a scrape that directly inserts into the MySQL database. If for whatever reason that has trouble, can use this file to save to CSV instead.

In [1]:
import pandas as pd
import numpy as np
import json

In [2]:
# Read in team name dictionary for cleaning
with open('/Users/bryanmichalek/Documents/GitHub_Personal/sports_betting_data/data/team_name_dictionary.txt', 'r') as f:
    # Load the dictionary from the file
    team_name_dict = json.load(f)

In [14]:
# User input
season = 2018
url = 'https://www.hockey-reference.com/leagues/NHL_' + str(season) + '_games.html'

In [15]:
# Get the schedule from hockey reference for the given season
season_results = pd.read_html(url, attrs={'class':'stats_table', 'id':'games'})[0]

In [16]:
# Clean up the schedule
season_results.drop(columns=['Att.', 'LOG', 'Notes'], inplace = True)
season_results.columns = ['date', 'away', 'away_G',  'home', 'home_G', 'OT_status']
season_results['away'] = season_results['away'].str.lower().replace(team_name_dict)
season_results['home'] = season_results['home'].str.lower().replace(team_name_dict)

In [17]:
season_results

Unnamed: 0,date,away,away_G,home,home_G,OT_status
0,2017-10-04,CGY,0,EDM,3,
1,2017-10-04,STL,5,PIT,4,OT
2,2017-10-04,PHI,5,SJS,3,
3,2017-10-04,TOR,7,WPG,2,
4,2017-10-05,ARI,4,ANA,5,
...,...,...,...,...,...,...
1266,2018-04-07,MIN,6,SJS,3,
1267,2018-04-07,MTL,2,TOR,4,
1268,2018-04-07,CHI,1,WPG,4,
1269,2018-04-07,NJD,3,WSH,5,


In [18]:
# Assign a game ID column to each row of the data frame
# Game ID = yymmdd_homeaway
season_results['date'] = pd.to_datetime(season_results['date'])
season_results['game_id'] = season_results.apply(lambda row: f"{row['date'].strftime('%y%m%d')}_{row['home']}{row['away']}", axis=1)

In [19]:
# Every game should have a winner (one team with more goals)
season_results.loc[season_results['away_G'] == season_results['home_G'],:]

Unnamed: 0,date,away,away_G,home,home_G,OT_status,game_id


In [20]:
# Melt df so that there is 1 row per team/game
home_melt = pd.melt(season_results, id_vars=['date', 'game_id',  'home', 'OT_status'], value_vars=['home_G'], value_name='G').drop(columns='variable').rename(columns={'home':'team'})
away_melt = pd.melt(season_results, id_vars=['date', 'game_id', 'away', 'OT_status'], value_vars=['away_G'], value_name='G').drop(columns='variable').rename(columns={'away':'team'})

# Add the column location to each
home_melt['location'] = 'H'
away_melt['location'] = 'A'

# Combine the melted df's
combined_melt = pd.concat([home_melt, away_melt], axis=0)

In [21]:
# Add a column for the winner of each game
combined_melt['win_flag'] = combined_melt['G'].eq(combined_melt.groupby('game_id')['G'].transform('max')) #.astype(int)
# Reset some values to null is G is missing (game has not been played yet)
combined_melt.loc[combined_melt['G'].isna(), 'win_flag'] = pd.NA
# Convert columns to int
combined_melt[['G', 'win_flag']] = combined_melt[['G', 'win_flag']].astype(pd.Int64Dtype())

# Add season column
combined_melt['season'] = season

# Get correct column order
combined_melt = combined_melt[['team', 'game_id', 'date', 'season', 'location', 'G', 'OT_status', 'win_flag']]

# See results
combined_melt.head(5)

  combined_melt.loc[combined_melt['G'].isna(), 'win_flag'] = pd.NA


Unnamed: 0,team,game_id,date,season,location,G,OT_status,win_flag
0,EDM,171004_EDMCGY,2017-10-04,2018,H,3,,1
1,PIT,171004_PITSTL,2017-10-04,2018,H,4,OT,0
2,SJS,171004_SJSPHI,2017-10-04,2018,H,3,,0
3,WPG,171004_WPGTOR,2017-10-04,2018,H,2,,0
4,ANA,171005_ANAARI,2017-10-05,2018,H,5,,1


In [22]:
# Test of game ID
combined_melt.loc[combined_melt['game_id'] == '171005_ANAARI']

Unnamed: 0,team,game_id,date,season,location,G,OT_status,win_flag
4,ANA,171005_ANAARI,2017-10-05,2018,H,5,,1
4,ARI,171005_ANAARI,2017-10-05,2018,A,4,,0


In [47]:
# Test of game ID
combined_melt.loc[combined_melt['game_id'] == '240308_ARIDET']

Unnamed: 0,team,game_id,date,season,location,G,OT_status,win_flag
1005,ARI,240308_ARIDET,2024-03-08,2024,H,,,
1005,DET,240308_ARIDET,2024-03-08,2024,A,,,


In [23]:
# Check row count
combined_melt.shape

(2542, 8)

In [24]:
# Write schedule to csv
combined_melt.to_csv('/Users/bryanmichalek/Documents/GitHub_Personal/sports_betting_data/data/historic_batch/season_schedules/schedule_' + str(season) + '.csv', header=True, index=False)