## Get the schedule and results of each NHL game in a particular season
If you run this file a few times during the middle of the season and once at the end of a season, it will update and fill all the games that have not happened yet. As a result, no reason to store daily results in a CSV. In the production file, I will try to set up a scrape that directly inserts into the MySQL database. If for whatever reason that has trouble, can use this file to save to CSV instead.

In [1]:
import pandas as pd
import numpy as np
import json

In [2]:
# Read in team name dictionary for cleaning
with open('/Users/bryanmichalek/Documents/GitHub_Personal/sports_betting_data/data/team_name_dictionary.txt', 'r') as f:
    # Load the dictionary from the file
    team_name_dict = json.load(f)

In [15]:
# User input
season = 2016
url = 'https://www.hockey-reference.com/leagues/NHL_' + str(season) + '_games.html'

In [16]:
# Get the schedule from hockey reference for the given season
season_results = pd.read_html(url, attrs={'class':'stats_table', 'id':'games'})[0]

In [17]:
# Clean up the schedule
season_results.drop(columns=['Att.', 'LOG', 'Notes'], inplace = True)
season_results.columns = ['date', 'away', 'away_G',  'home', 'home_G', 'OT_status']
season_results['away'] = season_results['away'].str.lower().replace(team_name_dict)
season_results['home'] = season_results['home'].str.lower().replace(team_name_dict)

In [18]:
season_results

Unnamed: 0,date,away,away_G,home,home_G,OT_status
0,2015-10-07,VAN,5,CGY,1,
1,2015-10-07,NYR,3,CHI,2,
2,2015-10-07,SJS,5,LAK,1,
3,2015-10-07,MTL,3,TOR,1,
4,2015-10-08,WPG,6,BOS,2,
...,...,...,...,...,...,...
1225,2016-04-09,ARI,0,SJS,1,
1226,2016-04-09,WSH,5,STL,1,
1227,2016-04-09,EDM,3,VAN,4,SO
1228,2016-04-10,PHI,5,NYI,2,


In [19]:
# Assign a game ID column to each row of the data frame
# Game ID = yymmdd_homeaway
season_results['date'] = pd.to_datetime(season_results['date'])
season_results['game_id'] = season_results.apply(lambda row: f"{row['date'].strftime('%y%m%d')}_{row['home']}{row['away']}", axis=1)

In [20]:
# Every game should have a winner (one team with more goals)
season_results.loc[season_results['away_G'] == season_results['home_G'],:]

Unnamed: 0,date,away,away_G,home,home_G,OT_status,game_id


In [21]:
# Melt df so that there is 1 row per team/game
home_melt = pd.melt(season_results, id_vars=['date', 'game_id',  'home', 'OT_status'], value_vars=['home_G'], value_name='G').drop(columns='variable').rename(columns={'home':'team'})
away_melt = pd.melt(season_results, id_vars=['date', 'game_id', 'away', 'OT_status'], value_vars=['away_G'], value_name='G').drop(columns='variable').rename(columns={'away':'team'})

# Add the column location to each
home_melt['location'] = 'H'
away_melt['location'] = 'A'

# Combine the melted df's
combined_melt = pd.concat([home_melt, away_melt], axis=0)

In [22]:
# Add a column for the winner of each game
combined_melt['win_flag'] = combined_melt['G'].eq(combined_melt.groupby('game_id')['G'].transform('max')) #.astype(int)
# Reset some values to null is G is missing (game has not been played yet)
combined_melt.loc[combined_melt['G'].isna(), 'win_flag'] = pd.NA
# Convert columns to int
combined_melt[['G', 'win_flag']] = combined_melt[['G', 'win_flag']].astype(pd.Int64Dtype())

# Add season column
combined_melt['season'] = season

# Get correct column order
combined_melt = combined_melt[['team', 'game_id', 'date', 'season', 'location', 'G', 'OT_status', 'win_flag']]

# See results
combined_melt.head(5)

  combined_melt.loc[combined_melt['G'].isna(), 'win_flag'] = pd.NA


Unnamed: 0,team,game_id,date,season,location,G,OT_status,win_flag
0,CGY,151007_CGYVAN,2015-10-07,2016,H,1,,0
1,CHI,151007_CHINYR,2015-10-07,2016,H,2,,0
2,LAK,151007_LAKSJS,2015-10-07,2016,H,1,,0
3,TOR,151007_TORMTL,2015-10-07,2016,H,1,,0
4,BOS,151008_BOSWPG,2015-10-08,2016,H,2,,0


In [23]:
# Test of game ID
combined_melt.loc[combined_melt['game_id'] == '151007_TORMTL']

Unnamed: 0,team,game_id,date,season,location,G,OT_status,win_flag
3,TOR,151007_TORMTL,2015-10-07,2016,H,1,,0
3,MTL,151007_TORMTL,2015-10-07,2016,A,3,,1


In [26]:
# Test of game ID
combined_melt.loc[combined_melt['game_id'] == '161012_CHISTL']

Unnamed: 0,team,game_id,date,season,location,G,OT_status,win_flag


In [24]:
# Check row count
combined_melt.shape

(2460, 8)

In [25]:
# Write schedule to csv
combined_melt.to_csv('/Users/bryanmichalek/Documents/GitHub_Personal/sports_betting_data/data/historic_batch/season_schedules/schedule_' + str(season) + '.csv', header=True, index=False)