# Scraping stats from hockeyreference.com


In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import time
import numpy as np
import json
import time

## User input

In [25]:
# Set the season. Use the 2nd year involved in the season. 
# For example, if the 2021-2022 season is desired, set season = '2022'
season = '2016'
#path_to_skaters = '/Volumes/LUNANI/sports_betting_csv_backup/game_stats/skater_games.csv'
#path_to_goalies = '/Volumes/LUNANI/sports_betting_csv_backup/game_stats/goalie_games.csv'
path_to_skaters = '/Users/bryanmichalek/Documents/GitHub_Personal/sports_betting_data/data/historic_batch/stats/skater_2.csv'
path_to_goalies = '/Users/bryanmichalek/Documents/GitHub_Personal/sports_betting_data/data/historic_batch/stats/goalie_2.csv'

## Read in dict to convert team name to 3 letter code

In [26]:
# Read in team name dictionary for cleaning
with open('../../../sports_betting_data/data/team_name_dictionary.txt', 'r') as f:
    # Load the dictionary from the file
    team_name_dict = json.load(f)

## Define function to clean player names

In [27]:
def clean_name(name):
    """Clean player names with special characters and punctuation to standardize across data sources"""
    # Set up replace dictionary
    replace_dict = {
        # Special characters
        #'Å':'A',
        'å':'a',
        'ä':'a',
        'á':'a',
        #'Č':'C',
        'č':'c',
        #'É':'E',
        'é':'e',
        'ë':'e',
        'è':'e',
        'ě':'e',
        'í':'i',
        'ļ':'l',
        'ň':'n',
        'ö':'o',
        'ø':'o',
        'ř':'r',
        #'Š':'S',
        'š':'s',
        'ü':'u',
        'ž':'z',

        # Other punctuation
        '.':'',
        '-':' ',
        "'":''
    }
    # Strip white space
    name = name.strip()
    # Lowercase
    name = name.lower()
    # Replace characters, punctuation, phrases
    for k, v in replace_dict.items():
        name = name.replace(k, v)

    # Return cleaned name
    return name

## Define function to web scrape season game logs given player_id

In [28]:
def get_game_log(player_id, season):
    try:
        # Convert Ex: allenja01 to a/allenja01 for the URL
        url_player_id = f'{str(player_id)[0]}/{str(player_id)}'

        # Create template URL. We will fill in the 2 blanks (___ and ...) with the correct information for each player
        url = f"https://www.hockey-reference.com/players/{url_player_id}/gamelog/{str(season)}"

        # Get table from hockey reference using pandas
        game_log = pd.read_html(url, attrs = {'class': 'row_summable'})[0]

        # Add the player_id column
        game_log['player_id'] = player_id
        
        # Add the season column
        game_log['season'] = season

        return game_log
    
    except:
        # If we cant find games for a particular player, alert us and save the player_id off to the side. 
        # Then continue on wih the rest.
        print(player_id, "was not found among season", season, "players on HockeyReference.com")
        return 

## Function to get a list of all the player IDs for the season
### This should also create a link between player ID and player name
Now, we want to do this for all players from the user specified season that have played at least 1 game during that season.
It would probably be most accurate to get this list from HockeyReference since that is where we are scraping the player stats.

The url containing these players has the following form and can be modified based on what season the user is interested in. 
* https://www.hockey-reference.com/leagues/NHL_2022_skaters.html
* https://www.hockey-reference.com/leagues/NHL_2022_goalies.html

We will grab the player_id using the href. The names can be retrieved from the same tag (using the text instead of href). The names will be left joined later to the final data frame.

In [29]:
def create_name_to_id_link(season, player_type):
    # Scrape the list of players.
    # Create the url
    season = str(season)
    if player_type == 's':
        names_url = 'https://www.hockey-reference.com/leagues/NHL_' + season + '_skaters.html'
    elif player_type == 'g':
        names_url = 'https://www.hockey-reference.com/leagues/NHL_' + season + '_goalies.html'
    else:
        raise ValueError('Invalid player type. Use "s" or "g".')
    
    # Grab the html code 
    response = requests.get(names_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    names_html_list = soup.find_all('td', attrs = {'data-stat':'player'})

    # Here is the list of player names. We can use this to create a dictionary of {'player_id':'player_names'}.
    names = [player_name.text for player_name in names_html_list]
    names = [clean_name(name) for name in names]
    print('Number of names scraped:', len(names))

    # Grab player ID using the href in <a> tag. Then we don't have to construct them ourselves.
    href_strings = [str(names_html_list[index].a) for index in range(len(names_html_list))]
    ids = [re.search('players/\w/(\S+[0-9]+).html', href).group(1) for href in href_strings] # This can also handle names like 'J.T. Compher'

    # This works for getting all of the player IDs in the list
    print('Number of player IDs scraped:', len(ids))

    # Here is the set of unique ones we will use later to iterate through
    unique_ids = list(set(ids))
    unique_ids.sort()
    print('Number of unique player IDs', len(unique_ids), '\n')

    # Create a data frame of player ids and player names. We will use this to join to the final data frame to 
    # add the player names as another column.
    ids_names = pd.DataFrame({'player_id' : ids, 'player_name' : names})
    ids_names = ids_names.drop_duplicates()

    # Return the df/dictionary that links player ID to player name
    return ids_names
        

## Get the game log for each player ID

In [30]:
# Get the names and player ID's of all skaters that played during the season
skater_ids_names = create_name_to_id_link(season = season, player_type = 's')
display(skater_ids_names.head())

Number of names scraped: 1042
Number of player IDs scraped: 1042
Number of unique player IDs 898 



Unnamed: 0,player_id,player_name
0,abdelju01,justin abdelkader
1,acciano01,noel acciari
2,agostke01,kenny agostino
3,agozzan01,andrew agozzino
4,alzneka01,karl alzner


In [31]:
# Initalize empty df to store all player game logs combined
skater_games = pd.DataFrame()
unique_skater_ids = skater_ids_names['player_id'].unique()

start_time = time.time()

for i, id in enumerate(unique_skater_ids):
    # Wait some time between each request
    time.sleep(4)
    
    # Get game log for the id
    skater_games = pd.concat([skater_games, get_game_log(id, season = season)], axis = 0)

    # Print our progress as we iterate through all the players
    if i in [25, 50, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]:
        print('Num players:', len(skater_games['player_id'].unique()), '- Num rows in df:', len(skater_games.index))

end_time = time.time()
print(f"Elapsed time: {end_time - start_time}")

Num players: 26 - Num rows in df: 1275
Num players: 51 - Num rows in df: 2411
Num players: 101 - Num rows in df: 5138
Num players: 201 - Num rows in df: 10432
Num players: 301 - Num rows in df: 15470
Num players: 401 - Num rows in df: 20981
Num players: 501 - Num rows in df: 26401
Num players: 601 - Num rows in df: 31077
Num players: 701 - Num rows in df: 35922
Num players: 801 - Num rows in df: 41217
Elapsed time: 4690.847387075424


## Get the game log for each goalie ID

In [32]:
# Get the names and player ID's of all goalies that played during the season
goalie_ids_names = create_name_to_id_link(season = season, player_type = 'g')
display(goalie_ids_names.head())

Number of names scraped: 96
Number of player IDs scraped: 96
Number of unique player IDs 92 



Unnamed: 0,player_id,player_name
0,allenja01,jake allen
1,anderfr01,frederik andersen
2,andercr01,craig anderson
3,bachmri01,richard bachman
4,backsni01,niklas backstrom


In [33]:
# Initalize empty df to store all player game logs combined
goalie_games = pd.DataFrame()
unique_goalie_ids = goalie_ids_names['player_id'].unique()

start_time = time.time()

for i, id in enumerate(unique_goalie_ids):
    # Wait some time between each request
    time.sleep(4)
    
    # Get game log for the id
    goalie_games = pd.concat([goalie_games, get_game_log(id, season = season)], axis = 0)

    # Print our progress as we iterate through all the players
    if i in [10, 25, 50, 75, 100, 125]:
        print('Num players:', len(goalie_games['player_id'].unique()), '- Num rows in df:', len(goalie_games.index))

end_time = time.time()
print(f"Elapsed time: {end_time - start_time}")

Num players: 11 - Num rows in df: 324
Num players: 26 - Num rows in df: 719
Num players: 51 - Num rows in df: 1430
Num players: 76 - Num rows in df: 2354
Elapsed time: 440.8437309265137


## Clean the data frame of skater statistics

In [None]:
# Rename columns
skater_games.columns = ['rank', 'date', 'game_num', 'age', 'team', 'home_away_status', 'opponent', 'result', 
                        'G', 'A', 'P', 'rating', 'PIM', 'EVG', 'PPG', 'SHG', 'GWG', 'EVA', 'PPA', 'SHA', 'S', 'S_perc',
                        'shifts', 'TOI', 'HIT', 'BLK', 'FOW', 'FOL', 'FOW_perc', 'player_id', 'season']

# Remove these weird rows... They are not actual data (just repeated column names)
skater_games = skater_games.loc[skater_games[('rank')] != 'Rk']

# Left join the player_name column to the data frame
skater_games = skater_games.merge(skater_ids_names, on = 'player_id', how = 'left')

# Reorder the columns 
skater_games = skater_games[['rank', 'player_id', 'player_name', 'age', 'season', 'game_num', 'date', 'team', 
                            'opponent', 'home_away_status', 'result', 'G', 'A', 'P', 'rating', 'PIM', 'EVG', 
                             'PPG', 'SHG', 'GWG', 'EVA', 'PPA', 'SHA', 'S', 'S_perc', 'shifts', 'TOI', 'HIT', 
                             'BLK', 'FOW', 'FOL', 'FOW_perc']]

# Clean home_away_status column. 1 means the player played a home game. 0 means the game was on the road.
skater_games = skater_games.replace({'home_away_status': {'@':0, np.nan:1}})

# Fix age strings to just reflect age in years. We probably don't care about days since last birthday.
skater_games['age'] = skater_games.age.str.extract(pat = '(\d+)')

# Convert time on ice (TOI) to minutes using decimal. For example 22:40 should become 22.666667.
skater_games[['min', 'sec']] = skater_games.TOI.str.split(':', expand = True)
skater_games['TOI'] = skater_games['min'].astype('float') + skater_games['sec'].astype('float') / 60

# Now drop columns we don't need.
skater_games = skater_games.drop(columns = ['rank', 'min', 'sec'])

# Convert team to 3 letter code
skater_games['team'] = skater_games['team'].str.lower().replace(team_name_dict)
skater_games['opponent'] = skater_games['opponent'].str.lower().replace(team_name_dict)

# Clean player names (technically this should already be done by cleaning the names in the dictionary mapping ID to name)
#skater_games.loc[:, 'player_name'] = [clean_name(name) for name in skater_games.loc[:, 'player_name']]

# Convert columns to correct types. The proper types are listed below.
    # Date --> [date]
    # Integer --> [game_num, age, home_away_status, G, A, P, rating, PIM, EVG, PPG, SHG, GWG, EVA, PPA, 
                # SHA, S, shifts, HIT, BLK, FOW, FOL, season]
    # Float --> [S_per, TOI, FOW_perc]
    # String --> [player_id, team, opponent, result]
skater_games = skater_games.astype({'date':'datetime64[ns]','game_num':'int', 'age':'int',
                                              'team':'str', 'home_away_status':'int', 'opponent':'str', 
                                              'result':'str', 'G':'int', 'A':'int', 'P':'int', 'rating':'int', 
                                              'PIM':'int', 'EVG':'int', 'PPG':'int', 'SHG':'int', 'GWG':'int', 
                                              'EVA':'int', 'PPA':'int', 'SHA':'int', 'S':'int', 'S_perc':'float',
                                              'shifts':'int', 'TOI':'float', 'HIT':'int', 'BLK':'int', 'FOW':'int', 
                                              'FOL':'int', 'FOW_perc':'float', 'player_id':'str', 'season':'int', 
                                              'player_name':'str'})

# Keep only date part
skater_games['date'] = skater_games['date'].dt.date


In [35]:
skater_games.loc[skater_games['BLK'].isna(), :]

Unnamed: 0,player_id,player_name,age,season,game_num,date,team,opponent,home_away_status,result,...,SHA,S,S_perc,shifts,TOI,HIT,BLK,FOW,FOL,FOW_perc
220,anderjo03,joakim andersson,26,2016,15,2015-12-11,DET,NJD,0.0,L-OT,...,0,0,,13,8.683333,0,,,,
227,anderjo03,joakim andersson,26,2016,22,2016-01-02,DET,BUF,0.0,W,...,0,0,,12,7.816667,0,,,,
1283,bartkma01,matt bartkowski,27,2016,58,2016-02-21,VAN,COL,1.0,W,...,0,0,,26,19.666667,0,,,,
1304,bartkma01,matt bartkowski,27,2016,79,2016-04-07,VAN,CGY,0.0,L,...,0,0,,23,18.633333,0,,,,
1326,bassco01,cody bass,29,2016,11,2016-02-04,NSH,PHI,1.0,L,...,0,0,,12,5.583333,0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41945,vermijo01,joel vermin,23,2016,6,2015-12-14,TBL,CBJ,0.0,W,...,0,0,,1,1.633333,0,,,,
43124,wiercpa01,patrick wiercioch,25,2016,7,2015-10-22,OTT,NJD,1.0,L-SO,...,0,0,,17,14.500000,0,,,,
43602,woodsbr01,brendan woods,23,2016,2,2016-03-10,CAR,BOS,0.0,W,...,0,0,,14,8.166667,0,,,,
43769,zadorni01,nikita zadorov,20,2016,10,2015-10-30,COL,CAR,0.0,L,...,0,0,,17,14.266667,0,,,,


## Clean data frame for goalies

In [36]:
# Rename columns
goalie_games.columns = ['rank', 'date', 'game_num', 'age', 'team', 'home_away_status', 'opponent', 'result', 
                        'decision', 'GA', 'SA', 'SV', 'SV_perc', 'shutout', 'PIM', 'TOI', 'player_id', 'season']

# Remove these weird rows... They are not actual data (just repeated column names)
goalie_games = goalie_games.loc[goalie_games[('rank')] != 'Rk']

# Left join the player_name column to the data frame
goalie_games = goalie_games.merge(goalie_ids_names, on = 'player_id', how = 'left')

# Reorder the columns 
goalie_games = goalie_games[['rank', 'player_id', 'player_name', 'age', 'season', 'game_num', 'date', 'team', 
                            'opponent', 'home_away_status', 'result', 'decision', 'GA', 'SA', 'SV', 'SV_perc', 
                             'shutout', 'PIM', 'TOI']]

# Clean home_away_status column. 1 means the player played a home game. 0 means the game was on the road.
goalie_games = goalie_games.replace({'home_away_status': {'@':0, np.nan:1}})

# Fix age strings to just reflect age in years. We probably don't care about days since last birthday.
goalie_games['age'] = goalie_games.age.str.extract(pat = '(\d+)')

# Convert time on ice (TOI) to minutes using decimal. For example 22:40 should become 22.666667.
goalie_games[['min', 'sec']] = goalie_games.TOI.str.split(':', expand = True)
goalie_games['TOI'] = goalie_games['min'].astype('float') + goalie_games['sec'].astype('float') / 60

# Now drop columns we don't need.
goalie_games = goalie_games.drop(columns = ['rank', 'min', 'sec'])

# Convert team to 3 letter code
goalie_games['team'] = goalie_games['team'].str.lower().replace(team_name_dict)
goalie_games['opponent'] = goalie_games['opponent'].str.lower().replace(team_name_dict)

# Clean player names (technically this should already be done by cleaning the names in the dictionary mapping ID to name)
#goalie_games.loc[:, 'player_name'] = [clean_name(name) for name in goalie_games.loc[:, 'player_name']]

# Convert columns to correct types. The proper types are listed below.
    # Date --> [date]
    # Integer --> [age, season, game_num, home_away_status, GA, SA, SV, shutout, PIM]
    # Float --> [SV_perc, TOI]
    # String --> [player_id, player_name, team, opponent, result, decision]
goalie_games = goalie_games.astype({'date':'datetime64[ns]', 'age':'int', 'season':'int', 'game_num':'int', 
                                    'home_away_status':'int', 'GA':'int', 'SA':'int', 'SV':'int', 
                                    'shutout':'int', 'PIM':'int', 'SV_perc':'float', 'TOI':'float', 
                                    'player_id':'str', 'player_name':'str', 'team':'str', 'opponent':'str', 
                                    'result':'str', 'decision':'str'})

# Keep only date part
goalie_games['date'] = goalie_games['date'].dt.date

  goalie_games = goalie_games.replace({'home_away_status': {'@':0, np.nan:1}})


## Sanity checks for skaters

In [37]:
# Number of unique goalies
print(len(skater_games.player_id.unique()))

# All players should have between 1 and 82 games played (UNLESS THEY WERE TRADED AND NEVER MISSED A GAME)
print(sum(skater_games.groupby(['player_id'])['player_id'].count() <= 0))
print(sum(skater_games.groupby(['player_id'])['player_id'].count() > 82))

898
0
2


## Sanity checks for goalies

In [38]:
# Number of unique goalies
print(len(goalie_games.player_id.unique()))

# All goalies should have between 1 and 82 games played 
print(sum(goalie_games.groupby(['player_id'])['player_id'].count() <= 0))
print(sum(goalie_games.groupby(['player_id'])['player_id'].count() > 82))

92
0
0


In [39]:
# Goals against should equal shots against minus saves
print(sum(goalie_games.GA != goalie_games.SA - goalie_games.SV))

0


## Update the csv file

In [40]:
try:
    old_skater = pd.read_csv(path_to_skaters)
    new_skater = pd.concat([old_skater, skater_games], axis=0).reset_index(drop=True)
    new_skater.to_csv(path_to_skaters, header=True, index=False)
    print(f'Skater game CSV successfully updated.\nNumber of rows added: {len(skater_games)}\nNew total rows: {len(new_skater)}\n')
except:
    print('Skater game CSV was not updated.\n')
    raise

Skater game CSV successfully updated.
Number of rows added: 44278
New total rows: 88552



In [41]:
try:
    old_goalie = pd.read_csv(path_to_goalies)
    new_goalie = pd.concat([old_goalie, goalie_games], axis=0).reset_index(drop=True)
    new_goalie.to_csv(path_to_goalies, header=True, index=False)
    print(f'Goalie game CSV successfully updated.\nNumber of rows added: {len(goalie_games)}\nNew total rows: {len(new_goalie)}\n')
except:
    print('Goalie game CSV was not updated.\n')
    raise

Goalie game CSV successfully updated.
Number of rows added: 2644
New total rows: 5301

