# Scraping stats from hockeyreference.com


In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import regex as re
import time
import sched
import os

## User input

In [13]:
# Set the season. Use the 2nd year involved in the season. 
# For example, if the 2021-2022 season is desired, set season = '2022'
season = '2021'

## Define function to web scrape 2021-2022 game logs given player_id

In [14]:
# For SKATERS only (not goalies...)
def get_game_log(player_id, player_type):
    global season
    
    try:
        # Create template URL. We will fill in the 2 blanks (___ and ...) with the correct information for each player
        template_url = 'https://www.hockey-reference.com/players.../gamelog/___'

        # Fill in blanks
        url = template_url.replace('...', str(player_id))
        url = url.replace('___', str(season))

        # Get table from hockey reference using pandas
        game_log = pd.read_html(url, attrs = {'class': 'row_summable'})[0]

        # Add the player_id column
        game_log['player_id'] = player_id
        
        # Add the season column
        game_log['season'] = season

        # Check whether player is a skater or goalie
        if player_type == "skater":
            global skater_games
            
            # Update the running df containing all the players
            skater_games = skater_games.append(game_log)
            
            # Print our progress as we iterate through all the players
            print('Num players:', len(skater_games.player_id.unique()), '- Num rows in df:', len(skater_games.index))
            
        elif player_type == "goalie":
            global goalie_games
            
             # Update the running df containing all the players
            goalie_games = goalie_games.append(game_log)
            
            # Print our progress as we iterate through all the players
            print('Num players:', len(goalie_games.player_id.unique()), '- Num rows in df:', len(goalie_games.index))
        
        else:
            print(player_type, "is not a known player type.")
            
        return
    
    except:
        # If we cant find games for a particular player, alert us and save the player_id off to the side. 
        # Then continue on wih the rest.
        if player_type == "skater":
            global skater_ids_that_go_wrong
            print(player_id, "was not found among season", season, "players on HockeyReference.com")
            skater_ids_that_go_wrong.append(player_id)
            
        elif player_type == "goalie":
            global goalie_ids_that_go_wrong
            print(player_id, "was not found among season", season, "players on HockeyReference.com")
            goalie_ids_that_go_wrong.append(player_id)
            
        else:
            print(player_type, "is not a known player type.")

        return

## Now lets get a list of all the player IDs for the season
Now, we want to do this for all players from the user specified season that have played at least 1 game during that season.
It would probably be most accurate to get this list from HockeyReference since that is where we are scraping the player stats.

The url containing these players has the following form and can be modified based on what season the user is interested in. 
* https://www.hockey-reference.com/leagues/NHL_2022_skaters.html
* https://www.hockey-reference.com/leagues/NHL_2022_goalies.html

We will grab the player_id using the href. The names can be retrieved from the same tag (using the text instead of href). The names will be left joined later to the final data frame.

In [4]:
# Scrape the list of players.
# Create the url
skater_names_template_url = 'https://www.hockey-reference.com/leagues/NHL_..._skaters.html'
skater_names_url = skater_names_template_url.replace('...', str(season))

# Grab the html code 
response = requests.get(skater_names_url)
soup = BeautifulSoup(response.text, 'html.parser')
skater_names_html_list = soup.find_all('td', attrs = {'data-stat':'player'})

# Here is the list of player names. We can use this to create a dictionary of {'player_id':'player_names'}.
skater_names = [player_name.text for player_name in skater_names_html_list]

# List of unique names (technically not guaranteed to be accurate... ex: there are 2 Sebastian Aho's)
#unique_skater_names = set(skater_names)
#print('Unique skater names we are getting:', len(unique_skater_names), '\n')

In [6]:
# Grab player ID using the href in <a> tag. Then we don't have to construct them ourselves.
href_strings = [str(skater_names_html_list[index].a) for index in range(len(skater_names_html_list))]
skater_ids = [re.search('/\w/\S+[0-9]+', href).group(0) for href in href_strings] # This can also handle names like 'J.T. Compher'

# This works for getting all of the player IDs in the list
print('Number of player IDs scraped:', len(skater_ids))

# Here is the set of unique ones we will use later to iterate through
unique_skater_ids = list(set(skater_ids))
unique_skater_ids.sort()
print('Number of unique player IDs', len(unique_skater_ids), '\n')

# Show first few
#print('First few player IDs', unique_skater_ids[0:5])

# Create a data frame of player ids and player names. We will use this to join to the final data frame to 
# add the player names as another column.
skater_ids_names = pd.DataFrame({'player_id' : skater_ids, 'player_name' : skater_names})
skater_ids_names = skater_ids_names.drop_duplicates()
display(skater_ids_names.head())                                                             

Number of player IDs scraped: 1162
Number of unique player IDs 1004 



Unnamed: 0,player_id,player_name
0,/a/abruzni01,Nicholas Abruzzese
1,/a/acciano01,Noel Acciari
2,/a/addisca01,Calen Addison
3,/a/agozzan01,Andrew Agozzino
4,/a/ahcanja01,Jack Ahcan


## Get goalie player IDs

In [15]:
# Scrape the list of goalies.
# Create the url
goalie_names_template_url = 'https://www.hockey-reference.com/leagues/NHL_..._goalies.html'
goalie_names_url = goalie_names_template_url.replace('...', str(season))

# Grab the html code 
response = requests.get(goalie_names_url)
soup = BeautifulSoup(response.text, 'html.parser')
goalie_names_html_list = soup.find_all('td', attrs = {'data-stat':'player'})

# Here is the list of player names. We can use this to create a dictionary of {'player_id':'player_names'}.
goalie_names = [player_name.text for player_name in goalie_names_html_list]

# List of unique names (technically not guaranteed to be accurate... ex: there are 2 Sebastian Aho's)
#unique_goalie_names = set(goalie_names)
#print('Unique skater names we are getting:', len(unique_goalie_names), '\n')

In [16]:
# Grab player ID using the href in <a> tag. Then we don't have to construct them ourselves.
href_strings = [str(goalie_names_html_list[index].a) for index in range(len(goalie_names_html_list))]
goalie_ids = [re.search('/\w/\S+[0-9]+', href).group(0) for href in href_strings] # This can also handle names like 'J.T. Compher'

# This works for getting all of the player IDs in the list
print('Number of player IDs scraped:', len(goalie_ids))

# Here is the set of unique ones we will use later to iterate through
unique_goalie_ids = list(set(goalie_ids))
unique_goalie_ids.sort()
print('Number of unique player IDs', len(unique_goalie_ids), '\n')

# Show first few
#print('First few player IDs', unique_goalie_ids[0:5])

# Create a data frame of player ids and player names. We will use this to join to the final data frame to 
# add the player names as another column.
goalie_ids_names = pd.DataFrame({'player_id' : goalie_ids, 'player_name' : goalie_names})
goalie_ids_names = goalie_ids_names.drop_duplicates()
display(goalie_ids_names.head())


Number of player IDs scraped: 104
Number of unique player IDs 98 



Unnamed: 0,player_id,player_name
0,/a/allenja01,Jake Allen
1,/a/anderfr01,Frederik Andersen
2,/a/andercr01,Craig Anderson
3,/b/bernijo01,Jonathan Bernier
4,/b/binnijo01,Jordan Binnington


## Apply function on all skater IDs

In [21]:
# Initalize empty df to store all player game logs combined
skater_games = pd.DataFrame()
# Place to store any player ID that does NOT get added to data frame
skater_ids_that_go_wrong = []

# Set up scheduler object to fun the game log function every 2 seconds
scheduler = sched.scheduler(time.time, time.sleep)

# Fill with web scrape output
# Is this the most efficient way of doing things?
for id_code in unique_skater_ids:
    scheduler.enter(3, 1, action = get_game_log, argument = (id_code, "skater"))
    scheduler.run()

Num players: 1 - Num rows in df: 9
Num players: 2 - Num rows in df: 29
Num players: 3 - Num rows in df: 44
Num players: 4 - Num rows in df: 45
Num players: 5 - Num rows in df: 51
Num players: 6 - Num rows in df: 133
Num players: 7 - Num rows in df: 170
Num players: 8 - Num rows in df: 171
Num players: 9 - Num rows in df: 172
Num players: 10 - Num rows in df: 230
Num players: 11 - Num rows in df: 237
Num players: 12 - Num rows in df: 309
Num players: 13 - Num rows in df: 314
Num players: 14 - Num rows in df: 334
Num players: 15 - Num rows in df: 393
Num players: 16 - Num rows in df: 479
Num players: 17 - Num rows in df: 485
Num players: 18 - Num rows in df: 489
Num players: 19 - Num rows in df: 560
Num players: 20 - Num rows in df: 568
Num players: 21 - Num rows in df: 630
Num players: 22 - Num rows in df: 699
Num players: 23 - Num rows in df: 782
Num players: 24 - Num rows in df: 854
Num players: 25 - Num rows in df: 883
Num players: 26 - Num rows in df: 959
Num players: 27 - Num rows 

Num players: 210 - Num rows in df: 9824
Num players: 211 - Num rows in df: 9909
Num players: 212 - Num rows in df: 9945
Num players: 213 - Num rows in df: 10010
Num players: 214 - Num rows in df: 10092
Num players: 215 - Num rows in df: 10127
Num players: 216 - Num rows in df: 10210
Num players: 217 - Num rows in df: 10285
Num players: 218 - Num rows in df: 10362
Num players: 219 - Num rows in df: 10440
Num players: 220 - Num rows in df: 10442
Num players: 221 - Num rows in df: 10482
Num players: 222 - Num rows in df: 10549
Num players: 223 - Num rows in df: 10572
Num players: 224 - Num rows in df: 10655
Num players: 225 - Num rows in df: 10657
Num players: 226 - Num rows in df: 10668
Num players: 227 - Num rows in df: 10703
Num players: 228 - Num rows in df: 10705
Num players: 229 - Num rows in df: 10790
Num players: 230 - Num rows in df: 10872
Num players: 231 - Num rows in df: 10957
Num players: 232 - Num rows in df: 11038
Num players: 233 - Num rows in df: 11115
Num players: 234 - 

Num players: 410 - Num rows in df: 20025
Num players: 411 - Num rows in df: 20077
Num players: 412 - Num rows in df: 20156
Num players: 413 - Num rows in df: 20216
Num players: 414 - Num rows in df: 20232
Num players: 415 - Num rows in df: 20311
Num players: 416 - Num rows in df: 20393
Num players: 417 - Num rows in df: 20397
Num players: 418 - Num rows in df: 20414
Num players: 419 - Num rows in df: 20433
Num players: 420 - Num rows in df: 20503
Num players: 421 - Num rows in df: 20572
Num players: 422 - Num rows in df: 20583
Num players: 423 - Num rows in df: 20654
Num players: 424 - Num rows in df: 20666
Num players: 425 - Num rows in df: 20751
Num players: 426 - Num rows in df: 20764
Num players: 427 - Num rows in df: 20825
Num players: 428 - Num rows in df: 20904
Num players: 429 - Num rows in df: 20905
Num players: 430 - Num rows in df: 20977
Num players: 431 - Num rows in df: 21059
Num players: 432 - Num rows in df: 21133
Num players: 433 - Num rows in df: 21213
Num players: 434

Num players: 610 - Num rows in df: 29964
Num players: 611 - Num rows in df: 30038
Num players: 612 - Num rows in df: 30105
Num players: 613 - Num rows in df: 30150
Num players: 614 - Num rows in df: 30161
Num players: 615 - Num rows in df: 30241
Num players: 616 - Num rows in df: 30315
Num players: 617 - Num rows in df: 30386
Num players: 618 - Num rows in df: 30458
Num players: 619 - Num rows in df: 30467
Num players: 620 - Num rows in df: 30487
Num players: 621 - Num rows in df: 30533
Num players: 622 - Num rows in df: 30613
Num players: 623 - Num rows in df: 30665
Num players: 624 - Num rows in df: 30751
Num players: 625 - Num rows in df: 30760
Num players: 626 - Num rows in df: 30800
Num players: 627 - Num rows in df: 30802
Num players: 628 - Num rows in df: 30874
Num players: 629 - Num rows in df: 30912
Num players: 630 - Num rows in df: 30917
Num players: 631 - Num rows in df: 30930
Num players: 632 - Num rows in df: 30999
Num players: 633 - Num rows in df: 31054
Num players: 634

Num players: 808 - Num rows in df: 39341
Num players: 809 - Num rows in df: 39342
Num players: 810 - Num rows in df: 39386
Num players: 811 - Num rows in df: 39387
Num players: 812 - Num rows in df: 39440
Num players: 813 - Num rows in df: 39523
Num players: 814 - Num rows in df: 39606
Num players: 815 - Num rows in df: 39671
Num players: 816 - Num rows in df: 39744
Num players: 817 - Num rows in df: 39780
Num players: 818 - Num rows in df: 39850
Num players: 819 - Num rows in df: 39915
Num players: 820 - Num rows in df: 39984
Num players: 821 - Num rows in df: 40050
Num players: 822 - Num rows in df: 40130
Num players: 823 - Num rows in df: 40175
Num players: 824 - Num rows in df: 40200
Num players: 825 - Num rows in df: 40277
Num players: 826 - Num rows in df: 40315
Num players: 827 - Num rows in df: 40318
Num players: 828 - Num rows in df: 40363
Num players: 829 - Num rows in df: 40448
Num players: 830 - Num rows in df: 40534
Num players: 831 - Num rows in df: 40567
Num players: 832

## Apply function on all goalie IDs

In [17]:
# Initalize empty df to store all player game logs combined
goalie_games = pd.DataFrame()
# Place to store any player ID that does NOT get added to data frame
goalie_ids_that_go_wrong = []

# Set up scheduler object to fun the game log function every 2 seconds
scheduler = sched.scheduler(time.time, time.sleep)

# Fill with web scrape output
# Is this the most efficient way of doing things?
for id_code in unique_goalie_ids:
    scheduler.enter(3, 1, action = get_game_log, argument = (id_code, "goalie"))
    scheduler.run()

Num players: 1 - Num rows in df: 30
Num players: 2 - Num rows in df: 34
Num players: 3 - Num rows in df: 59
Num players: 4 - Num rows in df: 84
Num players: 5 - Num rows in df: 128
Num players: 6 - Num rows in df: 164
Num players: 7 - Num rows in df: 196
Num players: 8 - Num rows in df: 210
Num players: 9 - Num rows in df: 233
Num players: 10 - Num rows in df: 234
Num players: 11 - Num rows in df: 242
Num players: 12 - Num rows in df: 243
Num players: 13 - Num rows in df: 249
Num players: 14 - Num rows in df: 256
Num players: 15 - Num rows in df: 292
Num players: 16 - Num rows in df: 312
Num players: 17 - Num rows in df: 313
Num players: 18 - Num rows in df: 337
Num players: 19 - Num rows in df: 360
Num players: 20 - Num rows in df: 391
Num players: 21 - Num rows in df: 428
Num players: 22 - Num rows in df: 436
Num players: 23 - Num rows in df: 455
Num players: 24 - Num rows in df: 457
Num players: 25 - Num rows in df: 493
Num players: 26 - Num rows in df: 528
Num players: 27 - Num row

## Clean the data frame of skater statistics

In [22]:
# Rename columns
skater_games.columns = ['rank', 'date', 'game_num', 'age', 'team', 'home_away_status', 'opponent', 'result', 
                        'G', 'A', 'P', 'rating', 'PIM', 'EVG', 'PPG', 'SHG', 'GWG', 'EVA', 'PPA', 'SHA', 'S', 'S_perc',
                        'shifts', 'TOI', 'HIT', 'BLK', 'FOW', 'FOL', 'FOW_perc', 'player_id', 'season']

# Remove these weird rows... They are not actual data (just repeated column names)
skater_games = skater_games.loc[skater_games[('rank')] != 'Rk']

# Left join the player_name column to the data frame
skater_games = skater_games.merge(skater_ids_names, on = 'player_id', how = 'left')

# Reorder the columns 
skater_games = skater_games[['rank', 'player_id', 'player_name', 'age', 'season', 'game_num', 'date', 'team', 
                            'opponent', 'home_away_status', 'result', 'G', 'A', 'P', 'rating', 'PIM', 'EVG', 
                             'PPG', 'SHG', 'GWG', 'EVA', 'PPA', 'SHA', 'S', 'S_perc', 'shifts', 'TOI', 'HIT', 
                             'BLK', 'FOW', 'FOL', 'FOW_perc']]

# Clean home_away_status column. 1 means the player played a home game. 0 means the game was on the road.
skater_games = skater_games.replace({'home_away_status': {'@':0, None:1}})

# Fix age strings to just reflect age in years. We probably don't care about days since last birthday.
skater_games['age'] = skater_games.age.str.extract(pat = '(\d+)')

# Convert time on ice (TOI) to minutes using decimal. For example 22:40 should become 22.666667.
skater_games[['min', 'sec']] = skater_games.TOI.str.split(':', expand = True)
skater_games['TOI'] = skater_games['min'].astype('float') + skater_games['sec'].astype('float') / 60

# Now drop columns we don't need.
skater_games = skater_games.drop(columns = ['rank', 'min', 'sec'])

# Convert columns to correct types. The proper types are listed below.
    # Date --> [date]
    # Integer --> [game_num, age, home_away_status, G, A, P, rating, PIM, EVG, PPG, SHG, GWG, EVA, PPA, 
                # SHA, S, shifts, HIT, BLK, FOW, FOL, season]
    # Float --> [S_per, TOI, FOW_perc]
    # String --> [player_id, team, opponent, result]
skater_games = skater_games.astype({'date':'datetime64[ns]','game_num':'int', 'age':'int',
                                              'team':'str', 'home_away_status':'int', 'opponent':'str', 
                                              'result':'str', 'G':'int', 'A':'int', 'P':'int', 'rating':'int', 
                                              'PIM':'int', 'EVG':'int', 'PPG':'int', 'SHG':'int', 'GWG':'int', 
                                              'EVA':'int', 'PPA':'int', 'SHA':'int', 'S':'int', 'S_perc':'float',
                                              'shifts':'int', 'TOI':'float', 'HIT':'int', 'BLK':'int', 'FOW':'int', 
                                              'FOL':'int', 'FOW_perc':'float', 'player_id':'str', 'season':'int', 
                                              'player_name':'str'})



## Clean data frame for goalies

In [18]:
# Rename columns
goalie_games.columns = ['rank', 'date', 'game_num', 'age', 'team', 'home_away_status', 'opponent', 'result', 
                        'decision', 'GA', 'SA', 'SV', 'SV_perc', 'shutout', 'PIM', 'TOI', 'player_id', 'season']

# Remove these weird rows... They are not actual data (just repeated column names)
goalie_games = goalie_games.loc[goalie_games[('rank')] != 'Rk']

# Left join the player_name column to the data frame
goalie_games = goalie_games.merge(goalie_ids_names, on = 'player_id', how = 'left')

# Reorder the columns 
goalie_games = goalie_games[['rank', 'player_id', 'player_name', 'age', 'season', 'game_num', 'date', 'team', 
                            'opponent', 'home_away_status', 'result', 'decision', 'GA', 'SA', 'SV', 'SV_perc', 
                             'shutout', 'PIM', 'TOI']]

# Clean home_away_status column. 1 means the player played a home game. 0 means the game was on the road.
goalie_games = goalie_games.replace({'home_away_status': {'@':0, None:1}})

# Fix age strings to just reflect age in years. We probably don't care about days since last birthday.
goalie_games['age'] = goalie_games.age.str.extract(pat = '(\d+)')

# Convert time on ice (TOI) to minutes using decimal. For example 22:40 should become 22.666667.
goalie_games[['min', 'sec']] = goalie_games.TOI.str.split(':', expand = True)
goalie_games['TOI'] = goalie_games['min'].astype('float') + goalie_games['sec'].astype('float') / 60

# Now drop columns we don't need.
goalie_games = goalie_games.drop(columns = ['rank', 'min', 'sec'])

# Convert columns to correct types. The proper types are listed below.
    # Date --> [date]
    # Integer --> [age, season, game_num, home_away_status, GA, SA, SV, shutout, PIM]
    # Float --> [SV_perc, TOI]
    # String --> [player_id, player_name, team, opponent, result, decision]
goalie_games = goalie_games.astype({'date':'datetime64[ns]', 'age':'int', 'season':'int', 'game_num':'int', 
                                    'home_away_status':'int', 'GA':'int', 'SA':'int', 'SV':'int', 
                                    'shutout':'int', 'PIM':'int', 'SV_perc':'float', 'TOI':'float', 
                                    'player_id':'str', 'player_name':'str', 'team':'str', 'opponent':'str', 
                                    'result':'str', 'decision':'str'})

## Sanity checks for skaters

In [19]:
# HockeyReference does not show this player's games (even though he played 4 games this season). 
# Nothing we can do about this. He isn't a major player anyways.
print(skater_ids_that_go_wrong)

NameError: name 'skater_ids_that_go_wrong' is not defined

In [24]:
# Should have 1003 since 2 Sebastian Aho's but the player above was excluded
print(len(skater_games.player_id.unique()))

1003


In [25]:
# All players should have between 1 and 82 games played
print(sum(skater_games.groupby(['player_id'])['player_id'].count() <= 0))
print(sum(skater_games.groupby(['player_id'])['player_id'].count() > 82))

0
0


## Sanity checks for goalies

In [20]:
# Which IDs could we not find data for?
print(goalie_ids_that_go_wrong)

[]


In [21]:
# Number of unique goalies
print(len(goalie_games.player_id.unique()))

98


In [22]:
# All goalies should have between 1 and 82 games played
print(sum(goalie_games.groupby(['player_id'])['player_id'].count() <= 0))
print(sum(goalie_games.groupby(['player_id'])['player_id'].count() > 82))

0
0


In [26]:
# Goals against should equal shots against minus saves
print(sum(goalie_games.GA != goalie_games.SA - goalie_games.SV))

0


## Write the final data frame to a .csv file

In [23]:
#os.getcwd()
directory_to_save_s = '../data/stats/skaters_' + str(season) + '.csv'
skater_games.to_csv(directory_to_save_s, index = False)

#os.getcwd()
directory_to_save_g = '../data/stats/goalies_' + str(season) + '.csv'
goalie_games.to_csv(directory_to_save_g, index = False)