#### Import necessary packages ####
Use Pandas for reading HTML packages and CSV for saving data. Use time to limit requests per minute.

In [1]:
# Import packages
import pandas as pd
import csv
import time
import html5lib
import requests
import configparser

#### Create a dictionary of NHL team names and their standard abbreviations ####
This will be useful later when the HTML tables use inconsistent naming. Make sure the NHL hasn't added or relocated any franchises since 2023 or this dictionary will be outdated! 

In [2]:
# Dictionary of names and abbreviations
abbr = {
    'Anaheim Ducks': 'ANA',
    'Arizona Coyotes': 'ARI',
    'Atlanta Thrashers': 'ATL',
    'Boston Bruins': 'BOS',
    'Buffalo Sabres': 'BUF',
    'Calgary Flames': 'CGY',
    'Carolina Hurricanes': 'CAR',
    'Chicago Blackhawks': 'CHI',
    'Colorado Avalanche': 'COL',
    'Columbus Blue Jackets': 'CBJ',
    'Dallas Stars': 'DAL',
    'Detroit Red Wings': 'DET',
    'Edmonton Oilers': 'EDM',
    'Florida Panthers': 'FLA',
    'Los Angeles Kings': 'LAK',
    'Mighty Ducks of Anaheim': 'MDA',
    'Minnesota Wild': 'MIN',
    'Montreal Canadiens': 'MTL',
    'Nashville Predators': 'NSH',
    'New Jersey Devils': 'NJD',
    'New York Islanders': 'NYI',
    'New York Rangers': 'NYR',
    'Ottawa Senators': 'OTT',
    'Philadelphia Flyers': 'PHI',
    'Phoenix Coyotes': 'PHX',
    'Pittsburgh Penguins': 'PIT',
    'San Jose Sharks': 'SJS',
    'Seattle Kraken': 'SEA',
    'St. Louis Blues': 'STL',
    'Tampa Bay Lightning': 'TBL',
    'Toronto Maple Leafs': 'TOR',
    'Vancouver Canucks': 'VAN',
    'Vegas Golden Knights': 'VEG',
    'Washington Capitals': 'WSH',
    'Winnipeg Jets': 'WPG'
}

#### Read the config file ####
The data source site doesn't want to be used for model training, so I can't say where the data came from.

In [3]:
config = configparser.ConfigParser()
config.read('config.ini');

#### Create a list of NHL team abbreviations ####
Make sure this list is the same length as our hard-coded dictionary of abbreviations. If not, the NHL may have added (or removed) a franchise, so update the dictionary accordingly! 

In [4]:
# Generate list of team abbreviations
def create_teams():
    ''' (none) -> list of str
    Return a list of current NHL teams (and some outdated ones).
    '''
    homeurl = config['HR']['home']
    dfs = pd.read_html(homeurl)

    east = dfs[0]['Eastern'].values.tolist()
    east.remove('Atlantic')
    east.remove('Metropolitan')
    east_teams = [item.replace('*', '') for item in east]

    west = dfs[1]['Western'].values.tolist()
    west.remove('Pacific')
    west.remove('Central')
    west_teams = [item.replace('*', '') for item in west]
    teams = east_teams + west_teams

    # Manually add Atlanta and Mighty Ducks of Anaheim so we can access those URLs
    teams.extend(['ATL', 'MDA', 'PHX'])
    return teams

In [5]:
teams = create_teams()
len(teams) == len(abbr)

True

#### Create a list of strings representing each season since the start of the salary cap era #### 
This list is hardcoded to end with 2023, which represents the 2022-2023 season.

In [6]:
# Post-lockout seasons list as strings
seasons = list(range(2006, 2024))

#### Write function to create standings_df DataFrame for given season ####
Use Pandas to read the HTML table of the season. Combine East/West standings_df and eliminate division labels. Clean up columns and team names, adding a column for playoff status. 

In [7]:
def create_standings_df(season):
    ''' (int) -> DataFrame
    
    Return a DataFrame containing the NHL standings for the season *ending* in the specified year.
    Use this function to generate the DataFrame that function create_team_df accepts as an argument.
    
    >>> create_standings_df('2019')
    '''
    seasonurlprefix = config['HR']['season']
    seasonurl = f'{seasonurlprefix}{str(season)}.html'
    
    try: 
        response = requests.get(seasonurl)
        response.raise_for_status()
        
        dfs = pd.read_html(seasonurl)
        standings_df = pd.concat([dfs[0], dfs[1]]).reset_index(drop = True)
        standings_df.rename(columns={'Unnamed: 0': 'Team'}, inplace = True)

        standings_df = standings_df[standings_df['Team'].str.contains('Division')==False]
        standings_df['Playoffs'] = standings_df['Team'].str.contains('\*')
        standings_df['Team'] = standings_df['Team'].str.replace('\*', '', regex = True)
        standings_df['Team'] = standings_df['Team'].map(abbr)

        cols = ['Team', 'PTS', 'GP', 'W', 'L', 'OL', 'PTS%', 'GF', 'GA', 'Playoffs']
        standings_df = standings_df[cols]

        standings_df = standings_df.astype({'GP': 'int', 'W': 'int', 'L': 'int', 'OL': 'int', 'PTS': 'int', 'PTS%': 'float', 'GF': 'int', 'GA': 'int'})
        standings_df = standings_df.sort_values('PTS', ascending = False).reset_index(drop = True)
    
    except:
        print(f'Error {seasonurl}')
        
    return standings_df

#### Write function to create team_df DataFrame for a given team in the given season ####
Use Pandas to read the HTML table of the team during the specified season. Clean up multi-level columns. Eliminate columns that aren't available for all years. Add columns for team and playoff status.

In [8]:
def create_team_df(standings_df, season, team):
    ''' (DataFrame, int, string) -> DataFrame
    
    Return a DataFrame containing the NHL player statistics for the specified team during the season *ending* in the specified year.
    Use function 'create_standings_df' to generate standings_df.
    
    >>> create_team_df(standings_df, '2016', 'PIT')
    '''
    teamurlprefix = config['HR']['team']
    teamurl = f'{teamurlprefix}{team}/{str(season)}.html'
    
    try:
        response = requests.get(teamurl)
        response.raise_for_status()
        
        dfs = pd.read_html(teamurl)
        # The Vegas 2023 season page has an additional HTML table, so read table 4 from that page
        tableno = 3 + (team == 'VEG' and season == '2023')
        team_df = dfs[tableno]
        team_df.columns = team_df.columns.droplevel(0)
        cols = ['Player', 'Age', 'Pos', 'GP', 'G', 'A', 'PTS', '+/-', 'PIM', 'S', 'S%', 'OPS',
       'DPS', 'PS']
        team_df = team_df[cols]
        gf = standings_df.loc[standings_df['Team'] == team, 'GF'].iloc[0]
        playoffs = standings_df.loc[standings_df['Team'] == team, 'Playoffs'].iloc[0]
        team_df = team_df.assign(Team = team, Playoffs = playoffs, Season = season)
        team_df = team_df.dropna(subset = ['Age']).fillna(0)
        
    except requests.exceptions.RequestException as e:
        print(f'Error {teamurl}')
    return team_df

#### Write function to create a DataFrame for each season ####
Source blocks users sending more than 20 requests per minute, so wait 5 seconds between requests.

In [9]:
def create_season_df(season, teams):
    ''' (int, list of strings) -> DataFrame
    
    Returns a DataFrame containing all player statistics from each NHL team in the
    "teams" list for the season *ending* in the given year.
    
    >>> create_season_df('2016', teams)
    '''
    standings_df = create_standings_df(season)
    season_df = pd.DataFrame()
    
    for team in teams:
        team_df = create_team_df(standings_df, season, team)
        season_df = pd.concat([season_df, team_df])
        time.sleep(5) 
        
    return season_df

#### Write function to create a DataFrame for a range of seasons ####
Call create_season_df for each season, passing lists based on which teams were in the league that year.

In [10]:
def scrape_data(seasons, teams):
    ''' (list int, list of strings) -> DataFrame
        Returns a DataFrame containing all player statistics from each NHL team in the "teams" list
        for the all seasons in the "seasons" list.
    
    >>> scrape_data(seasons, teams)
    '''
    NHL_data = pd.DataFrame()
    
    # Mighty Ducks of Anaheim are abbreviated as "MDA"
    teams2006 = list(set(teams) - set(['WPG', 'SEA', 'VEG', 'ANA', 'ARI']))
    # Ducks rebrand to Anaheim Ducks, abbreviated "ANA"
    teams2007 = list(set(teams) - set(['WPG', 'SEA', 'VEG', 'MDA', 'ARI']))
    # Atlanta Thrashers become Winnipeg Jets
    teams2012 = list(set(teams) - set(['ATL', 'SEA', 'VEG', 'MDA', 'ARI']))
    # Phoenix Coyotes rebrand to Arizona Coyotes
    teams2015 = list(set(teams) - set(['ATL', 'SEA', 'VEG', 'MDA', 'PHX']))
    # Vegas Golden Knights enter the league
    teams2018 = list(set(teams) - set(['ATL', 'SEA', 'MDA', 'PHX']))
    # Seattle Kraken enter the league
    teams2022 = list(set(teams) - set(['ATL', 'MDA', 'PHX']))
    
    for season in seasons:
        if season == 2006:
            season_df = create_season_df(season, teams2006)
        elif season < 2012:
            season_df = create_season_df(season, teams2007)
        elif season >= 2012 and season < 2015:
            season_df = create_season_df(season, teams2012)
        elif season >= 2015 and season < 2018:
            season_df = create_season_df(season, teams2015)
        elif season >= 2018 and season < 2022:
            season_df = create_season_df(season, teams2018)
        else:
            season_df = create_season_df(season, teams2022)
            
        NHL_data = pd.concat([NHL_data, season_df])
    return NHL_data

#### Scrape data into a DataFrame ####
Make sure data pass the eye test

In [11]:
start = time.time()
nhl_data = scrape_data(seasons, teams)
end = time.time()
elapsed_time = end - start
print('Execution time:', time.strftime('%M:%S', time.gmtime(elapsed_time)))

Execution time: 48:45


In [12]:
nhl_data.shape

(19183, 17)

In [13]:
nhl_data.describe()

Unnamed: 0,Age,GP,G,A,PTS,+/-,PIM,S,S%,OPS,DPS,PS,Season
count,19183.0,19183.0,19183.0,19183.0,19183.0,19183.0,19183.0,19183.0,19183.0,19183.0,19183.0,19183.0,19183.0
mean,26.750925,42.484909,6.328885,10.820049,17.148934,-0.317886,23.941354,67.657457,6.872038,1.034192,1.004822,2.435088,2014.64651
std,4.466326,28.268683,8.551189,12.837838,20.322113,8.97512,27.793149,69.338058,7.809564,1.87744,1.224246,2.936748,5.234457
min,18.0,1.0,0.0,0.0,0.0,-47.0,0.0,0.0,0.0,-2.3,-1.0,-1.9,2006.0
25%,23.0,15.0,0.0,1.0,1.0,-4.0,4.0,8.0,0.0,0.0,0.1,0.2,2010.0
50%,26.0,44.0,3.0,6.0,9.0,0.0,16.0,46.0,6.0,0.2,0.6,1.3,2015.0
75%,30.0,70.0,9.0,17.0,26.0,3.0,35.0,109.0,10.8,1.5,1.5,3.9,2019.0
max,48.0,82.0,65.0,92.0,153.0,64.0,324.0,528.0,100.0,15.8,8.1,18.5,2023.0


In [14]:
nhl_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19183 entries, 0 to 31
Data columns (total 17 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Player    19183 non-null  object 
 1   Age       19183 non-null  float64
 2   Pos       19183 non-null  object 
 3   GP        19183 non-null  int64  
 4   G         19183 non-null  int64  
 5   A         19183 non-null  int64  
 6   PTS       19183 non-null  int64  
 7   +/-       19183 non-null  float64
 8   PIM       19183 non-null  int64  
 9   S         19183 non-null  int64  
 10  S%        19183 non-null  float64
 11  OPS       19183 non-null  float64
 12  DPS       19183 non-null  float64
 13  PS        19183 non-null  float64
 14  Team      19183 non-null  object 
 15  Playoffs  19183 non-null  bool   
 16  Season    19183 non-null  int64  
dtypes: bool(1), float64(6), int64(7), object(3)
memory usage: 2.5+ MB


In [15]:
nhl_data.Team.nunique()

35

In [16]:
nhl_data.head()

Unnamed: 0,Player,Age,Pos,GP,G,A,PTS,+/-,PIM,S,S%,OPS,DPS,PS,Team,Playoffs,Season
0,Brian Gionta,27.0,RW,82,48,41,89,18.0,46,291,16.5,8.5,2.9,11.4,NJD,True,2006
1,Scott Gomez,26.0,C,82,33,51,84,8.0,42,244,13.5,6.9,2.4,9.2,NJD,True,2006
2,Jamie Langenbrunner,30.0,RW,80,19,34,53,-1.0,74,243,7.8,2.6,2.0,4.6,NJD,True,2006
3,Brian Rafalski,32.0,D,82,6,43,49,0.0,36,126,4.8,2.9,5.7,8.5,NJD,True,2006
4,Patrik Eliáš,29.0,LW,38,16,29,45,11.0,20,142,11.3,3.9,1.4,5.2,NJD,True,2006


#### Save to CSV ####
That code takes ages to run because of the request limiter. Export the data so there's no need to run that again.

In [17]:
nhl_data.to_csv('nhldata.csv', index = False)

#### Next Steps ####
- Remove asterisks next to Hall of Fame players
- Consolidate teams whose names changed (MDA/ANA, PHX/ARI, ATL/WPG)
- EDA