In [1]:
# Import packages
import pandas as pd
import csv
import time
import html5lib
import requests
import configparser

In [2]:
# Dictionary of names and abbreviations
abbr = {
    'Anaheim Ducks': 'ANA',
    'Arizona Coyotes': 'ARI',
    'Atlanta Thrashers': 'ATL',
    'Boston Bruins': 'BOS',
    'Buffalo Sabres': 'BUF',
    'Calgary Flames': 'CGY',
    'Carolina Hurricanes': 'CAR',
    'Chicago Blackhawks': 'CHI',
    'Colorado Avalanche': 'COL',
    'Columbus Blue Jackets': 'CBJ',
    'Dallas Stars': 'DAL',
    'Detroit Red Wings': 'DET',
    'Edmonton Oilers': 'EDM',
    'Florida Panthers': 'FLA',
    'Los Angeles Kings': 'LAK',
    'Mighty Ducks of Anaheim': 'MDA',
    'Minnesota Wild': 'MIN',
    'Montreal Canadiens': 'MTL',
    'Nashville Predators': 'NSH',
    'New Jersey Devils': 'NJD',
    'New York Islanders': 'NYI',
    'New York Rangers': 'NYR',
    'Ottawa Senators': 'OTT',
    'Philadelphia Flyers': 'PHI',
    'Phoenix Coyotes': 'PHX',
    'Pittsburgh Penguins': 'PIT',
    'San Jose Sharks': 'SJS',
    'Seattle Kraken': 'SEA',
    'St. Louis Blues': 'STL',
    'Tampa Bay Lightning': 'TBL',
    'Toronto Maple Leafs': 'TOR',
    'Vancouver Canucks': 'VAN',
    'Vegas Golden Knights': 'VEG',
    'Washington Capitals': 'WSH',
    'Winnipeg Jets': 'WPG'
}

In [3]:
config = configparser.ConfigParser()
config.read('config.ini');

In [4]:
# Generate list of team abbreviations
def create_teams():
    ''' (none) -> list of str
    Return a list of current NHL teams (and some outdated ones).
    '''
    homeurl = config['HR']['home']
    dfs = pd.read_html(homeurl)

    east = dfs[0]['Eastern'].values.tolist()
    east.remove('Atlantic')
    east.remove('Metropolitan')
    east_teams = [item.replace('*', '') for item in east]

    west = dfs[1]['Western'].values.tolist()
    west.remove('Pacific')
    west.remove('Central')
    west_teams = [item.replace('*', '') for item in west]
    teams = east_teams + west_teams

    # Manually add Atlanta and Mighty Ducks of Anaheim so we can access those URLs
    teams.extend(['ATL', 'MDA', 'PHX'])
    return teams

In [5]:
teams = create_teams()
len(teams) == len(abbr)

True

In [6]:
# Post-lockout seasons list as strings
seasons = [2024]

In [8]:
def create_standings_df(season):
    ''' (int) -> DataFrame
    
    Return a DataFrame containing the NHL standings for the season *ending* in the specified year.
    Use this function to generate the DataFrame that function create_team_df accepts as an argument.
    
    >>> create_standings_df('2019')
    '''
    seasonurlprefix = config['HR']['season']
    seasonurl = f'{seasonurlprefix}{str(season)}.html'
    
    try: 
        response = requests.get(seasonurl)
        response.raise_for_status()
        
        dfs = pd.read_html(seasonurl)
        standings_df = pd.concat([dfs[0], dfs[1]]).reset_index(drop = True)
        standings_df.rename(columns={'Unnamed: 0': 'Team'}, inplace = True)

        standings_df = standings_df[standings_df['Team'].str.contains('Division')==False]
        standings_df['Playoffs'] = standings_df['Team'].str.contains('\*')
        standings_df['Team'] = standings_df['Team'].str.replace('\*', '', regex = True)
        standings_df['Team'] = standings_df['Team'].map(abbr)

        cols = ['Team', 'PTS', 'GP', 'W', 'L', 'OL', 'PTS%', 'GF', 'GA', 'Playoffs']
        standings_df = standings_df[cols]

        standings_df = standings_df.astype({'GP': 'int', 'W': 'int', 'L': 'int', 'OL': 'int', 'PTS': 'int', 'PTS%': 'float', 'GF': 'int', 'GA': 'int'})
        standings_df = standings_df.sort_values('PTS', ascending = False).reset_index(drop = True)
    
    except:
        print(f'Error {seasonurl}')
        
    return standings_df

In [67]:
def create_team_df(standings_df, season, team):
    ''' (DataFrame, int, string) -> DataFrame
    
    Return a DataFrame containing the NHL player statistics for the specified team during the season *ending* in the specified year.
    Use function 'create_standings_df' to generate standings_df.
    
    >>> create_team_df(standings_df, '2016', 'PIT')
    '''
    teamurlprefix = config['HR']['team']
    teamurl = f'{teamurlprefix}{team}/{str(season)}.html'
    
    try:
        response = requests.get(teamurl)
        response.raise_for_status()
        
        dfs = pd.read_html(teamurl)

        three_list = ['BOS', 'NYR']

        # The Boston and New York Rangers pages have an extra table in 2024
        tableno = 4 - (team in three_list) 
        team_df = dfs[tableno]
        team_df.columns = team_df.columns.droplevel(0)
        cols = ['Player', 'Age', 'Pos', 'GP', 'G', 'A', 'PTS', '+/-', 'PIM', 'S', 'S%', 'OPS',
       'DPS', 'PS']
        team_df = team_df[cols]
        gf = standings_df.loc[standings_df['Team'] == team, 'GF'].iloc[0]
        playoffs = standings_df.loc[standings_df['Team'] == team, 'Playoffs'].iloc[0]
        team_df = team_df.assign(Team = team, Playoffs = playoffs, Season = season)
        team_df = team_df.dropna(subset = ['Age']).fillna(0)
        
    except requests.exceptions.RequestException as e:
        print(f'Error {teamurl}')
    return team_df

In [9]:
def create_season_df(season, teams):
    ''' (int, list of strings) -> DataFrame
    
    Returns a DataFrame containing all player statistics from each NHL team in the
    "teams" list for the season *ending* in the given year.
    
    >>> create_season_df('2016', teams)
    '''
    standings_df = create_standings_df(season)
    season_df = pd.DataFrame()
    
    for team in teams:
        team_df = create_team_df(standings_df, season, team)
        season_df = pd.concat([season_df, team_df])
        time.sleep(5) 
        
    return season_df

In [12]:
def scrape_data(seasons, teams):
    ''' (list int, list of strings) -> DataFrame
        Returns a DataFrame containing all player statistics from each NHL team in the "teams" list
        for the all seasons in the "seasons" list.
    
    >>> scrape_data(seasons, teams)
    '''
    NHL_data = pd.DataFrame()

    teams2022 = list(set(teams) - set(['ATL', 'MDA', 'PHX']))
    
    for season in seasons:
        season_df = create_season_df(season, teams2022)
        NHL_data = pd.concat([NHL_data, season_df])
    return NHL_data

In [68]:
start = time.time()
nhl_data = scrape_data(seasons, teams)
end = time.time()
elapsed_time = end - start
print('Execution time:', time.strftime('%M:%S', time.gmtime(elapsed_time)))

Execution time: 02:59


In [70]:
nhl_data.to_csv('2024data.csv', index = False)