### Scraping Standings ###
I originally dropped all the standings data other than playoff status. Now I'm predicting points percentage instead of boolean playoff status so I need to get that data back.

In [1]:
# Import packages
import pandas as pd
import csv
import time
import html5lib
import requests
import configparser

In [2]:
# Dictionary of names and abbreviations
abbr = {
    'Anaheim Ducks': 'ANA',
    'Arizona Coyotes': 'ARI',
    'Atlanta Thrashers': 'ATL',
    'Boston Bruins': 'BOS',
    'Buffalo Sabres': 'BUF',
    'Calgary Flames': 'CGY',
    'Carolina Hurricanes': 'CAR',
    'Chicago Blackhawks': 'CHI',
    'Colorado Avalanche': 'COL',
    'Columbus Blue Jackets': 'CBJ',
    'Dallas Stars': 'DAL',
    'Detroit Red Wings': 'DET',
    'Edmonton Oilers': 'EDM',
    'Florida Panthers': 'FLA',
    'Los Angeles Kings': 'LAK',
    'Mighty Ducks of Anaheim': 'MDA',
    'Minnesota Wild': 'MIN',
    'Montreal Canadiens': 'MTL',
    'Nashville Predators': 'NSH',
    'New Jersey Devils': 'NJD',
    'New York Islanders': 'NYI',
    'New York Rangers': 'NYR',
    'Ottawa Senators': 'OTT',
    'Philadelphia Flyers': 'PHI',
    'Phoenix Coyotes': 'PHX',
    'Pittsburgh Penguins': 'PIT',
    'San Jose Sharks': 'SJS',
    'Seattle Kraken': 'SEA',
    'St. Louis Blues': 'STL',
    'Tampa Bay Lightning': 'TBL',
    'Toronto Maple Leafs': 'TOR',
    'Vancouver Canucks': 'VAN',
    'Vegas Golden Knights': 'VEG',
    'Washington Capitals': 'WSH',
    'Winnipeg Jets': 'WPG'
}

In [3]:
config = configparser.ConfigParser()
config.read('config.ini');

In [4]:
# Generate list of team abbreviations
def create_teams():
    ''' (none) -> list of str
    Return a list of current NHL teams (and some outdated ones).
    '''
    homeurl = config['HR']['home']
    dfs = pd.read_html(homeurl)

    east = dfs[0]['Eastern'].values.tolist()
    east.remove('Atlantic')
    east.remove('Metropolitan')
    east_teams = [item.replace('*', '') for item in east]

    west = dfs[1]['Western'].values.tolist()
    west.remove('Pacific')
    west.remove('Central')
    west_teams = [item.replace('*', '') for item in west]
    teams = east_teams + west_teams

    # Manually add Atlanta and Mighty Ducks of Anaheim so we can access those URLs
    teams.extend(['ATL', 'MDA', 'PHX'])
    return teams

In [5]:
teams = create_teams()
len(teams) == len(abbr)

True

In [6]:
# Post-lockout seasons list as strings
seasons = list(range(2006, 2024))

In [20]:
def create_standings_df(season):
    ''' (int) -> DataFrame
    
    Return a DataFrame containing the NHL standings for the season *ending* in the specified year.
    Use this function to generate the DataFrame that function create_team_df accepts as an argument.
    
    >>> create_standings_df('2019')
    '''
    seasonurlprefix = config['HR']['season']
    seasonurl = f'{seasonurlprefix}{str(season)}.html'
    
    try: 
        response = requests.get(seasonurl)
        response.raise_for_status()
        
        dfs = pd.read_html(seasonurl)
        standings_df = pd.concat([dfs[0], dfs[1]]).reset_index(drop = True)
        standings_df.rename(columns={'Unnamed: 0': 'Team'}, inplace = True)

        standings_df = standings_df[standings_df['Team'].str.contains('Division')==False]
        standings_df['Playoffs'] = standings_df['Team'].str.contains('\*')
        standings_df['Team'] = standings_df['Team'].str.replace('\*', '', regex = True)
        standings_df['Team'] = standings_df['Team'].map(abbr)

        cols = ['Team', 'PTS', 'GP', 'W', 'L', 'OL', 'PTS%', 'GF', 'GA', 'Playoffs']
        standings_df = standings_df[cols]

        standings_df = (standings_df
                        .astype({'GP': 'int', 'W': 'int', 'L': 'int', 'OL': 'int', 'PTS': 'int', 'PTS%': 'float', 'GF': 'int', 'GA': 'int'})
                        .sort_values('PTS%', ascending = False)
                        .reset_index(drop = True)
                        .assign(Season = season)
                       )
       
    
    except:
        print(f'Error {seasonurl}')
        
    return standings_df

In [17]:
def create_points_pct_df(seasons):
    ''' (list of int) -> DataFrame
    
    Returns a DataFrame containing all team points percentages for all seasons in the list "seasons".
    
    >>> create_pts_pct_df(seasons)
    '''
    #standings_df = create_standings_df(season)
    points_pct_df = pd.DataFrame()
    
    for season in seasons:
        standings_df = create_standings_df(season)
        points_pct_df = pd.concat([points_pct_df, standings_df])
        time.sleep(5) 
        
    return points_pct_df

In [22]:
start = time.time()
points_pct_df = create_points_pct_df(seasons)
end = time.time()
elapsed_time = end - start
print('Execution time:', time.strftime('%M:%S', time.gmtime(elapsed_time)))

Execution time: 01:35


In [23]:
points_pct_df

Unnamed: 0,Team,PTS,GP,W,L,OL,PTS%,GF,GA,Playoffs,Season
0,DET,124,82,58,16,8,0.756,305,209,True,2006
1,OTT,113,82,52,21,9,0.689,314,211,True,2006
2,CAR,112,82,52,22,8,0.683,294,260,True,2006
3,DAL,112,82,53,23,6,0.683,265,218,True,2006
4,BUF,110,82,52,24,6,0.671,281,239,True,2006
...,...,...,...,...,...,...,...,...,...,...,...
27,MTL,68,82,31,45,6,0.415,232,307,False,2023
28,SJS,60,82,22,44,16,0.366,234,321,False,2023
29,CHI,59,82,26,49,7,0.360,204,301,False,2023
30,CBJ,59,82,25,48,9,0.360,214,330,False,2023


In [26]:
# Standardize teams whose names changed during the target timeframe
team_map = {'ATL': 'WPG', 'PHX': 'ARI', 'MDA': 'ANA'}
points_pct_df['Team'] = points_pct_df['Team'].replace(team_map)

In [27]:
points_pct_df.to_csv('standings.csv', index = False)