In [1]:
import pandas as pd
from pathlib import Path

In [2]:
NHL_team_abbreviation_dict = {
    'ANA':'Anaheim_Ducks',
    'ARI':'Arizona_Coyotes',
    'BOS':'Boston_Bruins',
    'BUF':'Buffalo_Sabres',
    'CGY':'Calgary_Flames',
    'CAR':'Carolina_Hurricanes',
    'CHI':'Chicago_Blackhawks',
    'COL':'Colorado_Avalanche',
    'CBJ':'Columbus_Blue_Jackets',
    'DAL':'Dallas_Stars',
    'DET':'Detroit_Red_Wings',
    'EDM':'Edmonton_Oilers',
    'FLA':'Florida_Panthers',
    'LAK':'Los_Angeles_Kings',
    'MIN':'Minnesota_Wild',
    'MTL':'Montreal_Canadiens',
    'NSH':'Nashville_Predators',
    'NJD':'New_Jersey_Devils',
    'NYI':'New_York_Islanders',
    'NYR':'New_York_Rangers',
    'OTT':'Ottawa_Senators',
    'PHI':'Philadelphia_Flyers',
    'PHX':'Phoenix_Coyotes',
    'PIT':'Pittsburgh_Penguins',
    'SJS':'San_Jose_Sharks',
    'STL':'St_Louis_Blues',
    'TBL':'Tampa_Bay_Lightning',
    'TOR':'Toronto_Maple_Leafs',
    'VAN':'Vancouver_Canucks',
    'VEG':'Vegas_Golden_Knights',
    'WSH':'Washington_Capitals',
    'WPG':'Winnipeg_Jets'
}

In [3]:
def get_nhl_team_data(team, season_end):
    '''
        team: team abbreviation name (e.g. 'VAN' for 'Vancouver Canucks');
        season_end: year in which season ended (e.g. 2021 for 2020-2021 season)
    '''
    # 1. Download data from url;
    #    first table is the one that we want, hence the [0];
    data = pd.read_html(f'https://www.hockey-reference.com/teams/{team}/{str(season_end)}_games.html')[0]
    
    # 2. Select relevant columns and remove useless rows
    data = data[['GP','Date','Opponent','GF','GA','W','L','OL']]
    to_drop = data[data.GP == 'GP'].index
    data = data.drop(to_drop).reset_index()
    
    # 3. Convert columns to its right type
    data[['GP','GF','GA','W','L','OL']] = data[['GP','GF','GA','W','L','OL']].apply(pd.to_numeric)
    data['Date'] = data['Date'].apply(pd.to_datetime)
    
    # 4. Compute team points based on W-L-OL and add team column
    data['Points'] = 2*data['W'] + 1*data['OL']
    data['Team'] = NHL_team_abbreviation_dict[team]
    
    # 5. Reorganize columns to make sense
    cols_ordered = ['GP','Date','Team','Opponent','GF','GA','W','L','OL','Points']
    data = data[cols_ordered]
    
    return data

In [4]:
def get_NHL_data(year, team_abb_dict):
    all_teams = []
    
    for key in team_abb_dict.keys():
        # Arizona Coyotes was called Phoenix Coyotes previous to the 2014-2015 season
        if year < 2015 and key == 'ARI':
            continue
        if year > 2014 and key == 'PHX':
            continue

        # Vegas Golden Knights first season was in 2017-2018
        if year < 2018 and key == 'VEG':
            continue
            
        # Winnipeg Jets first season was in 2011-2012
        if year < 2012 and key == 'WPG':
            continue

        print(f"Fetching data for: {key} ({year-1}-{year})...")
        # Get team data and save it on teams folder
        team_data = get_nhl_team_data(key, year)
        
        # Append team data to list
        all_teams.append(team_data)
        
    return pd.concat(all_teams)

In [5]:
def find_rank(pts, gp, all_df):
    gp_pts_list = sorted(list(set(list(all_df[all_df.GP==gp]['Points']))), reverse=True)
    return gp_pts_list.index(pts)
    
def fill_ranks(team_df, all_df):
    for idx, row in team_df.iterrows():
        team_df.loc[idx,'Ranking'] = find_rank(team_df.loc[idx,'Points'],idx+1, all_df)
  

In [13]:
def build_and_save_NHL_data(season_end, team_abbv_dict):
    '''
        season_end: year in which season ended (e.g. 2021 for 2020-2021 season);
        team_abbv_dict: dict with mapping from abbreviation to team name
    '''
    
    # 1. Get raw data from each team and group it into one dataframe
    raw_data = get_NHL_data(season_end, team_abbv_dict)
    
    # 2. Build paths to save files
    data_folder = Path("..//data//NHL//nhl_"+str(season_end))
    data_folder.mkdir(parents=True, exist_ok=True)

    teams_folder = data_folder / 'teams'
    teams_folder.mkdir(parents=True, exist_ok=True)
    
    # 3. From dataframe with all teams, get ranking of each team
    #    in each GP of season, update team dataframe and save each
    #    team dataframe
    all_df_wrank = []
    for team, team_df in raw_data.groupby('Team'):
        fill_ranks(team_df, raw_data)
        team_df.to_csv(str(teams_folder)+'//' + team, index=False)
        all_df_wrank.append(team_df)

    # 4. Get dataframe with all teams updated dataframes with ranking
    #    and save it
    all_df_wrank = pd.concat(all_df_wrank)
    all_df_wrank.to_csv(str(data_folder) + '//' + "all_"+str(season_end), index=False)

In [17]:
build_and_save_NHL_data(2021, NHL_team_abbreviation_dict)

Fetching data for: ANA (2020-2021)...
Fetching data for: ARI (2020-2021)...
Fetching data for: BOS (2020-2021)...
Fetching data for: BUF (2020-2021)...
Fetching data for: CGY (2020-2021)...
Fetching data for: CAR (2020-2021)...
Fetching data for: CHI (2020-2021)...
Fetching data for: COL (2020-2021)...
Fetching data for: CBJ (2020-2021)...
Fetching data for: DAL (2020-2021)...
Fetching data for: DET (2020-2021)...
Fetching data for: EDM (2020-2021)...
Fetching data for: FLA (2020-2021)...
Fetching data for: LAK (2020-2021)...
Fetching data for: MIN (2020-2021)...
Fetching data for: MTL (2020-2021)...
Fetching data for: NSH (2020-2021)...
Fetching data for: NJD (2020-2021)...
Fetching data for: NYI (2020-2021)...
Fetching data for: NYR (2020-2021)...
Fetching data for: OTT (2020-2021)...
Fetching data for: PHI (2020-2021)...
Fetching data for: PIT (2020-2021)...
Fetching data for: SJS (2020-2021)...
Fetching data for: STL (2020-2021)...
Fetching data for: TBL (2020-2021)...
Fetching dat