In [None]:
# TODO:
#     Still should include full comments/descriptions
#     Rebound information, and possibly other information from: http://blog.war-on-ice.com/index.html
#     The above blog also provides a way to reduce venue bias in location data.

In [12]:
import requests
import pickle
import json
from bs4 import BeautifulSoup
import pandas as pd
from pathlib import Path
import numpy as np
import logging
logging.basicConfig(filename='logs.log', level=logging.INFO)

In [2]:
# We'll begin by examining the 2018-19 season. This season is chosen because it's the most-recent non-COVID season and because
# shot information in the 2019-20 was incorrect. 
# For details, see https://www.ontheforecheck.com/2019/10/15/20915205/nhl-shot-location-play-by-play-data-has-changed-pbp-shotmaps-analytics-expected-goals-model
api_root_url = 'https://statsapi.web.nhl.com'
data_folder = 'data/'
game_report_folder = data_folder + 'games/'
season_list = ['20182019']

In [3]:
def get_schedule_file(season):
    current_dir = Path.cwd()
    relative_path = data_folder + 'schedule_' + season + '.json'
    file = current_dir.joinpath(relative_path)
    return file

def read_game_feed_links(season):
    file = get_schedule_file(season)
    if file.exists():
        logging.info('Found ' + season + ' schedule. Reading.')
        with file.open('r') as infile:
            game_feed_links = json.load(infile)
        return game_feed_links
    else:
        return None
    
def extract_season_game_feed_links(season):
    logging.info('Did not find ' + season + ' schedule. Downloading.')
    api_url = api_root_url + '/api/v1/schedule?' + 'season=' + season
    api_request = requests.get(api_url)
    season = api_request.json()
    schedule_links = [ game['link'] 
                  for game_date in season['dates'] 
                  for game in game_date['games'] ]    
    return schedule_links

def get_game_feed_links(season, refresh = False):
    read_from_file = read_game_feed_links(season) if not refresh else None
    if read_from_file is None:
        game_feed_links = extract_season_game_feed_links(season)
        file = get_schedule_file(season)
        file.touch()
        with file.open('w') as outfile:
            json.dump(game_feed_links, outfile)
        return game_feed_links
    else:
        return read_from_file

In [4]:
event_translation = {
    'Game Scheduled': None, 
    'Period Ready': None, 
    'Period Start': 'PSTR', 
    'Faceoff': 'FAC',
    'Giveaway': 'GIVE', 
    'Shot': 'SHOT', 
    'Stoppage': 'STOP', 
    'Takeaway': 'TAKE', 
    'Hit': 'HIT', 
    'Missed Shot': 'MISS',
    'Penalty': 'PENL', 
    'Blocked Shot': 'BLOCK', 
    'Goal': 'GOAL', 
    'Period End': 'PEND', 
    'Period Official': None,
    'Shootout Complete': 'SOC', 
    'Game End': 'GEND', 
    'Game Official': 'GOFF',
    'Official Challenge': 'CHL', 
    'Early Intermission Start': 'EISTR',
    'Early Intermission End': 'EIEND', 
    'Emergency Goaltender': 'EGT'
}

def construct_game_live_feed_frame(live_feed_link):
    '''
    Constructs a Pandas data frame from the live feed data for the requested game.
    
    Parameters:
        live_feed_link (str) - The live feed link of the game for the frame. Example: '/api/v1/game/2018020240/feed/live' 
        for the game in the 2018-2019 season with id 020240.  
        
    Returns:
        Pandas data frame representing the game if the live feed exists.
        Returns None otherwise.    
    '''    
    api_request = requests.get(api_root_url + live_feed_link)
    feed = api_request.json()
    if (api_request.status_code == 200):
        logging.info('Success reading live feed ' + live_feed_link)
        # Extract information from the feed to build the frame.
        frame = pd.DataFrame({
            # Metadata for the game. This should be broadcasted to all rows for the game.
            'game_id': str(feed['gameData']['game']['pk']),
            'season': str(feed['gameData']['game']['season']),
            # Provides a code for type of game, 'PR' for pre-season/exhibition, 'R' for regular season, 'P' for playoffs,
            # and 'A' for all-star.
            'type': feed['gameData']['game']['type'],
            # Team information. Non-NHL teams may not have triCodes, so their team name is used as a substitute.
            'away_id': feed['gameData']['teams']['away']['id'],
            'away_code': feed['gameData']['teams']['away']['triCode'] 
                if 'triCode' in feed['gameData']['teams']['away'].keys() else feed['gameData']['teams']['away']['teamName'],
            'home_id': feed['gameData']['teams']['home']['id'],
            'home_code': feed['gameData']['teams']['home']['triCode'] 
                if 'triCode' in feed['gameData']['teams']['home'].keys() else feed['gameData']['teams']['home']['teamName'],
            # Venue information.
            # venue_id is missing for many games, so track venue as well.
            'venue': feed['gameData']['venue']['name'],
            'venue_id': int(feed['gameData']['venue']['id']) if 'id' in feed['gameData']['venue'].keys() else None,
            # Start and end times for the game
            'game_time': feed['gameData']['datetime']['dateTime'],
            'game_end_time': feed['gameData']['datetime']['endDateTime'] 
                if 'endDateTime' in feed['gameData']['datetime'].keys() else None,
            # Game state, to guarantee all considered games are complete.
            'game_state': feed['gameData']['status']['abstractGameState'],
            
            # All remaining fields correspond to individual events.
            # Live feed indexing of the event
            'event_idx': [int(play['about']['eventIdx']) for play in feed['liveData']['plays']['allPlays']],
            'event_id': [int(play['about']['eventId']) for play in feed['liveData']['plays']['allPlays']],
            # Time of the event.
            'period': [int(play['about']['period']) for play in feed['liveData']['plays']['allPlays']],
            'period_ord': [play['about']['ordinalNum'] for play in feed['liveData']['plays']['allPlays']],
            'period_type': [play['about']['periodType'] for play in feed['liveData']['plays']['allPlays']],
            'time_elapsed': [play['about']['periodTime'] for play in feed['liveData']['plays']['allPlays']],
            'time_remaining': [play['about']['periodTimeRemaining'] for play in feed['liveData']['plays']['allPlays']],            
            'event_time': [play['about']['dateTime'] for play in feed['liveData']['plays']['allPlays']],  
            # Game score for event
            'score_away': [play['about']['goals']['away'] for play in feed['liveData']['plays']['allPlays']],  
            'score_home': [play['about']['goals']['home'] for play in feed['liveData']['plays']['allPlays']],  
            # Event descriptions
            'event': [play['result']['event'] for play in feed['liveData']['plays']['allPlays']],
            'event_code': [play['result']['eventCode'] for play in feed['liveData']['plays']['allPlays']],
            'event_type_id': [play['result']['eventTypeId'] for play in feed['liveData']['plays']['allPlays']],
            'event_desc': [play['result']['description'] for play in feed['liveData']['plays']['allPlays']],
            # Track which team the event was assigned to. This allows matching with the away_id and home_id to determine
            # skater state at the time of the event.
            'event_team_id': [play['team']['id'] if ('team' in play.keys()) else None 
                              for play in feed['liveData']['plays']['allPlays']],
            'event_team_code': [play['team']['triCode'] if ('team' in play.keys() and 'triCode' in play['team'].keys()) else None 
                                for play in feed['liveData']['plays']['allPlays']],
            # Event coordinates. Note: blocked shots are marked at the location of the block, not the shot.
            'event_coord_x': [float(play['coordinates']['x']) if ('x' in play['coordinates'].keys()) else None 
                                    for play in feed['liveData']['plays']['allPlays']],
            'event_coord_y': [float(play['coordinates']['y']) if ('y' in play['coordinates'].keys()) else None 
                                    for play in feed['liveData']['plays']['allPlays']],
            # Contains shot type for shots and penalty information for penalties
            'secondary_type': [play['result']['secondaryType'] if ('secondaryType' in play['result'].keys()) else None 
                               for play in feed['liveData']['plays']['allPlays']]
          #"penaltySeverity" : "Minor",
          #"penaltyMinutes" : 2            
        })
        # The play-by-play in the html reports codes plays differently. Translate the events that we can. All remaining events
        # have no corresponding entry.
        frame['event_shortcode'] = frame['event'].replace(event_translation)
        return frame
    else:
        logging.info('WARNING: Failure reading live feed ' + live_feed_link 
                     + ' (status: ' + str(api_request.status_code) +')')
        return None

def get_game_live_feed_frame_file(live_feed_link):
    '''
    Returns the path object for the game live feed data frame.
    This makes no guarantees on whether the file actually exists - the object will refer to the correct file if and only if
    the file already exists.
    
    Parameters:
        live_feed_link (str) - The live feed link of the game for the frame. Example: '/api/v1/game/2018020240/feed/live' 
        for the game in the 2018-2019 season with id 020240.
        
    Returns:
        Path object giving the absolute path to where the file should be located.
    '''
    current_dir = Path.cwd()
    # Uses live feed formatting for the game id for compactness.
    game_id = extract_season(live_feed_link)[:4] + extract_game_id(live_feed_link)
    relative_path = game_report_folder + 'livefeed_' + game_id + '.pkl'
    file = current_dir.joinpath(relative_path)
    return file

def read_game_live_feed_frame(live_feed_link):
    '''
    Reads the Pandas live feed data frame for the requested game, if it exists.
    
    Parameters:
        live_feed_link (str) - The live feed link of the game for the frame. Example: '/api/v1/game/2018020240/feed/live' 
        for the game in the 2018-2019 season with id 020240.  
        
    Returns:
        Pandas data frame representing the game if it exists.
        Returns None otherwise.
    '''
    file = get_game_live_feed_frame_file(live_feed_link)
    if file.exists():
        logging.info('Found ' + live_feed_link + ' live feeds. Reading.')
        game_frame = pd.read_pickle(str(file))
        return game_frame
    else:
        return None
    
def get_game_live_feed_frame(live_feed_link, refresh = False):
    '''
    Obtains a Pandas data frame corresponding to the live feed for the given link.
    
    Parameters:
        live_feed_link (str) - The live feed link of the game for the frame. Example: '/api/v1/game/2018020240/feed/live' 
            for the game in the 2018-2019 season with id 020240.  
        refresh (bool, optional) - If true, regenerates from scratch and overwrites any existing game frame file.
        
    Returns:
        Pandas data frame representing the game.    
    '''
    read_from_file = read_game_live_feed_frame(live_feed_link) if refresh else None
    if read_from_file is None:
        game_frame = construct_game_live_feed_frame(live_feed_link)
        if game_frame is not None:
            game_frame.to_pickle(get_game_live_feed_frame_file(live_feed_link)) 
        return game_frame
    else:
        return read_from_file
    
def get_season_live_feed_frame_file(season):
    '''
    Returns the path object for the season live feed data frame.
    This makes no guarantees on whether the file actually exists - the object will refer to the correct file if and only if
    the file already exists.
    
    Parameters:
        season (str) - The season for the frame. Example: '20182019' for the 2018-2019 season.
        
    Returns:
        Path object giving the absolute path to where the file should be located.
    '''    
    current_dir = Path.cwd()
    relative_path = data_folder + 'livefeed_' + season + '.pkl'
    file = current_dir.joinpath(relative_path)
    return file
    
def construct_season_live_feed_frame(season, refresh = False, refresh_games = None):
    '''
    Compiles the individual game live feed frames and combines them.
    
    Parameters:
        season (str) - The season for the frame. Example: '20182019' for the 2018-2019 season.
        refresh (bool, optional) - If true, regenerates from scratch. This option will overwrite any existing single game 
            cached data.
        refresh_games (list, optional) - If this exists, regenerates the frame by re-reading any game in this list. 
            Games not in this list are read from cached data, if present. Games should be of the form given by live feed links
            Example: '/api/v1/game/2018020240/feed/live' for the game in the 2018-2019 season with id 020240.    
            
    Returns:
        Pandas data frame representing the season.
    '''
    logging.info('Did not find ' + season + ' live feed. Constructing.')
    game_feed_links = get_game_feed_links(season)
    # Checks to see if the specific game should be refreshed before calling get_game_html_report_frame. This will be true
    # if refresh is true or if the specific link is included in refresh_links.
    
    if refresh_games is not None:
        live_feed_frames = [get_game_live_feed_frame(feed_url, refresh | (feed_url in refresh_games)) 
                                                     for feed_url in game_feed_links]
    else:
        live_feed_frames = [get_game_live_feed_frame(feed_url, refresh) for feed_url in game_feed_links]        
        
    return pd.concat(live_feed_frames)

        
def read_season_live_feed_frame(season):
    '''
    Reads the Pandas live feed data frame for the requested season, if it exists.
    
    Parameters:
        season (str) - The season for the frame. Example: '20182019' for the 2018-2019 season.
        
    Returns:
        Pandas data frame representing the season if it exists.
        Returns None otherwise.
    '''
    file = get_season_live_feed_frame_file(season)
    if file.exists():
        logging.info('Found ' + season + ' season live feed. Reading.')
        season_report_frame = pd.read_pickle(str(file))
        return season_report_frame
    else:
        return None
    
def get_season_live_feed_frame(season, refresh = False, refresh_games=None):
    '''
    Obtains a Pandas data frame constructed from all of the live feed reports for a season.
    
    Parameters:
        season (str) - The season for the frame. Example: '20182019' for the 2018-2019 season.
        refresh (bool, optional) - If true, regenerates the frame from scratch. This option will overwrite any existing
            single game cached data.
        refresh_games (list, optional) - If this exists, regenerates the frame by re-reading any game in this list. 
            Games not in this list are read from cached data, if present. Games should be of the form given by live feed links
            Example: '/api/v1/game/2018020240/feed/live' for the game in the 2018-2019 season with id 020240.
            
    Returns:
        Pandas data frame representing the season.
    '''    
    # Determine if any parameters have changed from the defaults. If they haven't, try to read the frame from disk. If they 
    # have changed, the frame must be rebuilt.
    refresh_any = refresh or (refresh_games is not None)
    # If the frame doesn't exist on disk, this will be None.
    read_from_file = read_season_live_feed_frame(season) if not refresh_any else None
    
    if read_from_file is None:
        live_feed_frame = construct_season_live_feed_frame(season, refresh_games)
        live_feed_frame.to_pickle(str(get_season_live_feed_frame_file(season)))
        return live_feed_frame
    else:
        return read_from_file

In [5]:
def extract_season_from_live_feed_link(live_feed_link):
    '''
    Constructs the season id from the live feed link.
    
    Parameters:
        live_feed_link (str) - The live feed link of the game for the frame. Example: '/api/v1/game/2018020240/feed/live' 
        for the game in the 2018-2019 season with id 020240.  
        
    Returns:
        A string representing the season. Example: '20182019' for the game in the 2018-2019 season with id 020240.    
    '''    
    # The first part of the season string is contained as the first four digits of the game id portion of the url.
    # The full season string is those four digits (interpreted as a string giving a year) with four digits representing
    # the following year.
    season = live_feed_link[13:17]
    season += str(int(season) + 1)    
    return season

def extract_game_id_from_live_feed_link(live_feed_link):
    '''
    Extracts the six-character game id from the live feed link.
    
    Parameters:
        live_feed_link (str) - The live feed link of the game for the frame. Example: '/api/v1/game/2018020240/feed/live' 
        for the game in the 2018-2019 season with id 020240.  
        
    Returns:
        A string representing the game id. Example: '020240' for the game in the 2018-2019 season with id 020240.    
    '''
    # The game id is the last 6 digits of the game id portion.
    return live_feed_link[17:23]   

def get_html_report_url(live_feed_link):
    '''
    Converts the live feed link into the url for the corresponding html report.
    
    Parameters:
        live_feed_link (str) - The live feed link of the game for the frame. Example: '/api/v1/game/2018020240/feed/live' 
        for the game in the 2018-2019 season with id 020240.  
        
    Returns:
        A string representing the url of the html report. Example: 'http://www.nhl.com/scores/htmlreports/2018/PL020240.HTM'
            for the game in the 2018-2019 season with id 020240.
    '''
    season = extract_season(live_feed_link)
    game = extract_game_id(live_feed_link)
    return 'http://www.nhl.com/scores/htmlreports/' + season + '/PL' + game + '.HTM'

def get_game_html_report(live_feed_link):
    '''
    Reads the html report for the requested game.
    
    Parameters:
        live_feed_link (str) - The live feed link of the game for the frame. Example: '/api/v1/game/2018020240/feed/live' 
        for the game in the 2018-2019 season with id 020240.  
        
    Returns:
        Content of the html report for the game if the html report exists.
        Returns None otherwise.      
    '''
    html_report_url = get_html_report_url(live_feed_link)
    report = requests.get(html_report_url)
    if (report.status_code == 200):
        logging.info('Success reading html report ' + html_report_url)
        return report.content
    else:
        logging.info('WARNING: Failure reading html report ' + html_report_url + ' (status: ' + str(report.status_code) +')')
        return None

def construct_game_html_report(live_feed_link):
    '''
    Constructs a Pandas data frame from the html report data for the requested game.
    
    Parameters:
        live_feed_link (str) - The live feed link of the game for the frame. Example: '/api/v1/game/2018020240/feed/live' 
        for the game in the 2018-2019 season with id 020240.  
        
    Returns:
        Pandas data frame representing the game if the html report exists.
        Returns None otherwise.    
    '''
    report = get_game_html_report(live_feed_link)
    if report is None:
        return None
    else:
        game_soup = BeautifulSoup(report, 'html.parser')
        # Play-by-play rows are all the same class
        event_rows = game_soup.find_all('tr', class_='evenColor')
        # Row children are
        #   1: Index
        #   3: Period (In regular season, OT is 4 and SO is 5)
        #   5: Strength (Even strength = EV, Power play = PP, Short-handed = SH)
        #   7: Time elapse / Time remaining
        #   9: Event type
        #  11: Event detailed description
        #  13: Visiting players on ice / jersey numbers and positions
        #  15: Home players on ice / jersey numbers and positions     
        frame = pd.DataFrame({
            'season': extract_season(live_feed_link),
            # Put game_id in a format that will match the live feed frame.
            'game_id': extract_season(live_feed_link)[:4] + extract_game_id(live_feed_link),
            'row_index': [int(list(row.children)[1].get_text()) for row in event_rows],
            'period': [int(list(row.children)[3].get_text()) for row in event_rows],
            'strengths': [list(row.children)[5].get_text().replace('\xa0','') for row in event_rows],
            # The times are missing initial '0's, meaning they don't match with the live feeds without adjustment.
            'time_elapsed': [list(list(row.children)[7].children)[0].rjust(5, '0') for row in event_rows],
            'time_remaining': [ list(list(row.children)[7].children)[2].rjust(5, '0') for row in event_rows],
            'event': [ list(row.children)[9].get_text() for row in event_rows],
            'desc': [list(row.children)[11].get_text().replace('\xa0','') for row in event_rows],
            'voi_no': [[ player.find('font').get_text() for player in j] if j is not None else None 
                       for j in [i.find_all('table') if i is not None else None 
                                 for i in [ list(row.children)[13].find('table') for row in event_rows]]],
            'voi_pos':  [[player.find_all('td')[1].get_text() for player in j ] if j is not None else None 
                         for j in [i.find_all('table') if i is not None else None 
                                   for i in [list(row.children)[13].find('table') for row in event_rows]]],
            'hoi_no': [[ player.find('font').get_text() for player in j] if j is not None else None 
                       for j in [i.find_all('table') if i is not None else None 
                                 for i in [ list(row.children)[15].find('table') for row in event_rows]]],
            'hoi_pos': [[player.find_all('td')[1].get_text() for player in j ] if j is not None else None 
                         for j in [i.find_all('table') if i is not None else None 
                                   for i in [list(row.children)[15].find('table') for row in event_rows]]]
        })
        # There are a number of derived fields that are more easily calculated up-front. These have to do with
        # how many players are on the ice for each team and what positions they play.

        # Start with visiting team.
        # There are a handful of players not assigned any particular forward position.
        frame['vis_f'] = frame['voi_pos'].apply(lambda x: x.count('F') if x is not None else None)
        # Rest of the forwards (centers, left and right wings)
        frame['vis_c'] = frame['voi_pos'].apply(lambda x: x.count('C') if x is not None else None)
        frame['vis_l'] = frame['voi_pos'].apply(lambda x: x.count('L') if x is not None else None)
        frame['vis_r'] = frame['voi_pos'].apply(lambda x: x.count('R') if x is not None else None)
        # All combined forwards
        frame['vis_fwd'] = frame['vis_f'] + frame['vis_c'] + frame['vis_l'] + frame['vis_r']
        # Defense
        frame['vis_d'] = frame['voi_pos'].apply(lambda x: x.count('D') if x is not None else None)
        # Total skaters (all players except goalies)
        frame['vis_skr'] = frame['vis_fwd'] + frame['vis_d']
        # Goalies
        frame['vis_g'] = frame['voi_pos'].apply(lambda x: x.count('G') if x is not None else None)
        # Total players on ice
        frame['vis_plr'] = frame['vis_skr'] + frame['vis_g']
        
        # Same for the home team, starting with non-specific forwards
        frame['hom_f'] = frame['hoi_pos'].apply(lambda x: x.count('F') if x is not None else None)
        # Rest of the forwards (centers, left and right wings)
        frame['hom_c'] = frame['hoi_pos'].apply(lambda x: x.count('C') if x is not None else None)
        frame['hom_l'] = frame['hoi_pos'].apply(lambda x: x.count('L') if x is not None else None)
        frame['hom_r'] = frame['hoi_pos'].apply(lambda x: x.count('R') if x is not None else None)
        # Combined forwards
        frame['hom_fwd'] = frame['hom_f'] + frame['hom_c'] + frame['hom_l'] + frame['hom_r']
        # Defense
        frame['hom_d'] = frame['hoi_pos'].apply(lambda x: x.count('D') if x is not None else None)
        # Total skaters (all players except goalies)
        frame['hom_skr'] = frame['hom_fwd'] + frame['hom_d']
        # Goalies
        frame['hom_g'] = frame['hoi_pos'].apply(lambda x: x.count('G') if x is not None else None)
        # Total players on ice
        frame['hom_plr'] = frame['hom_skr'] + frame['hom_g']  
        
        return frame
    
def get_game_html_report_frame_file(live_feed_link):
    '''
    Returns the path object for the game html report data frame.
    This makes no guarantees on whether the file actually exists - the object will refer to the correct file if and only if
    the file already exists.
    
    Parameters:
        live_feed_link (str) - The live feed link of the game for the frame. Example: '/api/v1/game/2018020240/feed/live' 
        for the game in the 2018-2019 season with id 020240.
        
    Returns:
        Path object giving the absolute path to where the file should be located.
    '''
    current_dir = Path.cwd()
    # Uses live feed formatting for the game id for compactness.
    game_id = extract_season(live_feed_link)[:4] + extract_game_id(live_feed_link)
    relative_path = game_report_folder + 'htmlreport_' + game_id + '.pkl'
    file = current_dir.joinpath(relative_path)
    return file

def read_game_html_report_frame(live_feed_link):
    '''
    Reads the Pandas html report data frame for the requested game, if it exists.
    
    Parameters:
        live_feed_link (str) - The live feed link of the game for the frame. Example: '/api/v1/game/2018020240/feed/live' 
        for the game in the 2018-2019 season with id 020240.  
        
    Returns:
        Pandas data frame representing the game if it exists.
        Returns None otherwise.
    '''
    file = get_game_html_report_frame_file(live_feed_link)
    if file.exists():
        logging.info('Found ' + live_feed_link + ' game html reports. Reading.')
        game_frame = pd.read_pickle(str(file))
        return game_frame
    else:
        return None
    
def get_game_html_report_frame(live_feed_link, refresh = False):
    '''
    Obtains a Pandas data frame corresponding to the HTML report for the given live feed link.
    
    Parameters:
        live_feed_link (str) - The live feed link of the game for the frame. Example: '/api/v1/game/2018020240/feed/live' 
            for the game in the 2018-2019 season with id 020240.  
        refresh (bool, optional) - If true, regenerates from scratch and overwrites any existing game frame file.
        
    Returns:
        Pandas data frame representing the game.    
    '''
    read_from_file = read_game_html_report_frame(live_feed_link) if refresh else None
    if read_from_file is None:
        game_frame = construct_game_html_report(live_feed_link)
        if game_frame is not None:
            game_frame.to_pickle(get_game_html_report_frame_file(live_feed_link)) 
        return game_frame
    else:
        return read_from_file
    
def construct_season_html_report_frame(season, refresh = False, refresh_games = None):
    '''
    Compiles the individual game html_report frames and combines them.
    
    Parameters:
        season (str) - The season for the frame. Example: '20182019' for the 2018-2019 season.
        refresh (bool, optional) - If true, regenerates from scratch. This option will overwrite any existing single game 
            cached data.
        refresh_games (list, optional) - If this exists, regenerates the frame by re-reading any game in this list. 
            Games not in this list are read from cached data, if present. Games should be of the form given by live feed links
            Example: '/api/v1/game/2018020240/feed/live' for the game in the 2018-2019 season with id 020240.    
            
    Returns:
        Pandas data frame representing the season.
    '''
    logging.info('Did not find ' + season + ' season html report. Constructing.')
    game_feed_links = get_game_feed_links(season)
    # Checks to see if the specific game should be refreshed before calling get_game_html_report_frame. This will be true
    # if refresh is true or if the specific link is included in refresh_links.
    
    if refresh_games is not None:
        html_report_frames = [get_game_html_report_frame(feed_url, refresh | (feed_url in refresh_games)) 
                                                     for feed_url in game_feed_links]
    else:
        html_report_frames = [get_game_html_report_frame(feed_url, refresh) for feed_url in game_feed_links]        
        
    # The concatenation implicitly uses the fact that any entries equal to None get silently dropped. This matters for the
    # handful of games without html reports.
    return pd.concat(html_report_frames)

def get_season_html_report_frame_file(season):
    '''
    Returns the path object for the season html report data frame.
    This makes no guarantees on whether the file actually exists - the object will refer to the correct file if and only if
    the file already exists.
    
    Parameters:
        season (str) - The season for the frame. Example: '20182019' for the 2018-2019 season.
        
    Returns:
        Path object giving the absolute path to where the file should be located.
    '''
    current_dir = Path.cwd()
    relative_path = data_folder + 'htmlreport_' + season + '.pkl'
    file = current_dir.joinpath(relative_path)
    return file

def read_season_html_report_frame(season):
    '''
    Reads the Pandas html report data frame for the requested season, if it exists.
    
    Parameters:
        season (str) - The season for the frame. Example: '20182019' for the 2018-2019 season.
        
    Returns:
        Pandas data frame representing the season if it exists.
        Returns None otherwise.
    '''
    file = get_season_html_report_frame_file(season)
    if file.exists():
        logging.info('Found ' + season + ' season html report. Reading.')
        season_report_frame = pd.read_pickle(str(file))
        return season_report_frame
    else:
        return None
    
def get_season_html_report_frame(season, refresh = False, refresh_games = None):
    '''
    Obtains a Pandas data frame constructed from all the play-by-play html reports for a season.
    
    Parameters:
        season (str) - The season for the frame. Example: '20182019' for the 2018-2019 season.
        refresh (bool, optional) - If true, regenerates the frame from scratch. This option will overwrite any existing
            single game cached data.
        refresh_games (list, optional) - If this exists, regenerates the frame by re-reading any game in this list. 
            Games not in this list are read from cached data, if present. Games should be of the form given by live feed links
            Example: '/api/v1/game/2018020240/feed/live' for the game in the 2018-2019 season with id 020240.
            
    Returns:
        Pandas data frame representing the season.
    '''
    # Determine if any parameters have changed from the defaults. If they haven't, try to read the frame from disk. If they 
    # have changed, the frame must be rebuilt.
    refresh_any = refresh or (refresh_games is not None)
    # If the frame doesn't exist on disk, this will be None.
    read_from_file = read_season_html_report_frame(season) if not refresh_any else None
    
    if read_from_file is None:
        html_report_frame = construct_season_html_report_frame(season, refresh, refresh_games)
        html_report_frame.to_pickle(str(get_season_html_report_frame_file(season)))
        return html_report_frame
    else:
        return read_from_file    

In [6]:
def get_combined_frame_file(season):
    '''
    Returns the path object for the combined (live feed + html report) data frame.
    This makes no guarantees on whether the file actually exists - the object will refer to the correct file if and only if
    the file already exists.
    
    Parameters:
        season (str) - season (str) - The season for the frame. Example: '20182019' for the 2018-2019 season.
        
    Returns:
        Path object giving the absolute path to where the file should be located.
    '''
    current_dir = Path.cwd()
    relative_path = data_folder + 'combined_' + season + '.pkl'
    file = current_dir.joinpath(relative_path)
    return file

def filter_live_feed_frame(live_feed_df):
    filter_df = ((live_feed_df['event_shortcode'] == 'SHOT') 
                | (live_feed_df['event_shortcode'] == 'MISS')  
                | (live_feed_df['event_shortcode'] ==  'GOAL')
                | (live_feed_df['event_shortcode'] ==  'BLOCK'))
    return live_feed_df[filter_df]

def filter_html_report_frame(html_report_df):
    filter_df = ((html_report_df['event'] == 'SHOT') 
                | (html_report_df['event'] == 'MISS')  
                | (html_report_df['event'] ==  'GOAL')
                | (html_report_df['event'] ==  'BLOCK'))
    return html_report_df[filter_df]

def filter_combined_frame_game_types(combined_df):
    filter_df = (combined_df['type'] != 'PR') & (combined_df['type'] != 'A')
    return combined_df[filter_df]

def filter_combined_frame_penalty_shots(combined_df):
    series_filter = (combined_df['hom_plr'] == 1) | (combined_df['vis_plr'] == 1)
    filtered = combined_df.copy()
    filtered['is_penalty_shot'] = series_filter
    return filtered
    
def filter_combined_frame(combined_df):
    # Eliminate preseason and all-star games
    
    #filtered_df = combined_df[filter_df]
    # Eliminate shootout goals.
    #filter_df = filtered_df['period_ord'] != 'SO'
    #filtered_df = filtered_df[filter_df]
    
    # Mark penalty shots. These should be marked by having only 1 player on the ice for a team.
    #filtered_df.loc[:,'is_penalty_shot'] = (filtered_df['hom_plr'] == 1) | (filtered_df['vis_plr'] == 1)
    return filter_combined_frame_penalty_shots(filter_combined_frame_game_types(combined_df))

def combine_frames(live_feed_df, html_report_df):
    '''
    Combines the live feed and html report frames.
    '''
    combined = pd.merge(filter_live_feed_frame(live_feed_df), filter_html_report_frame(html_report_df), how='outer', 
                        left_on=['season', 'game_id', 'event_shortcode', 'period', 'time_elapsed', 'time_remaining'], 
                        right_on = ['season', 'game_id', 'event', 'period', 'time_elapsed', 'time_remaining'], 
                        suffixes=('_livefeed', '_html'))
    combined = filter_combined_frame(combined)
    return combined

def read_combined_frame(season):
    '''
    Reads the Pandas data frame combining the live feed and the play-by-play html reports for a season, if it exists.

    Parameters:
        season (str) - The season for the frame. Example: '20182019' for the 2018-2019 season.
        
    Returns:
        Pandas data frame representing the season if it exists.
        Returns None otherwise.    
    '''
    file = get_combined_frame_file(season)
    if file.exists():
        logging.info('Found ' + season + ' combined report. Reading.')
        season_frame = pd.read_pickle(str(file))
        return season_frame
    else:
        return None
    
def get_combined_frame(season, refresh=False, refresh_games=None, refresh_feed=False, refresh_feed_games=None, 
                       refresh_html=False, refresh_html_games=None, refresh_combine=False):
    '''
    Obtains a Pandas data frame combining the live feed and the play-by-play html reports for a season.
    
    Parameters:
        season (str) - The season for the frame. Example: '20182019' for the 2018-2019 season.
        refresh (bool, optional) - If true, regenerates the frame from scratch. This option will overwrite any existing data.
        refresh_games (list, optional) - If this exists, regenerates the frame by re-reading any game in this list. 
            Games not in this list are read from cached data, if present. Games should be of the form given by live feed links
            Example: '/api/v1/game/2018020240/feed/live' for the game in the 2018-2019 season with id 020240.
        refresh_feed (bool, optional), refresh_feed_games (list, optional) - If present, passed to get_season_live_feed_frame 
            as 'refresh' and 'refresh_games', respectively. Superceded by 'refresh' and 'refresh_games' if specified.
        refresh_html (bool, optional), refresh_feed_games (list, optional) - If present, passed to get_season_html_report_frame 
            as 'refresh' and 'refresh_html_games', respectively. Superceded by 'refresh' and 'refresh_games' if specified.      
        refresh_combine (bool, optional) - Re-combine the frames without necessarily re-computing them.
        
    Returns:
        Pandas data frame representing the season.
    '''    
    any_refreshed = refresh | refresh_feed | refresh_html | refresh_combine
    read_from_file = read_combined_frame(season) if not any_refreshed else None
    if read_from_file is None:   
        logging.info('Did not find ' + season + ' combined report. Constructing.')
        feed_df = get_season_live_feed_frame(season, refresh | refresh_feed, 
                                      refresh_games if refresh_games is not None else refresh_feed_games)
        html_df = get_season_html_report_frame(season, refresh | refresh_html, 
                                               refresh_games if refresh_games is not None else refresh_html_games)
        combined_df = combine_frames(feed_df, html_df)
        combined_df.to_pickle(str(get_combined_frame_file(season)))
        return combined_df
    else:
        return read_from_file

In [11]:
df = get_combined_frame('20182019', refresh_combine=True)

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 160058 entries, 10184 to 170424
Data columns (total 59 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   game_id          160058 non-null  object 
 1   season           160058 non-null  object 
 2   type             160032 non-null  object 
 3   away_id          160032 non-null  float64
 4   away_code        160032 non-null  object 
 5   home_id          160032 non-null  float64
 6   home_code        160032 non-null  object 
 7   venue            160032 non-null  object 
 8   venue_id         130908 non-null  object 
 9   game_time        160032 non-null  object 
 10  game_end_time    159919 non-null  object 
 11  game_state       160032 non-null  object 
 12  event_idx        160032 non-null  float64
 13  event_id         160032 non-null  float64
 14  period           160058 non-null  float64
 15  period_ord       160032 non-null  object 
 16  period_type      160032 non-null  