In [4]:
# %load read_play_by_play_data.py
"""
Created on Fri Nov 27 13:16:44 2020

@author: wodar
"""

import requests
import pickle
import json
from bs4 import BeautifulSoup
import pandas as pd
from pathlib import Path
import numpy as np
from collections import Counter
import re
from statsmodels.distributions.empirical_distribution import ECDF
from scipy import stats
import logging
logging.basicConfig(filename='logs.log', level=logging.INFO, format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')

# Event location data was first added to the game feeds in the 2010-2011 season. Consequently, that will be the oldest season
# used in this project.
# The last season used will be the 2018-19 season. This season is chosen because it's the most-recent non-COVID season and because
# shot location information in the 2019-20 season was incorrect. For details, see https://www.ontheforecheck.com/2019/10/15/20915205/nhl-shot-location-play-by-play-data-has-changed-pbp-shotmaps-analytics-expected-goals-model

# general api and local cache information.
API_ROOT_URL = 'https://statsapi.web.nhl.com'
DATA_FOLDER = 'data/'
GAME_FRAME_FOLDER = DATA_FOLDER + 'games/'
RAW_FOLDER = DATA_FOLDER + 'raw/'
RAW_LIVE_FEED_FOLDER = RAW_FOLDER + 'feeds/'
RAW_HTML_REPORT_FOLDER = RAW_FOLDER + 'html/'
SEASON_LIST = ['20102011', '20112012', '20122013', '20132014', '20142015', '20152016', '20162017', 
               '20172018', '20182019', '20192020']

def get_schedule_local_path(season):
    '''
    Obtains a path object for the local file storing the schedule information for a season.

    Parameters
    ----------
    season : str
        The season for the schedule. Example: '20182019' for the 2018-19 season.

    Returns
    -------
    schedule_path : pathlib.Path
        Path object for the local schedule file, if it exists. If it doesn't exist, points
        to the location that it would exist, allowing saving at that location.

    '''
    current_dir = Path.cwd()
    relative_path = DATA_FOLDER + 'schedule_' + season + '.json'
    schedule_path = current_dir.joinpath(relative_path)
    return schedule_path

def get_schedule_api_url(season):
    '''
    Builds the link to the NHL API schedule endpoint for the requested season.

    Parameters
    ----------
    season : str
        The season for the schedule. Example: '20182019' for the 2018-19 season.

    Returns
    -------
    str
        URL giving the API endpoint to obtain the season schedule. The link is constructed to restrict games
        to the regular season and playoffs.

    '''
    # At the time of writing, querying the website for the 2016-17 season returned a status code of 500.
    # The workaround is to query every calendar date contained in the schedule (2016-10-12 until 2017-06-11).
    
    if (season != '20162017'):
        # Game type of 'R' represents regular season, 'P' represents playoffs. These are the only games for which
        # statistics are officially counted.
        return API_ROOT_URL + '/api/v1/schedule?' + 'season=' + season + '&gameType=R,P'
    else:
        return 'https://statsapi.web.nhl.com/api/v1/schedule?startDate=2016-10-12&endDate=2017-06-11&gameType=R,P'
        
def extract_season_game_feed_links(season):
    '''
    Extracts the live feed links from the schedule returned by the API

    Parameters
    ----------
    season : str
        The season for the schedule. Example: '20182019' for the 2018-19 season.

    Returns
    -------
    schedule_links : list of str
        List of links to live feeds of games for the season.
        Returns None if the API request fails.

    '''
    # api_url is restricted to regular season and playoff games by get_schedule_api_link.
    api_url = get_schedule_api_url(season)
    api_request = requests.get(api_url)
    if (api_request.status_code == 200):
        season = api_request.json()
        logging.info('Success downloading ' + season + 'schedule')
        # The json returned by the API provides a list of calendar dates under the key 'dates'. Each calendar date in
        # turn provides a list of games for that date, keyed by 'games'. Finally, each game provides the live feed link.
        schedule_links = [ game['link'] 
                      for game_date in season['dates'] 
                      for game in game_date['games'] ]    
        return schedule_links
    else:
        logging.error('Error downloading ' + season + 'schedule (Status: ' + str(api_request.status_code)+')')
        return None

def read_game_feed_links(season):
    '''
    Reads the local file containing live feed links for the current season, if it exists.

    Parameters
    ----------
    season : str
        The season for the schedule. Example: '20182019' for the 2018-19 season.

    Returns
    -------
    game_feed_links : list of str
        List of links to live feeds of games for the season, if the local file exists.
        Returns None if the file doesn't exist.
        An individual link has the form '/api/v1/game/2018020256/feed/live', where the substring '2018020256' represents 
        the NHL's game ID for the game. 
    '''
    game_feed_link_file = get_schedule_local_path(season)
    if game_feed_link_file.exists():
        logging.info('Reading ' + season + ' schedule.')
        with game_feed_link_file.open('r') as infile:
            game_feed_links = json.load(infile)
        return game_feed_links
    else:
        return None
    
def get_season_game_feed_links(season, refresh = False):
    '''
    Obtains the live feed links for games in the requested season.

    Parameters
    ----------
    season : str
        The season for the schedule. Example: '20182019' for the 2018-19 season.
    refresh : bool, optional
        If True, ignores the existence of any local files and re-downloads and processes
        the data from the API. This will result in overwriting any current saves. The default is False.

    Returns
    -------
    game_feed_links : list of str
        List of links to live feeds of games for the season. An individual link has the form 
        '/api/v1/game/2018020256/feed/live', where the substring '2018020256' represents the NHL's game ID
        for the game. 

    '''
    # Read the data if it exists and no request to refresh/reconstruct the data was sent.
    read_from_file = read_game_feed_links(season) if not refresh else None
    if read_from_file is None:
        game_feed_links = extract_season_game_feed_links(season)
        # Save the data before returing.
        live_feed_file = get_schedule_local_path(season)
        live_feed_file.touch()
        with live_feed_file.open('w') as outfile:
            json.dump(game_feed_links, outfile)
        return game_feed_links
    else:
        return read_from_file
    
def get_game_feed_links(seasons, refresh=False):
    '''
    Obtains game feed links for every season indicated in seasons.

    Parameters
    ----------
    seasons : str or list
        If a string, should be the season for the schedule. Example: '20182019' for the 2018-19 season.
        If a list, should be a list of strings in the format listed above. Example: ['20182019', '20152016']
        will obtain the information for the 2015-16 and 2018-19 seasons.
    refresh : bool, optional
        If True, ignores the existence of any local files and re-downloads and processes
        the data from the API. This will result in overwriting any current saves. The default is False.

    Returns
    -------
    List of str
        List of links to live feeds of games for the season. An individual link has the form 
        '/api/v1/game/2018020256/feed/live', where the substring '2018020256' represents the NHL's game ID
        for the game. 
    '''
    if (type(seasons)==str):
        # Assume that a string argument refers to a single season, so just return that season.
        return get_season_game_feed_links(seasons)
    else:
        # Assume any other argument is a list of seasons.
        return [ link 
                for season in seasons 
                for link in get_season_game_feed_links(season)]
 

# Event codes in the live feed are somewhat verbose. This dictionary provides a base to translate
# the codes to match those given in the play-by-play HTML report.
EVENT_TRANSLATION = {
    'Game Scheduled': None, 
    'Period Ready': None, 
    'Period Start': 'PSTR', 
    'Faceoff': 'FAC',
    'Giveaway': 'GIVE', 
    'Shot': 'SHOT', 
    'Stoppage': 'STOP', 
    'Takeaway': 'TAKE', 
    'Hit': 'HIT', 
    'Missed Shot': 'MISS',
    'Penalty': 'PENL', 
    'Blocked Shot': 'BLOCK', 
    'Goal': 'GOAL', 
    'Period End': 'PEND', 
    'Period Official': None,
    'Shootout Complete': 'SOC', 
    'Game End': 'GEND', 
    'Game Official': 'GOFF',
    'Official Challenge': 'CHL', 
    'Early Intermission Start': 'EISTR',
    'Early Intermission End': 'EIEND', 
    'Emergency Goaltender': 'EGT'
}
# Track whether the code marks a shot or some other event.
# Shots are coded by the categories as 'Goal', 'Missed Shot', 'Shot', and 'Blocked Shot'
SHOT_EVENTS = [ 'SHOT', 'BLOCK', 'GOAL', 'MISS']
FACEOFF_EVENTS = [ 'FAC' ]

def extract_id_from_live_feed_link(live_feed_link):
    '''
    Extracts the ten-character game id from the live_feed_link.

    Parameters
    ----------
    live_feed_link : str
        The live feed link of the game for the frame. Example: '/api/v1/game/2018020240/feed/live' 
        for the game in the 2018-2019 season with id 020240. See the documentation for get_game_feed_links
        for more information.

    Returns
    -------
    str
        The ten-character game id from the live_feed_link. Example: '/api/v1/game/2018020240/feed/live' will return
        '2018020240'

    '''
    return live_feed_link[13:23]
    
def get_live_feed_file(live_feed_link):
    '''
    Obtains the handle for the local version of the live feed file.

    Parameters
    ----------
    live_feed_link : str
        The live feed link of the game for the frame. Example: '/api/v1/game/2018020240/feed/live' 
        for the game in the 2018-2019 season with id 020240. See the documentation for get_game_feed_links
        for more information.

    Returns
    -------
    live_feed_file : pathlib.Path
        Path object for the local live feed file, if it exists. If it doesn't exist, points
        to the location that it would exist, allowing saving at that location.

    '''
    current_dir = Path.cwd()
    relative_path = RAW_LIVE_FEED_FOLDER + 'livefeed_' + extract_id_from_live_feed_link(live_feed_link) + '.json'
    live_feed_file = current_dir.joinpath(relative_path)
    return live_feed_file
 
def read_live_feed(live_feed_link):
    '''
    Reads the local copy of the live feed for the given link, if it exists.

    Parameters
    ----------
    live_feed_link : str
        The live feed link of the game for the frame. Example: '/api/v1/game/2018020240/feed/live' 
        for the game in the 2018-2019 season with id 020240. See the documentation for get_game_feed_links
        for more information.

    Returns
    -------
    live_feed : dict
        Representation of the json object corresponding to the link, if it is saved locally.
        Otherwise returns None.

    '''
    feed_file = get_live_feed_file(live_feed_link)
    if feed_file.exists():
        logging.info('Reading raw feed ' + live_feed_link)
        with feed_file.open('r') as infile:
            live_feed = json.load(infile)
            
        return live_feed
    else:
        return None

def download_live_feed(live_feed_link):
    '''
    Downloads the live feed for the given link from the API.

    Parameters
    ----------
    live_feed_link : str
        The live feed link of the game for the frame. Example: '/api/v1/game/2018020240/feed/live' 
        for the game in the 2018-2019 season with id 020240. See the documentation for get_game_feed_links
        for more information.

    Returns
    -------
    dict
        Representation of the json object corresponding to the link, if it exists.
        Otherwise returns None.

    '''
    api_request = requests.get(API_ROOT_URL + live_feed_link)
    
    if (api_request.status_code == 200):
        logging.info('Success downloading raw feed ' + live_feed_link)
        return api_request.json()
    else:
        logging.error('Error downloading raw feed ' + live_feed_link +' (Status: ' + str(api_request.status_code)+')')
        return None

def get_live_feed(live_feed_link, refresh=False):
    '''
    
    Obtains the raw live feed for the link.
    
    Parameters
    ----------
    live_feed_link : str
        The live feed link of the game for the frame. Example: '/api/v1/game/2018020240/feed/live' 
        for the game in the 2018-2019 season with id 020240. See the documentation for get_game_feed_links
        for more information.
    refresh : bool, optional
        If True, ignores the existence of any local files and re-downloads and processes
        the data from the API. This will result in overwriting any current saves. The default is False.

    Returns
    -------
    dict
        Dictionary with the full raw live feed for the link.

    '''

    # Read the file if it already exists locally and there is no request to re-download.
    read_from_file = read_live_feed(live_feed_link) if not refresh else None
    if read_from_file is None:
        live_feed = download_live_feed(live_feed_link)
        # Once the raw data is downloaded, save it for faster future processing.
        if live_feed is not None:
           live_feed_file = get_live_feed_file(live_feed_link)
           # Make sure that the folder exists.
           live_feed_file.parent.resolve().mkdir(parents=True, exist_ok=True)
           live_feed_file.touch()
           with live_feed_file.open('w') as outfile:
               json.dump(live_feed, outfile)
        return live_feed
    else:
        return read_from_file    
    pass

def get_seconds_elapsed(period, time_str):
    '''
    Calculates the time of an event in seconds elapsed since the start of the game.

    Parameters
    ----------
    period : int
        The period of the event. Periods are assumed to be 20 minutes in length. In the regular season,
        there is only one overtime period (of less than 20 minutes). Since the function counts elapsed time,
        this shouldn't be a concern.
    time_str : str
        The time of the event within the period, given in the form 'mm:ss'. This should be the time counting up
        from the start of the period.

    Returns
    -------
    int
        The time of the event in seconds since the start of the game.

    '''
    m, s = time_str.split(':')
    return 20*60*(period - 1) + int(m) * 60 + int(s)

def parse_live_feed(feed):
    '''
    Parses game live feed to produce a pandas data frame.

    Parameters
    ----------
    feed : dict
        Dictionary containing the live feed data for a game.

    Returns
    -------
    Pandas DataFrame
        Date Frame containing event data from the live feed file. 

    '''
            
    return pd.DataFrame({
        # Game metadata
        'game_id': str(feed['gameData']['game']['pk']),
        'season': str(feed['gameData']['game']['season']),
        'type': feed['gameData']['game']['type'],
        'game_time': feed['gameData']['datetime']['dateTime'],
        'away_code': feed['gameData']['teams']['away']['triCode'] \
            if 'triCode' in feed['gameData']['teams']['away'].keys() else feed['gameData']['teams']['away']['teamName'],
        'home_code': feed['gameData']['teams']['home']['triCode'] \
            if 'triCode' in feed['gameData']['teams']['home'].keys() else feed['gameData']['teams']['home']['teamName'],
        # Venue information. Ideally, we'd just use venue_id, but it is missing for many games, so track venue as well.
        'venue': feed['gameData']['venue']['name'],
        'venue_id': int(feed['gameData']['venue']['id']) if 'id' in feed['gameData']['venue'].keys() else None,
            
        # Use the event ordering used by the feed.
        'event_idx': [int(play['about']['eventIdx']) for play in feed['liveData']['plays']['allPlays']],
        # Game time of the event.
        # The period and time elapsed are sufficient, but combining these into 'cum_time_elapsed' allows
        # for more succinct determination of time between separate events.
        'period': [int(play['about']['period']) for play in feed['liveData']['plays']['allPlays']],
        # While the ordinal is not crucial, it offers a readable way to determine when the period is a shootout.
        'period_ord': [play['about']['ordinalNum'] for play in feed['liveData']['plays']['allPlays']],
        # Similarly allows easy distinguishing between regulation, overtime, and shootouts.
        'period_type': [play['about']['periodType'] for play in feed['liveData']['plays']['allPlays']],
        'time_elapsed': [play['about']['periodTime'] for play in feed['liveData']['plays']['allPlays']],
        # Calculate the number of seconds into the game of the event.
        'cum_time_elapsed': [ get_seconds_elapsed(int(play['about']['period']), play['about']['periodTime']) 
                                        for play in feed['liveData']['plays']['allPlays']],
        # Information about the actual event.
        'event': map(EVENT_TRANSLATION.get, [play['result']['event'] for play in feed['liveData']['plays']['allPlays']]),
        # Track the team corresponding to the event. This will matter for correction of venue bias.
        'event_team_code': [ play['team']['triCode'] \
                            if (('team' in play.keys()) and ('triCode' in play['team'].keys())) else None 
                            for play in feed['liveData']['plays']['allPlays']],
        # Determine whether the event is associated to the home team.
        'event_team_is_home': [(feed['gameData']['teams']['home']['id'] == play['team']['id']) 
                               if ('team' in play.keys()) else None for play in feed['liveData']['plays']['allPlays']],
        # Event coordinates. Note: blocked shots are marked at the location of the block, not the shot.
        'event_coord_x': [float(play['coordinates']['x']) if ('x' in play['coordinates'].keys()) else None 
                          for play in feed['liveData']['plays']['allPlays']],
        'event_coord_y': [float(play['coordinates']['y']) if ('y' in play['coordinates'].keys()) else None 
                          for play in feed['liveData']['plays']['allPlays']],
        # Contains shot type for shots and penalty information for penalties
        'secondary_type': [play['result']['secondaryType'] if ('secondaryType' in play['result'].keys()) else None 
                           for play in feed['liveData']['plays']['allPlays']]       
    })
    
def process_live_feed_frame(frame):
    '''
    Performs post-parsing processing of the live feed data frame.

    Parameters
    ----------
    frame : Pandas DataFrame
        Data frame that has been generated by parsing the live feed for a game.

    Returns
    -------
    frame : Pandas DataFrame
        The input data frame with additional column 'is_rebound' and restricted to only events referring
        to shots.

    '''
    # The main purpose of further processing the frame is to classify shots as to whether they're rebounds or not. 
    # This project follows the convention described in http://blog.war-on-ice.com/annotated-glossary/ that a rebound is 
    # any shot taken within 3 seconds of the previous shot.
    # This is a bit tricky since a shot shouldn't count as a rebound if there was an intervening play stoppage.
    
    # Track whether the event is a shot
    frame['is_shot'] = frame['event'].isin(SHOT_EVENTS)
    # Same concept, but determine whether the event is a stoppage event. While many events stop play, all restarts
    # (except for penalty shots) are done by faceoff. Penalty shots will be dropped later, meaning they are not
    # a concern. As a result, faceoffs are used as proxies for stoppage events.
    frame['is_faceoff'] = frame['event'].isin(FACEOFF_EVENTS) 
        
    # Find the recent shot event, not including the current event.
    # Count the number of previous shot events
    frame['prev_shot_num'] = frame['is_shot'].cumsum() - frame['is_shot']
    # Get the times and indices of shot events.
    shot_times = frame.groupby('prev_shot_num')['cum_time_elapsed'].max()
    shot_indexes = frame.groupby('prev_shot_num')['event_idx'].max()
    # Arbitrarily use -1 for events before the first shot.
    frame['prev_shot_time'] = [shot_times[shot_num - 1] if shot_num > 0 else -1 for shot_num in frame['prev_shot_num']]
    frame['prev_shot_idx'] = [shot_indexes[shot_num - 1] if shot_num > 0 else -1 for shot_num in frame['prev_shot_num']]
    
    # And the most recent faceoff event. Only need the index, not the time, for faceoffs, because the only concern
    # with faceoffs is guaranteeing that there was no intervening faceoff/stoppage between shots. The indexing maintains
    # the order (and allows distinguishing between events occurring less than a second apart).
    frame['prev_faceoff_num'] = frame['is_faceoff'].cumsum() - frame['is_faceoff']
    faceoff_indexes = frame.groupby('prev_faceoff_num')['event_idx'].max()
    frame['prev_faceoff_idx'] = [ faceoff_indexes[fo_num - 1] if fo_num > 0 else -1 for fo_num in frame['prev_faceoff_num'] ]
         
    # Now, rebounds are defined as any shot taken 3 seconds or less after the preceding shot so long as there has
    # been no intervening faceoff.
    frame['is_rebound'] = (frame['cum_time_elapsed'] <= frame['prev_shot_time'] + 3) \
        & (frame['prev_shot_idx'] > frame['prev_faceoff_idx']) & (frame['is_shot'])

    # Limit to shots
    frame = frame[frame['is_shot']].copy()
    
    # Most of the columns generated above are no longer needed
    frame.drop(['is_shot', 'is_faceoff', 'prev_shot_num', 'prev_shot_time', 'prev_shot_idx', 'prev_faceoff_num', 
                'prev_faceoff_idx', 'event_idx'], axis=1, inplace=True)
    
   
    return frame
    
    
def construct_game_live_feed_frame(live_feed_link, refresh=False):
    '''
    Uses the live feed to create a data frame for the information in the feed, if it exists.

    Parameters
    ----------
    live_feed_link : str
        The live feed link of the game for the frame. Example: '/api/v1/game/2018020240/feed/live' 
        for the game in the 2018-2019 season with id 020240. See the documentation for get_game_feed_links
        for more information.
    refresh : bool, optional
        If True, ignores the existence of any local files and re-downloads and processes
        the data from the API. This will result in overwriting any current saves. The default is False.

    Returns
    -------
    frame : pandas DataFrame
        Data frame representing the game if the live feed exists.
        Returns None otherwise.

    '''
    feed = get_live_feed(live_feed_link, refresh)
    
    if feed is not None:
        # There are two key steps to producing the frame.
        # First, parse the frame and pull out necessary data.
        frame = parse_live_feed(feed)
        # Second, process the frame and add derived columns    
        return process_live_feed_frame(frame)
    else:
        return None

def get_game_live_feed_frame_file(live_feed_link):
    '''
    Constructs the Path object giving a handle to the game live feed data frame.
    This makes no guarantees on whether the file actually exists - the object will refer to the correct file if and only if
    the file already exists.

    Parameters
    ----------
    live_feed_link : str
        The live feed link of the game for the frame. Example: '/api/v1/game/2018020240/feed/live' 
        for the game in the 2018-2019 season with id 020240. See the documentation for get_game_feed_links
        for more information.

    Returns
    -------
    frame_file : pathlib.Path
        Path object for the local live feed data frame file, if it exists. If it doesn't exist, points
        to the location that it would exist, allowing saving at that location.

    '''
    current_dir = Path.cwd()
    # Uses live feed formatting for the game id for compactness.
    game_id = extract_id_from_live_feed_link(live_feed_link)
    relative_path = GAME_FRAME_FOLDER + 'livefeed_' + game_id + '.pkl'
    frame_file = current_dir.joinpath(relative_path)
    return frame_file

def read_game_live_feed_frame(live_feed_link):
    '''
    Reads the Pandas live feed data frame for the requested game, if it exists.

    Parameters
    ----------
    live_feed_link : str
        The live feed link of the game for the frame. Example: '/api/v1/game/2018020240/feed/live' 
        for the game in the 2018-2019 season with id 020240. See the documentation for get_game_feed_links
        for more information.

    Returns
    -------
    game_frame : andas DataFrame
        Data frame representing the game if the file is saved locally.
        Returns None otherwise.
    '''

    file = get_game_live_feed_frame_file(live_feed_link)
    if file.exists():
        logging.info('Reading live feed data frame for ' + extract_id_from_live_feed_link(live_feed_link))
        game_frame = pd.read_pickle(str(file))
        return game_frame
    else:
        return None
    
def get_game_live_feed_frame(live_feed_link, refresh=False, refresh_frame=False):
    '''
    Obtains a Pandas data frame corresponding to the game live feed for the given link.

    Parameters
    ----------
    live_feed_link : str
        The live feed link of the game for the frame. Example: '/api/v1/game/2018020240/feed/live' 
        for the game in the 2018-2019 season with id 020240. See the documentation for get_game_feed_links
        for more information.
    refresh : bool, optional
        If True, ignores the existence of any local files and re-downloads and processes
        the data from the API. This will result in overwriting any current saves. The default is False.
    refresh_frame : bool, optional
        Similar to refresh, but only refreshes the data frame. Any locally-saved raw data is kept. Ignored if
        refresh is True. The default is False.

    Returns
    -------
    Pandas data frame
        Data frame representing the live feed data.

    '''   
    refresh_any = refresh | refresh_frame
    read_from_file = read_game_live_feed_frame(live_feed_link) if not refresh_any else None
    if read_from_file is None:
        game_frame = construct_game_live_feed_frame(live_feed_link, refresh)
        # Save the frame
        if game_frame is not None:
            # Make sure that the folder exists.
            game_frame_file = get_game_live_feed_frame_file(live_feed_link)
            game_frame_file.parent.resolve().mkdir(parents=True, exist_ok=True)  
            # Now the file can be saved.              
            game_frame.to_pickle(str(game_frame_file)) 
            
        return game_frame
    else:
        return read_from_file

def get_game_html_report_frame_file(live_feed_link):
    '''
    Obtains a handle to the Pandas data frame created from the html report for the game corresponding to the given link.

    Parameters
    ----------
    live_feed_link : str
        The live feed link of the game for the frame. Example: '/api/v1/game/2018020240/feed/live' 
        for the game in the 2018-2019 season with id 020240. See the documentation for get_game_feed_links
        for more information.

    Returns
    -------
    frame_file : pathlib.Path
        Path object for the local live feed data frame file, if it exists. If it doesn't exist, points
        to the location that it would exist, allowing saving at that location.

    '''
    current_dir = Path.cwd()
    # Uses live feed formatting for the game id for compactness.
    game_id = extract_id_from_live_feed_link(live_feed_link)
    relative_path = GAME_FRAME_FOLDER + 'htmlreport_' + game_id + '.pkl'
    frame_file = current_dir.joinpath(relative_path)
    return frame_file

def extract_season_from_link(live_feed_link):
    '''
    Constructs the season id from the live feed link.

    Parameters
    ----------
    live_feed_link : str
        The live feed link of the game for the frame. Example: '/api/v1/game/2018020240/feed/live' 
        for the game in the 2018-2019 season with id 020240. See the documentation for get_game_feed_links
        for more information.

    Returns
    -------
    season : str
        Eight-character string identifying the season corresponding to the link. Example: 
        '/api/v1/game/2018020240/feed/live' will return '20182019'

    ''' 
    # The first part of the season string is contained as the first four digits of the game id portion of the url,
    # which can be extracted using extract_id_from_live_feed_link.
    season = extract_id_from_live_feed_link(live_feed_link)[:4]
    # The full season string is those four digits (interpreted as a string giving a year) with four digits representing
    # the following year.
    season += str(int(season) + 1)    
    return season

def get_html_report_url(live_feed_link):
    '''
    Converts the live feed link into the url for the html report of the game corresponding to the link.

    Parameters
    ----------
    live_feed_link : str
        The live feed link of the game for the frame. Example: '/api/v1/game/2018020240/feed/live' 
        for the game in the 2018-2019 season with id 020240. See the documentation for get_game_feed_links
        for more information.

    Returns
    -------
    str
        URL for the html report on the NHL.com website. Example: 'http://www.nhl.com/scores/htmlreports/2018/PL020240.HTM'
            for the game in the 2018-2019 season with id 020240.

    '''
    season = extract_season_from_link(live_feed_link)
    # The game portion is the last six characters of the id portion of the link.
    # For '/api/v1/game/2018020240/feed/live', it will be '020240'.
    game = extract_id_from_live_feed_link(live_feed_link)[-6:]
    return 'http://www.nhl.com/scores/htmlreports/' + season + '/PL' + game + '.HTM'

def download_game_html_report(live_feed_link):
    '''
    Downloads the html report for the game corresponding to the given link, if it exists.

    Parameters
    ----------
    live_feed_link : str
        The live feed link of the game for the frame. Example: '/api/v1/game/2018020240/feed/live' 
        for the game in the 2018-2019 season with id 020240. See the documentation for get_game_feed_links
        for more information.

    Returns
    -------
    BeautifulSoup
        BeautifulSoup representation of the html report corresponding to the link, if it exists.
        Otherwise returns None.

    '''
    html_report_url = get_html_report_url(live_feed_link)
    report = requests.get(html_report_url)
    if (report.status_code == 200):
        logging.info('Success reading html report ' + html_report_url)
        return BeautifulSoup(report.content, 'html.parser')
    else:
        logging.error('Failure reading html report ' + html_report_url + ' (status: ' + str(report.status_code) +')')
        return None


def get_game_html_report_file(live_feed_link):
    '''
    Obtains the handle for the local version of the html report file.

    Parameters
    ----------
    live_feed_link : str
        The live feed link of the game for the frame. Example: '/api/v1/game/2018020240/feed/live' 
        for the game in the 2018-2019 season with id 020240. See the documentation for get_game_feed_links
        for more information.

    Returns
    -------
    live_feed_file : pathlib.Path
        Path object for the local live feed file, if it exists. If it doesn't exist, points
        to the location that it would exist, allowing saving at that location.

    '''
    current_dir = Path.cwd()
    relative_path = RAW_HTML_REPORT_FOLDER + 'htmlreport_' + extract_id_from_live_feed_link(live_feed_link) + '.pkl'
    live_feed_file = current_dir.joinpath(relative_path)
    return live_feed_file

def read_game_html_report(live_feed_link):
    '''
    Reads the local copy of the html report for the game corresponding to the given link, if it exists.

    Parameters
    ----------
    live_feed_link : str
        The live feed link of the game for the frame. Example: '/api/v1/game/2018020240/feed/live' 
        for the game in the 2018-2019 season with id 020240. See the documentation for get_game_feed_links
        for more information.

    Returns
    -------
    report_soup : BeautifulSoup
        BeautifulSoup representation of the html report corresponding to the link, if it is saved locally.
        Otherwise returns None.

    '''
    html_report_file = get_game_html_report_file(live_feed_link)
    if html_report_file.exists():
        logging.info('Reading raw html report ' + live_feed_link)
        with html_report_file.open('rb') as infile:
            html_report = pickle.load(infile)
        report_soup = BeautifulSoup(html_report, 'lxml')
        return report_soup
    else:
        return None

def get_game_html_report(live_feed_link, refresh = False):
    '''
    Obtains the BeautifulSoup object for the raw html report data for the game corresponding to the live_feed_link.

    Parameters
    ----------
    live_feed_link : str
        The live feed link of the game for the frame. Example: '/api/v1/game/2018020240/feed/live' 
        for the game in the 2018-2019 season with id 020240. See the documentation for get_game_feed_links
        for more information.
    refresh : bool, optional
        If True, ignores the existence of any local files and re-downloads and processes
        the data from the API. This will result in overwriting any current saves. The default is False.

    Returns
    -------
    BeautifulSoup
        BeautifulSoup object representing the html report for the game.

    '''
    read_from_file = read_game_html_report(live_feed_link) if not refresh else None
    if read_from_file is None:
        html_report = download_game_html_report(live_feed_link)
         # Save the report
        if html_report is not None:
            # There are currently issues saving the HTML reports locally. Ignore that for the time being.
            pass
            # Make sure that the folder exists.
            html_report_file = get_game_html_report_file(live_feed_link)
            html_report_file.parent.resolve().mkdir(parents=True, exist_ok=True)  
            # Now the file can be saved. 
            with html_report_file.open('wb') as outfile:
                pickle.dump(str(html_report), outfile)
        return html_report
    else:
        return read_from_file


def parse_row_index(row):
    '''
    Extracts the row index from a game event.

    Parameters
    ----------
    row : BeautifulSoup
        BeautifulSoup object referring to a single event

    Returns
    -------
    int
        Index of the game event.

    '''
    return int(list(row.children)[1].get_text())

def parse_row_period(row):
    '''
    Extracts the period from a game event.

    Parameters
    ----------
    row : BeautifulSoup
        BeautifulSoup object referring to a single event

    Returns
    -------
    int
        Period of the event.

    '''
    return int(list(row.children)[3].get_text())

def parse_row_strength(row):
    '''
    Extracts the game strength from the point of view of the team owning the event.

    Parameters
    ----------
    row : BeautifulSoup
        BeautifulSoup object referring to a single event

    Returns
    -------
    str
        String of 'EV' for even strength, 'PP' for power-play, or 'SH' for short-handed.

    '''
    # The strength can contain non-breaking spaces, which are replaced with normal spaces.
    return list(row.children)[5].get_text().replace('\xa0',' ')

def parse_row_time(row, elapsed):
    '''
    Extracts the game time of the event.

    Parameters
    ----------
    row : BeautifulSoup
        BeautifulSoup object referring to a single event
    elapsed : bool
        If True, extracts the elapsed time in the current period. Otherwise, extracts the remaining
        time.

    Returns
    -------
    str
        A time in the form 'mm:ss.

    '''
    # The times are missing initial '0's, meaning they don't match with the live feeds without adjustment.
    # The elapsed time is in index 0, the time remaining is in index 2.
    idx = 2 - 2 * elapsed
    return list(list(row.children)[7].children)[idx].rjust(5, '0')

def parse_row_event(row):
    '''
    Extracts the event-type from the event.

    Parameters
    ----------
    row : BeautifulSoup
        BeautifulSoup object referring to a single event

    Returns
    -------
    str
        Short code denoting the type of event.

    '''
    return list(row.children)[9].get_text()

def is_zone_field(part):
    return bool(re.search(r'[Zz]one$', part))

def is_name_field(part):
    return bool(re.search(r'\#', part))

def is_assist_field(part):
    return bool(re.search(r'Assist', part))
                
def is_dist_field(part):
    return bool(re.search(r'ft\.$', part))

def is_shot_type_field(part):
    return bool(re.search(r'S[nl]ap|Backhand|Wrist|Deflected|Tip|Wrap', part))

def is_miss_type_field(part):
    return bool(re.search(r'Net|Goalpost|Crossbar', part))

def extract_distance(part):
    res = re.search('\d+(?=.*ft\.)', part)
    if res:
        return int(res.group(0))
    else:
        return None

def parse_row_desc_components(parts, event):
#Player, Shot-type, How missed, Zone, Distance, Assists
    parts = [part for part in reversed(parts)]
    idx = 0
    parsed = { 'shot_dist': None, 'event_zone': None, 
              'miss_type': None, 'shot_type': None }
    
    # Check for assists, only matters for goals.
    if ((idx < len(parts)) & (event=='GOAL')):
        if (is_assist_field(parts[idx])):
            # Always ignore the assist field
            idx += 1
    
    # Next, check for a distance field
    if ((idx < len(parts)) & is_dist_field(parts[idx])):
        dist = extract_distance(parts[idx])
        parsed['shot_dist'] = dist
        idx += 1
        
    # Next, check for a zone field
    if ((idx < len(parts)) & is_zone_field(parts[idx])):
        parsed['event_zone'] = parts[idx]
        idx += 1

    # Next is the miss-type field, only matters if event is MISS
    if (idx < len(parts)):
        if (event=='MISS'):
            if (is_miss_type_field(parts[idx])):
                # If it's an obvious miss, then include it.
                parsed['miss_type'] = parts[idx]
                idx += 1
            else:
                # Even if it's not an obvious miss, presumptively count it as a
                # miss.
                guess_false = is_name_field(parts[idx]) | is_shot_type_field(parts[idx])
                if not guess_false:
                    # There is no good evidence against it being a miss type.
                    parsed['miss_type'] = parts[idx]
                    idx += 1     
        elif (event=='BLOCK'):
            parsed['miss_type'] = 'Block'
        elif (event=='SHOT'):
            parsed['miss_type'] = 'Save'
           
           
    # Final field to check is presumptively the shot-type field.
    if (idx < len(parts)):
        if (is_shot_type_field(parts[idx])):
            # Obvious shot
            parsed['shot_type'] = parts[idx]
        else:
            # Like with miss-type, this field is presumptively the shot-type
            if not is_name_field(parts[idx]):
                parsed['shot_type'] = parts[idx]
                
       
    return parsed

def parse_row_desc(row):
    '''
    Extracts the description from an event and splits it for ease of future parsing.

    Parameters
    ----------
    row : BeautifulSoup
        BeautifulSoup object referring to a single event

    Returns
    -------
    list of str
        The event description split at commas. For goals, an additional split is added after the 
        distance and before the assists.

    '''
    # The description can contain non-breaking spaces, which are replaced with normal spaces.
    # Goal descriptions omit a comma between the shot distance and the assists. Since the units are always 'ft.' and will
    # end up being dropped later, they're replaced with ',' here to aid with the string splitting and parsing.
    # Finally, the description is split to make it easier to parse at the next step.
    
    # Moving to increase robustness. T
    # The order of items depends on the type of row being parsed.
    
    event = parse_row_event(row)
    desc = list(row.children)[11].get_text(separator=', ')

                                                                                            
    if (event in SHOT_EVENTS):
        desc_parts = desc.split(', ')
        return parse_row_desc_components(desc_parts, event)
    else:
        return desc


def parse_on_ice_pos(row, home):
    '''
    Extracts the positions of the players on ice for one of the teams

    Parameters
    ----------
    row : BeautifulSoup
        BeautifulSoup object referring to a single event
    home : bool
        If True, extract the home team. Otherwise, extract the away team.

    Returns
    -------
    Counter
        Counter of all positions found on the ice for the event. Example: if there are two centers, a left winger,
        two defense and a goaltender, will return Counter({'C': 2, 'L': 1, 'D': 2, 'G': 1})

    '''
    # The visiting team uses index 13, the home team 15. Start by grabbing the correct section of the row.
    idx = 13 + 2 * home
    sec = list(row.children)[idx].find('table')
    if sec is not None:
        # If the section exists, we can break it down further into subsections for each player
        player_list = sec.find_all('table')
        # Each player subsection contains two cells. The player position is the text from the second cell.
        return Counter([ player.find_all('td')[1].get_text() for player in player_list])
    else:
        return None

def parse_penalty_shot(row):
    '''
    Determines whether the event was a penalty shot.

    Parameters
    ----------
    row : BeautifulSoup
        BeautifulSoup object referring to a single event

    Returns
    -------
    bool
        True if the description indicates the shot was a penalty shot, False otherwise.

    '''
    # Search the description for use of the term 'Penalty Shot'.
    return bool(re.search('Penalty Shot', list(row.children)[11].get_text().replace('\xa0',' ')))
 
    
def parse_game_html_report(report):
    '''
    Parses game html report to produce a pandas data frame.

    Parameters
    ----------
    report : BeautifulSoup
        BeautifulSoup object for the html report page.

    Returns
    -------
    frame :     Pandas DataFrame
        Date Frame containing event data from the html report. 

    '''
    # Play-by-play rows are either all the same class or one of two classes.
    event_rows = report.find_all('tr', class_ = re.compile("(evenColor|oddColor)"))
    
    # Row children are
    #   1: Index
    #   3: Period (In regular season, OT is 4 and SO is 5)
    #   5: Strength (Even strength = EV, Power play = PP, Short-handed = SH)
    #   7: Time elapse / Time remaining
    #   9: Event type
    #  11: Event detailed description
    #  13: Visiting/away players on ice / jersey numbers and positions
    #  15: Home players on ice / jersey numbers and positions    
    frame = pd.DataFrame({
        # Include metadata because we need to investigate why frames aren't matching.
        'idx': [ parse_row_index(row) for row in event_rows],
        'period': [ parse_row_period(row) for row in event_rows],
        'strength': [ parse_row_strength(row) for row in event_rows],
        'time_elapsed': [ parse_row_time(row, True) for row in event_rows],
        'event': [ parse_row_event(row) for row in event_rows],
        'desc': [ parse_row_desc(row) for row in event_rows],
        'is_penalty_shot': [ parse_penalty_shot(row) for row in event_rows],
        # Positions for players on ice, away and home respectively
        'pos_a':  [ parse_on_ice_pos(row, False) for row in event_rows],
        'pos_h': [ parse_on_ice_pos(row, True) for row in event_rows]
    })    
    return frame

def process_parsed_report(frame):
    '''
    Performs post-parsing processing of the html report data frame.

    Parameters
    ----------
    frame : pandas data frame
        Data frame that has been generated by parsing the html report for a game.

    Returns
    -------
    frame : Pandas DataFrame
        The input data frame with additional columns 'event_zone', 'how_missed', 'shot_dist', and 'shot_type', with
        the column 'desc' eliminated. All non-shot events are also removed.

    '''
    # The description is no longer needed, so that is dropped.
    frame.drop('desc', axis=1, inplace=True)
    
    # Finally, reduce the table to only shot events.
    frame = frame[frame['event'].isin(SHOT_EVENTS)].copy()
    
    expanded_dicts = frame['desc'].apply(pd.Series)
    frame['shot_dist'] = expanded_dicts['shot_dist']
    frame['event_zone'] = expanded_dicts['event_zone']
    frame['miss_type'] = expanded_dicts['miss_type']
    frame['shot_type'] = expanded_dicts['shot_type']
    
    return frame
    
def construct_game_html_report_frame(live_feed_link, refresh = False):
    '''
    Constructs a Pandas data frame from the html report data for the requested game.

    Parameters
    ----------
    live_feed_link : str
        The live feed link of the game for the frame. Example: '/api/v1/game/2018020240/feed/live' 
        for the game in the 2018-2019 season with id 020240. See the documentation for get_game_feed_links
        for more information.
    refresh : bool, optional
        If True, ignores the existence of any local files and re-downloads and processes
        the data from the API. This will result in overwriting any current saves. The default is False.

    Returns
    -------
    Pandas data frame
        Pandas data frame representing the game if the html report exists.
        Returns None otherwise.

    '''
    report = get_game_html_report(live_feed_link, refresh)
    if report is None:
        return None
    else:
        frame = parse_game_html_report(report)
        frame['game_id'] = extract_id_from_live_feed_link(live_feed_link)
        return process_parsed_report(frame)
    
def read_game_html_report_frame(live_feed_link):
    '''
    Reads the Pandas html report data frame for the requested game, if it exists locally.

    Parameters
    ----------
    live_feed_link : str
        The live feed link of the game for the frame. Example: '/api/v1/game/2018020240/feed/live' 
        for the game in the 2018-2019 season with id 020240. See the documentation for get_game_feed_links
        for more information.

    Returns
    -------
    game_frame : Pandas data frame
        Data frame representing the html report data, if it exists.
        Returns None otherwise.

    '''
    frame_file = get_game_html_report_frame_file(live_feed_link)
    if frame_file.exists():
        logging.info('Reading html frame ' + live_feed_link)
        game_frame = pd.read_pickle(str(frame_file))
        return game_frame
    else:
        return None
    
def get_game_html_report_frame(live_feed_link, refresh = False, refresh_frame=False):
    '''
    Obtains a Pandas data frame corresponding to the HTML report for the game corresponding to the
    live feed link.

    Parameters
    ----------
    live_feed_link : str
        The live feed link of the game for the frame. Example: '/api/v1/game/2018020240/feed/live' 
        for the game in the 2018-2019 season with id 020240. See the documentation for get_game_feed_links
        for more information.
    refresh : bool, optional
        If True, ignores the existence of any local files and re-downloads and processes
        the data from the API. This will result in overwriting any current saves. The default is False.
    refresh_frame : bool, optional
        Similar to refresh, but only refreshes the data frame. Any locally-saved raw data is kept. Ignored if
        refresh is True. The default is False.

    Returns
    -------
    Pandas data frame
        Data frame representing the html report data.

    '''
    refresh_any = refresh | refresh_frame
    read_from_file = read_game_html_report_frame(live_feed_link) if not refresh_any else None
    if read_from_file is None:
        game_frame = construct_game_html_report_frame(live_feed_link, refresh)
         # Save the frame
        if game_frame is not None:
            # Make sure that the folder exists.
            game_frame_file = get_game_html_report_frame_file(live_feed_link)
            game_frame_file.parent.resolve().mkdir(parents=True, exist_ok=True)  
            # Now the file can be saved.              
            game_frame.to_pickle(str(game_frame_file)) 
        return game_frame
    else:
        return read_from_file

def combine_frames(live_feed_frame, html_report_frame):
    return pd.merge(live_feed_frame, html_report_frame, how='outer', left_on=['period', 'time_elapsed', 'event'], 
             right_on=['period', 'time_elapsed', 'event'],  suffixes=['_livefeed', '_htmlreport'])
 
# def add_metadate(combined_frame):
#     pass

def process_combined_frame(combined_frame):
    # Dump shootouts and penalty shots.
    combined = combined_frame[combined_frame['is_penalty_shot']==False].copy()
    combined = combined[combined['period_ord'] != 'SO'].copy()
    # The 'is_penalty_shot' column is no longer needed since penalty shots have been removed from the frame.
    combined.drop(['is_penalty_shot'], axis=1, inplace=True)
        
    # Blocks use the blocking team's viewpoint. This should be swapped to use the shooting team's viewpoint.
    # Coordinates don't need to change in the live feed, but event_team_id, event_team_is_home, strength, and event_zone 
    # need to be flipped.
    # Flip event team home flag. This just swaps True and False, however it's important to coerce the type to bool first.
    combined['event_team_is_home'] = combined['event_team_is_home'].astype(bool)
    combined['event_team_is_home'] = np.where(combined['event']!='BLOCK', 
                                              combined['event_team_is_home'], 
                                              ~combined['event_team_is_home'] )
    # Flip team id.
    away_code =  combined['away_code']
    home_code =  combined['home_code']
    combined['event_team_code'] = np.where(combined['event']!='BLOCK', 
                                         combined['event_team_code'], 
                                         np.where(combined['event_team_code']==away_code, home_code, away_code) )
    # Flip zones. Here, it's a simple swap of offensive and defensive zones, with neutral zone left unchanged
    block_zone_swap = { 'Def. Zone': 'Off. Zone', 
                       'Off. Zone': 'Def. Zone'}
    combined['event_zone'] = np.where(combined['event']!='BLOCK', 
                                      combined['event_zone'], 
                                      combined['event_zone'].replace(block_zone_swap) )
    # Strengths are similar to zones. Here, 'EV' (even strength) is left alone, but 'PP' and 'SH' (power-play and 
    # short-handed) are swapped.
    block_strength_swap = { 'PP': 'SH', 
                           'SH': 'PP' }
    combined['strength'] = np.where(combined['event']!='BLOCK', 
                                      combined['strength'], 
                                      combined['strength'].replace(block_strength_swap) )    
    
    # Standardize coordinate directions.
    # Roughly half the shots are charted on each half of the ice. Flip the shots where the offensive zone has
    # negative x-coordinates.
    # Since it is still possible for a shot to be taken from a location with negative x-coordinate, the home end
    # of the ice is determined first. Teams switch ends of the ice each period. Grouping the signs of shot coordinates
    # by period and home/away allows determination of which end of the ice is being attacked.
    
    combined['home_attacks_positive'] = ((combined['event_coord_x'] >= 0) & combined['event_team_is_home']) | \
                                        ((combined['event_coord_x'] <= 0) & ~combined['event_team_is_home'])
    combined['home_attacks_positive'] = combined['home_attacks_positive'].astype(float)      
                           
    # In most cases, the mean will be within 0.1 of either end. It is reasonable to assume that any period with mean
    # greater than 0.5 indicates that the home attack end has positive x-coordinates. Additionally, any period with
    # mean less than 0.5 indicates that the home attack end has negative x-coordinates. These periods will be rotated
    # 180 degrees to standardize.
    combined['home_end_correct'] = combined.groupby('period')[['home_attacks_positive']].transform('mean')
    #combined['home_end_correct'] = is_home_end_positive
    # df.c.map(df.groupby(['c'])['a', 'b'].apply(lambda x: sum(x['a'])/sum(x['b'])))
    # v = df.groupby('c')[['a', 'b']].transform('sum')
    # df['ab_weighted'] = v.a / v.b
    # Only coordinates will change. In this case, since the intent is to rotate, x-coordinates and y-coordinates
    # are both negated for periods when the home team is attacking the negative x-coordinate end.
    
    combined['event_coord_x'] = np.where(combined['home_end_correct'] >= 0.5, 
                                         combined['event_coord_x'], 
                                         -combined['event_coord_x'])
    combined['event_coord_y'] = np.where(combined['home_end_correct'] >= 0.5, 
                                         combined['event_coord_y'], 
                                         -combined['event_coord_y'])
    
    # Now, since home is always attacking a single end, rotate the visiting team shots.
    combined['event_coord_x'] = np.where(combined['event_team_is_home'] == True, 
                                         combined['event_coord_x'], 
                                         -combined['event_coord_x'])
    combined['event_coord_y'] = np.where(combined['event_team_is_home'] == True, 
                                         combined['event_coord_y'], 
                                         -combined['event_coord_y'])
    
    # The new columns have served their purpose and can be removed.
    combined.drop(['home_end_correct', 'home_attacks_positive'], axis=1, inplace=True)
        
    # Calculate shot distance from coordinates.
    # The distance is calculated to the center (y=0) of the goal line. The goal line is 11 feet from the 
    # end boards, which are 100 feet from center ice, giving x=89.
    
    combined['calc_dist'] = np.sqrt((combined['event_coord_x']-89)**2 + combined['event_coord_y']**2)
    combined['dist_difference'] = np.abs(combined['calc_dist'] - combined['shot_dist'])
    return combined

def construct_combined_frame(live_feed_frame, html_report_frame):
    # Merge the frames.
    combined = combine_frames(live_feed_frame, html_report_frame)
    combined = process_combined_frame(combined)
    return combined.reset_index(drop=True)

def get_game_combined_frame_file(live_feed_link):

    '''
    Obtains a handle to the Pandas data frame created from the html report for the game corresponding to the given link.

    Parameters
    ----------
    live_feed_link : str
        The live feed link of the game for the frame. Example: '/api/v1/game/2018020240/feed/live' 
        for the game in the 2018-2019 season with id 020240. See the documentation for get_game_feed_links
        for more information.

    Returns
    -------
    frame_file : pathlib.Path
        Path object for the local live feed data frame file, if it exists. If it doesn't exist, points
        to the location that it would exist, allowing saving at that location.

    '''
    current_dir = Path.cwd()
    # Uses live feed formatting for the game id for compactness.
    game_id = extract_id_from_live_feed_link(live_feed_link)
    relative_path = GAME_FRAME_FOLDER + 'combined_' + game_id + '.pkl'
    frame_file = current_dir.joinpath(relative_path)
    return frame_file
    
def read_game_combined_frame(live_feed_link):
    '''
    

    Parameters
    ----------
    live_feed_link : str
        The live feed link of the game for the frame. Example: '/api/v1/game/2018020240/feed/live' 
        for the game in the 2018-2019 season with id 020240. See the documentation for get_game_feed_links
        for more information.

    Returns
    -------
    game_frame : Pandas data frame
        Data frame combining the live feed and the html report data.

    '''
    
    frame_file = get_game_combined_frame_file(live_feed_link)
    if frame_file.exists():
        logging.info('Reading combined data frame for ' + extract_id_from_live_feed_link(live_feed_link))
        game_frame = pd.read_pickle(str(frame_file))
        return game_frame
    else:
        return None
    
def get_game_combined_frame(live_feed_link, refresh_combine=False, refresh_all=False, refresh_feed=False, 
                            refresh_feed_frame=False, refresh_html=False, refresh_html_frame=False):
    '''
    Obtains the combined data frame for the game corresponding to live_feed_link.

    Parameters
    ----------
    live_feed_link : str
        The live feed link of the game for the frame. Example: '/api/v1/game/2018020240/feed/live' 
        for the game in the 2018-2019 season with id 020240. See the documentation for get_game_feed_links
        for more information.
    refresh_combine : bool, optional
        Re-combine the frames but otherwise use any locally-saved files. Has no effect if any other option is set.
        The default is False.
    refresh_all : bool, optional
        Forces re-downloading of the live feed and the html report. This option causes all other options to be ignored
        if set. Existing files will be overwritten. The default is False.
    refresh_feed : bool, optional
        Forces re-downloading of the live feed, but not the html report. Existing live feed files will be overwritten.
        This option is ignored if refresh_all is set. The default is False.
    refresh_feed_frame : bool, optional
        Forces re-creation of the live feed frame without re-downloading existing raw files. This option is ignored if
        refresh_feed is set. The default is False.
    refresh_html : bool, optional
        Forces re-downloading of the html report, but not the live feed. Existing html report files will be overwritten.
        This option is ignored if refresh_all is set. The default is False.
    refresh_html_frame : bool, optional
        Forces re-creation of the html report frame without re-downloading existing raw files. This option is ignored if
        refresh_html is set. The default is False.

    Returns
    -------
    Pandas data frame
        Data frame combining the live feed and the html report data.

    '''
    
    # If any of the refresh options are true, the combined local file shouldn't be read as it will need to be 
    # recreated
    refresh_any = refresh_combine | refresh_all | refresh_feed | refresh_feed_frame | refresh_html | refresh_html_frame
    #logging.debug('Here?')
    read_from_file = read_game_combined_frame(live_feed_link) if not refresh_any else None
    #logging.debug('Or here?')
    if read_from_file is None:
        # There are multiple reasons the file may need to be recreated. In the event of refresh_combine, the 
        # constituent frames can simply be read. For refresh_all, everything needs to be re-created.
        # Pass refresh states onto the individual loading functions, with refresh_all overriding everything else if true.
        feed_frame = get_game_live_feed_frame(live_feed_link, refresh_all | refresh_feed, refresh_all | refresh_feed_frame)
        html_frame = get_game_html_report_frame(live_feed_link, refresh_all | refresh_html, refresh_all | refresh_html_frame)
        
        # Combining the frames is a required action. 
        #logging.debug('Do we execute this?')
        combined_frame = construct_combined_frame(feed_frame, html_frame)
        #logging.debug('Or this?')
        
        # If the combination occurred successfully, save the file.
        if combined_frame is not None:
            # Make sure that the folder exists.
            combined_frame_file = get_game_combined_frame_file(live_feed_link)
            combined_frame_file.parent.resolve().mkdir(parents=True, exist_ok=True)
            # The directory exists, so the file can be saved.                
            combined_frame.to_pickle(str(combined_frame_file)) 
        return combined_frame           
            
    else:
        return read_from_file

def retrieve_all(link_list, refresh=False, refresh_feed=False, refresh_html=False):
    for live_feed_link in link_list:
        get_live_feed(live_feed_link, refresh | refresh_feed)
        get_game_html_report(live_feed_link, refresh | refresh_html)
 
def get_game_combined_frame_from_local(live_feed_link, refresh_combine=False, refresh_all=False, refresh_feed=False, 
                                       refresh_html=False):
    refresh_feed_frame = refresh_all | refresh_feed
    refresh_html_frame = refresh_all | refresh_html
    return get_game_combined_frame(live_feed_link, refresh_combine=refresh_combine, refresh_feed_frame=refresh_feed_frame, 
                            refresh_html_frame=refresh_html_frame)

def check_live_feeds(live_feed_links):
    bad_links = []
    for live_feed_link in live_feed_links:
        feed = get_live_feed(live_feed_link)
        if len(feed['liveData']['plays']['allPlays']) == 0:
            bad_links.append(live_feed_link)
    return bad_links



'''
Things to include yet:
    Add some validation for offensive/defensive zones. There are a lot of shots of length over 100 that are appearing
    and these are generally suspect.
    
    DONE: Flip blocked shots zones to be from viewpoint of shooter.
    DONE: Standardize locations to all use positive x-coords for goals.
    Estimate location of blocked shot.
    
    Venue correction for locations.
    
'''

    




'\nThings to include yet:\n    Add some validation for offensive/defensive zones. There are a lot of shots of length over 100 that are appearing\n    and these are generally suspect.\n    \n    DONE: Flip blocked shots zones to be from viewpoint of shooter.\n    DONE: Standardize locations to all use positive x-coords for goals.\n    Estimate location of blocked shot.\n    \n    Venue correction for locations.\n    \n'

In [5]:
game_links = get_game_feed_links(SEASON_LIST)
links_missing = check_live_feeds(game_links)
# links with broken play-by-play
links_broken = ['/api/v1/game/2010020124/feed/live', '/api/v1/game/2013020971/feed/live']
bad_links = links_missing + links_broken

In [6]:
len(game_links)

12646

In [7]:
len(bad_links)

87

In [8]:
combined_frame_list = [get_game_combined_frame_from_local(link) for link in game_links if link not in bad_links];
shot_frame = pd.concat(combined_frame_list);

In [9]:
shot_frame.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1428368 entries, 0 to 111
Data columns (total 32 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   game_id_livefeed    1428356 non-null  object 
 1   season              1428356 non-null  object 
 2   type                1428356 non-null  object 
 3   game_time           1428356 non-null  object 
 4   away_code           1428356 non-null  object 
 5   home_code           1428356 non-null  object 
 6   venue               1428356 non-null  object 
 7   venue_id            947804 non-null   object 
 8   period              1428368 non-null  int64  
 9   period_ord          1428356 non-null  object 
 10  period_type         1428356 non-null  object 
 11  time_elapsed        1428368 non-null  object 
 12  cum_time_elapsed    1428356 non-null  float64
 13  event               1428368 non-null  object 
 14  event_team_code     1428356 non-null  object 
 15  event_team_is_home 

In [10]:
shot_frame[shot_frame.game_id_livefeed.isna()]

Unnamed: 0,game_id_livefeed,season,type,game_time,away_code,home_code,venue,venue_id,period,period_ord,...,desc,pos_a,pos_h,game_id_htmlreport,shot_dist,event_zone,miss_type,shot_type,calc_dist,dist_difference
105,,,,,,,,,3,,...,"{'shot_dist': None, 'event_zone': 'Def. Zone',...","{'C': 2, 'L': 1, 'D': 2, 'G': 1}","{'C': 2, 'R': 1, 'D': 2, 'G': 1}",2014020600,,Off. Zone,Block,Wrist,,
133,,,,,,,,,3,,...,"{'shot_dist': 36, 'event_zone': 'Off. Zone', '...","{'C': 2, 'D': 3, 'G': 1}","{'C': 1, 'L': 1, 'D': 2, 'G': 1}",2014020859,36.0,Off. Zone,Save,Wrist,,
123,,,,,,,,,3,,...,"{'shot_dist': 23, 'event_zone': 'Off. Zone', '...","{'L': 2, 'D': 2, 'G': 1}","{'C': 2, 'R': 1, 'L': 1, 'D': 2}",2014021127,23.0,Off. Zone,Save,Wrist,,
124,,,,,,,,,3,,...,"{'shot_dist': 52, 'event_zone': 'Off. Zone', '...","{'L': 2, 'D': 2, 'G': 1}","{'C': 2, 'R': 1, 'L': 1, 'D': 2}",2014021127,52.0,Off. Zone,Save,Wrist,,
125,,,,,,,,,3,,...,"{'shot_dist': 18, 'event_zone': 'Off. Zone', '...","{'L': 2, 'D': 2, 'G': 1}","{'C': 2, 'R': 1, 'L': 1, 'D': 2}",2014021127,18.0,Off. Zone,Save,Wrist,,
94,,,,,,,,,3,,...,"{'shot_dist': 40, 'event_zone': 'Off. Zone', '...","{'C': 1, 'R': 1, 'L': 1, 'D': 2, 'G': 1}","{'C': 1, 'R': 1, 'L': 1, 'D': 2, 'G': 1}",2016020177,40.0,Off. Zone,Wide of Net,Wrist,,
114,,,,,,,,,1,,...,"{'shot_dist': None, 'event_zone': 'Def. Zone',...","{'C': 2, 'L': 1, 'D': 2, 'G': 1}","{'C': 1, 'R': 2, 'D': 2, 'G': 1}",2019020779,,Off. Zone,Block,Wrist,,
115,,,,,,,,,1,,...,"{'shot_dist': 11, 'event_zone': 'Off. Zone', '...","{'C': 1, 'R': 1, 'L': 2, 'D': 1, 'G': 1}","{'C': 1, 'R': 1, 'D': 2, 'G': 1}",2019020779,11.0,Off. Zone,Save,Backhand,,
116,,,,,,,,,1,,...,"{'shot_dist': 21, 'event_zone': 'Off. Zone', '...","{'C': 2, 'L': 2, 'D': 1, 'G': 1}","{'C': 1, 'L': 1, 'D': 2, 'G': 1}",2019020779,21.0,Off. Zone,Save,Wrist,,
117,,,,,,,,,2,,...,"{'shot_dist': 58, 'event_zone': 'Off. Zone', '...","{'C': 1, 'R': 2, 'D': 2, 'G': 1}","{'C': 1, 'R': 1, 'L': 1, 'D': 2, 'G': 1}",2019020779,58.0,Off. Zone,Save,Wrist,,


In [14]:
shot_frame.dropna(subset=['game_id_livefeed'], inplace=True)

In [15]:
shot_frame.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1428356 entries, 0 to 111
Data columns (total 32 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   game_id_livefeed    1428356 non-null  object 
 1   season              1428356 non-null  object 
 2   type                1428356 non-null  object 
 3   game_time           1428356 non-null  object 
 4   away_code           1428356 non-null  object 
 5   home_code           1428356 non-null  object 
 6   venue               1428356 non-null  object 
 7   venue_id            947804 non-null   object 
 8   period              1428356 non-null  int64  
 9   period_ord          1428356 non-null  object 
 10  period_type         1428356 non-null  object 
 11  time_elapsed        1428356 non-null  object 
 12  cum_time_elapsed    1428356 non-null  float64
 13  event               1428356 non-null  object 
 14  event_team_code     1428356 non-null  object 
 15  event_team_is_home 

In [16]:
shot_frame.dropna(subset=['event_coord_x', 'event_coord_y'], inplace=True)

In [17]:
shot_frame.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1428328 entries, 0 to 111
Data columns (total 32 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   game_id_livefeed    1428328 non-null  object 
 1   season              1428328 non-null  object 
 2   type                1428328 non-null  object 
 3   game_time           1428328 non-null  object 
 4   away_code           1428328 non-null  object 
 5   home_code           1428328 non-null  object 
 6   venue               1428328 non-null  object 
 7   venue_id            947784 non-null   object 
 8   period              1428328 non-null  int64  
 9   period_ord          1428328 non-null  object 
 10  period_type         1428328 non-null  object 
 11  time_elapsed        1428328 non-null  object 
 12  cum_time_elapsed    1428328 non-null  float64
 13  event               1428328 non-null  object 
 14  event_team_code     1428328 non-null  object 
 15  event_team_is_home 

In [18]:
shot_frame[shot_frame['pos_a'].isna()]

Unnamed: 0,game_id_livefeed,season,type,game_time,away_code,home_code,venue,venue_id,period,period_ord,...,desc,pos_a,pos_h,game_id_htmlreport,shot_dist,event_zone,miss_type,shot_type,calc_dist,dist_difference
0,2019020876,20192020,R,2020-03-12T02:00:00Z,STL,ANA,Honda Center,5046,1,1st,...,"{'shot_dist': 34, 'event_zone': 'Off. Zone', '...",,,2019020876,34.0,Off. Zone,,,34.928498,0.928498
1,2019020876,20192020,R,2020-03-12T02:00:00Z,STL,ANA,Honda Center,5046,1,1st,...,"{'shot_dist': 41, 'event_zone': 'Off. Zone', '...",,,2019020876,41.0,Off. Zone,,,34.928498,6.071502
2,2019020876,20192020,R,2020-03-12T02:00:00Z,STL,ANA,Honda Center,5046,1,1st,...,"{'shot_dist': 34, 'event_zone': 'Off. Zone', '...",,,2019020876,34.0,Off. Zone,,,41.231056,7.231056
3,2019020876,20192020,R,2020-03-12T02:00:00Z,STL,ANA,Honda Center,5046,1,1st,...,"{'shot_dist': 41, 'event_zone': 'Off. Zone', '...",,,2019020876,41.0,Off. Zone,,,41.231056,0.231056


In [19]:
shot_frame.dropna(subset=['pos_a', 'pos_h'], inplace=True)

In [20]:
shot_frame.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1428324 entries, 0 to 111
Data columns (total 32 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   game_id_livefeed    1428324 non-null  object 
 1   season              1428324 non-null  object 
 2   type                1428324 non-null  object 
 3   game_time           1428324 non-null  object 
 4   away_code           1428324 non-null  object 
 5   home_code           1428324 non-null  object 
 6   venue               1428324 non-null  object 
 7   venue_id            947780 non-null   object 
 8   period              1428324 non-null  int64  
 9   period_ord          1428324 non-null  object 
 10  period_type         1428324 non-null  object 
 11  time_elapsed        1428324 non-null  object 
 12  cum_time_elapsed    1428324 non-null  float64
 13  event               1428324 non-null  object 
 14  event_team_code     1428324 non-null  object 
 15  event_team_is_home 

In [None]:
shot_frame['calc_angle'] = np.arctan2(full_frame['event_coord_y'], 89 - full_frame['event_coord_x']) * 180 / np.pi
shot_frame['calc_angle'].describe()

The shot-location data is well-known to have an arena bias (see e.g. Schnuckers and Curro at http://statsportsconsulting.com/main/wp-content/uploads/Schuckers_Curro_MIT_Sloan_THoR.pdf). Schnuckers and Curro provide a method for adjusting coordinates to account for the bias. This adjustment has been made to the shot data.

### Coordinate-adjustment Procedure
Each coordinate is adjusted independently. For a coordinate, the empirical distribution functions $F$ and $F_A$ are calculated, where $F$ is the CDF of all shot coordinates and $F_A$ is the CDF looking at all shot coordinates where the shot was taken by the visiting team. There are a handful of neutral-site venues. In these games, the away-team was chosen to be the team determined by the NHL to be the visiting team for that game.
In a similar fashion, empirical distribution functions $F_V$ and $F_{VA}$ are calculated for each venue independently. Here, $F_V$ is the CDF for all shots at that venue and $F_{VA}$ is the CDF for all shots by the visiting team at that venue.
Using these ECDFs, an adjusted quantile `q` is calculated for each coordinate `c` to be $q(c) = F_V(c) - (F_{VA}(c) - F_A(c)).$  This quantile is adjusted to be in the interval $[0,1]$ by taking $q'(c) = \min(\max(q(c), 0), 1).$ Finally, the adjusted coordinate `c'` is calculated by $F^{-1}(q')$. The adjusted coordinates are assigned to variables `adj_x` and `adj_y`.

In [None]:
venue_list = list(shot_frame['venue'].unique())
venue_all = {}
venue_away = {}
for venue in venue_list:
    venue_shots = shot_frame[shot_frame['venue']==venue]
    x_coord = ECDF(list(venue_shots['event_coord_x']))
    y_coord = ECDF(list(venue_shots['event_coord_y']))
    venue_all[venue] = {'x': x_coord, 'y': y_coord}
    venue_away_shots = venue_shots[venue_shots['event_team_is_home']==False]
    x_coord = ECDF(list(venue_away_shots['event_coord_x']))
    y_coord = ECDF(list(venue_away_shots['event_coord_y']))
    venue_away[venue] = {'x': x_coord, 'y': y_coord}
away_shots = shot_frame[shot_frame['event_team_is_home']==False]
x_coord = ECDF(list(away_shots['event_coord_x']))
y_coord = ECDF(list(away_shots['event_coord_y']))
venue_away['all'] = {'x': x_coord, 'y': y_coord}

In [None]:
def calc_adjusted(row, coord):
    if coord=='x':
        coord_name = 'event_coord_x'
    elif coord=='y':
        coord_name = 'event_coord_y'
        
    f_v = venue_all[row['venue']][coord](row[coord_name])     
    f_va = venue_away[row['venue']][coord](row[coord_name])
    f_a = venue_away['all'][coord](row[coord_name])
    
    arg = f_r - (f_ra - f_a)
    
    if (arg < 0):
        arg = 0
    elif (arg > 1):
        arg = 1
        
    return shot_frame[coord_name].quantile(arg, interpolation='lower') 

In [None]:
shot_frame['adj_x'] = shot_frame.apply(lambda row : calc_adjusted(row, 'x'), axis = 1)
shot_frame['adj_y'] = shot_frame.apply(lambda row : calc_adjusted(row, 'y'), axis = 1)

Using the adjusted coordinates, calculate the angle of the shot and the distance of the shot. These are determined by using the line segment starting at the midpoint of the goal line and ending at the adjusted shot location. The goal line corresponds to the line $x = 89$, indicating that the midpoint of the goal line is at coordinates $(89, 0)$. The adjusted distance `adj_dist` is simply the length of this segment. The adjusted angle `adj_angle` is the angle between this segment and the ray from the midpoint of the goal line passing through center ice (coordinates $(0,0)$).

In [None]:
shot_frame['adj_dist'] = np.sqrt((combined['adj_x']-89)**2 + combined['adj_y']**2)
shot_frame['adj_angle'] = np.arctan2(full_frame['adj_y'], 89 - full_frame['adj_x']) * 180 / np.pi
shot_frame['adj_angle'].describe()