In [36]:
import requests
from pyspark import SparkContext
from pyspark.sql import SQLContext
from itertools import chain, starmap
from datetime import datetime, timedelta

sc.stop()

sc = SparkContext("local", "NHL Events").getOrCreate()
sqlContext = SQLContext(sc)

r = requests.get(f"https://statsapi.web.nhl.com/api/v1/schedule/?date={datetime.strftime(datetime.now() - timedelta(1), '%Y-%m-%d')}")
data = r.json()

total_games = data['totalGames']
dates = data['dates']

game_ids = list()

for item in dates:
    for game in item['games']:
        game_id = game['gamePk']
        game_ids.append(game_id)



[2019020866, 2019020867, 2019020868, 2019020869, 2019020870, 2019020871, 2019020872, 2019020873, 2019020874, 2019020875, 2019020876]


In [3]:
from itertools import chain, starmap

def flatten_json_iterative_solution(dictionary):
    """Flatten a nested json file"""

    def unpack(parent_key, parent_value):
        """Unpack one level of nesting in json file"""
        # Unpack one level only!!!
        
        if isinstance(parent_value, dict):
            for key, value in parent_value.items():
                temp1 = parent_key + '_' + key
                yield temp1, value
        elif isinstance(parent_value, list):
            i = 0 
            for value in parent_value:
                temp2 = parent_key + '_'+str(i) 
                i += 1
                yield temp2, value
        else:
            yield parent_key, parent_value    

            
    # Keep iterating until the termination condition is satisfied
    while True:
        # Keep unpacking the json file until all values are atomic elements (not dictionary or list)
        dictionary = dict(chain.from_iterable(starmap(unpack, dictionary.items())))
        # Terminate condition: not any value in the json file is dictionary or list
        if not any(isinstance(value, dict) for value in dictionary.values()) and \
           not any(isinstance(value, list) for value in dictionary.values()):
            break

    return dictionary

In [75]:
# sc.stop()
enriched_games = list()
for game_id in game_ids:
    r = requests.get(f'https://statsapi.web.nhl.com/api/v1/game/{game_id}/feed/live')
    data = r.json()

    game_id = data['gamePk']
    api_link = data['link']
    timestamp = data['metaData']['timeStamp']
    gameData = data['gameData']
    liveData = data['liveData']

    game = gameData['game']
    pk = game['pk']
    season = game['season']
    game_type = game['type']

    datetime = gameData['datetime']
    start_time = datetime['dateTime']
#     end_time = datetime['endDateTime']

    status = gameData['status']
    end_status = status['abstractGameState']

    teams = gameData['teams']
    away = teams['away']
    home = teams['home']

    players = gameData['players']

    venue = gameData['venue']
    venue_name = venue['name']
    venue_link = venue['link']

    # PLAYS NOT DONE
    plays = liveData['plays']
    all_plays = plays['allPlays']
    scoring_plays = plays['scoringPlays']
    penalty_plays = plays['penaltyPlays']
    plays_by_period = plays['playsByPeriod']


#     current_play = plays['currentPlay']
#     current_play_result = current_play['result']
#     current_play_about = current_play['about']
#     current_play_coordinates = current_play['coordinates']

#     linescore = liveData['linescore']
#     current_period = linescore['currentPeriod']
#     current_period_ordinal = linescore['currentPeriodOrdinal']
#     current_period_time_remaining = linescore['currentPeriodTimeRemaining']
#     # PERIODS NOT DONE
#     periods = linescore['periods']

    shootout_info = linescore['shootoutInfo']
    shootout_info_away = shootout_info['away']
    shootout_info_home = shootout_info['home']

    linescore_teams = linescore['teams']
    # team, goals, shotsOnGoal, goaliePulled, numSkaters, powerPlay
    linescore_home = linescore_teams['home']
    linescore_away = linescore_teams['away']

    power_play_strength = linescore['powerPlayStrength']
    has_shootout = linescore['hasShootout']
    # intermissionTimeRemaining, intermissionTimeElapsed, inIntermission
    intermission_info = linescore['intermissionInfo']
    # situationTimeRemaining, situationTimeElapsed, inSituation
    powerPlayInfo = linescore['powerPlayInfo']


    boxscore = liveData['boxscore']
    boxscore_teams = boxscore['teams']

    # THESE TW NOT DONE
    boxscore_home_team = boxscore_teams['home']
    boxscore_home_team_stats = boxscore_home_team['teamStats']['teamSkaterStats']
    boxscore_home_goals = boxscore_home_team_stats['goals']
    boxscore_home_shots = boxscore_home_team_stats['shots']
    boxscore_home_blocked_shots = boxscore_home_team_stats['blocked']
    boxscore_home_pim = boxscore_home_team_stats['pim']
    boxscore_home_power_play_percent = boxscore_home_team_stats['powerPlayPercentage']
    
    boxscore_away_team = boxscore_teams['away']
    boxscore_away_team_stats = boxscore_away_team['teamStats']['teamSkaterStats']
    boxscore_away_goals = boxscore_away_team_stats['goals']
    boxscore_away_shots = boxscore_away_team_stats['shots']
    boxscore_away_blocked_shots = boxscore_away_team_stats['blocked']
    boxscore_away_pim = boxscore_away_team_stats['pim']
    boxscore_away_power_play_percent = boxscore_away_team_stats['powerPlayPercentage']
    boxscore_officials = boxscore['officials']

#     decisions = liveData['decisions']
#     winning_team = decisions['winner']
#     losing_team = decisions['loser']
#     first_star = decisions['firstStar']
#     second_star = decisions['secondStar']
#     third_star = decisions['thirdStar']

#     shot_pressure = liveData['shotPressure']
#     game_pressures = shot_pressure['gamePressures']
#     skater_advantages = shot_pressure['skaterAdvantages']
#     home_skater_advantages = skater_advantages['homeTeam']
#     away_skater_advantages = skater_advantages['awayTeam']
    # print(shotPressure)

    parsed_data = {
        'game_id': game_id,
        'api_link': api_link,
        'timestamp': timestamp,
        'home_team': {
            'name': home['name'],
            'goals': boxscore_home_goals,
            'shots': boxscore_home_shots,
            'blocked_shots': boxscore_home_blocked_shots,
            'pim': boxscore_home_pim,
            'pp%': boxscore_home_power_play_percent
        },
        'away_team': {
            'name': away['name'], 
            'goals': boxscore_away_goals,
            'shots': boxscore_away_shots,
            'blocked_shots': boxscore_away_blocked_shots,
            'pim': boxscore_away_pim,
            'pp%': boxscore_away_power_play_percent
        }
    }
    
    enriched_games.append(parsed_data)

df = sc.parallelize(enriched_games)
df.collect()




[{'game_id': 2019020866,
  'api_link': '/api/v1/game/2019020866/feed/live',
  'timestamp': '20200212_163050',
  'home_team': {'name': 'Buffalo Sabres',
   'goals': 3,
   'shots': 25,
   'blocked_shots': 11,
   'pim': 6,
   'pp%': '0.0'},
  'away_team': {'name': 'Detroit Red Wings',
   'goals': 2,
   'shots': 29,
   'blocked_shots': 13,
   'pim': 6,
   'pp%': '0.0'}},
 {'game_id': 2019020867,
  'api_link': '/api/v1/game/2019020867/feed/live',
  'timestamp': '20200212_030738',
  'home_team': {'name': 'Toronto Maple Leafs',
   'goals': 3,
   'shots': 41,
   'blocked_shots': 17,
   'pim': 6,
   'pp%': '0.0'},
  'away_team': {'name': 'Arizona Coyotes',
   'goals': 2,
   'shots': 37,
   'blocked_shots': 17,
   'pim': 6,
   'pp%': '0.0'}},
 {'game_id': 2019020868,
  'api_link': '/api/v1/game/2019020868/feed/live',
  'timestamp': '20200212_053503',
  'home_team': {'name': 'New Jersey Devils',
   'goals': 3,
   'shots': 28,
   'blocked_shots': 18,
   'pim': 4,
   'pp%': '50.0'},
  'away_team': 