<a href="https://colab.research.google.com/github/maddran/NHL-pred/blob/main/NHL_games_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import requests
import pandas as pd
from pandas import json_normalize 
from datetime import datetime
from time import sleep

import requests

def call_nhl(startSeason, endSeason=None):

  if not endSeason:
    endSeason = startSeason

  headers = {
      'authority': 'api.nhle.com',
      'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36',
      'accept': '*/*',
      'origin': 'http://www.nhl.com',
      'sec-fetch-site': 'cross-site',
      'sec-fetch-mode': 'cors',
      'sec-fetch-dest': 'empty',
      'referer': 'http://www.nhl.com/',
      'accept-language': 'en-US,en;q=0.9',
  }

  params = (
      ('isAggregate', 'false'),
      ('isGame', 'true'),
      ('sort', '[{"property":"gameDate","direction":"DESC"}]'),
      ('start', '0'),
      ('limit', '0'),
      ('factCayenneExp', 'gamesPlayed>=1'),
      ('cayenneExp', f'gameTypeId=2 and seasonId<={endSeason} and seasonId>={startSeason}'),
  )
  

  response = requests.get('https://api.nhle.com/stats/rest/en/team/summary', headers=headers, params=params)

  return response

def get_gameData(startYear, numSeasons):

  seasons = [f"{startYear+i}{startYear+i+1}" for i in range(numSeasons)]

  rows=0
  res = {}

  for s in seasons:
    response = call_nhl(s)

    if response:
      response = response.json()
      rows+=len(response['data'])
      df = pd.json_normalize(response['data'])
      res[s] = df
      print(f"Number of games grabbed for {s} = {len(response['data'])}. Total = {rows}")
    else:
      print("ERROR")

  return res

def rolling_aggregate(df, window = 3):
  res = {}
  roll_sum = ['gamesPlayed', 'goalsAgainst', 'goalsFor', 
              'losses', 'otLosses', 
              'points', 
              'regulationAndOtWins', 'winsInShootout']

  roll_mean = ['goalsForPerGame', 'goalsAgainstPerGame',
               'shotsForPerGame', 'shotsAgainstPerGame']

  cumsum = ['gamesPlayed', 'points', 
            'goalsFor', 'goalsAgainst']
            

  df = df.fillna(0)

  res = df[roll_sum].rolling(window).sum()
  res = res.merge(df[roll_mean].rolling(window).mean(), 
            left_index=True, right_index = True)
  res.columns = [f"rolling_{col}" for col in res.columns]

  res[[f"cum_{col}" for col in cumsum]] = df[cumsum].cumsum()

  res.index = df['gameId']

  res = res[sorted(res.columns)]

  return res

def home_road(df, teamLU):
  res = {}
  res['home'] = df[df['homeRoad']=='H']['teamId'].values[0]
  res['road'] = df[df['homeRoad']=='R']['teamId'].values[0]

  res['homeName'] = teamLU[res['home']]
  res['roadName'] = teamLU[res['road']]

  return pd.Series(res, index=res.keys())

def process_data(raw_data):
  data = {}
  for season, df in raw_data.items():
    df['gameDate'] = pd.to_datetime(df['gameDate'])
    df['seasonId'] = str(season)
    df = df.sort_values('gameDate', axis=0).reset_index(drop=True)

    rolling = df.groupby(['teamId']).apply(rolling_aggregate, 3)

    teamLU = dict(zip(df['teamId'], df['teamFullName']))

    schedule = df.groupby(['gameId', 'gameDate']).apply(home_road, teamLU)  

    data[season] =  {'raw_data':df, 
                    'rolling':rolling, 
                    'schedule':schedule,
                    'teamLU' : teamLU}

  return data


In [13]:
raw_data = get_gameData(2014, 5)
data = process_data(raw_data)

Number of games grabbed for 20142015 = 2460. Total = 2460
Number of games grabbed for 20152016 = 2460. Total = 4920
Number of games grabbed for 20162017 = 2460. Total = 7380
Number of games grabbed for 20172018 = 2542. Total = 9922
Number of games grabbed for 20182019 = 2542. Total = 12464


In [14]:
def pretty(d, indent=0):
   for key, value in d.items():
      print('\t' * indent + str(key))
      if isinstance(value, dict):
        if key == "teamLU":
          print('\t' * (indent+1) + f"Dictionary: {len(value)}")
        else:
          pretty(value, indent+1)
      elif isinstance(value, pd.DataFrame):
         print('\t' * (indent+1) + f"DataFrame: {value.shape}")
      

pretty(data)

20142015
	raw_data
		DataFrame: (2460, 28)
	rolling
		DataFrame: (2460, 16)
	schedule
		DataFrame: (1230, 4)
	teamLU
		Dictionary: 30
20152016
	raw_data
		DataFrame: (2460, 28)
	rolling
		DataFrame: (2460, 16)
	schedule
		DataFrame: (1230, 4)
	teamLU
		Dictionary: 30
20162017
	raw_data
		DataFrame: (2460, 28)
	rolling
		DataFrame: (2460, 16)
	schedule
		DataFrame: (1230, 4)
	teamLU
		Dictionary: 30
20172018
	raw_data
		DataFrame: (2542, 28)
	rolling
		DataFrame: (2542, 16)
	schedule
		DataFrame: (1271, 4)
	teamLU
		Dictionary: 31
20182019
	raw_data
		DataFrame: (2542, 28)
	rolling
		DataFrame: (2542, 16)
	schedule
		DataFrame: (1271, 4)
	teamLU
		Dictionary: 31


In [16]:
data['20142015']['raw_data'].head()

Unnamed: 0,faceoffWinPct,gameDate,gameId,gamesPlayed,goalsAgainst,goalsAgainstPerGame,goalsFor,goalsForPerGame,homeRoad,losses,opponentTeamAbbrev,otLosses,penaltyKillNetPct,penaltyKillPct,pointPct,points,powerPlayNetPct,powerPlayPct,regulationAndOtWins,shotsAgainstPerGame,shotsForPerGame,teamFullName,teamId,ties,wins,winsInRegulation,winsInShootout,seasonId
0,0.385714,2014-10-08,2014020003,1,2,2.0,4,4.0,R,0,CGY,0,1.0,1.0,1.0,2,0.333333,0.333333,1,25.0,33.0,Vancouver Canucks,23,,1,1,0,20142015
1,0.40845,2014-10-08,2014020004,1,4,4.0,0,0.0,H,1,SJS,0,0.75,0.75,0.0,0,0.0,0.0,0,30.0,34.0,Los Angeles Kings,26,,0,0,0,20142015
2,0.591549,2014-10-08,2014020004,1,0,0.0,4,4.0,R,0,LAK,0,1.0,1.0,1.0,2,0.25,0.25,1,34.0,30.0,San Jose Sharks,28,,1,1,0,20142015
3,0.409836,2014-10-08,2014020002,1,2,2.0,1,1.0,R,1,BOS,0,0.5,0.5,0.0,0,0.0,0.0,0,33.0,20.0,Philadelphia Flyers,4,,0,0,0,20142015
4,0.412698,2014-10-08,2014020001,1,4,4.0,3,3.0,H,1,MTL,0,1.0,1.0,0.0,0,0.333333,0.333333,0,32.0,27.0,Toronto Maple Leafs,10,,0,0,0,20142015


In [17]:
data['20182019']['schedule'].head()

Unnamed: 0_level_0,Unnamed: 1_level_0,home,road,homeName,roadName
gameId,gameDate,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018020001,2018-10-03,10,8,Toronto Maple Leafs,Montréal Canadiens
2018020002,2018-10-03,15,6,Washington Capitals,Boston Bruins
2018020003,2018-10-03,23,20,Vancouver Canucks,Calgary Flames
2018020004,2018-10-03,28,24,San Jose Sharks,Anaheim Ducks
2018020005,2018-10-04,7,6,Buffalo Sabres,Boston Bruins


In [18]:
import pickle

pickle.dump( data, open( "data.p", "wb" ) )