# Time Series Analysis

This notebook will explore shot and goal data over time. 

In [1]:
from pathlib import Path
import numpy as np
import pandas as pd

PICKLE_PATH = Path("../pickled_data/")

Import relevant data from pickle

In [2]:
game_plays = pd.read_pickle(PICKLE_PATH / "game_plays")
games = pd.read_pickle(PICKLE_PATH / "games")

In [None]:
game_plays.head()

Unnamed: 0,play_id,game_id,team_id_for,team_id_against,event,secondaryType,x,y,period,periodType,periodTime,periodTimeRemaining,dateTime,goals_away,goals_home,description,st_x,st_y
0,2016020045_1,2016020045,,,Game Scheduled,,,,1,REGULAR,0,1200.0,2016-10-18 23:40:58,0,0,Game Scheduled,,
1,2016020045_2,2016020045,,,Period Ready,,,,1,REGULAR,0,1200.0,2016-10-19 01:35:28,0,0,Period Ready,,
2,2016020045_3,2016020045,,,Period Start,,,,1,REGULAR,0,1200.0,2016-10-19 01:40:50,0,0,Period Start,,
3,2016020045_4,2016020045,16.0,4.0,Faceoff,,0.0,0.0,1,REGULAR,0,1200.0,2016-10-19 01:40:50,0,0,Jonathan Toews faceoff won against Claude Giroux,0.0,0.0
4,2016020045_5,2016020045,16.0,4.0,Shot,Wrist Shot,-71.0,9.0,1,REGULAR,54,1146.0,2016-10-19 01:41:44,0,0,Artem Anisimov Wrist Shot saved by Michal Neuv...,71.0,-9.0


In [4]:
games.head()

Unnamed: 0,game_id,season,type,date_time_GMT,away_team_id,home_team_id,away_goals,home_goals,outcome,home_rink_side_start,venue,venue_link,venue_time_zone_id,venue_time_zone_offset,venue_time_zone_tz
0,2016020045,20162017,R,2016-10-19T00:30:00Z,4,16,4,7,home win REG,right,United Center,/api/v1/venues/null,America/Chicago,-5,CDT
1,2017020812,20172018,R,2018-02-07T00:00:00Z,24,7,4,3,away win OT,left,KeyBank Center,/api/v1/venues/null,America/New_York,-4,EDT
2,2015020314,20152016,R,2015-11-24T01:00:00Z,21,52,4,1,away win REG,right,MTS Centre,/api/v1/venues/null,America/Winnipeg,-5,CDT
3,2015020849,20152016,R,2016-02-17T00:00:00Z,52,12,1,2,home win REG,right,PNC Arena,/api/v1/venues/null,America/New_York,-4,EDT
4,2017020586,20172018,R,2017-12-30T03:00:00Z,20,24,1,2,home win REG,left,Honda Center,/api/v1/venues/null,America/Los_Angeles,-7,PDT


We can merge in the season to the game_plays frame. Then perform aggregations with the full data surround the play.

In [None]:
game_plays = pd.merge(game_plays, games[['game_id', 'season']], how="left", on="game_id")

array([20162017, 20172018, 20152016, 20142015, 20122013, 20132014,
       20112012, 20102011, 20082009, 20092010, 20072008, 20062007,
       20022003, 20032004, 20052006, 20002001, 20012002, 20192020,
       20182019])

In [15]:
game_plays.dtypes

play_id                 object
game_id                  int64
team_id_for            float64
team_id_against        float64
event                   object
secondaryType           object
x                      float64
y                      float64
period                   int64
periodType              object
periodTime               int64
periodTimeRemaining    float64
dateTime                object
goals_away               int64
goals_home               int64
description             object
st_x                   float64
st_y                   float64
season                   int64
dtype: object

To simplify things and make them a bit cleaner, let's change the season data to align to the year end for each season

In [17]:
season_replacements = {
    20002001: 2001, 
    20012002: 2002,
    20022003: 2003, 
    20032004: 2004, 
    20052006: 2006,
    20062007: 2007,
    20072008: 2008, 
    20082009: 2009, 
    20092010: 2010,
    20102011: 2011,
    20112012: 2012,
    20122013: 2013, 
    20132014: 2014,
    20142015: 2015,
    20152016: 2016,
    20162017: 2017, 
    20172018: 2018,
    20182019: 2019,
    20192020: 2020
}

game_plays['season'] = game_plays['season'].replace(season_replacements)

In [20]:
shots = game_plays.loc[game_plays['event'] == 'Shot']

In [21]:
shots.groupby('season')['play_id'].count()

season
2006      501
2007      644
2008      512
2009      534
2010      682
2011    72151
2012    70631
2013    42884
2014    71948
2015    71402
2016    70111
2017    72077
2018    77200
2019    77753
2020    69335
Name: play_id, dtype: int64

In [22]:
goals = game_plays.loc[game_plays['event'] == 'Goal']
goals.groupby('season')['play_id'].count()

season
2001    6633
2002    6266
2003    6327
2004    6201
2006    7617
2007    7365
2008    6900
2009    7247
2010    7149
2011    7376
2012    7233
2013    4411
2014    7377
2015    7243
2016    7082
2017    7251
2018    7992
2019    8276
2020    7399
Name: play_id, dtype: int64