In [1]:
import os
import re
import glob
import pickle
import pandas as pd
import numpy as np
from datetime import datetime

---
## Get data
### Ground Truth

In [2]:
data = []
for player in glob.glob('../LSTM-Neural-Network-for-Time-Series-Prediction/data/*.csv'):
    tmp = pd.read_csv(player)
    data.append(tmp)
data = pd.concat(data)

In [3]:
data.head()

Unnamed: 0.1,Unnamed: 0,Game_Id,Team,Player,Player_Id,Date,Goal,First_Assist,Second_Assist,Total_Points
0,412,2013_20014,VAN,RYAN KESLER,8470616,2013-10-03 00:00:00,0.0,0.0,0.0,0.0
1,1024,2013_20030,VAN,RYAN KESLER,8470616,2013-10-05 00:00:00,1.0,0.0,0.0,1.0
2,1174,2013_20034,VAN,RYAN KESLER,8470616,2013-10-06 00:00:00,0.0,0.0,0.0,0.0
3,1520,2013_20043,VAN,RYAN KESLER,8470616,2013-10-08 00:00:00,0.0,0.0,0.0,0.0
4,2016,2013_20056,VAN,RYAN KESLER,8470616,2013-10-10 00:00:00,0.0,0.0,0.0,0.0


In [4]:
years = ['20132014', '20142015', '20152016', '20162017', '20172018', '20182019']
start_date = [pd.datetime(2013, 10, 1), 
              pd.datetime(2014, 10, 1), 
              pd.datetime(2015, 10, 1), 
              pd.datetime(2016, 10, 1),
              pd.datetime(2017, 10, 1),
              pd.datetime(2018, 10, 1)]
end_date = [pd.datetime(2014, 4, 13), 
            pd.datetime(2015, 4, 11), 
            pd.datetime(2016, 4, 10), 
            pd.datetime(2017, 4, 9),
            pd.datetime(2018, 4, 8),
            pd.datetime(2019, 4, 6)]

yearly_data = {}
i = 0
for y in years:
    print(y)
    print('Points...')
    data['Date'] = pd.to_datetime(data['Date'])
    mask = (data['Date'] > start_date[i]) & (data['Date'] <= end_date[i])
    data_tmp = data.loc[mask]
    yearly_data[y] = data_tmp
    
    i+=1

20132014
Points...
20142015
Points...
20152016
Points...
20162017
Points...
20172018
Points...
20182019
Points...


In [5]:
data_1718 = yearly_data['20172018']
data_1718_agg = data_1718.groupby('Player_Id').agg({'Player': np.unique, 'Total_Points': sum})
data_1718_agg['Name'] = data_1718_agg.Player.apply(lambda player: re.sub(r'\W+', '', f'{player}').upper())
data_1718_agg.set_index('Name', inplace=True)
data_1718_agg.head()

Unnamed: 0_level_0,Player,Total_Points
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
JAROMIRJAGR,JAROMIR JAGR,7.0
MATTCULLEN,MATT CULLEN,22.0
ZDENOCHARA,ZDENO CHARA,24.0
JOETHORNTON,JOE THORNTON,36.0
PATRICKMARLEAU,PATRICK MARLEAU,47.0


### Predictions

In [6]:
preds = []
for player in glob.glob('../LSTM-Neural-Network-for-Time-Series-Prediction/data/*.pkl'):
    tmp = pd.read_pickle(player)
    tmp = pd.DataFrame.from_dict(tmp, orient='index')
    preds.append(tmp)
preds = pd.concat(preds)
preds['Player_Id'] = preds.apply(lambda x: x.name.split('.cs')[0].split('-')[-1], axis=1)

---
## Comparison

In [7]:
comparison = data[(data.Player_Id == int(preds.iloc[1].Player_Id))&
                 (data.Date >= pd.to_datetime(preds.iloc[1].test[0]))&
                 (data.Date <= pd.to_datetime(preds.iloc[1].test[1]))][5:]
comparison['full-sequence'] = preds['full-sequence'].iloc[1]
comparison['point-by-point'] = preds['point-by-point'].iloc[1]
comparison.head()

Unnamed: 0.1,Unnamed: 0,Game_Id,Team,Player,Player_Id,Date,Goal,First_Assist,Second_Assist,Total_Points,full-sequence,point-by-point
186,153385,2016_21007,BOS,MATT BELESKEY,8473492,2016-03-10,0.0,0.0,0.0,0.0,0.536254,0.536254
187,153803,2016_21018,BOS,MATT BELESKEY,8473492,2016-03-12,0.0,0.0,0.0,0.0,0.524542,0.527461
188,154872,2016_21046,BOS,MATT BELESKEY,8473492,2016-03-15,0.0,0.0,0.0,0.0,0.506973,0.517053
189,155634,2016_21066,BOS,MATT BELESKEY,8473492,2016-03-18,0.0,0.0,0.0,0.0,0.483843,0.506013
190,156053,2016_21077,BOS,MATT BELESKEY,8473492,2016-03-19,0.0,0.0,0.0,0.0,0.499843,0.542198


In [8]:
comparison = []
for i, player in preds.iterrows():
    # Leave first five because of the sliding window (which takes last five before predicting the 82)
    tmp = data[(data.Player_Id == int(player.Player_Id))&
                     (data.Date >= pd.to_datetime(player.test[0]))&
                     (data.Date <= pd.to_datetime(player.test[1]))][5:]
    tmp['full-sequence'] = player['full-sequence']
    tmp['point-by-point'] = player['point-by-point']
    comparison.append(tmp)
comparison = pd.concat(comparison)
comparison.head()

In [11]:
comparison.head()

Unnamed: 0.1,Unnamed: 0,Game_Id,Team,Player,Player_Id,Date,Goal,First_Assist,Second_Assist,Total_Points,full-sequence,point-by-point
44,43467,2014_20535,STL,MARTIN BRODEUR,8455710,2014-12-29,0.0,0.0,0.0,0.0,0.11195,0.11195
45,89915,2015_20565,STL,MARTIN BRODEUR,8455710,2015-01-02,0.0,0.0,0.0,0.0,0.106384,0.11195
186,153385,2016_21007,BOS,MATT BELESKEY,8473492,2016-03-10,0.0,0.0,0.0,0.0,0.536254,0.536254
187,153803,2016_21018,BOS,MATT BELESKEY,8473492,2016-03-12,0.0,0.0,0.0,0.0,0.524542,0.527461
188,154872,2016_21046,BOS,MATT BELESKEY,8473492,2016-03-15,0.0,0.0,0.0,0.0,0.506973,0.517053
