# Get Gamelogs

Scrape `nhl.com` website to get fine-grained stats about all players. Used for LSTM predictions.

## 1. Utilities
---

In [1]:
import pandas as pd
import numpy as np
import re
import math
import pickle
import glob
import xgboost
import tsfresh
import os
import hockey_scraper
from datetime import datetime
# from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from pprint import pprint
from sklearn import svm
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, SGDRegressor, Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.tree import export_graphviz
from sklearn.metrics import mean_absolute_error, mean_squared_error, explained_variance_score, r2_score, log_loss
from sklearn.model_selection import cross_validate, cross_val_predict
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

sns.set_context('notebook')
sns.set_color_codes()
plot_kwds = {'alpha' : 0.5, 's' : 100, 'linewidths':0}
large = 22; med = 16; small = 12
params = {'axes.titlesize': large,
          'legend.fontsize': med,
          'figure.figsize': (16, 10),
          'axes.labelsize': med,
          'axes.titlesize': med,
          'xtick.labelsize': med,
          'ytick.labelsize': med,
          'figure.titlesize': large}
plt.rcParams.update(params)
plt.style.use('seaborn-whitegrid')
sns.set_style("white")

pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

pd.options.mode.chained_assignment = None  # default='warn'

# Variables
scrape = False

## 2. Scrape
---

In [2]:
if scrape:
    # Pbp data deposited in file - /Users/noiseuce/hockey_scraper_data/csvs/nhl_pbp20182019.csv
    # Shift data deposited in file - /Users/noiseuce/hockey_scraper_data/csvs/nhl_shifts20182019.csv
    hockey_scraper.scrape_seasons([2019], True, docs_dir=True)

## 3. Reformat
---

End of season date:

* 2013-2014 = 13 avril

* 2014-2015 = 11 avril

* 2015-2016 = 10 avril

* 2016-2017 = 9 avril

* 2017-2018 = 8 avril

* 2018-2019 = 6 avril

* 2019-2020 = 11 mars


In [3]:
%%time
years = ['20132014', '20142015', '20152016', '20162017', '20172018', '20182019', '20192020']
start_date = [pd.datetime(2013, 10, 1), 
              pd.datetime(2014, 10, 1), 
              pd.datetime(2015, 10, 1), 
              pd.datetime(2016, 10, 1),
              pd.datetime(2017, 10, 1),
              pd.datetime(2018, 10, 1),
              pd.datetime(2019, 10, 1)]
end_date = [pd.datetime(2014, 4, 13), 
            pd.datetime(2015, 4, 11), 
            pd.datetime(2016, 4, 10), 
            pd.datetime(2017, 4, 9),
            pd.datetime(2018, 4, 8),
            pd.datetime(2019, 4, 6),
            pd.datetime(2020, 3, 11)]
nhl_pbp = pd.DataFrame()
nhl_shifts = pd.DataFrame()
i = 0
for y in years:
    print(y)
    print('Points...')
    nhl_pbp_tmp = pd.read_csv(f'../data/hockey_scraper_data/csvs/nhl_pbp{y}.csv')
    nhl_pbp_tmp['Date'] = pd.to_datetime(nhl_pbp_tmp['Date'])
    mask_pbp_tmp = (nhl_pbp_tmp['Date'] > start_date[i]) & (nhl_pbp_tmp['Date'] <= end_date[i])
    nhl_pbp_tmp = nhl_pbp_tmp.loc[mask_pbp_tmp]
    nhl_pbp_tmp['Game_Id'] = nhl_pbp_tmp.apply(lambda x: str(x.Date.year) + '_' + str(x.Game_Id), axis=1)
    nhl_pbp = pd.concat([nhl_pbp, nhl_pbp_tmp])
    
    print('Shifts...')
    nhl_shifts_tmp = pd.read_csv(f'../data/hockey_scraper_data/csvs/nhl_shifts{y}.csv').drop('Unnamed: 0', axis=1)
    nhl_shifts_tmp['Date'] = pd.to_datetime(nhl_shifts_tmp['Date'])
    mask_shifts_tmp = (nhl_shifts_tmp['Date'] > start_date[i]) & (nhl_shifts_tmp['Date'] <= end_date[i])
    nhl_shifts_tmp = nhl_shifts_tmp.loc[mask_shifts_tmp]
    nhl_shifts_tmp['Game_Id'] = nhl_shifts_tmp.apply(lambda x: str(x.Date.year) + '_' + str(x.Game_Id), axis=1)
    nhl_shifts = pd.concat([nhl_shifts, nhl_shifts_tmp])
    
    i+=1

20132014
Points...
Shifts...
20142015
Points...
Shifts...
20152016
Points...
Shifts...
20162017
Points...
Shifts...
20172018
Points...
Shifts...
20182019
Points...
Shifts...
20192020
Points...
Shifts...
CPU times: user 3min 59s, sys: 21.3 s, total: 4min 20s
Wall time: 4min 27s


In [5]:
# For each game ID, get all players that played in it
player_games = nhl_shifts.sort_values(['Game_Id', 'Player']).drop_duplicates(subset=['Game_Id', 'Player'])[['Game_Id', 'Team', 'Player', 'Player_Id', 'Date']]

# Add team that an event is against
teams_involved = [set(x) for x in nhl_pbp[['Home_Team', 'Away_Team']].values]
event_team = [set(x) for x in np.expand_dims(nhl_pbp.Ev_Team.values, axis=1)]
nhl_pbp['Against_Team'] = [list(teams - event_team)[0] for (teams, event_team) in zip(teams_involved, event_team)]
nhl_pbp['Teams_Involved'] = [x for x in nhl_pbp[['Home_Team', 'Away_Team']].values]

# For each game ID, get all players with goal(s) or assist(s)
player_games_points = nhl_pbp[nhl_pbp['Event'] == 'GOAL'][['Game_Id', 'Ev_Team', 'Against_Team', 'p1_name', 'p1_ID', 'p2_name', 'p2_ID', 'p3_name', 'p3_ID']]
player_games_points = player_games_points.fillna(0)

In [6]:
# Get goals scored per player in each games
# Keep game_id, player name and id, and number of goals
player_games_goals = player_games_points.groupby(['Game_Id', 'p1_name', 'p1_ID']).count().reset_index()
player_games_goals = player_games_goals.iloc[:,:4]
player_games_goals.columns = ['Game_Id', 'Player', 'Player_Id', 'Goal']

# Get first assists per player in each games
# Keep game_id, player name and id, and number of assists
player_games_first_assists = player_games_points.groupby(['Game_Id', 'p2_name', 'p2_ID']).count().reset_index()
player_games_first_assists = player_games_first_assists[player_games_first_assists['p2_name'] != 0]
player_games_first_assists = player_games_first_assists.iloc[:,:4]
player_games_first_assists.columns = ['Game_Id', 'Player', 'Player_Id', 'First_Assist']

# Get second assists per player in each games
# Keep game_id, player name and id, and number of assists
player_games_second_assists = player_games_points.groupby(['Game_Id', 'p3_name', 'p3_ID']).count().reset_index()
player_games_second_assists = player_games_second_assists[player_games_second_assists['p3_name'] != 0]
player_games_second_assists = player_games_second_assists.iloc[:,:4]
player_games_second_assists.columns = ['Game_Id', 'Player', 'Player_Id', 'Second_Assist']

In [8]:
# Merge goals and first assist and second assist
player_logs = player_games.merge(player_games_goals[['Game_Id', 'Player', 'Player_Id', 'Goal']], on=['Game_Id', 'Player', 'Player_Id'], how='outer')
player_logs = player_logs.merge(player_games_first_assists[['Game_Id', 'Player', 'Player_Id', 'First_Assist']], on=['Game_Id', 'Player', 'Player_Id'], how='outer')
player_logs = player_logs.merge(player_games_second_assists[['Game_Id', 'Player', 'Player_Id', 'Second_Assist']], on=['Game_Id', 'Player', 'Player_Id'], how='outer')
player_logs = player_logs.fillna(0)

# Add total points
player_logs['Total_Points'] = player_logs[['Goal', 'First_Assist', 'Second_Assist']].sum(axis=1)
player_logs = player_logs[player_logs['Date'] != 0]
player_logs = player_logs.drop_duplicates()

# Add Against Team
teams_involved_df = nhl_pbp.groupby('Game_Id')[['Game_Id', 'Teams_Involved']].head(1)
player_logs = player_logs.merge(teams_involved_df, on='Game_Id', how='left')
player_logs['Against_Team'] = [list(set(involved_teams) - set([team]))[0] for team, involved_teams in zip(player_logs['Team'], player_logs['Teams_Involved'])]
player_logs = player_logs.drop('Teams_Involved', axis=1)

player_logs.to_csv('../data/nhl_tft.csv') # For tft
player_logs[['Game_Id', 'Team', 'Player', 'Player_Id', 'Date', 'Goal', 'First_Assist', 'Second_Assist', 'Total_Points']].to_csv('../data/nhl.csv') # For pool dataset

player_logs.head()

Unnamed: 0,Game_Id,Team,Player,Player_Id,Date,Goal,First_Assist,Second_Assist,Total_Points,Against_Team
0,2013_20004,PHI,ADAM HALL,8467925,2013-10-02 00:00:00,0.0,0.0,0.0,0.0,TOR
1,2013_20004,PHI,ANDREJ MESZAROS,8471236,2013-10-02 00:00:00,0.0,0.0,0.0,0.0,TOR
2,2013_20004,PHI,BRAYDEN SCHENN,8475170,2013-10-02 00:00:00,1.0,0.0,0.0,1.0,TOR
3,2013_20004,PHI,BRAYDON COBURN,8470601,2013-10-02 00:00:00,0.0,0.0,0.0,0.0,TOR
4,2013_20004,TOR,CARL GUNNARSSON,8474125,2013-10-02 00:00:00,0.0,0.0,0.0,0.0,PHI


## 4. Save Individual Results
---

In [10]:
for plyr in player_logs.Player_Id.unique():
    data_save = player_logs[player_logs.Player_Id == plyr].sort_values('Date')
    data_save = data_save[['Game_Id', 'Team', 'Player', 'Player_Id', 'Date', 'Goal', 'First_Assist', 'Second_Assist', 'Total_Points']]
    data_save.to_csv(f'../LSTM-Neural-Network-for-Time-Series-Prediction/LSTM-Neural-Network-for-Time-Series-Prediction/data/{str(data_save.Player.unique()[0]).replace(" ", "-")}-{str(plyr)}.csv')

In [55]:
# Look at LSTM prediction results
pd.DataFrame(pd.read_pickle('../LSTM-Neural-Network-for-Time-Series-Prediction/predictions.pkl')).T

Unnamed: 0,full-sequence,multiple-sequences,point-by-point,test,train
RYAN-KESLER-8470616.csv,"[0.75380015, 0.75219226, 0.75523365, 0.7533355...","[[0.75380015, 0.75219226, 0.75523365, 0.753335...","[0.7538001, 0.7523783, 0.7563757, 0.7563757, 0...","[2018-02-06 00:00:00, 2019-03-06 00:00:00]","[2013-10-03 00:00:00, 2018-02-05 00:00:00]"
ETHAN-BEAR-8478451.csv,,,,,
LOGAN-SHAW-8476400.csv,"[0.10769645, 0.112798735, 0.119258024, 0.12576...","[[0.10769645, 0.112798735, 0.119258024, 0.1257...","[0.10769646, 0.11252023, 0.118144475, 0.125722...","[2017-02-20 00:00:00, 2018-04-07 00:00:00]","[2015-10-30 00:00:00, 2017-02-19 00:00:00]"
NELSON-NOGIER-8478031.csv,,,,,
MIKE-SMITH-8469608.csv,"[0.0067961705, 0.0069396747, 0.0070885536, 0.0...","[[0.0067961705, 0.0069396747, 0.0070885536, 0....","[0.006796169, 0.006796169, 0.006796169, 0.0067...","[2017-10-29 00:00:00, 2019-04-06 00:00:00]","[2013-10-03 00:00:00, 2017-10-27 00:00:00]"
MACKENZIE-BLACKWOOD-8478406.csv,,,,,
GABRIEL-BOURQUE-8475268.csv,"[0.18078952, 0.17285362, 0.17830783, 0.1872089...","[[0.18078952, 0.17285362, 0.17830783, 0.187208...","[0.18078953, 0.17345063, 0.1774704, 0.18078953...","[2018-02-03 00:00:00, 2019-04-06 00:00:00]","[2013-10-03 00:00:00, 2018-02-01 00:00:00]"
FILIP-ZADINA-8480821.csv,,,,,
CONNOR-BROWN-8477015.csv,"[0.30577764, 0.3081794, 0.31415343, 0.3237189,...","[[0.30577764, 0.3081794, 0.31415343, 0.3237189...","[0.30577764, 0.30577764, 0.30577764, 0.3057776...","[2018-03-30 00:00:00, 2019-04-06 00:00:00]","[2016-03-17 00:00:00, 2018-03-28 00:00:00]"
PIERRE-EDOUARD-BELLEMARE-8477930.csv,"[0.13796371, 0.13673583, 0.13605505, 0.1401470...","[[0.13796371, 0.13673583, 0.13605505, 0.140147...","[0.13796371, 0.13697979, 0.13679461, 0.1415269...","[2018-03-16 00:00:00, 2019-04-06 00:00:00]","[2014-10-08 00:00:00, 2018-03-14 00:00:00]"


In [58]:
files = glob.glob(os.path.join('../LSTM-Neural-Network-for-Time-Series-Prediction/data', '*.csv'))

In [59]:
files

['../LSTM-Neural-Network-for-Time-Series-Prediction/data/RYAN-KESLER-8470616.csv',
 '../LSTM-Neural-Network-for-Time-Series-Prediction/data/ETHAN-BEAR-8478451.csv',
 '../LSTM-Neural-Network-for-Time-Series-Prediction/data/LOGAN-SHAW-8476400.csv',
 '../LSTM-Neural-Network-for-Time-Series-Prediction/data/NELSON-NOGIER-8478031.csv',
 '../LSTM-Neural-Network-for-Time-Series-Prediction/data/MIKE-SMITH-8469608.csv',
 '../LSTM-Neural-Network-for-Time-Series-Prediction/data/MACKENZIE-BLACKWOOD-8478406.csv',
 '../LSTM-Neural-Network-for-Time-Series-Prediction/data/GABRIEL-BOURQUE-8475268.csv',
 '../LSTM-Neural-Network-for-Time-Series-Prediction/data/FILIP-ZADINA-8480821.csv',
 '../LSTM-Neural-Network-for-Time-Series-Prediction/data/CONNOR-BROWN-8477015.csv',
 '../LSTM-Neural-Network-for-Time-Series-Prediction/data/PIERRE-EDOUARD-BELLEMARE-8477930.csv',
 '../LSTM-Neural-Network-for-Time-Series-Prediction/data/JONNY-BRODZINSKI-8477380.csv',
 '../LSTM-Neural-Network-for-Time-Series-Prediction/data