# Utilities

In [1]:
import pandas as pd
import numpy as np
import re
import math
import pickle
import glob
import xgboost
import tsfresh
import os
import hockey_scraper
from datetime import datetime
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from pprint import pprint
from sklearn import svm
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, SGDRegressor, Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.tree import export_graphviz
from sklearn.metrics import mean_absolute_error, mean_squared_error, explained_variance_score, r2_score, log_loss
from sklearn.model_selection import cross_validate, cross_val_predict
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

sns.set_context('notebook')
sns.set_color_codes()
plot_kwds = {'alpha' : 0.5, 's' : 100, 'linewidths':0}
large = 22; med = 16; small = 12
params = {'axes.titlesize': large,
          'legend.fontsize': med,
          'figure.figsize': (16, 10),
          'axes.labelsize': med,
          'axes.titlesize': med,
          'xtick.labelsize': med,
          'ytick.labelsize': med,
          'figure.titlesize': large}
plt.rcParams.update(params)
plt.style.use('seaborn-whitegrid')
sns.set_style("white")

pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

pd.options.mode.chained_assignment = None  # default='warn'

# Variables
scrape = False

# Scrape

In [2]:
if scrape:
    # Pbp data deposited in file - /Users/noiseuce/hockey_scraper_data/csvs/nhl_pbp20182019.csv
    # Shift data deposited in file - /Users/noiseuce/hockey_scraper_data/csvs/nhl_shifts20182019.csv
    hockey_scraper.scrape_seasons([2013, 2014, 2015, 2016, 2017, 2018], True, docs_dir=True)

# Reformat

2013-2014 = 13 avril

2014-2015 = 11 avril

2015-2016 = 10 avril

2016-2017 = 9 avril

2017-2018 = 8 avril

2018-2018 = 6 avril


In [35]:
%%time
years = ['20132014', '20142015', '20152016', '20162017', '20172018', '20182019']
start_date = [pd.datetime(2013, 10, 1), 
              pd.datetime(2014, 10, 1), 
              pd.datetime(2015, 10, 1), 
              pd.datetime(2016, 10, 1),
              pd.datetime(2017, 10, 1),
              pd.datetime(2018, 10, 1)]
end_date = [pd.datetime(2014, 4, 13), 
            pd.datetime(2015, 4, 11), 
            pd.datetime(2016, 4, 10), 
            pd.datetime(2017, 4, 9),
            pd.datetime(2018, 4, 8),
            pd.datetime(2019, 4, 6)]
nhl_pbp = pd.DataFrame()
nhl_shifts = pd.DataFrame()
i = 0
for y in years:
    print(y)
    print('Points...')
    nhl_pbp_tmp = pd.read_csv(f'../data/hockey_scraper_data/csvs/nhl_pbp{y}.csv')
    nhl_pbp_tmp['Date'] = pd.to_datetime(nhl_pbp_tmp['Date'])
    mask_pbp_tmp = (nhl_pbp_tmp['Date'] > start_date[i]) & (nhl_pbp_tmp['Date'] <= end_date[i])
    nhl_pbp_tmp = nhl_pbp_tmp.loc[mask_pbp_tmp]
    nhl_pbp_tmp['Game_Id'] = nhl_pbp_tmp.apply(lambda x: str(x.Date.year) + '_' + str(x.Game_Id), axis=1)
    nhl_pbp = pd.concat([nhl_pbp, nhl_pbp_tmp])
    
    print('Shifts...')
    nhl_shifts_tmp = pd.read_csv(f'../data/hockey_scraper_data/csvs/nhl_shifts{y}.csv').drop('Unnamed: 0', axis=1)
    nhl_shifts_tmp['Date'] = pd.to_datetime(nhl_shifts_tmp['Date'])
    mask_shifts_tmp = (nhl_shifts_tmp['Date'] > start_date[i]) & (nhl_shifts_tmp['Date'] <= end_date[i])
    nhl_shifts_tmp = nhl_shifts_tmp.loc[mask_shifts_tmp]
    nhl_shifts_tmp['Game_Id'] = nhl_shifts_tmp.apply(lambda x: str(x.Date.year) + '_' + str(x.Game_Id), axis=1)
    nhl_shifts = pd.concat([nhl_shifts, nhl_shifts_tmp])
    
    i+=1

20132014
Points...
Shifts...
20142015
Points...
Shifts...
20152016
Points...
Shifts...
20162017
Points...
Shifts...
20172018
Points...
Shifts...
20182019
Points...
Shifts...
CPU times: user 4min 51s, sys: 12 s, total: 5min 3s
Wall time: 5min 4s


In [36]:
# For each game ID, get all players that played in it
player_games = nhl_shifts.sort_values(['Game_Id', 'Player']).drop_duplicates(subset=['Game_Id', 'Player'])[['Game_Id', 'Team', 'Player', 'Player_Id', 'Date']]

# For each game ID, get all players with goal(s) or assist(s)
player_games_points = nhl_pbp[nhl_pbp['Event'] == 'GOAL'][['Game_Id', 'p1_name', 'p1_ID', 'p2_name', 'p2_ID', 'p3_name', 'p3_ID']]
player_games_points = player_games_points.fillna(0)

In [37]:
# Get goals scored per player in each games
# Keep game_id, player name and id, and number of goals
player_games_goals = player_games_points.groupby(['Game_Id', 'p1_name', 'p1_ID']).count().reset_index()
player_games_goals = player_games_goals.iloc[:,:4]
player_games_goals.columns = ['Game_Id', 'Player', 'Player_Id', 'Goal']

# Get first assists per player in each games
# Keep game_id, player name and id, and number of assists
player_games_first_assists = player_games_points.groupby(['Game_Id', 'p2_name', 'p2_ID']).count().reset_index()
player_games_first_assists = player_games_first_assists[player_games_first_assists['p2_name'] != 0]
player_games_first_assists = player_games_first_assists.iloc[:,:4]
player_games_first_assists.columns = ['Game_Id', 'Player', 'Player_Id', 'First_Assist']

# Get second assists per player in each games
# Keep game_id, player name and id, and number of assists
player_games_second_assists = player_games_points.groupby(['Game_Id', 'p3_name', 'p3_ID']).count().reset_index()
player_games_second_assists = player_games_second_assists[player_games_second_assists['p3_name'] != 0]
player_games_second_assists = player_games_second_assists.iloc[:,:4]
player_games_second_assists.columns = ['Game_Id', 'Player', 'Player_Id', 'Second_Assist']

In [38]:
# Merge goals and first assist and second assist
player_logs = player_games.merge(player_games_goals, on=['Game_Id', 'Player', 'Player_Id'], how='outer')
player_logs = player_logs.merge(player_games_first_assists, on=['Game_Id', 'Player', 'Player_Id'], how='outer')
player_logs = player_logs.merge(player_games_second_assists, on=['Game_Id', 'Player', 'Player_Id'], how='outer')
player_logs = player_logs.fillna(0)

# Add total points
player_logs['Total_Points'] = player_logs[['Goal', 'First_Assist', 'Second_Assist']].sum(axis=1)
player_logs.head()

Unnamed: 0,Game_Id,Team,Player,Player_Id,Date,Goal,First_Assist,Second_Assist,Total_Points
0,2013_20004,PHI,ADAM HALL,8467925,2013-10-02 00:00:00,0.0,0.0,0.0,0.0
1,2013_20004,PHI,ANDREJ MESZAROS,8471236,2013-10-02 00:00:00,0.0,0.0,0.0,0.0
2,2013_20004,PHI,BRAYDEN SCHENN,8475170,2013-10-02 00:00:00,1.0,0.0,0.0,1.0
3,2013_20004,PHI,BRAYDON COBURN,8470601,2013-10-02 00:00:00,0.0,0.0,0.0,0.0
4,2013_20004,TOR,CARL GUNNARSSON,8474125,2013-10-02 00:00:00,0.0,0.0,0.0,0.0


In [45]:
player_logs[player_logs.Player == 'BRAD MARCHAND'].to_csv('../LSTM-Neural-Network-for-Time-Series-Prediction/data/Nikita_Kucherov.csv')

In [46]:
player_logs[player_logs.Player == 'BRAD MARCHAND'].shape

(463, 9)