# NHL Goal Prediction Part 2: Electric Boogaloo

Going to look at trying to predict future NHL goals again by trying a couple different statistical techniques that may or may not work!

In [33]:
import os
import math
import itertools
import time
from typing import Tuple

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, RidgeCV, ElasticNetCV, LassoCV
from sklearn.ensemble import BaggingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import KFold, train_test_split
from sklearn.feature_selection import SelectKBest, f_classif
import seaborn as sns

from helper_functions import cv_model, oos_stats
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [34]:
def prepare_data(dataframe: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
    stat_columns = ['g', 'gp', 'toi', 'a1', 'a2', 'isf', 'iff', 'icf']
    dataframe = dataframe.merge(dataframe[stat_columns + ['api_id', 'prev_season']], 
                                how='left', left_on=['api_id', 'season'], 
                                right_on=['api_id', 'prev_season'] ,suffixes=[None, '_target'])
    dataframe['n_1'] = dataframe['season'] - 10001
    dataframe['n_2'] = dataframe['season'] - 20002
    dataframe['n_3'] = dataframe['season'] - 30003
    dataframe = dataframe.merge(dataframe[stat_columns + ['api_id', 'season']], 
                                how='left', left_on=['api_id', 'n_1'], 
                                right_on=['api_id', 'season'] ,suffixes=[None, '_n_1'])
    dataframe = dataframe.merge(dataframe[stat_columns + ['api_id', 'season']], 
                                how='left', left_on=['api_id', 'n_2'], 
                                right_on=['api_id', 'season'] ,suffixes=[None, '_n_2'])
    dataframe = dataframe.merge(dataframe[stat_columns + ['api_id', 'season']], 
                                how='left', left_on=['api_id', 'n_3'], 
                                right_on=['api_id', 'season'] ,suffixes=[None, '_n_3'])
    
    mask = (dataframe.season != 20192020)&(dataframe.g_target.isna())
    dataframe = dataframe[~mask]
    test_x = dataframe[dataframe.season == 20192020]
    dataframe = dataframe[dataframe.season != 20192020]
    
    drop_columns = ['prev_season_target', 'n_1', 'n_2', 'n_3', 'points']
    dataframe = dataframe.drop(drop_columns, axis=1)
    test_x = test_x.drop(drop_columns, axis=1)
    
    return dataframe, test_x

In [35]:
test_data_y = pd.read_csv('csv_files/20202021stats.csv')

#reading in csv files to dataframes
ev_df = pd.read_csv('ev_player_goal_data.csv')
pp_df = pd.read_csv('pp_player_goal_data.csv')

ev_df['prev_season'] = ev_df.season - 10001
pp_df['prev_season'] = pp_df.season = 10001
#combine player stats into totals
ev_df.drop(['Unnamed: 0'], axis=1, inplace=True)
pp_df.drop(['Unnamed: 0'], axis=1, inplace=True)

ev_df = ev_df.groupby(['player', 'api_id', 'season_age', 'position', 'season', 'prev_season'])\
        [['gp', 'toi', 'g', 'a1', 'a2', 'points', 'isf', 'iff', 'icf']].sum().reset_index()
pp_df = ev_df.groupby(['player', 'api_id', 'season_age', 'position', 'season', 'prev_season'])\
        [['gp', 'toi', 'g', 'a1', 'a2', 'points', 'isf', 'iff', 'icf']].sum().reset_index()

ev_f_df = ev_df[ev_df.position != 'D']
ev_d_df = ev_df[ev_df.position == 'D']

pp_f_df = pp_df[pp_df.position != 'D']
pp_d_df = pp_df[pp_df.position == 'D']

In [36]:
ev_f_df_train, ev_f_df_test_data_x = prepare_data(ev_f_df)
ev_d_df_train, ev_d_df_test_data_x = prepare_data(ev_d_df)
pp_f_df_train, pp_f_df_test_data_x = prepare_data(pp_f_df)
pp_d_df_train, pp_d_df_test_data_x = prepare_data(pp_d_df)

In [37]:
selected_features = ['g', 'isf', 'gp', 'toi', 'a1', 'a2', 'iff', 'icf', 'g_n_1']
estimator = cv_model(ev_f_df_train, 
                     ev_f_df_train[['g_target']], LinearRegression(), 
                     500, features=selected_features)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [42]:
#build models to impute missing stats for the previous years
selected_features = ['g', 'isf', 'gp', 'toi', 'a1', 'a2', 'iff', 'icf']
estimator = cv_model(ev_f_df_train, 
                     ev_f_df_train[['gp_target']], LinearRegression(), 
                     500, features=selected_features)

Building model with features: ['g', 'isf', 'gp', 'toi', 'a1', 'a2', 'iff', 'icf']
Baseline linear model training set metrics:
R^2 for test set: 0.3234
Mean Squared Error for training set: 453.9947
Root Mean Squared Error for training set: 21.3072
Mean Absolute Error for training set: 17.3613



In [38]:
ev_f_df_train.head()


Unnamed: 0,player,api_id,season_age,position,season,prev_season,gp,toi,g,a1,a2,isf,iff,icf,g_target,gp_target,toi_target,a1_target,a2_target,isf_target,iff_target,icf_target,g_n_1,gp_n_1,toi_n_1,a1_n_1,a2_n_1,isf_n_1,iff_n_1,icf_n_1,season_n_1,g_n_2,gp_n_2,toi_n_2,a1_n_2,a2_n_2,isf_n_2,iff_n_2,icf_n_2,season_n_2,g_n_3,gp_n_3,toi_n_3,a1_n_3,a2_n_3,isf_n_3,iff_n_3,icf_n_3,season_n_3
0,A.J. Greer,8478421,19.0,L,20162017,20152016,5.0,63.8,0.0,0.0,1.0,6.0,7.0,8.0,0.0,17.0,122.27,2.0,1.0,13.0,15.0,20.0,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,A.J. Greer,8478421,20.0,L,20172018,20162017,17.0,122.27,0.0,2.0,1.0,13.0,15.0,20.0,1.0,15.0,93.8,1.0,0.0,9.0,11.0,14.0,0.0,5.0,63.8,0.0,1.0,6.0,7.0,8.0,20162017.0,,,,,,,,,,,,,,,,,,
3,Aaron Downey,8465992,33.0,R,20072008,20062007,56.0,254.55,0.0,3.0,0.0,15.0,19.0,22.0,1.0,4.0,20.83,0.0,1.0,2.0,2.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,
5,Aaron Gagnon,8471451,23.0,C,20092010,20082009,2.0,15.1,0.0,0.0,0.0,2.0,4.0,4.0,0.0,19.0,143.03,0.0,2.0,8.0,18.0,25.0,,,,,,,,,,,,,,,,,,,,,,,,,,,
6,Aaron Gagnon,8471451,24.0,C,20102011,20092010,19.0,143.03,0.0,0.0,2.0,8.0,18.0,25.0,0.0,7.0,65.65,0.0,0.0,6.0,9.0,12.0,0.0,2.0,15.1,0.0,0.0,2.0,4.0,4.0,20092010.0,,,,,,,,,,,,,,,,,,
