# NHL Goal Prediction Part 2: Electric Boogaloo

Going to look at trying to predict future NHL goals again by trying a couple different statistical techniques that may or may not work!

In [52]:
import os
import math
import itertools
import time
from typing import Tuple

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, RidgeCV, ElasticNetCV, LassoCV
from sklearn.ensemble import BaggingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import KFold, train_test_split
from sklearn.feature_selection import SelectKBest, f_classif
import seaborn as sns

from helper_functions import cv_model, oos_stats
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
'''
STAT_COLUMNS = ['g', 'gp', 'toi', 'a1', 'a2', 'isf', 'iff', 'icf', 'toi_gp', 'sh_percent', 'toi_per_g',
                'avg_goals_season']
'''
STAT_COLUMNS = ['g', 'gp', 'toi', 'a1', 'a2', 'isf', 'iff', 'icf', 'toi_gp', 'sh_percent', 'toi_per_g',]

In [53]:
def feature_engineer_data(dataframe: pd.DataFrame) -> pd.DataFrame:
    #feature engineering
    
    #prorate goals for lockout shortened season
    dataframe.loc[:, 'g'] = np.where(dataframe['season'] == 20122013, 
                              (dataframe['g']) * (82/48), dataframe['g'])
    
    dataframe.loc[:, 'toi_gp'] = round(dataframe['toi']/dataframe['gp'], 2)
    dataframe.loc[:, 'sh_percent'] = round(dataframe['g']/dataframe['isf'], 3)
    dataframe.loc[:, 'toi_per_g'] = round(dataframe['g']/dataframe['toi'], 3)

    #avg goals over career
    #dataframe = dataframe.sort_values(['api_id', 'season'])
    #dataframe["number_season"] = dataframe.groupby("api_id")["season"].rank(method="first", ascending=True)

    #dataframe['goals_shift'] = dataframe['g'] + dataframe['g'].shift(1)
    #dataframe['cum_goals'] = np.where(dataframe['api_id'] == dataframe['api_id'].shift(1), 
                                    #dataframe['goals_shift'], 
                                    #dataframe['g'])
    #dataframe['avg_goals_season'] = dataframe['cum_goals']/dataframe['number_season']

    #avg shooting percentage over career
    #dataframe['cum_shots'] = np.where(dataframe['api_id'] == dataframe['api_id'].shift(1), 
                                        #dataframe['isf'] + dataframe['isf'].shift(1), 
                                        #dataframe['isf'])
    #dataframe['avg_sh_perc'] = round(dataframe['cum_goals']/dataframe['cum_shots'], 3)

    #diff of season sh% from career avg
    #dataframe.loc[:, 'sh_perc_diff'] = dataframe['sh_percent'] - dataframe['avg_sh_perc']
    
    dataframe = dataframe.merge(dataframe[STAT_COLUMNS + ['api_id', 'prev_season']], 
                                how='left', left_on=['api_id', 'season'], 
                                right_on=['api_id', 'prev_season'] ,
                                suffixes=[None, '_target'])
    
    return dataframe

In [54]:
def create_past_seasons_data(dataframe: pd.DataFrame) -> pd.DataFrame:
    #pulling in previous season stats will look at past three seasons
    dataframe['n_1'] = dataframe['season'] - 10001
    dataframe['n_2'] = dataframe['season'] - 20002
    dataframe['n_3'] = dataframe['season'] - 30003
    dataframe = dataframe.merge(dataframe[STAT_COLUMNS + ['api_id', 'season']], 
                                how='left', left_on=['api_id', 'n_1'], 
                                right_on=['api_id', 'season'] ,suffixes=[None, '_n_1'])
    dataframe = dataframe.merge(dataframe[STAT_COLUMNS + ['api_id', 'season']], 
                                how='left', left_on=['api_id', 'n_2'], 
                                right_on=['api_id', 'season'] ,suffixes=[None, '_n_2'])
    dataframe = dataframe.merge(dataframe[STAT_COLUMNS + ['api_id', 'season']], 
                                how='left', left_on=['api_id', 'n_3'], 
                                right_on=['api_id', 'season'] ,suffixes=[None, '_n_3'])
    
    return dataframe

In [55]:
def prepare_data(dataframe: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
    
    dataframe = feature_engineer_data(dataframe)
    dataframe = create_past_seasons_data(dataframe)
    
    mask = (dataframe.season != 20192020)&(dataframe.g_target.isna())
    dataframe = dataframe[~mask]
    test_x = dataframe[dataframe.season == 20192020]
    dataframe = dataframe[dataframe.season != 20192020]
    
    drop_columns = ['prev_season_target', 'n_1', 'n_2', 'n_3', 'points',]# 'cum_shots']
    dataframe = dataframe.drop(drop_columns, axis=1)
    test_x = test_x.drop(drop_columns, axis=1)
    
    return dataframe, test_x

In [56]:
test_data_y = pd.read_csv('csv_files/20202021stats.csv')

#reading in csv files to dataframes
ev_df = pd.read_csv('ev_player_goal_data.csv')
pp_df = pd.read_csv('pp_player_goal_data.csv')

ev_df['prev_season'] = ev_df.season - 10001
pp_df['prev_season'] = pp_df.season - 10001
#combine player stats into totals
ev_df.drop(['Unnamed: 0'], axis=1, inplace=True)
pp_df.drop(['Unnamed: 0'], axis=1, inplace=True)

ev_df = ev_df.groupby(['player', 'api_id', 'season_age', 'position', 'season', 'prev_season'])\
        [['gp', 'toi', 'g', 'a1', 'a2', 'points', 'isf', 'iff', 'icf']].sum().reset_index()
pp_df = ev_df.groupby(['player', 'api_id', 'season_age', 'position', 'season', 'prev_season'])\
        [['gp', 'toi', 'g', 'a1', 'a2', 'points', 'isf', 'iff', 'icf']].sum().reset_index()

ev_f_df = ev_df[ev_df.position != 'D']
ev_d_df = ev_df[ev_df.position == 'D']

pp_f_df = pp_df[pp_df.position != 'D']
pp_d_df = pp_df[pp_df.position == 'D']

In [57]:
ev_f_df_train, ev_f_df_test_data_x = prepare_data(ev_f_df)
ev_d_df_train, ev_d_df_test_data_x = prepare_data(ev_d_df)
pp_f_df_train, pp_f_df_test_data_x = prepare_data(pp_f_df)
pp_d_df_train, pp_d_df_test_data_x = prepare_data(pp_d_df)

In [28]:
ev_f_df_train.head()

Unnamed: 0,player,api_id,season_age,position,season,prev_season,gp,toi,g,a1,a2,isf,iff,icf,toi_gp,sh_percent,g_target,gp_target,toi_target,a1_target,a2_target,isf_target,iff_target,icf_target,toi_gp_target,sh_percent_target,g_n_1,gp_n_1,toi_n_1,a1_n_1,a2_n_1,isf_n_1,iff_n_1,icf_n_1,toi_gp_n_1,sh_percent_n_1,season_n_1,g_n_2,gp_n_2,toi_n_2,a1_n_2,a2_n_2,isf_n_2,iff_n_2,icf_n_2,toi_gp_n_2,sh_percent_n_2,season_n_2,g_n_3,gp_n_3,toi_n_3,a1_n_3,a2_n_3,isf_n_3,iff_n_3,icf_n_3,toi_gp_n_3,sh_percent_n_3,season_n_3
0,A.J. Greer,8478421,19.0,L,20162017,20152016,5.0,63.8,0.0,0.0,1.0,6.0,7.0,8.0,12.76,0.0,0.0,17.0,122.27,2.0,1.0,13.0,15.0,20.0,7.19,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,A.J. Greer,8478421,20.0,L,20172018,20162017,17.0,122.27,0.0,2.0,1.0,13.0,15.0,20.0,7.19,0.0,1.0,15.0,93.8,1.0,0.0,9.0,11.0,14.0,6.25,0.111,0.0,5.0,63.8,0.0,1.0,6.0,7.0,8.0,12.76,0.0,20162017.0,,,,,,,,,,,,,,,,,,,,,,
3,Aaron Downey,8465992,33.0,R,20072008,20062007,56.0,254.55,0.0,3.0,0.0,15.0,19.0,22.0,4.55,0.0,1.0,4.0,20.83,0.0,1.0,2.0,2.0,2.0,5.21,0.5,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
5,Aaron Gagnon,8471451,23.0,C,20092010,20082009,2.0,15.1,0.0,0.0,0.0,2.0,4.0,4.0,7.55,0.0,0.0,19.0,143.03,0.0,2.0,8.0,18.0,25.0,7.53,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
6,Aaron Gagnon,8471451,24.0,C,20102011,20092010,19.0,143.03,0.0,0.0,2.0,8.0,18.0,25.0,7.53,0.0,0.0,7.0,65.65,0.0,0.0,6.0,9.0,12.0,9.38,0.0,0.0,2.0,15.1,0.0,0.0,2.0,4.0,4.0,7.55,0.0,20092010.0,,,,,,,,,,,,,,,,,,,,,,


In [16]:
ev_f_df_train = ev_f_df_train.fillna(ev_f_df_train.median())

  ev_f_df_train = ev_f_df_train.fillna(ev_f_df_train.median())


In [17]:
pp_f_df_train = pp_f_df_train.fillna(pp_f_df_train.median())

  pp_f_df_train = pp_f_df_train.fillna(pp_f_df_train.median())


In [18]:
selected_features = ['g', 'isf', 'gp', 'toi', 'a1', 'a2', 'iff', 'icf', 'g_n_1', 'g_n_2', 'g_n_3']
estimator = cv_model(ev_f_df_train, 
                     ev_f_df_train[['g_target']], LinearRegression(), 
                     500, features=selected_features)

Building model with features: ['g', 'isf', 'gp', 'toi', 'a1', 'a2', 'iff', 'icf', 'g_n_1', 'g_n_2', 'g_n_3']
Baseline linear model training set metrics:
R^2 for test set: 0.4381
Mean Squared Error for training set: 28.1622
Root Mean Squared Error for training set: 5.3068
Mean Absolute Error for training set: 3.9528



In [19]:
#build models to impute missing stats for the previous years
estimator = cv_model(pp_f_df_train, 
                     pp_f_df_train[['gp_target']], LinearRegression(), 
                     500, features=selected_features)

Building model with features: ['g', 'isf', 'gp', 'toi', 'a1', 'a2', 'iff', 'icf', 'g_n_1', 'g_n_2', 'g_n_3']
Baseline linear model training set metrics:
R^2 for test set: 0.331
Mean Squared Error for training set: 448.8799
Root Mean Squared Error for training set: 21.1868
Mean Absolute Error for training set: 17.2744



In [22]:
from sklearn.linear_model import RidgeCV
def lambda_to_alpha(lambda_value, samples):
    return (lambda_value * samples) / 2.0
lambdas_ridge = [0.01, 0.025, 0.05, .075, 0.1]
alpha_ridge = [lambda_to_alpha(l, ev_f_df_train.shape[0]) for l in lambdas_ridge]
ridge_estimator = cv_model(ev_f_df_train, 
                           ev_f_df_train[['g_target']], 
                           RidgeCV(alphas=alpha_ridge, cv=10), 500,
                           features=selected_features)


Building model with features: ['g', 'isf', 'gp', 'toi', 'a1', 'a2', 'iff', 'icf', 'g_n_1', 'g_n_2', 'g_n_3']
Baseline linear model training set metrics:
R^2 for test set: 0.4379
Mean Squared Error for training set: 28.1737
Root Mean Squared Error for training set: 5.3079
Mean Absolute Error for training set: 3.9533



In [23]:
from sklearn.linear_model import LassoCV
lasso_lambdas = [.01, .001, .002, .003, .0001]
lasso_alphas = [lambda_to_alpha(l, ev_f_df_train.shape[0]) for l in lasso_lambdas]
lasso_estimator = cv_model(ev_f_df_train, ev_f_df_train[['g_target']], 
                           LassoCV(alphas=lasso_alphas, max_iter=1000), 
                           500, features=selected_features)

Building model with features: ['g', 'isf', 'gp', 'toi', 'a1', 'a2', 'iff', 'icf', 'g_n_1', 'g_n_2', 'g_n_3']
Baseline linear model training set metrics:
R^2 for test set: 0.4225
Mean Squared Error for training set: 28.945
Root Mean Squared Error for training set: 5.3801
Mean Absolute Error for training set: 4.0243



In [25]:
ev_f_df_train.head(20)

Unnamed: 0,player,api_id,season_age,position,season,prev_season,gp,toi,g,a1,a2,isf,iff,icf,toi_gp,sh_percent,number_season,goals_shift,cum_goals,avg_goals_season,avg_sh_perc,sh_perc_diff,g_target,gp_target,toi_target,a1_target,a2_target,isf_target,iff_target,icf_target,toi_gp_target,sh_percent_target,avg_goals_season_target,g_n_1,gp_n_1,toi_n_1,a1_n_1,a2_n_1,isf_n_1,iff_n_1,icf_n_1,toi_gp_n_1,sh_percent_n_1,avg_goals_season_n_1,season_n_1,g_n_2,gp_n_2,toi_n_2,a1_n_2,a2_n_2,isf_n_2,iff_n_2,icf_n_2,toi_gp_n_2,sh_percent_n_2,avg_goals_season_n_2,season_n_2,g_n_3,gp_n_3,toi_n_3,a1_n_3,a2_n_3,isf_n_3,iff_n_3,icf_n_3,toi_gp_n_3,sh_percent_n_3,avg_goals_season_n_3,season_n_3
0,Rod Brind'Amour,8445735,37.0,C,20072008,20062007,59.0,888.78,12.0,12.0,8.0,98.0,121.0,147.0,15.06,0.122,1.0,14.0,12.0,12.0,0.122,0.0,9.0,80.0,1121.43,7.0,9.0,99.0,128.0,157.0,14.02,0.091,10.5,8.0,67.0,785.095,6.0,4.0,87.0,119.0,151.0,12.33,0.093,4.0,20122013.0,9.0,68.0,797.295,6.0,4.0,90.0,123.0,156.0,12.46,0.095,4.5,20112012.0,9.0,69.0,813.68,7.0,4.0,93.5,128.0,162.0,12.585,0.096,5.0,20112012.0
1,Rod Brind'Amour,8445735,38.0,C,20082009,20072008,80.0,1121.43,9.0,7.0,9.0,99.0,128.0,157.0,14.02,0.091,2.0,21.0,21.0,10.5,0.107,-0.016,6.0,79.0,791.23,4.0,3.0,73.0,98.0,125.0,10.02,0.082,5.0,12.0,59.0,888.78,12.0,8.0,98.0,121.0,147.0,15.06,0.122,12.0,20072008.0,9.0,68.0,797.295,6.0,4.0,90.0,123.0,156.0,12.46,0.095,4.5,20112012.0,9.0,69.0,813.68,7.0,4.0,93.5,128.0,162.0,12.585,0.096,5.0,20112012.0
3,Kris Draper,8446485,36.0,C,20072008,20062007,65.0,813.73,3.0,6.0,2.0,81.0,117.0,137.0,12.52,0.037,1.0,9.0,3.0,3.0,0.037,0.0,5.0,78.0,758.92,4.0,4.0,85.0,108.0,128.0,9.73,0.059,4.0,8.0,67.0,785.095,6.0,4.0,87.0,119.0,151.0,12.33,0.093,4.0,20122013.0,9.0,68.0,797.295,6.0,4.0,90.0,123.0,156.0,12.46,0.095,4.5,20112012.0,9.0,69.0,813.68,7.0,4.0,93.5,128.0,162.0,12.585,0.096,5.0,20112012.0
4,Kris Draper,8446485,37.0,C,20082009,20072008,78.0,758.92,5.0,4.0,4.0,85.0,108.0,128.0,9.73,0.059,2.0,8.0,8.0,4.0,0.048,0.011,6.0,81.0,881.12,8.0,6.0,97.0,124.0,152.0,10.88,0.062,3.666667,3.0,65.0,813.73,6.0,2.0,81.0,117.0,137.0,12.52,0.037,3.0,20072008.0,9.0,68.0,797.295,6.0,4.0,90.0,123.0,156.0,12.46,0.095,4.5,20112012.0,9.0,69.0,813.68,7.0,4.0,93.5,128.0,162.0,12.585,0.096,5.0,20112012.0
5,Kris Draper,8446485,38.0,C,20092010,20082009,81.0,881.12,6.0,8.0,6.0,97.0,124.0,152.0,10.88,0.062,3.0,11.0,11.0,3.666667,0.06,0.002,6.0,47.0,445.95,1.0,4.0,54.0,72.0,85.0,9.49,0.111,3.0,5.0,78.0,758.92,4.0,4.0,85.0,108.0,128.0,9.73,0.059,4.0,20082009.0,3.0,65.0,813.73,6.0,2.0,81.0,117.0,137.0,12.52,0.037,3.0,20072008.0,9.0,69.0,813.68,7.0,4.0,93.5,128.0,162.0,12.585,0.096,5.0,20112012.0
7,Sergei Fedorov,8446788,37.0,C,20072008,20062007,68.0,832.24,4.0,11.0,4.0,77.0,116.0,150.0,12.24,0.052,1.0,10.0,4.0,4.0,0.052,0.0,9.0,52.0,653.33,7.0,4.0,94.0,117.0,152.0,12.56,0.096,6.5,8.0,67.0,785.095,6.0,4.0,87.0,119.0,151.0,12.33,0.093,4.0,20122013.0,9.0,68.0,797.295,6.0,4.0,90.0,123.0,156.0,12.46,0.095,4.5,20112012.0,9.0,69.0,813.68,7.0,4.0,93.5,128.0,162.0,12.585,0.096,5.0,20112012.0
10,Bobby Holik,8447958,36.0,C,20072008,20062007,82.0,1049.02,11.0,6.0,7.0,114.0,154.0,199.0,12.79,0.096,1.0,16.0,11.0,11.0,0.096,0.0,4.0,62.0,604.55,1.0,3.0,76.0,96.0,117.0,9.75,0.053,7.5,8.0,67.0,785.095,6.0,4.0,87.0,119.0,151.0,12.33,0.093,4.0,20122013.0,9.0,68.0,797.295,6.0,4.0,90.0,123.0,156.0,12.46,0.095,4.5,20112012.0,9.0,69.0,813.68,7.0,4.0,93.5,128.0,162.0,12.585,0.096,5.0,20112012.0
13,Jaromir Jagr,8448208,39.0,R,20112012,20102011,73.0,934.0,11.0,14.0,7.0,117.0,162.0,213.0,12.79,0.094,2.0,29.0,29.0,14.5,0.095,-0.001,29.184028,45.0,672.52,8.0,6.0,77.0,114.0,149.0,14.94,0.379,13.394676,8.0,67.0,785.095,6.0,4.0,87.0,119.0,151.0,12.33,0.093,4.0,20122013.0,9.0,68.0,797.295,6.0,4.0,90.0,123.0,156.0,12.46,0.095,4.5,20112012.0,9.0,69.0,813.68,7.0,4.0,93.5,128.0,162.0,12.585,0.096,5.0,20112012.0
14,Jaromir Jagr,8448208,40.0,R,20122013,20112012,45.0,672.52,29.184028,8.0,6.0,77.0,114.0,149.0,14.94,0.379,3.0,40.184028,40.184028,13.394676,0.207,0.172,14.0,82.0,1297.3,16.0,14.0,181.0,246.0,305.0,15.82,0.077,10.796007,11.0,73.0,934.0,14.0,7.0,117.0,162.0,213.0,12.79,0.094,14.5,20112012.0,9.0,68.0,797.295,6.0,4.0,90.0,123.0,156.0,12.46,0.095,4.5,20112012.0,9.0,69.0,813.68,7.0,4.0,93.5,128.0,162.0,12.585,0.096,5.0,20112012.0
15,Jaromir Jagr,8448208,41.0,R,20132014,20122013,82.0,1297.3,14.0,16.0,14.0,181.0,246.0,305.0,15.82,0.077,4.0,43.184028,43.184028,10.796007,0.167,-0.09,13.0,77.0,1139.78,14.0,6.0,134.0,185.0,241.0,14.8,0.097,5.4,29.184028,45.0,672.52,8.0,6.0,77.0,114.0,149.0,14.94,0.379,13.394676,20122013.0,11.0,73.0,934.0,14.0,7.0,117.0,162.0,213.0,12.79,0.094,14.5,20112012.0,9.0,69.0,813.68,7.0,4.0,93.5,128.0,162.0,12.585,0.096,5.0,20112012.0
