# NHL Goal Prediction Part 2: Electric Boogaloo

Going to look at trying to predict future NHL goals again by trying a couple different statistical techniques that may or may not work!

In [1]:
import os
import math
import itertools
import time

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, RidgeCV, ElasticNetCV, LassoCV
from sklearn.ensemble import BaggingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import KFold, train_test_split
from sklearn.feature_selection import SelectKBest, f_classif
import seaborn as sns

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [48]:
test_data = pd.read_csv('csv_files/20202021stats.csv')
#reading in csv files to dataframes
ev_df = pd.read_csv('ev_player_goal_data.csv')
pp_df = pd.read_csv('pp_player_goal_data.csv')

ev_df['prev_season'] = ev_df.season - 10001
pp_df['prev_season'] = pp_df.season = 10001
#combine player stats into totals
ev_df.drop(['Unnamed: 0'], axis=1, inplace=True)
pp_df.drop(['Unnamed: 0'], axis=1, inplace=True)

ev_df = ev_df.groupby(['player', 'api_id', 'season_age', 'position', 'season', 'prev_season'])\
        [['gp', 'toi', 'g', 'a1', 'a2', 'points', 'isf', 'iff', 'icf']].sum().reset_index()
pp_df = ev_df.groupby(['player', 'api_id', 'season_age', 'position', 'season', 'prev_season'])\
        [['gp', 'toi', 'g', 'a1', 'a2', 'points', 'isf', 'iff', 'icf']].sum().reset_index()

ev_f_df = ev_df[ev_df.position != 'D']
ev_d_df = ev_df[ev_df.position == 'D']

pp_f_df = pp_df[pp_df.position != 'D']
pp_d_df = pp_df[pp_df.position == 'D']

In [49]:
# creating target data for forwards and defense at even strength
ev_f_df = ev_f_df.merge(ev_f_df, how='left', 
                        left_on=['api_id', 'season'], right_on=['api_id', 'prev_season']
                       ,suffixes=[None, '_target'])
ev_f_mask = (ev_f_df.season != 20192020)&(ev_f_df.player_target.isna())
ev_f_df = ev_f_df[~ev_f_mask]


ev_d_df = ev_d_df.merge(ev_d_df, how='left', 
                        left_on=['api_id', 'season'], right_on=['api_id', 'prev_season']
                       ,suffixes=[None, '_target'])
ev_d_mask = (ev_d_df.season != 20192020)&(ev_d_df.player_target.isna())
ev_d_df = ev_d_df[~ev_d_mask]

#repeating process for power play
pp_f_df = pp_f_df.merge(pp_f_df, how='left', 
                        left_on=['api_id', 'season'], right_on=['api_id', 'prev_season']
                       ,suffixes=[None, '_target'])
pp_f_mask = (pp_f_df.season != 20192020)&(pp_f_df.player_target.isna())
pp_f_df = pp_f_df[~pp_f_mask]


pp_d_df = pp_d_df.merge(pp_d_df, how='left', 
                        left_on=['api_id', 'season'], right_on=['api_id', 'prev_season']
                       ,suffixes=[None, '_target'])
pp_d_mask = (pp_d_df.season != 20192020)&(pp_d_df.player_target.isna())
pp_d_df = pp_d_df[~pp_d_mask]

In [51]:
pp_d_df.head(10)

Unnamed: 0,player,api_id,season_age,position,season,prev_season,gp,toi,g,a1,a2,points,isf,iff,icf,player_target,season_age_target,position_target,season_target,prev_season_target,gp_target,toi_target,g_target,a1_target,a2_target,points_target,isf_target,iff_target,icf_target
0,Aaron Ekblad,8477932,18.0,D,20142015,20132014,81.0,1485.73,6.0,8.0,8.0,22.0,126.0,172.0,256.0,Aaron Ekblad,19.0,D,20152016.0,20142015.0,78.0,1378.1,12.0,6.0,8.0,26.0,134.0,175.0,228.0
1,Aaron Ekblad,8477932,19.0,D,20152016,20142015,78.0,1378.1,12.0,6.0,8.0,26.0,134.0,175.0,228.0,Aaron Ekblad,20.0,D,20162017.0,20152016.0,68.0,1194.68,5.0,3.0,4.0,12.0,156.0,208.0,270.0
2,Aaron Ekblad,8477932,20.0,D,20162017,20152016,68.0,1194.68,5.0,3.0,4.0,12.0,156.0,208.0,270.0,Aaron Ekblad,21.0,D,20172018.0,20162017.0,82.0,1500.0,11.0,3.0,9.0,23.0,140.0,215.0,291.0
3,Aaron Ekblad,8477932,21.0,D,20172018,20162017,82.0,1500.0,11.0,3.0,9.0,23.0,140.0,215.0,291.0,Aaron Ekblad,22.0,D,20182019.0,20172018.0,82.0,1562.65,8.0,6.0,11.0,25.0,129.0,171.0,234.0
4,Aaron Ekblad,8477932,22.0,D,20182019,20172018,82.0,1562.65,8.0,6.0,11.0,25.0,129.0,171.0,234.0,Aaron Ekblad,23.0,D,20192020.0,20182019.0,67.0,1268.28,5.0,17.0,10.0,32.0,122.0,155.0,218.0
5,Aaron Ekblad,8477932,23.0,D,20192020,20182019,67.0,1268.28,5.0,17.0,10.0,32.0,122.0,155.0,218.0,,,,,,,,,,,,,,
6,Aaron Johnson,8469534,24.0,D,20072008,20062007,30.0,342.2,0.0,0.0,1.0,1.0,13.0,18.0,34.0,Aaron Johnson,25.0,D,20082009.0,20072008.0,38.0,460.53,3.0,1.0,4.0,8.0,25.0,41.0,55.0
7,Aaron Johnson,8469534,25.0,D,20082009,20072008,38.0,460.53,3.0,1.0,4.0,8.0,25.0,41.0,55.0,Aaron Johnson,26.0,D,20092010.0,20082009.0,41.0,554.34,3.0,3.0,2.0,8.0,31.0,51.0,73.0
9,Aaron Johnson,8469534,28.0,D,20112012,20102011,56.0,793.07,2.0,5.0,7.0,14.0,50.0,73.0,96.0,Aaron Johnson,29.0,D,20122013.0,20112012.0,10.0,138.4,0.0,0.0,0.0,0.0,8.0,11.0,23.0
15,Aaron Ness,8474604,25.0,D,20152016,20142015,8.0,97.35,0.0,0.0,2.0,2.0,8.0,13.0,18.0,Aaron Ness,26.0,D,20162017.0,20152016.0,2.0,25.67,0.0,0.0,0.0,0.0,0.0,1.0,1.0


In [42]:
mask = (ev_f_df.player_target.isna()) & (ev_f_df.season != 20192020)