In [1]:
# Import Dependencies
import sys
import pandas as pd
import seaborn as sns
import numpy as np

from sklearn.model_selection import TimeSeriesSplit
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import RidgeClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score

from src.per_game_model import PerGameModel
from src.init_DFs.next_game import NextGameInit
pgModel = PerGameModel()
initNextGame = NextGameInit()

# Retrieve Data
season = "recent"
situation = "5on5"
csv_database = "C:/wamp64/www/bet-nhl-model/csv_database/"
df = pd.read_csv(csv_database + f"{season}/PER_GAME_BY_TEAM_{situation}_{season}.csv")

In [2]:
# Add target (dependent variable)
target = 'next_reg_win'
target_cols = ['reg_win']
target_operations = []
include_placebo = False
include_null_targets = False
null_target_value = 2

# not spending any more time on PLACEBO rn...
if include_placebo:
    df_target = pgModel.add_target(df, target, target_cols, target_operations, include_placebo)
    # Add placebo
    placebo_name = 'next_reg_odds'
    placebo_cols = ['odds']
    placebo_operations = []
    df = pgModel.add_placebo(df, placebo_name, placebo_cols, placebo_operations, include_null_targets)
elif include_null_targets:
    df = pgModel.add_target(df, target, target_cols, target_operations, include_null_targets)
    df_2 = df.copy()
    df[target][pd.isnull(df_2[target])] = null_target_value
else:
    df = pgModel.add_target(df, target, target_cols, target_operations)

In [3]:
# Checkpoint 1: inspect data
# df[pd.isnull(df["next_reg_win"])]
df_test = df.copy()
df_test

Unnamed: 0,game_id,team,opp_team,season,next_game_id,score,opp_score,win,reg_win,overtime,...,unblockedShotAttempts,scoreAdjustedUnblockedShotAttempts,dZoneGiveaways,xGoalsFromxReboundsOfShots,xGoalsFromActualReboundsOfShots,reboundxGoals,totalShotCredit,scoreAdjustedTotalShotCredit,scoreFlurryAdjustedTotalShotCredit,next_reg_win
0,2021-01-13COLSTL,STL,COL,2020,2021-01-15COLSTL,4,1,1,1,0,...,32,34.686,1,0.296,0.565,0.565,1.761,1.929,1.923,0.0
1,2021-01-13COLSTL,COL,STL,2020,2021-01-15COLSTL,1,4,0,0,0,...,26,24.091,4,0.244,0.316,0.316,1.073,0.988,0.977,1.0
2,2021-01-13EDMVAN,VAN,EDM,2020,2021-01-14EDMVAN,5,3,1,1,0,...,38,40.785,4,0.414,0.195,0.195,3.032,3.260,3.233,0.0
3,2021-01-13EDMVAN,EDM,VAN,2020,2021-01-14EDMVAN,3,5,0,0,0,...,29,27.141,3,0.341,0.276,0.276,2.208,2.065,2.064,1.0
4,2021-01-13PHIPIT,PIT,PHI,2020,2021-01-15PHIPIT,3,6,0,0,0,...,37,37.660,5,0.317,0.395,0.395,1.839,1.894,1.881,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6820,2023-04-12STLDAL,STL,DAL,2022,2023-04-13DALSTL,2,5,0,0,0,...,19,17.316,3,0.200,0.000,0.000,1.329,1.221,1.220,0.0
6821,2023-04-13BUFOTT,BUF,OTT,2022,2023-04-14CBJBUF,4,3,1,0,1,...,41,40.351,3,0.412,0.321,0.321,2.608,2.500,2.474,1.0
6822,2023-04-13CBJPIT,CBJ,PIT,2022,2023-04-14CBJBUF,3,2,1,0,1,...,44,41.582,2,0.488,0.000,0.000,3.438,3.224,3.145,0.0
6823,2023-04-13COLWPG,COL,WPG,2022,2023-04-14NSHCOL,4,2,1,1,0,...,27,27.030,6,0.274,0.286,0.286,1.624,1.584,1.559,1.0


In [4]:
# Specify columns needed for training
df = df.copy()
selected_cols = df.columns[~df.columns.isin(pgModel.ignored_cols)]

Index(['score', 'opp_score', 'win', 'reg_win', 'overtime', 'odds', 'opp_odds',
       'ot_odds', 'iceTime', 'opp_xGoalsPercentage',
       ...
       'scoreAdjustedShotsAttempts', 'unblockedShotAttempts',
       'scoreAdjustedUnblockedShotAttempts', 'dZoneGiveaways',
       'xGoalsFromxReboundsOfShots', 'xGoalsFromActualReboundsOfShots',
       'reboundxGoals', 'totalShotCredit', 'scoreAdjustedTotalShotCredit',
       'scoreFlurryAdjustedTotalShotCredit'],
      dtype='object', length=111)

In [5]:
# Init model features
rr = RidgeClassifier(alpha=1)
split = TimeSeriesSplit(n_splits=3)

sfs = SequentialFeatureSelector(
    rr, n_features_to_select=30, direction="forward", cv=split, n_jobs=4
)

scaler = MinMaxScaler()
df[selected_cols] = scaler.fit_transform(df[selected_cols])

In [6]:
# Find predictive columns
sfs.fit(df[selected_cols], df[target])
predictors = list(selected_cols[sfs.get_support()])
#predictors

['score',
 'ot_odds',
 'iceTime',
 'opp_corsiPercentage',
 'corsiPercentage',
 'opp_xOnGoal',
 'opp_xPlayContinuedInZone',
 'opp_shotsOnGoal',
 'opp_missedShots',
 'opp_goals',
 'opp_rebounds',
 'opp_reboundGoals',
 'opp_playStopped',
 'opp_playContinuedInZone',
 'opp_savedShotsOnGoal',
 'opp_savedUnblockedShotAttempts',
 'opp_highDangerGoals',
 'opp_unblockedShotAttempts',
 'opp_scoreAdjustedUnblockedShotAttempts',
 'xPlayStopped',
 'xPlayContinuedOutsideZone',
 'shotsOnGoal',
 'playStopped',
 'faceOffsWon',
 'giveaways',
 'mediumDangerShots',
 'lowDangerGoals',
 'totalShotCredit',
 'scoreAdjustedTotalShotCredit',
 'scoreFlurryAdjustedTotalShotCredit']

In [7]:
# MODEL 1 -> basic data, split by season
predictions_1 = pgModel.season_backtest(df, rr, predictors, target, start=1, step=1)
accuracy_score(predictions_1["actual"], predictions_1["predicted"])

0.619140625

In [8]:
# Compare accuracy
df.groupby("is_home").apply(lambda x: x[x[target] == 1].shape[0] / x.shape[0])

is_home
0    0.370088
1    0.404100
dtype: float64

In [10]:
# Get Rolling Average(s)
df_roll = df[["team", "season"] + list(selected_cols)]

df_roll_32 = df_roll.groupby(["team", "season"], group_keys=False).apply(pgModel.find_team_averages, num_games=32)
roll_32_cols = [f"{col}_32" for col in df_roll_32.columns]
df_roll_32.columns = roll_32_cols

df_roll_16 = df_roll.groupby(["team", "season"], group_keys=False).apply(pgModel.find_team_averages, num_games=16)
roll_16_cols = [f"{col}_16" for col in df_roll_16.columns]
df_roll_16.columns = roll_16_cols

df_roll_8 = df_roll.groupby(["team", "season"], group_keys=False).apply(pgModel.find_team_averages, num_games=8)
roll_8_cols = [f"{col}_8" for col in df_roll_8.columns]
df_roll_8.columns = roll_8_cols

df_roll_4 = df_roll.groupby(["team", "season"], group_keys=False).apply(pgModel.find_team_averages, num_games=4)
roll_4_cols = [f"{col}_4" for col in df_roll_4.columns]
df_roll_4.columns = roll_4_cols

In [15]:
# Add Rolling Average(s) to DF
df_32 = pd.concat([df.copy(), df_roll_32.copy()], axis=1)
df_16 = pd.concat([df.copy(), df_roll_16.copy()], axis=1)
df_8 = pd.concat([df.copy(), df_roll_8.copy()], axis=1)
df_4 = pd.concat([df.copy(), df_roll_4.copy()], axis=1)

# Remove games without rolling average
df_32 = df_32.dropna()
df_16 = df_16.dropna()
df_8 = df_8.dropna()
df_4 = df_4.dropna()

# Add KNOWN next_ values to dataframe (would be known before game is played)
df_4["next_is_home"] = pgModel.add_col(df_4.copy(), "is_home")
df_4["next_opp_team"] = pgModel.add_col(df_4.copy(), "opp_team")
df_4["next_game_date"] = pgModel.add_col(df_4.copy(), "game_date")

df_8["next_is_home"] = pgModel.add_col(df_8.copy(), "is_home")
df_8["next_opp_team"] = pgModel.add_col(df_8.copy(), "opp_team")
df_8["next_game_date"] = pgModel.add_col(df_8.copy(), "game_date")

df_16["next_is_home"] = pgModel.add_col(df_16.copy(), "is_home")
df_16["next_opp_team"] = pgModel.add_col(df_16.copy(), "opp_team")
df_16["next_game_date"] = pgModel.add_col(df_16.copy(), "game_date")

df_32["next_is_home"] = pgModel.add_col(df_32.copy(), "is_home")
df_32["next_opp_team"] = pgModel.add_col(df_32.copy(), "opp_team")
df_32["next_game_date"] = pgModel.add_col(df_32.copy(), "game_date")

    # currently don't have access to this for 2nd to last games (deleted last games)
    # might want to group by season as well.. 
    # (but that mightve already been accounted for in prev functions / dropna()s)

In [20]:
# Checkpoint 2: inspect data
#df
#df_16
#df_roll_16

Unnamed: 0,game_id,team,opp_team,season,next_game_id,score,opp_score,win,reg_win,overtime,...,scoreAdjustedShotsAttempts_4,unblockedShotAttempts_4,scoreAdjustedUnblockedShotAttempts_4,dZoneGiveaways_4,xGoalsFromxReboundsOfShots_4,xGoalsFromActualReboundsOfShots_4,reboundxGoals_4,totalShotCredit_4,scoreAdjustedTotalShotCredit_4,scoreFlurryAdjustedTotalShotCredit_4
66,2021-01-18CGYVAN,VAN,CGY,2020,2021-01-20VANMTL,0.181818,0.454545,0.0,0.0,0.0,...,0.390505,0.347222,0.328568,0.187500,0.398087,0.067228,0.067228,0.387879,0.404278,0.406628
71,2021-01-18EDMMTL,EDM,MTL,2020,2021-01-20TOREDM,0.090909,0.272727,0.0,0.0,0.0,...,0.305983,0.337963,0.291027,0.270833,0.430079,0.080579,0.080579,0.431087,0.410778,0.405938
81,2021-01-18TORWPG,TOR,WPG,2020,2021-01-20TOREDM,0.272727,0.090909,1.0,1.0,0.0,...,0.436806,0.365741,0.348000,0.125000,0.418865,0.120391,0.120391,0.399373,0.402076,0.404291
84,2021-01-19DETCBJ,DET,CBJ,2020,2021-01-22CHIDET,0.272727,0.181818,1.0,0.0,1.0,...,0.283463,0.300926,0.271595,0.270833,0.359499,0.093214,0.093214,0.307001,0.296551,0.296102
85,2021-01-19DETCBJ,CBJ,DET,2020,2021-01-21CBJTBL,0.181818,0.272727,0.0,0.0,1.0,...,0.427357,0.384259,0.370691,0.104167,0.408641,0.068897,0.068897,0.352351,0.370151,0.369025
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6820,2023-04-12STLDAL,STL,DAL,2022,2023-04-13DALSTL,0.181818,0.454545,0.0,0.0,0.0,...,0.369653,0.356481,0.327079,0.125000,0.419855,0.191036,0.191036,0.377847,0.373506,0.370140
6821,2023-04-13BUFOTT,BUF,OTT,2022,2023-04-14CBJBUF,0.363636,0.272727,1.0,0.0,1.0,...,0.432163,0.486111,0.440792,0.104167,0.507586,0.172123,0.172123,0.517555,0.516723,0.514393
6822,2023-04-13CBJPIT,CBJ,PIT,2022,2023-04-14CBJBUF,0.272727,0.181818,1.0,0.0,1.0,...,0.300673,0.305556,0.263563,0.156250,0.395778,0.065162,0.065162,0.423145,0.407528,0.406097
6823,2023-04-13COLWPG,COL,WPG,2022,2023-04-14NSHCOL,0.363636,0.181818,1.0,1.0,0.0,...,0.377560,0.351852,0.341830,0.145833,0.400726,0.068659,0.068659,0.393730,0.405064,0.406416


In [21]:
# Combine team data with that of next_opp
full_32 = df.merge(
    df_32[roll_32_cols + ["next_opp_team", "next_game_date", "team"]], 
    left_on=["team", "next_game_date"], 
    right_on=["next_opp_team", "next_game_date"]
)
full_16 = df.merge(
    df_16[roll_16_cols + ["next_opp_team", "next_game_date", "team"]], 
    left_on=["team", "next_game_date"], 
    right_on=["next_opp_team", "next_game_date"]
)
full_8 = df.merge(
    df_8[roll_8_cols + ["next_opp_team", "next_game_date", "team"]], 
    left_on=["team", "next_game_date"], 
    right_on=["next_opp_team", "next_game_date"]
)
full_4 = df.merge(
    df_4[roll_4_cols + ["next_opp_team", "next_game_date", "team"]], 
    left_on=["team", "next_game_date"], 
    right_on=["next_opp_team", "next_game_date"]
)
            # when 2 cols have same name, left=_x & right=_y
            # _y => cols from next_opp_team DF
            # _x => cols from original team

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_roll_8["next_is_home"] = pgModel.add_col(df_roll_8.copy(), "is_home")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_roll_8["next_opp_team"] = pgModel.add_col(df_roll_8.copy(), "opp_team")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_roll_8["next_game_date"] = pgModel.add_col(df_roll_8.c

In [22]:
# Checkpoint 3: inspect data
full_16[['team_x', 'next_opp_team_x', 'team_y', 'next_opp_team_y', 'next_game_date']]

Unnamed: 0,game_id,team,opp_team,season,next_game_id,score,opp_score,win,reg_win,overtime,...,dZoneGiveaways_8,xGoalsFromxReboundsOfShots_8,xGoalsFromActualReboundsOfShots_8,reboundxGoals_8,totalShotCredit_8,scoreAdjustedTotalShotCredit_8,scoreFlurryAdjustedTotalShotCredit_8,next_is_home,next_opp_team,next_game_date
169,2021-01-25VANOTT,VAN,OTT,2020,2021-01-27VANOTT,0.636364,0.090909,1.0,1.0,0.0,...,0.177083,0.370712,0.101081,0.101081,0.351541,0.355420,0.355030,1.0,OTT,2021-01-27
179,2021-01-26CGYTOR,TOR,CGY,2020,2021-01-28EDMTOR,0.363636,0.272727,1.0,1.0,0.0,...,0.166667,0.393305,0.101001,0.101001,0.382759,0.390491,0.391598,0.0,EDM,2021-01-28
193,2021-01-26WPGEDM,EDM,WPG,2020,2021-01-28EDMTOR,0.363636,0.545455,0.0,0.0,0.0,...,0.218750,0.449868,0.113597,0.113597,0.429180,0.433608,0.432175,1.0,TOR,2021-01-28
196,2021-01-27NSHCHI,CHI,NSH,2020,2021-01-29CHICBJ,0.090909,0.181818,0.0,0.0,1.0,...,0.161458,0.359993,0.130404,0.130404,0.334039,0.342525,0.343212,1.0,CBJ,2021-01-29
198,2021-01-27VANOTT,VAN,OTT,2020,2021-01-28VANOTT,0.454545,0.090909,1.0,1.0,0.0,...,0.182292,0.376154,0.147608,0.147608,0.345742,0.344805,0.344513,1.0,OTT,2021-01-28
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6820,2023-04-12STLDAL,STL,DAL,2022,2023-04-13DALSTL,0.181818,0.454545,0.0,0.0,0.0,...,0.130208,0.418041,0.129331,0.129331,0.392215,0.388289,0.386579,,,
6821,2023-04-13BUFOTT,BUF,OTT,2022,2023-04-14CBJBUF,0.363636,0.272727,1.0,0.0,1.0,...,0.125000,0.504453,0.205022,0.205022,0.487905,0.499135,0.496415,,,
6822,2023-04-13CBJPIT,CBJ,PIT,2022,2023-04-14CBJBUF,0.272727,0.181818,1.0,0.0,1.0,...,0.135417,0.357850,0.048514,0.048514,0.382602,0.372641,0.373274,,,
6823,2023-04-13COLWPG,COL,WPG,2022,2023-04-14NSHCOL,0.363636,0.181818,1.0,1.0,0.0,...,0.130208,0.475429,0.153528,0.153528,0.464734,0.480525,0.479286,,,


In [None]:
# Declare candidates for predictive columns (no object datatypes)
ignored_cols_32 = list(full_32.columns[full_32.dtypes == "object"]) + pgModel.ignored_cols
ignored_cols_16 = list(full_16.columns[full_16.dtypes == "object"]) + pgModel.ignored_cols
ignored_cols_8 = list(full_8.columns[full_8.dtypes == "object"]) + pgModel.ignored_cols
ignored_cols_4 = list(full_4.columns[full_4.dtypes == "object"]) + pgModel.ignored_cols

selected_cols_32 = full_32.columns[~full_32.columns.isin(ignored_cols_32)]
selected_cols_16 = full_16.columns[~full_16.columns.isin(ignored_cols_16)]
selected_cols_8 = full_8.columns[~full_8.columns.isin(ignored_cols_8)]
selected_cols_4 = full_4.columns[~full_4.columns.isin(ignored_cols_4)]

In [None]:
                            # here on out: one at a time
# Find predictive columns 
sfs.fit(full_4[selected_cols_4], full_4[target])
predictors = list(selected_cols_4[sfs.get_support()])
predictors

In [None]:
# MODEL 2 -> rolling data, split by season
predictions = pgModel.season_backtest(full, rr, predictors, target, start=1, step=1)
accuracy_score(predictions["res"], predictions["pred"])