In [1]:
# Import Dependencies
import sys
import pandas as pd
import seaborn as sns
import numpy as np

from sklearn.model_selection import TimeSeriesSplit
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import RidgeClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score

from src.per_game_model import PerGameModel
from src.init_DFs.next_game import NextGameInit
pgModel = PerGameModel()
initNextGame = NextGameInit()

# Retrieve Data
season = "recent"
situation = "5on5"
csv_database = "C:/wamp64/www/bet-nhl-model/csv_database/"
df = pd.read_csv(csv_database + f"{season}/PER_GAME_BY_TEAM_{situation}_{season}.csv")

In [2]:
# Add target (dependent variable)
target = 'next_reg_win'
target_cols = ['reg_win']
target_operations = []
include_placebo = False
include_null_targets = False
null_target_value = 2

# not spending any more time on PLACEBO rn...
if include_placebo:
    df_target = pgModel.add_target(df, target, target_cols, target_operations, include_placebo)
    # Add placebo
    placebo_name = 'next_reg_odds'
    placebo_cols = ['odds']
    placebo_operations = []
    df = pgModel.add_placebo(df, placebo_name, placebo_cols, placebo_operations, include_null_targets)
elif include_null_targets:
    df = pgModel.add_target(df, target, target_cols, target_operations, include_null_targets)
    df_2 = df.copy()
    df[target][pd.isnull(df_2[target])] = null_target_value
else:
    df = pgModel.add_target(df, target, target_cols, target_operations)

In [3]:
# Checkpoint 1: inspect data
# df[pd.isnull(df["next_reg_win"])]
df_test = df.copy()
df_test

Unnamed: 0,game_id,team,opp_team,season,next_game_id,score,opp_score,win,reg_win,overtime,...,unblockedShotAttempts,scoreAdjustedUnblockedShotAttempts,dZoneGiveaways,xGoalsFromxReboundsOfShots,xGoalsFromActualReboundsOfShots,reboundxGoals,totalShotCredit,scoreAdjustedTotalShotCredit,scoreFlurryAdjustedTotalShotCredit,next_reg_win
0,2021-01-13COLSTL,STL,COL,2020,2021-01-15COLSTL,4,1,1,1,0,...,32,34.686,1,0.296,0.565,0.565,1.761,1.929,1.923,0.0
1,2021-01-13COLSTL,COL,STL,2020,2021-01-15COLSTL,1,4,0,0,0,...,26,24.091,4,0.244,0.316,0.316,1.073,0.988,0.977,1.0
2,2021-01-13EDMVAN,VAN,EDM,2020,2021-01-14EDMVAN,5,3,1,1,0,...,38,40.785,4,0.414,0.195,0.195,3.032,3.260,3.233,0.0
3,2021-01-13EDMVAN,EDM,VAN,2020,2021-01-14EDMVAN,3,5,0,0,0,...,29,27.141,3,0.341,0.276,0.276,2.208,2.065,2.064,1.0
4,2021-01-13PHIPIT,PIT,PHI,2020,2021-01-15PHIPIT,3,6,0,0,0,...,37,37.660,5,0.317,0.395,0.395,1.839,1.894,1.881,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6820,2023-04-12STLDAL,STL,DAL,2022,2023-04-13DALSTL,2,5,0,0,0,...,19,17.316,3,0.200,0.000,0.000,1.329,1.221,1.220,0.0
6821,2023-04-13BUFOTT,BUF,OTT,2022,2023-04-14CBJBUF,4,3,1,0,1,...,41,40.351,3,0.412,0.321,0.321,2.608,2.500,2.474,1.0
6822,2023-04-13CBJPIT,CBJ,PIT,2022,2023-04-14CBJBUF,3,2,1,0,1,...,44,41.582,2,0.488,0.000,0.000,3.438,3.224,3.145,0.0
6823,2023-04-13COLWPG,COL,WPG,2022,2023-04-14NSHCOL,4,2,1,1,0,...,27,27.030,6,0.274,0.286,0.286,1.624,1.584,1.559,1.0


In [4]:
# Specify columns needed for training
df = df.copy()
selected_cols = df.columns[~df.columns.isin(pgModel.ignored_cols)]

In [5]:
# Init model features
rr = RidgeClassifier(alpha=1)
split = TimeSeriesSplit(n_splits=3)

sfs = SequentialFeatureSelector(
    rr, n_features_to_select=30, direction="forward", cv=split, n_jobs=4
)

scaler = MinMaxScaler()
df[selected_cols] = scaler.fit_transform(df[selected_cols])

In [6]:
# Find predictive columns
sfs.fit(df[selected_cols], df[target])
predictors_1 = list(selected_cols[sfs.get_support()])
#predictors_1

In [7]:
# MODEL 1 -> basic data, split by season
predictions_1 = pgModel.season_backtest(df.copy(), rr, predictors_1, target, start=1, step=1)
accuracy_score(predictions_1["actual"], predictions_1["predicted"])

0.6161655603279969

In [8]:
# Compare accuracy
df.groupby("is_home").apply(lambda x: x[x[target] == 1].shape[0] / x.shape[0])

is_home
0    0.370088
1    0.404100
dtype: float64

In [9]:
# Get Rolling Average(s)
df_roll = df[["team", "season"] + list(selected_cols)]

df_roll_32 = df_roll.groupby(["team", "season"], group_keys=False).apply(pgModel.find_team_averages, num_games=32)
roll_32_cols = [f"{col}_32" for col in df_roll_32.columns]
df_roll_32.columns = roll_32_cols

df_roll_16 = df_roll.groupby(["team", "season"], group_keys=False).apply(pgModel.find_team_averages, num_games=16)
roll_16_cols = [f"{col}_16" for col in df_roll_16.columns]
df_roll_16.columns = roll_16_cols

df_roll_8 = df_roll.groupby(["team", "season"], group_keys=False).apply(pgModel.find_team_averages, num_games=8)
roll_8_cols = [f"{col}_8" for col in df_roll_8.columns]
df_roll_8.columns = roll_8_cols

df_roll_4 = df_roll.groupby(["team", "season"], group_keys=False).apply(pgModel.find_team_averages, num_games=4)
roll_4_cols = [f"{col}_4" for col in df_roll_4.columns]
df_roll_4.columns = roll_4_cols

In [10]:
# Add Rolling Average(s) to DF
df_32 = pd.concat([df.copy(), df_roll_32.copy()], axis=1)
df_16 = pd.concat([df.copy(), df_roll_16.copy()], axis=1)
df_8 = pd.concat([df.copy(), df_roll_8.copy()], axis=1)
df_4 = pd.concat([df.copy(), df_roll_4.copy()], axis=1)

# Remove games without rolling average
df_32 = df_32.dropna()
df_16 = df_16.dropna()
df_8 = df_8.dropna()
df_4 = df_4.dropna()

# Add KNOWN next_ values to dataframe (would be known before game is played)
df_4["next_is_home"] = pgModel.add_col(df_4.copy(), "is_home")
df_4["next_opp_team"] = pgModel.add_col(df_4.copy(), "opp_team")
df_4["next_game_date"] = pgModel.add_col(df_4.copy(), "game_date")

df_8["next_is_home"] = pgModel.add_col(df_8.copy(), "is_home")
df_8["next_opp_team"] = pgModel.add_col(df_8.copy(), "opp_team")
df_8["next_game_date"] = pgModel.add_col(df_8.copy(), "game_date")

df_16["next_is_home"] = pgModel.add_col(df_16.copy(), "is_home")
df_16["next_opp_team"] = pgModel.add_col(df_16.copy(), "opp_team")
df_16["next_game_date"] = pgModel.add_col(df_16.copy(), "game_date")

df_32["next_is_home"] = pgModel.add_col(df_32.copy(), "is_home")
df_32["next_opp_team"] = pgModel.add_col(df_32.copy(), "opp_team")
df_32["next_game_date"] = pgModel.add_col(df_32.copy(), "game_date")

    # currently don't have access to this for 2nd to last games (deleted last games)
    # might want to group by season as well.. 
    # (but that mightve already been accounted for in prev functions / dropna()s)

In [24]:
# Checkpoint 2: inspect data
#df
df_16
#df_roll_16
#list(df_16.columns[df_16.dtypes == "object"])
#roll_16_cols
#df_16.columns

Unnamed: 0,game_id,team,opp_team,season,next_game_id,score,opp_score,win,reg_win,overtime,...,dZoneGiveaways_16,xGoalsFromxReboundsOfShots_16,xGoalsFromActualReboundsOfShots_16,reboundxGoals_16,totalShotCredit_16,scoreAdjustedTotalShotCredit_16,scoreFlurryAdjustedTotalShotCredit_16,next_is_home,next_opp_team,next_game_date
357,2021-02-08TORVAN,VAN,TOR,2020,2021-02-11VANCGY,0.090909,0.272727,0.0,0.0,0.0,...,0.226562,0.395531,0.127801,0.127801,0.385253,0.391290,0.390389,1.0,CGY,2021-02-11
384,2021-02-11MTLEDM,EDM,MTL,2020,2021-02-15EDMWPG,0.272727,0.000000,1.0,1.0,0.0,...,0.250000,0.425132,0.101478,0.101478,0.427756,0.435338,0.435309,1.0,WPG,2021-02-15
391,2021-02-11VANCGY,VAN,CGY,2020,2021-02-13VANCGY,0.090909,0.272727,0.0,0.0,0.0,...,0.218750,0.382833,0.131298,0.131298,0.367385,0.369155,0.368295,1.0,CGY,2021-02-13
402,2021-02-13CHICBJ,CHI,CBJ,2020,2021-02-15DETCHI,0.272727,0.181818,1.0,0.0,1.0,...,0.265625,0.365188,0.107597,0.107597,0.330277,0.331608,0.332191,0.0,DET,2021-02-15
403,2021-02-13CHICBJ,CBJ,CHI,2020,2021-02-15CARCBJ,0.181818,0.272727,0.0,0.0,1.0,...,0.164062,0.388605,0.075652,0.075652,0.346526,0.351463,0.352109,0.0,CAR,2021-02-15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6820,2023-04-12STLDAL,STL,DAL,2022,2023-04-13DALSTL,0.181818,0.454545,0.0,0.0,0.0,...,0.130208,0.400726,0.096531,0.096531,0.379428,0.381893,0.381413,,,
6821,2023-04-13BUFOTT,BUF,OTT,2022,2023-04-14CBJBUF,0.363636,0.272727,1.0,0.0,1.0,...,0.138021,0.471224,0.201605,0.201605,0.453187,0.458534,0.457842,,,
6822,2023-04-13CBJPIT,CBJ,PIT,2022,2023-04-14CBJBUF,0.272727,0.181818,1.0,0.0,1.0,...,0.135417,0.413094,0.113497,0.113497,0.412134,0.410542,0.409775,,,
6823,2023-04-13COLWPG,COL,WPG,2022,2023-04-14NSHCOL,0.363636,0.181818,1.0,1.0,0.0,...,0.119792,0.474522,0.138410,0.138410,0.469370,0.488297,0.486695,,,


In [25]:
# Combine team data with that of next_opp
full_32 = df_32.merge(
    df_32[roll_32_cols + ["next_opp_team", "next_game_date", "team"]], 
    left_on=["team", "next_game_date"], 
    right_on=["next_opp_team", "next_game_date"]
)
full_16 = df_16.merge(
    df_16[roll_16_cols + ["next_opp_team", "next_game_date", "team"]], 
    left_on=["team", "next_game_date"], 
    right_on=["next_opp_team", "next_game_date"]
)
full_8 = df_8.merge(
    df_8[roll_8_cols + ["next_opp_team", "next_game_date", "team"]], 
    left_on=["team", "next_game_date"], 
    right_on=["next_opp_team", "next_game_date"]
)
full_4 = df_4.merge(
    df_4[roll_4_cols + ["next_opp_team", "next_game_date", "team"]], 
    left_on=["team", "next_game_date"], 
    right_on=["next_opp_team", "next_game_date"]
)
            # when 2 cols have same name, left=_x & right=_y
            # _y => cols from next_opp_team DF
            # _x => cols from original team

In [27]:
# Checkpoint 3: inspect data
#full_16
#full_16[['team_x', 'next_opp_team_x', 'team_y', 'next_opp_team_y', 'next_game_date']]

Unnamed: 0,game_id,team_x,opp_team,season,next_game_id,score,opp_score,win,reg_win,overtime,...,scoreAdjustedUnblockedShotAttempts_16_y,dZoneGiveaways_16_y,xGoalsFromxReboundsOfShots_16_y,xGoalsFromActualReboundsOfShots_16_y,reboundxGoals_16_y,totalShotCredit_16_y,scoreAdjustedTotalShotCredit_16_y,scoreFlurryAdjustedTotalShotCredit_16_y,next_opp_team_y,team_y
0,2021-02-13CHICBJ,CHI,CBJ,2020,2021-02-15DETCHI,0.272727,0.181818,1.0,0.0,1.0,...,0.249775,0.158854,0.336412,0.091028,0.091028,0.292006,0.296669,0.296712,CHI,DET
1,2021-02-13NSHDET,DET,NSH,2020,2021-02-15DETCHI,0.363636,0.181818,1.0,1.0,0.0,...,0.279859,0.265625,0.365188,0.107597,0.107597,0.330277,0.331608,0.332191,DET,CHI
2,2021-02-15DETCHI,DET,CHI,2020,2021-02-17DETCHI,0.181818,0.272727,0.0,0.0,1.0,...,0.283908,0.268229,0.364281,0.107597,0.107597,0.328122,0.331097,0.331740,DET,CHI
3,2021-02-15DETCHI,CHI,DET,2020,2021-02-17DETCHI,0.272727,0.181818,1.0,0.0,1.0,...,0.271620,0.140625,0.345894,0.093194,0.093194,0.298485,0.302959,0.302993,CHI,DET
4,2021-02-15TOROTT,OTT,TOR,2020,2021-02-17TOROTT,0.545455,0.454545,1.0,0.0,1.0,...,0.324974,0.244792,0.447724,0.094167,0.094167,0.474412,0.484169,0.482539,OTT,TOR
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5265,2023-04-10LAKVAN,VAN,LAK,2022,2023-04-11ANAVAN,0.000000,0.272727,0.0,0.0,0.0,...,0.253469,0.158854,0.363456,0.147568,0.147568,0.317163,0.310324,0.309194,VAN,ANA
5266,2023-04-10NYRBUF,BUF,NYR,2022,2023-04-11NJDBUF,0.272727,0.181818,1.0,0.0,1.0,...,0.474321,0.208333,0.530261,0.192049,0.192049,0.550940,0.562579,0.559353,BUF,NJD
5267,2023-04-10OTTCAR,CAR,OTT,2022,2023-04-11CARDET,0.181818,0.272727,0.0,0.0,0.0,...,0.269139,0.174479,0.368734,0.157422,0.157422,0.346473,0.346692,0.346797,CAR,DET
5268,2023-04-10WPGSJS,WPG,SJS,2022,2023-04-11MINWPG,0.545455,0.181818,1.0,1.0,0.0,...,0.327342,0.145833,0.394129,0.115683,0.115683,0.387409,0.399848,0.400428,WPG,MIN


In [28]:
# Declare candidates for predictive columns (no object datatypes)
ignored_cols_32 = list(full_32.columns[full_32.dtypes == "object"]) + pgModel.ignored_cols
ignored_cols_16 = list(full_16.columns[full_16.dtypes == "object"]) + pgModel.ignored_cols
ignored_cols_8 = list(full_8.columns[full_8.dtypes == "object"]) + pgModel.ignored_cols
ignored_cols_4 = list(full_4.columns[full_4.dtypes == "object"]) + pgModel.ignored_cols

selected_cols_32 = full_32.columns[~full_32.columns.isin(ignored_cols_32)]
selected_cols_16 = full_16.columns[~full_16.columns.isin(ignored_cols_16)]
selected_cols_8 = full_8.columns[~full_8.columns.isin(ignored_cols_8)]
selected_cols_4 = full_4.columns[~full_4.columns.isin(ignored_cols_4)]

In [43]:
                            # here on out: one at a time
# Find predictive columns 
sfs.fit(full_32[selected_cols_32], full_32[target])
predictors_2 = list(selected_cols_32[sfs.get_support()])
predictors_2

['opp_scoreVenueAdjustedxGoals',
 'opp_faceOffsWon',
 'freeze',
 'takeaways',
 'mediumDangerxGoals',
 'lowDangerGoals',
 'mediumDangerGoals',
 'dZoneGiveaways',
 'opp_odds_32_x',
 'opp_xGoals_32_x',
 'opp_flurryAdjustedxGoals_32_x',
 'opp_scoreVenueAdjustedxGoals_32_x',
 'opp_flurryScoreVenueAdjustedxGoals_32_x',
 'opp_missedShots_32_x',
 'opp_rebounds_32_x',
 'opp_playStopped_32_x',
 'opp_mediumDangerShots_32_x',
 'opp_mediumDangerGoals_32_x',
 'opp_totalShotCredit_32_x',
 'penalityMinutes_32_x',
 'mediumDangerxGoals_32_x',
 'odds_32_y',
 'opp_xPlayContinuedInZone_32_y',
 'opp_xPlayContinuedOutsideZone_32_y',
 'opp_blockedShotAttempts_32_y',
 'opp_shotAttempts_32_y',
 'opp_scoreAdjustedShotsAttempts_32_y',
 'opp_unblockedShotAttempts_32_y',
 'opp_scoreFlurryAdjustedTotalShotCredit_32_y',
 'mediumDangerGoals_32_y']

In [44]:
# MODEL 2 -> rolling data, split by season
predictions_2 = pgModel.season_backtest(full_32.copy(), rr, predictors_2, target, start=1, step=1)
accuracy_score(predictions_2["actual"], predictions_2["predicted"])

0.6448966196258615

In [45]:
# Output pg_rolling_# model info
num = 32
df_output = full_32.copy()

df_output.to_csv(f'pg_rolling_{num}_{situation}_{season}.csv', header=True, index=False)
with open(f"pg_rolling_{num}_{situation}_{season}_predictors.txt", "w") as file:
    file.write(", ".join(map(str, predictors_2)))

In [None]:
full_4.to_csv(f'pg_rolling_4_{situation}_{season}.csv', header=True, index=False)
full_8.to_csv(f'pg_rolling_8_{situation}_{season}.csv', header=True, index=False)
full_16.to_csv(f'pg_rolling_16_{situation}_{season}.csv', header=True, index=False)
full_32.to_csv(f'pg_rolling_32_{situation}_{season}.csv', header=True, index=False)