# import

In [1]:
import sys
import pandas as pd
import seaborn as sns
import numpy as np

from sklearn.model_selection import TimeSeriesSplit
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import RidgeClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score

from src.constants import MODEL_CONST
from src.per_game_model import PerGameModel
from src.init_DFs.next_game import NextGameInit
pgModel = PerGameModel()
initNextGame = NextGameInit()

# configure

In [2]:
season = "20_22"
situation = "5on5"

target = 'next_reg_win'
target_cols = ['reg_win']
target_operations = []

include_placebo = False
include_null_targets = False
null_target_value = 2

# retrieve df

In [3]:
# Retrieve Data
df = pd.read_csv(MODEL_CONST['CSV_DB_PATH'] + f"{season}/PER_GAME_BY_TEAM_{situation}_{season}.csv")

# not spending any more time on PLACEBO rn...
if include_placebo:
    df_target = pgModel.add_target(df, target, target_cols, target_operations, include_placebo)
    # Add placebo
    placebo_name = 'next_reg_odds'
    placebo_cols = ['odds']
    placebo_operations = []
    df = pgModel.add_placebo(df, placebo_name, placebo_cols, placebo_operations, include_null_targets)
elif include_null_targets:
    df = pgModel.add_target(df, target, target_cols, target_operations, include_null_targets)
    df_2 = df.copy()
    df[target][pd.isnull(df_2[target])] = null_target_value
else:
    df = pgModel.add_target(df, target, target_cols, target_operations)

In [4]:
# Checkpoint 1: inspect data
# df[pd.isnull(df["next_reg_win"])]
df_test = df.copy()
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6825 entries, 0 to 6824
Columns: 118 entries, game_id to next_reg_win
dtypes: float64(49), int64(62), object(7)
memory usage: 6.1+ MB


In [5]:
# Specify columns needed for training
df = df.copy()
selected_cols = df.columns[~df.columns.isin(pgModel.ignored_cols)]

In [6]:
# Init model features
rr = RidgeClassifier(alpha=1)
split = TimeSeriesSplit(n_splits=3)

sfs = SequentialFeatureSelector(
    rr, n_features_to_select=30, direction="forward", cv=split, n_jobs=4
)

scaler = MinMaxScaler()
df[selected_cols] = scaler.fit_transform(df[selected_cols])

In [7]:
# Find predictive columns
sfs.fit(df[selected_cols], df[target])
predictors_1 = list(selected_cols[sfs.get_support()])
#predictors_1

In [7]:
# MODEL 1 -> basic data, split by season
predictions_1 = pgModel.season_backtest(df.copy(), rr, predictors_1, target, start=1, step=1)
accuracy_score(predictions_1["actual"], predictions_1["predicted"])

0.6161655603279969

In [8]:
# Compare accuracy
df.groupby("is_home").apply(lambda x: x[x[target] == 1].shape[0] / x.shape[0])

is_home
0    0.370088
1    0.404100
dtype: float64

In [7]:
# Get Rolling Average(s)
df_roll = df[["team", "season"] + list(selected_cols)]

df_roll_32 = df_roll.groupby(["team", "season"], group_keys=False).apply(pgModel.find_team_averages, num_games=32)
roll_32_cols = [f"{col}_32" for col in df_roll_32.columns]
df_roll_32.columns = roll_32_cols

df_roll_16 = df_roll.groupby(["team", "season"], group_keys=False).apply(pgModel.find_team_averages, num_games=16)
roll_16_cols = [f"{col}_16" for col in df_roll_16.columns]
df_roll_16.columns = roll_16_cols

df_roll_8 = df_roll.groupby(["team", "season"], group_keys=False).apply(pgModel.find_team_averages, num_games=8)
roll_8_cols = [f"{col}_8" for col in df_roll_8.columns]
df_roll_8.columns = roll_8_cols

df_roll_4 = df_roll.groupby(["team", "season"], group_keys=False).apply(pgModel.find_team_averages, num_games=4)
roll_4_cols = [f"{col}_4" for col in df_roll_4.columns]
df_roll_4.columns = roll_4_cols

In [8]:
# Add Rolling Average(s) to DF
df_32 = pd.concat([df.copy(), df_roll_32.copy()], axis=1)
df_16 = pd.concat([df.copy(), df_roll_16.copy()], axis=1)
df_8 = pd.concat([df.copy(), df_roll_8.copy()], axis=1)
df_4 = pd.concat([df.copy(), df_roll_4.copy()], axis=1)

# Remove games without rolling average
df_32 = df_32.dropna()
df_16 = df_16.dropna()
df_8 = df_8.dropna()
df_4 = df_4.dropna()

# Add KNOWN next_ values to dataframe (would be known before game is played)
df_4["next_is_home"] = pgModel.add_col(df_4.copy(), "is_home")
df_4["next_opp_team"] = pgModel.add_col(df_4.copy(), "opp_team")
df_4["next_game_date"] = pgModel.add_col(df_4.copy(), "game_date")

df_8["next_is_home"] = pgModel.add_col(df_8.copy(), "is_home")
df_8["next_opp_team"] = pgModel.add_col(df_8.copy(), "opp_team")
df_8["next_game_date"] = pgModel.add_col(df_8.copy(), "game_date")

df_16["next_is_home"] = pgModel.add_col(df_16.copy(), "is_home")
df_16["next_opp_team"] = pgModel.add_col(df_16.copy(), "opp_team")
df_16["next_game_date"] = pgModel.add_col(df_16.copy(), "game_date")

df_32["next_is_home"] = pgModel.add_col(df_32.copy(), "is_home")
df_32["next_opp_team"] = pgModel.add_col(df_32.copy(), "opp_team")
df_32["next_game_date"] = pgModel.add_col(df_32.copy(), "game_date")

    # currently don't have access to this for 2nd to last games (deleted last games)
    # might want to group by season as well.. 
    # (but that mightve already been accounted for in prev functions / dropna()s)

In [38]:
# Checkpoint 2: inspect data
#df
#df_16
#df_roll_16
#list(df_16.columns[df_16.dtypes == "object"])
#roll_16_cols
df_16.columns

Index(['game_id', 'team', 'opp_team', 'season', 'next_game_id', 'score',
       'opp_score', 'win', 'reg_win', 'overtime',
       ...
       'dZoneGiveaways_16', 'xGoalsFromxReboundsOfShots_16',
       'xGoalsFromActualReboundsOfShots_16', 'reboundxGoals_16',
       'totalShotCredit_16', 'scoreAdjustedTotalShotCredit_16',
       'scoreFlurryAdjustedTotalShotCredit_16', 'next_is_home',
       'next_opp_team', 'next_game_date'],
      dtype='object', length=229)

In [12]:
# Combine team data with that of next_opp
full_32 = df_32.merge(
    df_32[roll_32_cols + ["next_opp_team", "next_game_date", "team"]], 
    left_on=["team", "next_game_date"], 
    right_on=["next_opp_team", "next_game_date"]
)
full_16 = df_16.merge(
    df_16[roll_16_cols + ["next_opp_team", "next_game_date", "team"]], 
    left_on=["team", "next_game_date"], 
    right_on=["next_opp_team", "next_game_date"]
)
full_8 = df_8.merge(
    df_8[roll_8_cols + ["next_opp_team", "next_game_date", "team"]], 
    left_on=["team", "next_game_date"], 
    right_on=["next_opp_team", "next_game_date"]
)
full_4 = df_4.merge(
    df_4[roll_4_cols + ["next_opp_team", "next_game_date", "team"]], 
    left_on=["team", "next_game_date"], 
    right_on=["next_opp_team", "next_game_date"]
)
            # when 2 cols have same name, left=_x & right=_y
            # _y => cols from next_opp_team DF
            # _x => cols from original team

In [30]:
# Checkpoint 3: inspect data
full_16
#full_16[['team_x', 'next_opp_team_x', 'team_y', 'next_opp_team_y', 'next_game_date']]
#list(full_16.columns[full_16.dtypes == "object"])

Unnamed: 0,game_id,team_x,opp_team,season,next_game_id,score,opp_score,win,reg_win,overtime,...,scoreAdjustedUnblockedShotAttempts_16_y,dZoneGiveaways_16_y,xGoalsFromxReboundsOfShots_16_y,xGoalsFromActualReboundsOfShots_16_y,reboundxGoals_16_y,totalShotCredit_16_y,scoreAdjustedTotalShotCredit_16_y,scoreFlurryAdjustedTotalShotCredit_16_y,next_opp_team_y,team_y
0,2014-11-11CHITBL,TBL,CHI,2014,2014-11-13TBLSJS,0.181818,0.272727,0.0,0.0,1.0,...,0.368994,0.1625,0.436758,0.109961,0.136999,0.394044,0.403195,0.401463,TBL,SJS
1,2014-11-11FLASJS,SJS,FLA,2014,2014-11-13TBLSJS,0.090909,0.363636,0.0,0.0,0.0,...,0.364088,0.1200,0.437170,0.155912,0.163124,0.406583,0.413527,0.413759,SJS,TBL
2,2014-11-11MTLWPG,MTL,WPG,2014,2014-11-13MTLBOS,0.272727,0.000000,1.0,1.0,0.0,...,0.366031,0.1175,0.411774,0.082208,0.100803,0.392620,0.392276,0.394731,MTL,BOS
3,2014-11-11VANOTT,VAN,OTT,2014,2014-11-14VANARI,0.363636,0.272727,1.0,0.0,1.0,...,0.296956,0.1075,0.366013,0.115007,0.115007,0.319122,0.314493,0.316935,VAN,ARI
4,2014-11-12ANALAK,ANA,LAK,2014,2014-11-15LAKANA,0.545455,0.454545,1.0,0.0,1.0,...,0.383868,0.2050,0.415402,0.121166,0.121166,0.373315,0.377085,0.380670,ANA,LAK
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16841,2023-04-10LAKVAN,VAN,LAK,2022,2023-04-11ANAVAN,0.000000,0.272727,0.0,0.0,0.0,...,0.275263,0.1525,0.363456,0.147568,0.147568,0.317163,0.308513,0.309194,VAN,ANA
16842,2023-04-10NYRBUF,BUF,NYR,2022,2023-04-11NJDBUF,0.272727,0.181818,1.0,0.0,1.0,...,0.483547,0.2000,0.530261,0.192049,0.192049,0.550940,0.559295,0.559353,BUF,NJD
16843,2023-04-10OTTCAR,CAR,OTT,2022,2023-04-11CARDET,0.181818,0.272727,0.0,0.0,0.0,...,0.290041,0.1675,0.368734,0.157422,0.157422,0.346473,0.344669,0.346797,CAR,DET
16844,2023-04-10WPGSJS,WPG,SJS,2022,2023-04-11MINWPG,0.545455,0.181818,1.0,1.0,0.0,...,0.344932,0.1400,0.394129,0.115683,0.115683,0.387409,0.397514,0.400428,WPG,MIN


In [28]:
# Declare candidates for predictive columns (no object datatypes)
ignored_cols_32 = list(full_32.columns[full_32.dtypes == "object"]) + pgModel.ignored_cols
ignored_cols_16 = list(full_16.columns[full_16.dtypes == "object"]) + pgModel.ignored_cols
ignored_cols_8 = list(full_8.columns[full_8.dtypes == "object"]) + pgModel.ignored_cols
ignored_cols_4 = list(full_4.columns[full_4.dtypes == "object"]) + pgModel.ignored_cols

selected_cols_32 = full_32.columns[~full_32.columns.isin(ignored_cols_32)]
selected_cols_16 = full_16.columns[~full_16.columns.isin(ignored_cols_16)]
selected_cols_8 = full_8.columns[~full_8.columns.isin(ignored_cols_8)]
selected_cols_4 = full_4.columns[~full_4.columns.isin(ignored_cols_4)]

In [43]:
                            # here on out: one at a time
# Find predictive columns 
sfs.fit(full_32[selected_cols_32], full_32[target])
predictors_2 = list(selected_cols_32[sfs.get_support()])
predictors_2

['opp_scoreVenueAdjustedxGoals',
 'opp_faceOffsWon',
 'freeze',
 'takeaways',
 'mediumDangerxGoals',
 'lowDangerGoals',
 'mediumDangerGoals',
 'dZoneGiveaways',
 'opp_odds_32_x',
 'opp_xGoals_32_x',
 'opp_flurryAdjustedxGoals_32_x',
 'opp_scoreVenueAdjustedxGoals_32_x',
 'opp_flurryScoreVenueAdjustedxGoals_32_x',
 'opp_missedShots_32_x',
 'opp_rebounds_32_x',
 'opp_playStopped_32_x',
 'opp_mediumDangerShots_32_x',
 'opp_mediumDangerGoals_32_x',
 'opp_totalShotCredit_32_x',
 'penalityMinutes_32_x',
 'mediumDangerxGoals_32_x',
 'odds_32_y',
 'opp_xPlayContinuedInZone_32_y',
 'opp_xPlayContinuedOutsideZone_32_y',
 'opp_blockedShotAttempts_32_y',
 'opp_shotAttempts_32_y',
 'opp_scoreAdjustedShotsAttempts_32_y',
 'opp_unblockedShotAttempts_32_y',
 'opp_scoreFlurryAdjustedTotalShotCredit_32_y',
 'mediumDangerGoals_32_y']

In [44]:
# MODEL 2 -> rolling data, split by season
predictions_2 = pgModel.season_backtest(full_32.copy(), rr, predictors_2, target, start=1, step=1)
accuracy_score(predictions_2["actual"], predictions_2["predicted"])

0.6448966196258615

In [45]:
# Output pg_rolling_# model info
num = 32
df_output = full_32.copy()

df_output.to_csv(f'pg_rolling_{num}_{situation}_{season}.csv', header=True, index=False)
with open(f"pg_rolling_{num}_{situation}_{season}_predictors.txt", "w") as file:
    file.write(", ".join(map(str, predictors_2)))

In [None]:
full_4.to_csv(f'pg_rolling_4_{situation}_{season}.csv', header=True, index=False)
full_8.to_csv(f'pg_rolling_8_{situation}_{season}.csv', header=True, index=False)
full_16.to_csv(f'pg_rolling_16_{situation}_{season}.csv', header=True, index=False)
full_32.to_csv(f'pg_rolling_32_{situation}_{season}.csv', header=True, index=False)

In [None]:
# ADD PERMINANT model ADD / TEMP model REMOVE for these csvs ^^^