In [86]:
# Import Dependencies
import sys
import pandas as pd
import seaborn as sns
import numpy as np

from sklearn.model_selection import TimeSeriesSplit
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import RidgeClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score

from src.per_game import PerGameData
from src.init_DFs.next_game import NextGameInit

perGameData = PerGameData()
nextGameInit = NextGameInit()

In [87]:
# Retrieve Data
season = "recent"
situation = "5on5"
csv_database = "C:/wamp64/www/bet-nhl-model/csv_database/"
df = pd.read_csv(csv_database + f"{season}/PER_GAME_BY_TEAM_{situation}_{season}.csv")

# Add next game data and only include data with a next game
df = nextGameInit.add_next_game_data(df)

In [88]:
# REMOVE IRRELEVANT COLS
removed_cols = nextGameInit.remove_cols
selected_cols = df.columns[~df.columns.isin(removed_cols)]

# INIT MODEL FEATURES
rr = RidgeClassifier(alpha=1)
split = TimeSeriesSplit(n_splits=3)

sfs = SequentialFeatureSelector(rr, n_features_to_select=30, direction="forward", cv=split)

scaler = MinMaxScaler()
df[selected_cols] = scaler.fit_transform(df[selected_cols])

Unnamed: 0,game_id,team,opp_team,season,next_game_id,score,opp_score,win,reg_win,overtime,...,scoreAdjustedTotalShotCredit,scoreFlurryAdjustedTotalShotCredit,next_win,next_reg_win,next_overtime,next_score,next_opp_score,next_xGoals,next_opp_xGoals,next_xGoalsPercentage
0,2021-01-13COLSTL,STL,COL,2020,2021-01-15COLSTL,0.363636,0.090909,1.0,1.0,0.0,...,0.404487,0.408540,0,0,0,0,8,0.471,1.324,0.2624
1,2021-01-13COLSTL,COL,STL,2020,2021-01-15COLSTL,0.090909,0.363636,0.0,0.0,0.0,...,0.207171,0.207563,1,1,0,8,0,1.324,0.471,0.7376
2,2021-01-13EDMVAN,VAN,EDM,2020,2021-01-14EDMVAN,0.454545,0.272727,1.0,1.0,0.0,...,0.683581,0.686849,0,0,0,2,5,2.532,3.118,0.4481
3,2021-01-13EDMVAN,EDM,VAN,2020,2021-01-14EDMVAN,0.272727,0.454545,0.0,0.0,0.0,...,0.433005,0.438496,1,1,0,5,2,3.118,2.532,0.5519
4,2021-01-13PHIPIT,PIT,PHI,2020,2021-01-15PHIPIT,0.272727,0.545455,0.0,0.0,0.0,...,0.397148,0.399618,0,0,0,2,5,2.480,1.415,0.6367
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6820,2023-04-12STLDAL,STL,DAL,2022,2023-04-13DALSTL,0.181818,0.454545,0.0,0.0,0.0,...,0.256029,0.259188,0,0,0,0,1,0.916,3.070,0.2298
6821,2023-04-13BUFOTT,BUF,OTT,2022,2023-04-14CBJBUF,0.363636,0.272727,1.0,0.0,1.0,...,0.524219,0.525600,1,1,0,5,2,4.182,3.588,0.5382
6822,2023-04-13CBJPIT,CBJ,PIT,2022,2023-04-14CBJBUF,0.272727,0.181818,1.0,0.0,1.0,...,0.676033,0.668154,0,0,0,2,5,3.588,4.182,0.4618
6823,2023-04-13COLWPG,COL,WPG,2022,2023-04-14NSHCOL,0.363636,0.181818,1.0,1.0,0.0,...,0.332145,0.331209,1,1,0,4,3,2.625,1.532,0.6315


In [89]:
sfs.fit(df[selected_cols], df["next_reg_win"])

In [90]:
predictors = list(selected_cols[sfs.get_support()])
# predictors

In [91]:
# SPLIT DATA BY SZNs
def backtest(data, model, predictors, start=2, step=1):
    all_predictions = []
    
    seasons = sorted(data["season"].unique())
    
    for i in range(start, len(seasons), step):
        season = seasons[i]
        
        train = data[data["season"] < season]
        test = data[data["season"] == season]
        
        model.fit(train[predictors], train["next_reg_win"])
        
        preds = model.predict(test[predictors])
        preds = pd.Series(preds, index=test.index)
        
        combined = pd.concat([test["next_reg_win"], preds], axis=1)
        combined.columns = ["res", "pred"]
        
        all_predictions.append(combined)
    return pd.concat(all_predictions)

predictions = backtest(df, rr, predictors)
accuracy_score(predictions["res"], predictions["pred"])

0.619140625

In [92]:
df.groupby("is_home").apply(lambda x: x[x["next_reg_win"] == 1].shape[0] / x.shape[0])

is_home
0    0.370088
1    0.404100
dtype: float64

In [94]:
def find_team_averages(team):
    team = team.drop(columns=["team", "season"])
    rolling = team.rolling(10).mean()
    return rolling

df_roll = df[["team", "season"] + list(selected_cols)]
df_roll = df_roll.groupby(["team", "season"], group_keys=False).apply(find_team_averages)
# df_rolling[pd.isnull(df_rolling["score"])]

In [95]:
roll_cols = [f"{col}_10" for col in df_roll.columns]
df_roll.columns = roll_cols

df = pd.concat([df, df_roll], axis=1)
df

Unnamed: 0,game_id,team,opp_team,season,next_game_id,score,opp_score,win,reg_win,overtime,...,scoreAdjustedShotsAttempts_10,unblockedShotAttempts_10,scoreAdjustedUnblockedShotAttempts_10,dZoneGiveaways_10,xGoalsFromxReboundsOfShots_10,xGoalsFromActualReboundsOfShots_10,reboundxGoals_10,totalShotCredit_10,scoreAdjustedTotalShotCredit_10,scoreFlurryAdjustedTotalShotCredit_10
0,2021-01-13COLSTL,STL,COL,2020,2021-01-15COLSTL,0.363636,0.090909,1.0,1.0,0.0,...,,,,,,,,,,
1,2021-01-13COLSTL,COL,STL,2020,2021-01-15COLSTL,0.090909,0.363636,0.0,0.0,0.0,...,,,,,,,,,,
2,2021-01-13EDMVAN,VAN,EDM,2020,2021-01-14EDMVAN,0.454545,0.272727,1.0,1.0,0.0,...,,,,,,,,,,
3,2021-01-13EDMVAN,EDM,VAN,2020,2021-01-14EDMVAN,0.272727,0.454545,0.0,0.0,0.0,...,,,,,,,,,,
4,2021-01-13PHIPIT,PIT,PHI,2020,2021-01-15PHIPIT,0.272727,0.545455,0.0,0.0,0.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6820,2023-04-12STLDAL,STL,DAL,2022,2023-04-13DALSTL,0.181818,0.454545,0.0,0.0,0.0,...,0.369692,0.362963,0.335579,0.125000,0.415303,0.111538,0.111538,0.397032,0.399098,0.398364
6821,2023-04-13BUFOTT,BUF,OTT,2022,2023-04-14CBJBUF,0.363636,0.272727,1.0,0.0,1.0,...,0.428262,0.444444,0.419977,0.137500,0.476121,0.197616,0.197616,0.451473,0.461459,0.459805
6822,2023-04-13CBJPIT,CBJ,PIT,2022,2023-04-14CBJBUF,0.272727,0.181818,1.0,0.0,1.0,...,0.268453,0.257407,0.225771,0.170833,0.350264,0.052161,0.052161,0.376385,0.369155,0.369938
6823,2023-04-13COLWPG,COL,WPG,2022,2023-04-14NSHCOL,0.363636,0.181818,1.0,1.0,0.0,...,0.484759,0.427778,0.420105,0.120833,0.483377,0.186395,0.186395,0.464075,0.486035,0.482515


In [96]:
df = df.dropna()

In [99]:
df

Unnamed: 0,game_id,team,opp_team,season,next_game_id,score,opp_score,win,reg_win,overtime,...,scoreAdjustedShotsAttempts_10,unblockedShotAttempts_10,scoreAdjustedUnblockedShotAttempts_10,dZoneGiveaways_10,xGoalsFromxReboundsOfShots_10,xGoalsFromActualReboundsOfShots_10,reboundxGoals_10,totalShotCredit_10,scoreAdjustedTotalShotCredit_10,scoreFlurryAdjustedTotalShotCredit_10
223,2021-01-28VANOTT,VAN,OTT,2020,2021-01-30WPGVAN,0.363636,0.090909,1.0,1.0,0.0,...,0.346336,0.331481,0.310474,0.179167,0.403166,0.163732,0.163732,0.380564,0.385364,0.384767
236,2021-01-30EDMTOR,EDM,TOR,2020,2021-01-31EDMOTT,0.363636,0.272727,1.0,0.0,1.0,...,0.341115,0.353704,0.323802,0.229167,0.453430,0.123840,0.123840,0.456426,0.454624,0.453495
237,2021-01-30EDMTOR,TOR,EDM,2020,2021-02-04TORVAN,0.272727,0.363636,0.0,0.0,1.0,...,0.353347,0.303704,0.296755,0.200000,0.388918,0.086236,0.086236,0.392894,0.405641,0.406926
248,2021-01-30WPGVAN,VAN,WPG,2020,2021-02-01MTLVAN,0.363636,0.090909,1.0,1.0,0.0,...,0.323786,0.311111,0.292877,0.187500,0.379288,0.161697,0.161697,0.353856,0.357706,0.357043
252,2021-01-31ANASTL,ANA,STL,2020,2021-02-02LAKANA,0.090909,0.363636,0.0,0.0,0.0,...,0.304718,0.272222,0.248295,0.129167,0.343008,0.064240,0.064240,0.308380,0.307402,0.307946
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6820,2023-04-12STLDAL,STL,DAL,2022,2023-04-13DALSTL,0.181818,0.454545,0.0,0.0,0.0,...,0.369692,0.362963,0.335579,0.125000,0.415303,0.111538,0.111538,0.397032,0.399098,0.398364
6821,2023-04-13BUFOTT,BUF,OTT,2022,2023-04-14CBJBUF,0.363636,0.272727,1.0,0.0,1.0,...,0.428262,0.444444,0.419977,0.137500,0.476121,0.197616,0.197616,0.451473,0.461459,0.459805
6822,2023-04-13CBJPIT,CBJ,PIT,2022,2023-04-14CBJBUF,0.272727,0.181818,1.0,0.0,1.0,...,0.268453,0.257407,0.225771,0.170833,0.350264,0.052161,0.052161,0.376385,0.369155,0.369938
6823,2023-04-13COLWPG,COL,WPG,2022,2023-04-14NSHCOL,0.363636,0.181818,1.0,1.0,0.0,...,0.484759,0.427778,0.420105,0.120833,0.483377,0.186395,0.186395,0.464075,0.486035,0.482515


In [100]:
def shift_col(team, col_name):
    next_col = team[col_name].shift(-1)
    return next_col

def add_col(df, col_name):
    return df.groupby('team', group_keys=False).apply(lambda x: shift_col(x, col_name))

df["next_is_home"] = add_col(df.copy(), "is_home")
df["next_opp_team"] = add_col(df.copy(), "opp_team")
df["next_game_date"] = add_col(df.copy(), "game_date")
# currently don't have access to this for 2nd to last games (deleted last games)
# might want to group by season as well.. 
# (but that mightve already been accounted for in prev functions / dropna()s)

  df["next_is_home"] = add_col(df, "is_home")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["next_is_home"] = add_col(df, "is_home")
  df["next_opp_team"] = add_col(df, "opp_team")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["next_opp_team"] = add_col(df, "opp_team")
  df["next_game_date"] = add_col(df, "game_date")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#retu

In [109]:
# pd.isnull(df["next_is_home"]).sum()
df_roll

Unnamed: 0,score_10,opp_score_10,win_10,reg_win_10,overtime_10,odds_10,opp_odds_10,ot_odds_10,iceTime_10,opp_xGoalsPercentage_10,...,scoreAdjustedShotsAttempts_10,unblockedShotAttempts_10,scoreAdjustedUnblockedShotAttempts_10,dZoneGiveaways_10,xGoalsFromxReboundsOfShots_10,xGoalsFromActualReboundsOfShots_10,reboundxGoals_10,totalShotCredit_10,scoreAdjustedTotalShotCredit_10,scoreFlurryAdjustedTotalShotCredit_10
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6820,0.354545,0.381818,0.5,0.3,0.3,0.365474,0.531873,0.686332,0.611136,0.564648,...,0.369692,0.362963,0.335579,0.125000,0.415303,0.111538,0.111538,0.397032,0.399098,0.398364
6821,0.318182,0.281818,0.7,0.3,0.5,0.369050,0.536388,0.643324,0.647931,0.529559,...,0.428262,0.444444,0.419977,0.137500,0.476121,0.197616,0.197616,0.451473,0.461459,0.459805
6822,0.163636,0.436364,0.2,0.0,0.4,0.170368,0.766603,0.446147,0.586752,0.671840,...,0.268453,0.257407,0.225771,0.170833,0.350264,0.052161,0.052161,0.376385,0.369155,0.369938
6823,0.363636,0.236364,0.8,0.5,0.4,0.662799,0.246193,0.617224,0.623944,0.453642,...,0.484759,0.427778,0.420105,0.120833,0.483377,0.186395,0.186395,0.464075,0.486035,0.482515


In [110]:
full = df.merge(
    df[roll_cols + ["next_opp_team", "next_game_date", "team"]], 
    left_on=["team", "next_game_date"], 
    right_on=["next_opp_team", "next_game_date"]
)
full

# when 2 cols have same name, left=_x & right=_y
# _y => cols from next_opp_team DF
# _x => cols from original team

Unnamed: 0,game_id,team_x,opp_team,season,next_game_id,score,opp_score,win,reg_win,overtime,...,scoreAdjustedUnblockedShotAttempts_10_y,dZoneGiveaways_10_y,xGoalsFromxReboundsOfShots_10_y,xGoalsFromActualReboundsOfShots_10_y,reboundxGoals_10_y,totalShotCredit_10_y,scoreAdjustedTotalShotCredit_10_y,scoreFlurryAdjustedTotalShotCredit_10_y,next_opp_team_y,team_y
0,2021-01-30EDMTOR,TOR,EDM,2020,2021-02-04TORVAN,0.272727,0.363636,0.0,0.0,1.0,...,0.281344,0.233333,0.378364,0.164431,0.164431,0.361818,0.365402,0.365010,TOR,VAN
1,2021-01-31BUFNJD,BUF,NJD,2020,2021-02-15BUFNYI,0.272727,0.454545,0.0,0.0,0.0,...,0.376255,0.204167,0.455541,0.076605,0.076605,0.426708,0.434305,0.432101,BUF,NYI
2,2021-01-31MINCOL,COL,MIN,2020,2021-02-02COLMIN,0.272727,0.363636,0.0,0.0,1.0,...,0.293437,0.104167,0.402639,0.115544,0.124984,0.349718,0.347389,0.347992,COL,MIN
3,2021-01-31MINCOL,MIN,COL,2020,2021-02-02COLMIN,0.363636,0.272727,1.0,0.0,1.0,...,0.325027,0.070833,0.390897,0.107502,0.116879,0.343030,0.354351,0.355577,MIN,COL
4,2021-02-02COLMIN,MIN,COL,2020,2021-02-16LAKMIN,0.090909,0.181818,0.0,0.0,0.0,...,0.252062,0.191667,0.343536,0.084456,0.090782,0.302842,0.304173,0.305205,MIN,LAK
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5857,2023-04-10LAKVAN,VAN,LAK,2022,2023-04-11ANAVAN,0.000000,0.272727,0.0,0.0,0.0,...,0.251349,0.166667,0.373747,0.182899,0.182899,0.316677,0.307884,0.306883,VAN,ANA
5858,2023-04-10NYRBUF,BUF,NYR,2022,2023-04-11NJDBUF,0.272727,0.181818,1.0,0.0,1.0,...,0.470176,0.200000,0.538127,0.193738,0.193738,0.572414,0.576536,0.573975,BUF,NJD
5859,2023-04-10OTTCAR,CAR,OTT,2022,2023-04-11CARDET,0.181818,0.272727,0.0,0.0,0.0,...,0.243688,0.175000,0.354485,0.143166,0.143166,0.335298,0.335248,0.336350,CAR,DET
5860,2023-04-10WPGSJS,WPG,SJS,2022,2023-04-11MINWPG,0.545455,0.181818,1.0,1.0,0.0,...,0.315164,0.120833,0.382586,0.110235,0.110235,0.372706,0.379681,0.380880,WPG,MIN


In [114]:
full[['team_x', 'next_opp_team_x', 'team_y', 'next_opp_team_y', 'next_game_date']]

Unnamed: 0,team_x,next_opp_team_x,team_y,next_opp_team_y,next_game_date
0,TOR,VAN,VAN,TOR,2021-02-04
1,BUF,NYI,NYI,BUF,2021-02-15
2,COL,MIN,MIN,COL,2021-02-02
3,MIN,COL,COL,MIN,2021-02-02
4,MIN,LAK,LAK,MIN,2021-02-16
...,...,...,...,...,...
5857,VAN,ANA,ANA,VAN,2023-04-11
5858,BUF,NJD,NJD,BUF,2023-04-11
5859,CAR,DET,DET,CAR,2023-04-11
5860,WPG,MIN,MIN,WPG,2023-04-11


In [116]:
removed_cols = list(full.columns[full.dtypes == "object"]) + removed_cols

In [118]:
selected_cols = full.columns[~full.columns.isin(removed_cols)]

Index(['score', 'opp_score', 'win', 'reg_win', 'overtime', 'odds', 'opp_odds',
       'ot_odds', 'iceTime', 'opp_xGoalsPercentage',
       ...
       'scoreAdjustedShotsAttempts_10_y', 'unblockedShotAttempts_10_y',
       'scoreAdjustedUnblockedShotAttempts_10_y', 'dZoneGiveaways_10_y',
       'xGoalsFromxReboundsOfShots_10_y',
       'xGoalsFromActualReboundsOfShots_10_y', 'reboundxGoals_10_y',
       'totalShotCredit_10_y', 'scoreAdjustedTotalShotCredit_10_y',
       'scoreFlurryAdjustedTotalShotCredit_10_y'],
      dtype='object', length=334)

In [120]:
sfs.fit(full[selected_cols], full["next_reg_win"])

In [121]:
predictors = list(selected_cols[sfs.get_support()])
predictors

['score',
 'opp_missedShots',
 'opp_takeaways',
 'freeze',
 'faceOffsWon',
 'lowDangerGoals',
 'totalShotCredit',
 'opp_odds_10_x',
 'opp_mediumDangerxGoals_10_x',
 'opp_mediumDangerGoals_10_x',
 'opp_reboundxGoals_10_x',
 'shotAttempts_10_x',
 'faceOffsWon_10_x',
 'overtime_10_y',
 'opp_odds_10_y',
 'opp_xGoals_10_y',
 'opp_flurryAdjustedxGoals_10_y',
 'opp_scoreVenueAdjustedxGoals_10_y',
 'opp_flurryScoreVenueAdjustedxGoals_10_y',
 'opp_missedShots_10_y',
 'opp_giveaways_10_y',
 'opp_highDangerShots_10_y',
 'opp_mediumDangerxGoals_10_y',
 'opp_highDangerxGoals_10_y',
 'penalties_10_y',
 'penalityMinutes_10_y',
 'takeaways_10_y',
 'giveaways_10_y',
 'highDangerxGoals_10_y',
 'dZoneGiveaways_10_y']

In [122]:
predictions = backtest(full, rr, predictors)
accuracy_score(predictions["res"], predictions["pred"])

0.6605381165919283

In [123]:
full.to_csv(f'MODEL_PER_GAME_{situation}_{season}.csv', header=True, index=False)