In [251]:
import warnings
import pandas as pd
import numpy as np

warnings.simplefilter(action='ignore', category = pd.errors.PerformanceWarning)
warnings.simplefilter(action='ignore', category = DeprecationWarning)
warnings.simplefilter(action='ignore', category = FutureWarning)
warnings.simplefilter(action='ignore', category = pd.errors.SettingWithCopyWarning)

In [112]:
df = pd.read_csv("nba_games.csv", index_col = 0)

In [113]:
df = df.sort_values("date")
df.reset_index(drop = True)

Unnamed: 0,mp,mp.1,fg,fga,fg%,3p,3pa,3p%,ft,fta,...,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won
0,240.0,240.0,41.0,96.0,0.427,9.0,30.0,0.300,20.0,22.0,...,37.5,38.9,201.0,120.0,NOP,95,0,2016,2015-10-27,True
1,240.0,240.0,38.0,94.0,0.404,9.0,29.0,0.310,10.0,17.0,...,53.2,34.6,162.0,104.0,CHI,97,1,2016,2015-10-27,False
2,240.0,240.0,37.0,87.0,0.425,7.0,19.0,0.368,16.0,23.0,...,30.4,29.0,138.0,105.0,CLE,95,0,2016,2015-10-27,True
3,240.0,240.0,37.0,82.0,0.451,8.0,27.0,0.296,12.0,15.0,...,33.3,23.6,132.0,104.0,DET,106,0,2016,2015-10-27,False
4,240.0,240.0,37.0,96.0,0.385,12.0,29.0,0.414,20.0,26.0,...,57.1,33.8,258.0,121.0,ATL,94,1,2016,2015-10-27,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19935,240.0,240.0,34.0,92.0,0.370,11.0,35.0,0.314,15.0,19.0,...,21.5,33.4,202.0,113.0,DEN,109,0,2023,2023-06-07,False
19936,240.0,240.0,35.0,78.0,0.449,8.0,25.0,0.320,17.0,20.0,...,50.0,32.6,164.0,114.0,DEN,108,0,2023,2023-06-09,False
19937,240.0,240.0,39.0,79.0,0.494,14.0,28.0,0.500,16.0,21.0,...,25.2,30.3,223.0,127.0,MIA,95,1,2023,2023-06-09,True
19938,240.0,240.0,38.0,84.0,0.452,5.0,28.0,0.179,13.0,23.0,...,25.0,42.5,102.0,109.0,MIA,89,0,2023,2023-06-12,True


In [114]:
del df["mp.1"]
del df["mp_opp.1"]
del df["index_opp"]

In [115]:
def add_target(team):
    team["target"] = team["won"].shift(-1)
    return team

In [116]:
df = df.groupby("team", group_keys = False).apply(add_target)

In [117]:
df["target"][pd.isnull(df["target"])] = 2

In [187]:
df[["target", "won"]] = df[["target", "won"]].astype(int, errors = "ignore")

In [119]:
nulls = pd.isnull(df)
nulls = nulls.sum()
nulls = nulls[nulls > 0]

In [120]:
valid_columns = df.columns[~df.columns.isin(nulls.index)]
df = df[valid_columns].copy()

In [121]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import RidgeClassifier

rr = RidgeClassifier(alpha = 1)
split = TimeSeriesSplit(n_splits = 3)
sfs = SequentialFeatureSelector(rr, n_features_to_select = 30, direction = "forward", cv = split)

In [333]:
removed_columns = ["season", "date", "won", "target", "team", "team_opp"]
selected_columns = df.columns[~df.columns.isin(removed_columns)]

In [123]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df[selected_columns] = scaler.fit_transform(df[selected_columns])

In [124]:
sfs.fit(df[selected_columns], df["target"])

In [125]:
predictors = list(selected_columns[sfs.get_support()])

In [126]:
def backtest(data, model, predictors, start = 2, step = 1):
    all_predictions = []

    seasons = sorted(data["season"].unique())

    for i in range(start, len(seasons), step):
        season = seasons[i]

        train = data[data["season"] < season]
        test = data[data["season"] == season]

        model.fit(train[predictors], train["target"])

        preds = model.predict(test[predictors])
        preds = pd.Series(preds, index = test.index)

        combined = pd.concat([test["target"], preds], axis = 1)
        combined.columns = ["actual", "predictions"]

        all_predictions.append(combined)
    
    return pd.concat(all_predictions)



In [127]:
predictions = backtest(df, rr, predictors)

In [128]:
from sklearn.metrics import accuracy_score

In [129]:
accuracy_score(predictions["actual"], predictions["predictions"])

0.5403675970047651

In [139]:
df.groupby("home").apply(lambda x: x[x["won"] == 1].shape[0] / x.shape[0])

home
0.0    0.427282
1.0    0.572718
dtype: float64

In [312]:
df_rolling = df[list(selected_columns) + ["won", "team", "season"]]

In [313]:
def find_team_averages(team):
    cols = list(selected_columns)
    cols.append("won")
    rolling = team[cols].rolling(10).mean()
    
    return rolling

df_rolling = df_rolling.groupby(["team", "season"], group_keys = False).apply(find_team_averages)

In [324]:
rolling_cols = [f"{col}_10" for col in df_rolling.columns]
df_rolling.columns = rolling_cols

df = pd.concat([df, df_rolling], axis = 1)

In [326]:
df = df.dropna()

In [327]:
def shift_col(team, col_name):
    next_col = team[col_name].shift(-1)
    return next_col

def add_col(df, col_name):
    return df.groupby("team", group_keys = False).apply(lambda x: shift_col(x, col_name))

df["home_next"] = add_col(df, "home")
df["team_opp_next"] = add_col(df, "team_opp")
df["date_next"] = add_col(df, "date")

In [328]:
df = df.copy()

In [329]:
full = df.merge(
    df[rolling_cols + ["team_opp_next", "date_next", "team"]], 
    left_on = ["team", "date_next"], 
    right_on = ["team_opp_next", "date_next"]
)

In [337]:
new_removed_columns = list(full.columns[full.dtypes == "object"]) + removed_columns
new_selected_columns = full.columns[~full.columns.isin(new_removed_columns)]

In [339]:
sfs.fit(full[new_selected_columns], full["target"])

In [340]:
new_predictors = list(new_selected_columns[sfs.get_support()])

In [341]:
new_predictions = backtest(full, rr, new_predictors)

In [342]:
accuracy_score(new_predictions["actual"], new_predictions["predictions"])

0.6304197150558336