In [1]:
import numpy as np
import pandas as pd 

In [2]:
SCRAPED_DATA = "../scrape/nba_games.csv"
SCRAPED_DATA_2023 = "../scrape/nba_games_2023.csv"
DOWNLOADED_DATA = "../../nba_games.csv"
df = pd.read_csv(SCRAPED_DATA_2023, index_col = 0)
df = df.sort_values("date")
df = df.reset_index(drop=True)
del df["mp.1"]
del df["mp_opp.1"]
del df["index_opp"]

In [3]:
# Add a target column to df. Target is the next games win/loss bool
def add_target(team):
    team["target"] = team["won"].shift(-1)
    return team

df = df.groupby("team", group_keys = False).apply(add_target)

# Setting null values next games that haven't happened yet to 2
df["target"][pd.isnull(df["target"])] = 2

# Converting all target values to int
df["target"] = df["target"].astype(int, errors="ignore")

df[df["team"] == "GSW"]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["target"][pd.isnull(df["target"])] = 2


Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won,target
4,240.0,41.0,96.0,0.427,9.0,30.0,0.300,20.0,22.0,0.909,...,38.9,201.0,120.0,NOP,95,0,2016,2015-10-27,True,1
52,240.0,43.0,93.0,0.462,9.0,26.0,0.346,17.0,25.0,0.680,...,37.5,151.0,118.0,HOU,92,1,2016,2015-10-30,True,1
70,240.0,46.0,84.0,0.548,17.0,30.0,0.567,25.0,35.0,0.714,...,36.1,218.0,131.0,NOP,120,1,2016,2015-10-31,True,1
101,240.0,43.0,84.0,0.512,11.0,25.0,0.440,22.0,30.0,0.733,...,44.3,106.0,126.0,MEM,69,0,2016,2015-11-02,True,1
125,240.0,39.0,85.0,0.459,10.0,26.0,0.385,24.0,31.0,0.774,...,32.9,250.0,122.0,LAC,108,0,2016,2015-11-04,True,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18246,240.0,35.0,74.0,0.473,12.0,35.0,0.343,12.0,13.0,0.923,...,30.7,300.0,106.0,NYK,132,1,2023,2022-12-20,False,0
18268,240.0,41.0,83.0,0.494,12.0,33.0,0.364,19.0,25.0,0.760,...,32.6,183.0,118.0,BRK,143,1,2023,2022-12-21,False,1
18316,240.0,43.0,91.0,0.473,18.0,44.0,0.409,19.0,26.0,0.731,...,37.2,208.0,123.0,MEM,109,0,2023,2022-12-25,True,1
18338,240.0,41.0,95.0,0.432,8.0,37.0,0.216,20.0,24.0,0.833,...,36.1,151.0,111.0,CHO,105,0,2023,2022-12-27,True,1


In [4]:
# Find the columns that contain null values
nulls = pd.isnull(df)
nulls = nulls.sum()
nulls = nulls[nulls > 0]

In [5]:
# Create new df copy that does not include columns with null values
valid_columns = df.columns[~df.columns.isin(nulls.index)]
df = df[valid_columns].copy()

In [6]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

rr = RidgeClassifier(alpha=1)
clf = RandomForestClassifier(n_estimators=100, random_state=42)
split = TimeSeriesSplit(n_splits=3)

sfs = SequentialFeatureSelector(rr, n_features_to_select=35, direction="forward", cv=split)

In [7]:
removed_columns = ["season", "date", "won", "target", "team", "team_opp"]
selected_columns = df.columns[~df.columns.isin(removed_columns)]

In [8]:
# Scale b/w 0-1 to improve ridge regression performance
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df[selected_columns] = scaler.fit_transform(df[selected_columns])

In [9]:
# sfs.fit(df[selected_columns], df["target"])

In [10]:
# predictors = list(selected_columns[sfs.get_support()])

In [11]:
def backtest(model, data, predictors, start=2, step=1):
    all_predictions = []
    
    seasons = sorted(data["season"].unique())
    
    for i in range(start, len(seasons), step):
        season = seasons[i]
        
        train = data[data["season"] < season]
        test = data[data["season"] == season]
        
        model.fit(train[predictors], train["target"])
        
        preds = model.predict(test[predictors])
        preds = pd.Series(preds, index=test.index)
        
        combined = pd.concat([test["target"], preds], axis=1)
        combined.columns = ["actual", "predictions"]
        
        all_predictions.append(combined)
        
    return pd.concat(all_predictions) 

In [12]:
# predictions = backtest(rr, df, predictors)

# predictions = predictions[predictions["actual"] != 2]
# accuracy_score(predictions["actual"], predictions["predictions"])

In [13]:
# Compare to baseline
# NBA has big home team bais. Calculate home team winning %

df.groupby("home").apply(lambda x: x[x["won"] == 1].shape[0] / x.shape[0])

home
0.0    0.426613
1.0    0.573387
dtype: float64

In [14]:
# Build new rolling df that takes average of teams previous 10 games

df_rolling = df[list(selected_columns) + ["won", "team", "season"]]

In [15]:
def find_team_average(team):
    rolling = team.rolling(10).mean()
    return rolling

# Groupby team to only get team averages and grouping by season since teams change between seasons
df_rolling = df_rolling.groupby(["team", "season"], group_keys=False).apply(find_team_average)

  rolling = team.rolling(10).mean()


In [16]:
df_rolling

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,stl%_max_opp,blk%_max_opp,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,total_opp,home_opp,won,season
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18377,0.050,0.443182,0.362500,0.460766,0.568966,0.616667,0.437648,0.455814,0.403175,0.737923,...,0.0579,0.1674,0.264256,0.155841,0.489100,0.560000,0.442308,0.4,0.7,2023.0
18378,0.025,0.484091,0.481250,0.420813,0.479310,0.607576,0.373753,0.423256,0.371429,0.748425,...,0.0946,0.0757,0.337631,0.156868,0.496209,0.515294,0.463462,0.3,0.5,2023.0
18379,0.000,0.559091,0.487500,0.499761,0.351724,0.433333,0.372803,0.427907,0.401587,0.698833,...,0.0499,0.0740,0.381971,0.184339,0.454502,0.535294,0.425962,0.5,0.6,2023.0
18380,0.025,0.495455,0.443750,0.462201,0.527586,0.528788,0.466390,0.372093,0.328571,0.745391,...,0.0578,0.0924,0.473585,0.166752,0.550711,0.558824,0.419231,0.5,0.7,2023.0


In [17]:
rolling_cols = [f"{col}_10" for col in df_rolling.columns]
df_rolling.columns = rolling_cols

df = pd.concat([df, df_rolling], axis=1)

In [18]:
df = df.dropna()
df

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,stl%_max_opp_10,blk%_max_opp_10,tov%_max_opp_10,usg%_max_opp_10,ortg_max_opp_10,drtg_max_opp_10,total_opp_10,home_opp_10,won_10,season_10
234,0.0,0.522727,0.406250,0.523923,0.344828,0.333333,0.457245,0.255814,0.238095,0.708285,...,0.0628,0.0679,0.413522,0.124134,0.361611,0.449412,0.347115,0.4,0.8,2016.0
250,0.0,0.659091,0.453125,0.645933,0.620690,0.515152,0.562945,0.325581,0.238095,0.927655,...,0.0613,0.0772,0.469497,0.219641,0.394787,0.531765,0.324038,0.5,1.0,2016.0
252,0.0,0.386364,0.406250,0.358852,0.206897,0.181818,0.445368,0.511628,0.412698,0.827305,...,0.0625,0.1145,0.437841,0.138126,0.507109,0.360000,0.351923,0.6,0.4,2016.0
257,0.0,0.340909,0.265625,0.413876,0.310345,0.257576,0.509501,0.511628,0.412698,0.827305,...,0.0699,0.1072,0.380294,0.273427,0.270616,0.478824,0.308654,0.6,0.7,2016.0
262,0.0,0.500000,0.406250,0.497608,0.344828,0.318182,0.475059,0.325581,0.349206,0.593932,...,0.0646,0.0759,0.512159,0.133633,0.277251,0.388235,0.308654,0.4,0.6,2016.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18377,0.0,0.636364,0.484375,0.590909,0.620690,0.621212,0.475059,0.372093,0.380952,0.626604,...,0.0579,0.1674,0.264256,0.155841,0.489100,0.560000,0.442308,0.4,0.7,2023.0
18378,0.0,0.568182,0.484375,0.514354,0.379310,0.530303,0.334917,0.372093,0.333333,0.735123,...,0.0946,0.0757,0.337631,0.156868,0.496209,0.515294,0.463462,0.3,0.5,2023.0
18379,0.0,0.659091,0.546875,0.564593,0.448276,0.378788,0.532067,0.209302,0.206349,0.666278,...,0.0499,0.0740,0.381971,0.184339,0.454502,0.535294,0.425962,0.5,0.6,2023.0
18380,0.0,0.545455,0.453125,0.511962,0.517241,0.530303,0.457245,0.186047,0.206349,0.583431,...,0.0578,0.0924,0.473585,0.166752,0.550711,0.558824,0.419231,0.5,0.7,2023.0


In [19]:
def shift_cols(team, col_name):
    next_col = team[col_name].shift(-1)
    return next_col

def add_col(df, col_name):
    return df.groupby("team", group_keys=False).apply(lambda x: shift_cols(x, col_name))

df["home_next"] = add_col(df, "home")
df["team_opp_next"] = add_col(df, "team_opp")
df["date_next"] = add_col(df, "date")

df[['team', 'target', 'date']]

Unnamed: 0,team,target,date
234,ATL,0,2015-11-11
250,GSW,1,2015-11-12
252,MEM,1,2015-11-13
257,TOR,0,2015-11-13
262,IND,0,2015-11-13
...,...,...,...
18377,DAL,2,2022-12-29
18378,BOS,2,2022-12-29
18379,MEM,2,2022-12-29
18380,LAC,2,2022-12-29


In [20]:
df["date"].iloc[0]

'2015-11-11'

In [21]:
def fill_missing(team, home_next, team_opp_next, date_next):
    unplayed = df[df["target"] == 2]
    index = unplayed.loc[unplayed['team'] == team].index
    df.loc[index, "home_next"] = home_next
    df.loc[index, "team_opp_next"] = team_opp_next
    df.loc[index, "date_next"] = date_next

In [22]:
fill_missing("MIN", 0, "DET", "2022-12-31")
fill_missing("DET", 1, "MIN", "2022-12-31")

fill_missing("MEM", 0, "ORL", "2022-12-31")
fill_missing("ORL", 1, "MEM", "2022-12-31")

In [49]:
df.loc[df['target'] == 2, ['team', 'target', 'date', 'home_next', 'team_opp_next', 'date_next']]

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,usg%_max_opp_10,ortg_max_opp_10,drtg_max_opp_10,total_opp_10,home_opp_10,won_10,season_10,home_next,team_opp_next,date_next
234,0.0,0.522727,0.406250,0.523923,0.344828,0.333333,0.457245,0.255814,0.238095,0.708285,...,0.124134,0.361611,0.449412,0.347115,0.4,0.8,2016.0,0.0,BOS,2015-11-13
250,0.0,0.659091,0.453125,0.645933,0.620690,0.515152,0.562945,0.325581,0.238095,0.927655,...,0.219641,0.394787,0.531765,0.324038,0.5,1.0,2016.0,1.0,BRK,2015-11-14
252,0.0,0.386364,0.406250,0.358852,0.206897,0.181818,0.445368,0.511628,0.412698,0.827305,...,0.138126,0.507109,0.360000,0.351923,0.6,0.4,2016.0,0.0,MIN,2015-11-15
257,0.0,0.340909,0.265625,0.413876,0.310345,0.257576,0.509501,0.511628,0.412698,0.827305,...,0.273427,0.270616,0.478824,0.308654,0.6,0.7,2016.0,0.0,SAC,2015-11-15
262,0.0,0.500000,0.406250,0.497608,0.344828,0.318182,0.475059,0.325581,0.349206,0.593932,...,0.133633,0.277251,0.388235,0.308654,0.4,0.6,2016.0,0.0,CHI,2015-11-16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18377,0.0,0.636364,0.484375,0.590909,0.620690,0.621212,0.475059,0.372093,0.380952,0.626604,...,0.155841,0.489100,0.560000,0.442308,0.4,0.7,2023.0,,,
18378,0.0,0.568182,0.484375,0.514354,0.379310,0.530303,0.334917,0.372093,0.333333,0.735123,...,0.156868,0.496209,0.515294,0.463462,0.3,0.5,2023.0,,,
18379,0.0,0.659091,0.546875,0.564593,0.448276,0.378788,0.532067,0.209302,0.206349,0.666278,...,0.184339,0.454502,0.535294,0.425962,0.5,0.6,2023.0,0.0,ORL,2022-12-31
18380,0.0,0.545455,0.453125,0.511962,0.517241,0.530303,0.457245,0.186047,0.206349,0.583431,...,0.166752,0.550711,0.558824,0.419231,0.5,0.7,2023.0,,,


In [24]:
# Creating new full df 

full = df.merge(
    df[rolling_cols + ["team_opp_next", "date_next", "team"]],
    left_on = ["team", "date_next"],
    right_on = ["team_opp_next", "date_next"]
)

In [51]:
full[["team_x", "team_opp_next_x", "team_y", "team_opp_next_y", "target", "date", "date_next"]]

Unnamed: 0,team_x,team_opp_next_x,team_y,team_opp_next_y,target,date,date_next
0,TOR,SAC,SAC,TOR,0,2015-11-13,2015-11-15
1,SAC,TOR,TOR,SAC,1,2015-11-13,2015-11-15
2,GSW,TOR,TOR,GSW,1,2015-11-14,2015-11-17
3,CLE,DET,DET,CLE,0,2015-11-14,2015-11-17
4,ORL,MIN,MIN,ORL,1,2015-11-14,2015-11-18
...,...,...,...,...,...,...,...
16103,IND,CLE,CLE,IND,1,2022-12-27,2022-12-29
16104,ORL,MEM,MEM,ORL,2,2022-12-28,2022-12-31
16105,DET,MIN,MIN,DET,2,2022-12-28,2022-12-31
16106,MIN,DET,DET,MIN,2,2022-12-28,2022-12-31


In [26]:
full[full["target"] == 2]

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,tov%_max_opp_10_y,usg%_max_opp_10_y,ortg_max_opp_10_y,drtg_max_opp_10_y,total_opp_10_y,home_opp_10_y,won_10_y,season_10_y,team_opp_next_y,team_y
16104,0.0,0.363636,0.390625,0.342105,0.172414,0.272727,0.269596,0.581395,0.52381,0.725788,...,0.381971,0.184339,0.454502,0.535294,0.425962,0.5,0.6,2023.0,ORL,MEM
16105,0.0,0.386364,0.3125,0.433014,0.62069,0.575758,0.509501,0.697674,0.619048,0.737456,...,0.456813,0.154172,0.554976,0.502353,0.495192,0.7,0.3,2023.0,DET,MIN
16106,0.0,0.522727,0.359375,0.566986,0.344828,0.378788,0.409739,0.534884,0.492063,0.708285,...,0.415618,0.22208,0.569194,0.538824,0.567308,0.4,0.2,2023.0,MIN,DET
16107,0.0,0.659091,0.546875,0.564593,0.448276,0.378788,0.532067,0.209302,0.206349,0.666278,...,0.268658,0.224134,0.61327,0.552941,0.473077,0.5,0.7,2023.0,MEM,ORL


In [27]:
# removed_columns = removed_columns + list(full.columns[full.dtypes == "object"])

In [28]:
# selected_columns = full.columns[~full.columns.isin(removed_columns)]

In [29]:
# sfs.fit(full[selected_columns], full["target"])

In [30]:
# predictors = list(selected_columns[sfs.get_support()])
# predictors

In [31]:
predictors = ['mp',
 'trb%',
 'usg%',
 'drtg',
 'ft%_max',
 'orb_max',
 'blk_max',
 'tov_max',
 'orb%_max',
 'usg%_opp',
 'ast%_max_opp',
 'usg%_10_x',
 'ft_max_10_x',
 'fta_max_10_x',
 'drb_max_10_x',
 '+/-_max_10_x',
 'drtg_max_10_x',
 'usg%_opp_10_x',
 '3p_max_opp_10_x',
 'trb_max_opp_10_x',
 'pf_max_opp_10_x',
 'drb%_max_opp_10_x',
 'tov%_max_opp_10_x',
 'won_10_x',
 'home_next',
 'ft_10_y',
 'usg%_10_y',
 '+/-_max_10_y',
 'usg%_max_10_y',
 'ortg_max_10_y',
 'usg%_opp_10_y',
 'stl_max_opp_10_y',
 'pts_max_opp_10_y',
 'ts%_max_opp_10_y',
 'stl%_max_opp_10_y']

In [32]:
predictions = backtest(rr, full, predictors)
accuracy_score(predictions["actual"], predictions["predictions"])

0.6251578681485224

In [33]:
predictions[predictions["actual"] == 2]
predictions

Unnamed: 0,actual,predictions
4231,1,1
4232,1,1
4233,1,0
4234,1,0
4235,1,1
...,...,...
16103,1,1
16104,2,1
16105,2,0
16106,2,0


In [34]:
full["predictions"] = predictions["predictions"]
full.loc[full["target"] == 2, ["team_x", "team_opp_next_x", "team_y", "team_opp_next_y", "target", "predictions", "date", "date_next"]]

Unnamed: 0,team_x,team_opp_next_x,team_y,team_opp_next_y,target,predictions,date,date_next
16104,ORL,MEM,MEM,ORL,2,1.0,2022-12-28,2022-12-31
16105,DET,MIN,MIN,DET,2,0.0,2022-12-28,2022-12-31
16106,MIN,DET,DET,MIN,2,0.0,2022-12-28,2022-12-31
16107,MEM,ORL,ORL,MEM,2,1.0,2022-12-29,2022-12-31


In [48]:
# def predict(model, data, predictors):
#     preds = model.predict(test[predictors])
#     preds = pd.Series(preds, index=test.index)

rr.predict([full.loc[16107][predictors]])[0]
# full.loc[16104][predictors]



1

In [36]:
def backtest(model, data, predictors, start=2, step=1):
    all_predictions = []
    
    seasons = sorted(data["season"].unique())
    
    for i in range(start, len(seasons), step):
        season = seasons[i] 
        
        train = data[data["season"] < season]
        test = data[data["season"] == season]
        
        model.fit(train[predictors], train["target"])
        
        preds = model.predict(test[predictors])
        preds = pd.Series(preds, index=test.index)
        
        combined = pd.concat([test["target"], preds], axis=1)
        combined.columns = ["actual", "predictions"]
        
        all_predictions.append(combined)
        
    return pd.concat(all_predictions) 