In [1]:
import numpy as np
import pandas as pd 

In [2]:
SCRAPED_DATA = "../scrape/nba_games.csv"
df = pd.read_csv(SCRAPED_DATA, index_col = 0)
df = df.sort_values("date")
df = df.reset_index(drop=True)
del df["mp.1"]
del df["mp_opp.1"]
del df["index_opp"]

In [3]:
# Add a target column to df. Target is the next games win/loss bool
def add_target(team):
    team["target"] = team["won"].shift(-1)
    return team

df = df.groupby("team", group_keys = False).apply(add_target)

# Setting null values next games that haven't happened yet to 2
df["target"][pd.isnull(df["target"])] = 2

# Converting all target values to int
df["target"] = df["target"].astype(int, errors="ignore")

df[df["team"] == "GSW"]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["target"][pd.isnull(df["target"])] = 2


Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won,target
3,240.0,41.0,96.0,0.427,9.0,30.0,0.300,20.0,22.0,0.909,...,38.9,201.0,120.0,NOP,95,0,2016,2015-10-27,True,1
52,240.0,43.0,93.0,0.462,9.0,26.0,0.346,17.0,25.0,0.680,...,37.5,151.0,118.0,HOU,92,1,2016,2015-10-30,True,1
68,240.0,46.0,84.0,0.548,17.0,30.0,0.567,25.0,35.0,0.714,...,36.1,218.0,131.0,NOP,120,1,2016,2015-10-31,True,1
103,240.0,43.0,84.0,0.512,11.0,25.0,0.440,22.0,30.0,0.733,...,44.3,106.0,126.0,MEM,69,0,2016,2015-11-02,True,1
128,240.0,39.0,85.0,0.459,10.0,26.0,0.385,24.0,31.0,0.774,...,32.9,250.0,122.0,LAC,108,0,2016,2015-11-04,True,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17316,240.0,39.0,86.0,0.453,15.0,37.0,0.405,14.0,20.0,0.700,...,35.2,300.0,117.0,BOS,88,0,2022,2022-06-05,True,0
17319,240.0,36.0,78.0,0.462,15.0,40.0,0.375,13.0,15.0,0.867,...,28.8,175.0,117.0,BOS,116,1,2022,2022-06-08,False,1
17321,240.0,40.0,91.0,0.440,15.0,43.0,0.349,12.0,15.0,0.800,...,32.4,205.0,120.0,BOS,97,1,2022,2022-06-10,True,1
17322,240.0,41.0,88.0,0.466,9.0,40.0,0.225,13.0,15.0,0.867,...,94.4,300.0,112.0,BOS,94,0,2022,2022-06-13,True,1


In [4]:
# Find the columns that contain null values
nulls = pd.isnull(df)
nulls = nulls.sum()
nulls = nulls[nulls > 0]

In [None]:
df[df['target'] == 2][["target", "team", "team_opp"]]

Unnamed: 0,target,team,team_opp
17110,2,IND,BRK
17113,2,LAL,DEN
17116,2,WAS,CHO
17121,2,HOU,ATL
17122,2,NYK,TOR
17127,2,DET,PHI
17130,2,ORL,MIA
17132,2,POR,UTA
17134,2,OKC,LAC
17135,2,SAC,PHO


In [6]:
# Create new df copy that does not include columns with null values
valid_columns = df.columns[~df.columns.isin(nulls.index)]
df = df[valid_columns].copy()

In [7]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import accuracy_score

rr = RidgeClassifier(alpha=1)
split = TimeSeriesSplit(n_splits=3)

sfs = SequentialFeatureSelector(rr, n_features_to_select=35, direction="forward", cv=split)

In [8]:
removed_columns = ["season", "date", "won", "target", "team", "team_opp"]
selected_columns = df.columns[~df.columns.isin(removed_columns)]

In [9]:
# Scale b/w 0-1 to improve ridge regression performance
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df[selected_columns] = scaler.fit_transform(df[selected_columns])

In [10]:
# sfs.fit(df[selected_columns], df["target"])

In [11]:
# predictors = list(selected_columns[sfs.get_support()])

In [12]:
def backtest(model, data, predictors, start=2, step=1):
    all_predictions = []
    
    seasons = sorted(data["season"].unique())
    
    for i in range(start, len(seasons), step):
        season = seasons[i]
        
        train = data[data["season"] < season]
        test = data[data["season"] == season]
        
        model.fit(train[predictors], train["target"])
        
        preds = model.predict(test[predictors])
        preds = pd.Series(preds, index=test.index)
        
        combined = pd.concat([test["target"], preds], axis=1)
        combined.columns = ["actual", "predictions"]
        
        all_predictions.append(combined)
        
    return pd.concat(all_predictions) 

In [13]:
# predictions = backtest(rr, df, predictors)

# predictions = predictions[predictions["actual"] != 2]
# accuracy_score(predictions["actual"], predictions["predictions"])

In [14]:
# Compare to baseline
# NBA has big home team bais. Calculate home team winning %

df.groupby("home").apply(lambda x: x[x["won"] == 1].shape[0] / x.shape[0])

home
0.0    0.429066
1.0    0.570934
dtype: float64

In [15]:
# Build new rolling df that takes average of teams previous 10 games

df_rolling = df[list(selected_columns) + ["won", "team", "season"]]

In [16]:
def find_team_average(team):
    rolling = team.rolling(5).mean()
    return rolling

# Groupby team to only get team averages and grouping by season since teams change between seasons
df_rolling = df_rolling.groupby(["team", "season"], group_keys=False).apply(find_team_average)

  rolling = team.rolling(5).mean()


In [17]:
df_rolling

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,stl%_max_opp,blk%_max_opp,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,total_opp,home_opp,won,season
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17321,0.0,0.472727,0.409375,0.461244,0.537931,0.548485,0.460808,0.283721,0.241270,0.789965,...,0.0902,0.1298,0.459958,0.232349,0.719431,0.557647,0.405769,0.4,0.6,2022.0
17322,0.0,0.454545,0.409375,0.439713,0.503448,0.560606,0.421853,0.269767,0.238095,0.758926,...,0.0428,0.1432,0.344654,0.292940,0.791469,0.487059,0.375000,0.4,0.6,2022.0
17323,0.0,0.390909,0.356250,0.398086,0.517241,0.493939,0.481473,0.339535,0.323810,0.696616,...,0.0610,0.1490,0.491614,0.221566,0.591469,0.491765,0.396154,0.6,0.4,2022.0
17324,0.0,0.450000,0.421875,0.425359,0.503448,0.563636,0.419715,0.255814,0.215873,0.821237,...,0.0438,0.1378,0.484486,0.315533,0.640758,0.487059,0.317308,0.6,0.8,2022.0


In [18]:
rolling_cols = [f"{col}_10" for col in df_rolling.columns]
df_rolling.columns = rolling_cols

df = pd.concat([df, df_rolling], axis=1)

In [19]:
df = df.dropna()
d

105      0
112      1
115      1
118      1
120      0
        ..
17319    1
17320    0
17321    1
17322    1
17323    0
Name: target, Length: 16456, dtype: int64

In [20]:
def shift_cols(team, col_name):
    next_col = team[col_name].shift(-1)
    return next_col

def add_col(df, col_name):
    return df.groupby("team", group_keys=False).apply(lambda x: shift_cols(x, col_name))

df["home_next"] = add_col(df, "home")
df["team_opp_next"] = add_col(df, "team_opp")
df["date_next"] = add_col(df, "date")

df

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,usg%_max_opp_10,ortg_max_opp_10,drtg_max_opp_10,total_opp_10,home_opp_10,won_10,season_10,home_next,team_opp_next,date_next
105,0.0,0.477273,0.359375,0.509569,0.103448,0.242424,0.178147,0.441860,0.380952,0.766628,...,0.088575,0.488152,0.338824,0.361538,0.6,0.6,2016.0,0.0,POR,2015-11-05
112,0.0,0.409091,0.468750,0.339713,0.241379,0.257576,0.395487,0.372093,0.333333,0.735123,...,0.099615,0.408531,0.383529,0.317308,0.6,0.8,2016.0,1.0,BRK,2015-11-04
115,0.0,0.568182,0.468750,0.526316,0.482759,0.393939,0.554632,0.046512,0.111111,0.270712,...,0.148652,0.243602,0.378824,0.365385,0.6,0.6,2016.0,1.0,OKC,2015-11-05
118,0.0,0.500000,0.265625,0.629187,0.379310,0.318182,0.522565,0.325581,0.492063,0.380397,...,0.215661,0.392417,0.416471,0.342308,0.6,0.6,2016.0,1.0,MEM,2015-11-05
120,0.0,0.454545,0.343750,0.495215,0.275862,0.287879,0.413302,0.255814,0.206349,0.833139,...,0.113222,0.363981,0.496471,0.438462,0.4,0.6,2016.0,0.0,CHI,2015-11-05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17321,0.0,0.477273,0.484375,0.409091,0.517241,0.590909,0.414489,0.255814,0.222222,0.766628,...,0.232349,0.719431,0.557647,0.405769,0.4,0.6,2022.0,1.0,BOS,2022-06-13
17322,0.0,0.500000,0.437500,0.471292,0.310345,0.545455,0.267221,0.279070,0.222222,0.844807,...,0.292940,0.791469,0.487059,0.375000,0.4,0.6,2022.0,0.0,BOS,2022-06-16
17323,0.0,0.272727,0.234375,0.344498,0.379310,0.424242,0.408551,0.465116,0.476190,0.623104,...,0.221566,0.591469,0.491765,0.396154,0.6,0.4,2022.0,1.0,GSW,2022-06-16
17324,0.0,0.431818,0.500000,0.344498,0.655172,0.636364,0.490499,0.162791,0.111111,1.000000,...,0.315533,0.640758,0.487059,0.317308,0.6,0.8,2022.0,,,


In [21]:
# Adding 

full = df.merge(
    df[rolling_cols + ["team_opp_next", "date_next", "team"]],
    left_on = ["team", "date_next"],
    right_on = ["team_opp_next", "date_next"]
)

In [22]:
full[["team_x", "team_opp_next_x", "team_y", "team_opp_next_y", "date", "date_next"]]

Unnamed: 0,team_x,team_opp_next_x,team_y,team_opp_next_y,date,date_next
0,MEM,POR,POR,MEM,2015-11-03,2015-11-05
1,CHI,OKC,OKC,CHI,2015-11-03,2015-11-05
2,POR,MEM,MEM,POR,2015-11-04,2015-11-05
3,OKC,CHI,CHI,OKC,2015-11-04,2015-11-05
4,TOR,ORL,ORL,TOR,2015-11-04,2015-11-06
...,...,...,...,...,...,...
16401,GSW,BOS,BOS,GSW,2022-06-08,2022-06-10
16402,BOS,GSW,GSW,BOS,2022-06-10,2022-06-13
16403,GSW,BOS,BOS,GSW,2022-06-10,2022-06-13
16404,GSW,BOS,BOS,GSW,2022-06-13,2022-06-16


In [23]:
removed_columns = removed_columns + list(full.columns[full.dtypes == "object"])

In [24]:
selected_columns = full.columns[~full.columns.isin(removed_columns)]

In [None]:
sfs.fit(full[selected_columns], full["target"])

In [None]:
predictors = list(selected_columns[sfs.get_support()])
predictors

['mp',
 'trb%',
 'usg%',
 'drtg',
 'ft%_max',
 'orb_max',
 'blk_max',
 'tov_max',
 'orb%_max',
 'usg%_opp',
 'ast%_max_opp',
 'usg%_10_x',
 'ft_max_10_x',
 'fta_max_10_x',
 'drb_max_10_x',
 '+/-_max_10_x',
 'drtg_max_10_x',
 'usg%_opp_10_x',
 '3p_max_opp_10_x',
 'trb_max_opp_10_x',
 'pf_max_opp_10_x',
 'drb%_max_opp_10_x',
 'tov%_max_opp_10_x',
 'won_10_x',
 'home_next',
 'ft_10_y',
 'usg%_10_y',
 '+/-_max_10_y',
 'usg%_max_10_y',
 'ortg_max_10_y',
 'usg%_opp_10_y',
 'stl_max_opp_10_y',
 'pts_max_opp_10_y',
 'ts%_max_opp_10_y',
 'stl%_max_opp_10_y']

In [None]:
predictions = backtest(rr, full, predictors)
accuracy_score(predictions["actual"], predictions["predictions"])

0.6182646091576018

In [None]:
predictions

Unnamed: 0,actual,predictions
4547,0,0
4548,1,1
4549,1,0
4550,1,0
4551,0,0
...,...,...
16401,1,0
16402,0,0
16403,1,1
16404,1,0
