In [1]:
import numpy as np
import pandas as pd 

In [2]:
SCRAPED_DATA = "../scrape/nba_games.csv"
SCRAPED_DATA_2023 = "../scrape/nba_games_2023.csv"
DOWNLOADED_DATA = "../../nba_games.csv"
df = pd.read_csv(SCRAPED_DATA_2023, index_col = 0)
df = df.sort_values("date")
df = df.reset_index(drop=True)
del df["mp.1"]
del df["mp_opp.1"]
del df["index_opp"]

In [3]:
df["spread"] = df["total"] - df["total_opp"]

date_team_groups = df.groupby(['date', 'team'])

# You can now access the groups using the get_group() function
date_team_groups.get_group(('2022-01-01', 'GSW'))

  df["spread"] = df["total"] - df["total_opp"]


Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won,spread
15737,240.0,46.0,86.0,0.535,17.0,32.0,0.531,14.0,21.0,0.667,...,29.2,163.0,127.0,UTA,116,1,2022,2022-01-01,True,7


In [4]:
nulls = pd.isnull(df).sum()
nulls = nulls[nulls > 0]
nulls

+/-             18382
mp_max          18382
mp_max.1        18382
+/-_opp         18382
mp_max_opp      18382
mp_max_opp.1    18382
dtype: int64

In [5]:
valid_columns = df.columns[~df.columns.isin(nulls.index)]
df = df[valid_columns].copy()
df

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won,spread
0,240.0,37.0,82.0,0.451,8.0,27.0,0.296,12.0,15.0,0.800,...,23.6,132.0,104.0,DET,106,0,2016,2015-10-27,False,-12
1,240.0,35.0,83.0,0.422,6.0,18.0,0.333,19.0,27.0,0.704,...,43.7,206.0,104.0,GSW,111,1,2016,2015-10-27,False,-16
2,240.0,37.0,87.0,0.425,7.0,19.0,0.368,16.0,23.0,0.696,...,29.0,138.0,105.0,CLE,95,0,2016,2015-10-27,True,2
3,240.0,38.0,94.0,0.404,9.0,29.0,0.310,10.0,17.0,0.588,...,34.6,162.0,104.0,CHI,97,1,2016,2015-10-27,False,-2
4,240.0,41.0,96.0,0.427,9.0,30.0,0.300,20.0,22.0,0.909,...,38.9,201.0,120.0,NOP,95,0,2016,2015-10-27,True,16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18377,240.0,47.0,91.0,0.516,18.0,45.0,0.400,17.0,25.0,0.680,...,40.3,244.0,132.0,HOU,114,0,2023,2022-12-29,True,15
18378,240.0,44.0,91.0,0.484,11.0,39.0,0.282,17.0,22.0,0.773,...,29.4,235.0,125.0,LAC,110,0,2023,2022-12-29,True,6
18379,240.0,48.0,95.0,0.505,13.0,29.0,0.448,10.0,14.0,0.714,...,27.3,166.0,131.0,TOR,106,1,2023,2022-12-29,True,13
18380,240.0,43.0,89.0,0.483,15.0,39.0,0.385,9.0,14.0,0.643,...,39.8,248.0,119.0,BOS,116,1,2023,2022-12-29,False,-6


In [6]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.metrics import mean_absolute_error

model = RandomForestRegressor(n_estimators=100, random_state=42)
split = TimeSeriesSplit(n_splits=3)

sfs = SequentialFeatureSelector(model, n_features_to_select=35, direction="forward", cv=split)

In [7]:
removed_columns = ["season", "date", "won", "spread", "team", "team_opp", "total", "total_opp"]
selected_columns = df.columns[~df.columns.isin(removed_columns)]

In [8]:
# Scale b/w 0-1 to improve ridge regression performance
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df[selected_columns] = scaler.fit_transform(df[selected_columns])

In [9]:
def backtest(model, data, predictors, start=2, step=1):
    all_predictions = []
    
    seasons = sorted(data["season"].unique())
    
    for i in range(start, len(seasons), step):
        season = seasons[i]
        
        train = data[data["season"] < season]
        test = data[data["season"] == season]
        
        model.fit(train[predictors], train["spread"])
        
        preds = model.predict(test[predictors])
        preds = pd.Series(preds, index=test.index)
        
        combined = pd.concat([test["spread"], preds], axis=1)
        combined.columns = ["actual", "predictions"]
        
        all_predictions.append(combined)
        
    return pd.concat(all_predictions) 

In [10]:
# Build new dataframe that takes rolling data from previous 10 games

df_rolling = df[list(selected_columns) + ["won", "team", "season"]]

def find_team_average(team):
    rolling = team.rolling(10).mean()
    return rolling

# Groupby team to only get team averages and grouping by season since teams change between seasons
df_rolling = df_rolling.groupby(["team", "season"], group_keys=False).apply(find_team_average)

  rolling = team.rolling(10).mean()


In [11]:
df_rolling["team"] = df["team"]
df_rolling["spread"] = df["spread"]
df_rolling = df_rolling.dropna()
df_rolling = df_rolling.reset_index()

df_rolling

Unnamed: 0,index,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,...,blk%_max_opp,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,home_opp,won,season,team,spread
0,234,0.000,0.468182,0.379688,0.482775,0.320690,0.328788,0.430641,0.346512,0.295238,...,0.0679,0.413522,0.124134,0.361611,0.449412,0.4,0.8,2016.0,ATL,8
1,250,0.000,0.529545,0.440625,0.506699,0.420690,0.392424,0.493349,0.395349,0.357143,...,0.0772,0.469497,0.219641,0.394787,0.531765,0.5,1.0,2016.0,GSW,13
2,252,0.000,0.322727,0.354687,0.323684,0.165517,0.210606,0.315558,0.444186,0.382540,...,0.1145,0.437841,0.138126,0.507109,0.360000,0.6,0.4,2016.0,MEM,1
3,257,0.000,0.375000,0.343750,0.390431,0.224138,0.239394,0.382423,0.548837,0.471429,...,0.1072,0.380294,0.273427,0.270616,0.478824,0.6,0.7,2016.0,TOR,19
4,262,0.000,0.381818,0.351562,0.396172,0.268966,0.266667,0.420071,0.374419,0.358730,...,0.0759,0.512159,0.133633,0.277251,0.388235,0.4,0.6,2016.0,IND,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16217,18377,0.050,0.443182,0.362500,0.460766,0.568966,0.616667,0.437648,0.455814,0.403175,...,0.1674,0.264256,0.155841,0.489100,0.560000,0.4,0.7,2023.0,DAL,15
16218,18378,0.025,0.484091,0.481250,0.420813,0.479310,0.607576,0.373753,0.423256,0.371429,...,0.0757,0.337631,0.156868,0.496209,0.515294,0.3,0.5,2023.0,BOS,6
16219,18379,0.000,0.559091,0.487500,0.499761,0.351724,0.433333,0.372803,0.427907,0.401587,...,0.0740,0.381971,0.184339,0.454502,0.535294,0.5,0.6,2023.0,MEM,13
16220,18380,0.025,0.495455,0.443750,0.462201,0.527586,0.528788,0.466390,0.372093,0.328571,...,0.0924,0.473585,0.166752,0.550711,0.558824,0.5,0.7,2023.0,LAC,-6


In [12]:
# predictions = backtest(model, df_rolling, predictors)
# mae = mean_absolute_error(predictions["actual"], predictions["predictions"])
# mae

In [13]:
from sklearn.feature_selection import SequentialFeatureSelector
from tqdm import tqdm

In [14]:
# X = df_rolling[selected_columns]
# y = df_rolling["spread"]

# sfs.fit(X, y)

In [15]:
# predictors = list(selected_columns[sfs.get_support()])
# predictors

In [16]:
saved_predictors = [
    'mp',
    'fg%',
    'blk',
    'pf',
    'pts',
    'ts%',
    'trb%',
    'ast%',
    'usg%',
    '3pa_max',
    'ft%_max',
    'ast_max',
    'blk_max',
    'tov_max',
    '+/-_max',
    'ortg_max',
    'home',
    'mp_opp',
    'ast_opp',
    'blk_opp',
    'pts_opp',
    'ftr_opp',
    'trb%_opp',
    'tov%_opp',
    'usg%_opp',
    'ortg_opp',
    'fg_max_opp',
    'orb_max_opp',
    'drb_max_opp',
    'stl_max_opp',
    '+/-_max_opp',
    'stl%_max_opp',
    'tov%_max_opp',
    'ortg_max_opp',
    'home_opp'
]

In [17]:
predictions = backtest(model, df_rolling, saved_predictors)
mae = mean_absolute_error(predictions["actual"], predictions["predictions"])
mae

10.543120923231308

In [18]:
# rr.predict([full.loc[16107][predictors]])[0]

model.predict([df_rolling.loc[16198][saved_predictors]])[0]



-1.97

In [19]:
home_predictors = [
    'mp',
    'fg%',
    'blk',
    'pf',
    'pts',
    'ts%',
    'trb%',
    'ast%',
    'usg%',
    '3pa_max',
    'ft%_max',
    'ast_max',
    'blk_max',
    'tov_max',
    '+/-_max',
    'ortg_max',
    'home'
]

In [20]:
# Group the DataFrame by the 'team' column
team_groups = df_rolling.groupby('team')

# Select the group for the team 'GSW'
gsw_group = team_groups.get_group('GSW').iloc[-1]

gsw_group = gsw_group[home_predictors]

home_predictors =  gsw_group.index

home_predictors

Index(['mp', 'fg%', 'blk', 'pf', 'pts', 'ts%', 'trb%', 'ast%', 'usg%',
       '3pa_max', 'ft%_max', 'ast_max', 'blk_max', 'tov_max', '+/-_max',
       'ortg_max', 'home'],
      dtype='object')

In [21]:
opp_predictors = [
    'mp',
    'ast',
    'blk',
    'pts',
    'ftr',
    'trb%',
    'tov%',
    'usg%',
    'ortg',
    'fg_max',
    'orb_max',
    'drb_max',
    'stl_max',
    '+/-_max',
    'stl%_max',
    'tov%_max',
    'ortg_max',
    'home'
]

In [22]:
team_groups = df_rolling.groupby('team')

# Select the group for the team 'GSW'
det_group = team_groups.get_group('DET').iloc[-1]

det_group = det_group[opp_predictors]

det_group

mp              0.05
ast         0.369767
blk             0.17
pts         0.496154
ftr         0.431486
trb%        0.514865
tov%        0.442164
usg%             0.0
ortg        0.494731
fg_max      0.252381
orb_max     0.376923
drb_max         0.27
stl_max          0.2
+/-_max     0.393333
stl%_max      0.1703
tov%_max    0.408281
ortg_max    0.495261
home             0.6
Name: 16199, dtype: object

In [23]:
#opp_cols = {f"{col}_opp" for col in det_group.columns}
det_group_opp = det_group.rename(index={col: col + '_opp' for col in det_group.index})
opp_predictors = det_group.index
opp_predictors

Index(['mp', 'ast', 'blk', 'pts', 'ftr', 'trb%', 'tov%', 'usg%', 'ortg',
       'fg_max', 'orb_max', 'drb_max', 'stl_max', '+/-_max', 'stl%_max',
       'tov%_max', 'ortg_max', 'home'],
      dtype='object')

In [24]:
input = pd.concat([gsw_group, det_group.rename(index={col: col + '_opp' for col in det_group.index})])

In [25]:
model.predict([input])[0]



2.1

In [26]:
import pickle
x = []
data = {"model": model, "data": df_rolling, "home": home_predictors, "away": opp_predictors}
with open('saved_steps.pkl', 'wb') as file:
    pickle.dump(data, file)

In [27]:
with open('saved_steps.pkl', 'rb') as file:
    data = pickle.load(file)

regressor_loaded = data["model"]
home = data["home"]
away = data["away"]

home
away

Index(['mp', 'ast', 'blk', 'pts', 'ftr', 'trb%', 'tov%', 'usg%', 'ortg',
       'fg_max', 'orb_max', 'drb_max', 'stl_max', '+/-_max', 'stl%_max',
       'tov%_max', 'ortg_max', 'home'],
      dtype='object')

In [28]:
# Build new dataframe that takes rolling data from previous 10 games

df_roll = df[list(selected_columns) + ["won", "team", "season"]]

def find_team_average(team):
    rolling = team.rolling(10).mean()
    return rolling

# Groupby team to only get team averages and grouping by season since teams change between seasons
df_roll = df_roll.groupby(["team", "season"], group_keys=False).apply(find_team_average)

  rolling = team.rolling(10).mean()


In [29]:
roll_cols = [f"{col}_10" for col in df_roll.columns]
df_roll.columns = roll_cols

df = pd.concat([df, df_roll], axis=1)

In [30]:
df_roll["team"] = df["team"]
df_roll["spread"] = df["spread"]
df_roll = df.dropna()
df_roll

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,ast%_max_opp_10,stl%_max_opp_10,blk%_max_opp_10,tov%_max_opp_10,usg%_max_opp_10,ortg_max_opp_10,drtg_max_opp_10,home_opp_10,won_10,season_10
234,0.0,0.522727,0.406250,0.523923,0.344828,0.333333,0.457245,0.255814,0.238095,0.708285,...,0.311137,0.0628,0.0679,0.413522,0.124134,0.361611,0.449412,0.4,0.8,2016.0
250,0.0,0.659091,0.453125,0.645933,0.620690,0.515152,0.562945,0.325581,0.238095,0.927655,...,0.411596,0.0613,0.0772,0.469497,0.219641,0.394787,0.531765,0.5,1.0,2016.0
252,0.0,0.386364,0.406250,0.358852,0.206897,0.181818,0.445368,0.511628,0.412698,0.827305,...,0.244891,0.0625,0.1145,0.437841,0.138126,0.507109,0.360000,0.6,0.4,2016.0
257,0.0,0.340909,0.265625,0.413876,0.310345,0.257576,0.509501,0.511628,0.412698,0.827305,...,0.457635,0.0699,0.1072,0.380294,0.273427,0.270616,0.478824,0.6,0.7,2016.0
262,0.0,0.500000,0.406250,0.497608,0.344828,0.318182,0.475059,0.325581,0.349206,0.593932,...,0.282434,0.0646,0.0759,0.512159,0.133633,0.277251,0.388235,0.4,0.6,2016.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18377,0.0,0.636364,0.484375,0.590909,0.620690,0.621212,0.475059,0.372093,0.380952,0.626604,...,0.386797,0.0579,0.1674,0.264256,0.155841,0.489100,0.560000,0.4,0.7,2023.0
18378,0.0,0.568182,0.484375,0.514354,0.379310,0.530303,0.334917,0.372093,0.333333,0.735123,...,0.313777,0.0946,0.0757,0.337631,0.156868,0.496209,0.515294,0.3,0.5,2023.0
18379,0.0,0.659091,0.546875,0.564593,0.448276,0.378788,0.532067,0.209302,0.206349,0.666278,...,0.332721,0.0499,0.0740,0.381971,0.184339,0.454502,0.535294,0.5,0.6,2023.0
18380,0.0,0.545455,0.453125,0.511962,0.517241,0.530303,0.457245,0.186047,0.206349,0.583431,...,0.401493,0.0578,0.0924,0.473585,0.166752,0.550711,0.558824,0.5,0.7,2023.0


In [31]:
removed_columns = removed_columns + list(df_roll.columns[df_roll.dtypes == "object"])
selected_columns = df_roll.columns[~df_roll.columns.isin(removed_columns)]

In [None]:
sfs.fit(df_roll[selected_columns], df_roll["spread"])

In [None]:
predictors = list(selected_columns[sfs.get_support()])
predictors