In [1]:
import numpy as np
import pandas as pd 

In [2]:
SCRAPED_DATA = "../scrape/nba_games.csv"
SCRAPED_DATA_2023 = "../scrape/nba_games_2023.csv"
DOWNLOADED_DATA = "../../../nba_games.csv"
df = pd.read_csv(SCRAPED_DATA_2023, index_col = 0)
df = df.sort_values("date")
df = df.reset_index(drop=True)
del df["mp.1"]
del df["mp_opp.1"]
del df["index_opp"]

In [3]:
# Adding in spread value to act as our target
df["spread"] = df["total"] - df["total_opp"]

date_team_groups = df.groupby(['date', 'team'])

# You can now access the groups using the get_group() function
date_team_groups.get_group(('2022-01-01', 'GSW'))

  df["spread"] = df["total"] - df["total_opp"]


Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won,spread
15737,240.0,46.0,86.0,0.535,17.0,32.0,0.531,14.0,21.0,0.667,...,29.2,163.0,127.0,UTA,116,1,2022,2022-01-01,True,7


In [4]:
# Find the null values in the df
nulls = pd.isnull(df).sum()
nulls = nulls[nulls > 0]
nulls

+/-             18382
mp_max          18382
mp_max.1        18382
+/-_opp         18382
mp_max_opp      18382
mp_max_opp.1    18382
dtype: int64

In [5]:
# Remove the columns containing null values
valid_columns = df.columns[~df.columns.isin(nulls.index)]
df = df[valid_columns].copy()
df

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won,spread
0,240.0,37.0,82.0,0.451,8.0,27.0,0.296,12.0,15.0,0.800,...,23.6,132.0,104.0,DET,106,0,2016,2015-10-27,False,-12
1,240.0,35.0,83.0,0.422,6.0,18.0,0.333,19.0,27.0,0.704,...,43.7,206.0,104.0,GSW,111,1,2016,2015-10-27,False,-16
2,240.0,37.0,87.0,0.425,7.0,19.0,0.368,16.0,23.0,0.696,...,29.0,138.0,105.0,CLE,95,0,2016,2015-10-27,True,2
3,240.0,38.0,94.0,0.404,9.0,29.0,0.310,10.0,17.0,0.588,...,34.6,162.0,104.0,CHI,97,1,2016,2015-10-27,False,-2
4,240.0,41.0,96.0,0.427,9.0,30.0,0.300,20.0,22.0,0.909,...,38.9,201.0,120.0,NOP,95,0,2016,2015-10-27,True,16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18377,240.0,47.0,91.0,0.516,18.0,45.0,0.400,17.0,25.0,0.680,...,40.3,244.0,132.0,HOU,114,0,2023,2022-12-29,True,15
18378,240.0,44.0,91.0,0.484,11.0,39.0,0.282,17.0,22.0,0.773,...,29.4,235.0,125.0,LAC,110,0,2023,2022-12-29,True,6
18379,240.0,48.0,95.0,0.505,13.0,29.0,0.448,10.0,14.0,0.714,...,27.3,166.0,131.0,TOR,106,1,2023,2022-12-29,True,13
18380,240.0,43.0,89.0,0.483,15.0,39.0,0.385,9.0,14.0,0.643,...,39.8,248.0,119.0,BOS,116,1,2023,2022-12-29,False,-6


In [6]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.metrics import mean_absolute_error

model = RandomForestRegressor(n_estimators=100, random_state=42)
split = TimeSeriesSplit(n_splits=3)

sfs = SequentialFeatureSelector(model, n_features_to_select=35, direction="forward", cv=split)

In [7]:
removed_columns = ["season", "date", "won", "spread", "team", "team_opp", "total", "total_opp"]
selected_columns = df.columns[~df.columns.isin(removed_columns)]

In [8]:
# Scale b/w 0-1 to improve ridge regression performance
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df[selected_columns] = scaler.fit_transform(df[selected_columns])

In [44]:
def backtest(model, data, predictors, start=2, step=1):
    all_predictions = []
    
    seasons = sorted(data["season"].unique())
    
    for i in range(start, len(seasons), step):
        season = seasons[i]
        
        train = data[data["season"] < season]
        test = data[data["season"] == season]
        
        model.fit(train[predictors], train["spread"])
        
        preds = model.predict(test[predictors])
        preds = pd.Series(preds, index=test.index)
        
        combined = pd.concat([test["spread"], preds], axis=1)
        combined.columns = ["actual", "predictions"]
        
        all_predictions.append(combined)
        
    return pd.concat(all_predictions) 

In [10]:
# Build new dataframe that takes rolling data from previous 10 games

df_rolling = df[list(selected_columns) + ["won", "team", "season"]]

def find_team_average(team):
    rolling = team.rolling(10).mean()
    return rolling

# Groupby team to only get team averages and grouping by season since teams change between seasons
df_rolling = df_rolling.groupby(["team", "season"], group_keys=False).apply(find_team_average)
df_rolling

  rolling = team.rolling(10).mean()


Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,ast%_max_opp,stl%_max_opp,blk%_max_opp,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,home_opp,won,season
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18377,0.050,0.443182,0.362500,0.460766,0.568966,0.616667,0.437648,0.455814,0.403175,0.737923,...,0.386797,0.0579,0.1674,0.264256,0.155841,0.489100,0.560000,0.4,0.7,2023.0
18378,0.025,0.484091,0.481250,0.420813,0.479310,0.607576,0.373753,0.423256,0.371429,0.748425,...,0.313777,0.0946,0.0757,0.337631,0.156868,0.496209,0.515294,0.3,0.5,2023.0
18379,0.000,0.559091,0.487500,0.499761,0.351724,0.433333,0.372803,0.427907,0.401587,0.698833,...,0.332721,0.0499,0.0740,0.381971,0.184339,0.454502,0.535294,0.5,0.6,2023.0
18380,0.025,0.495455,0.443750,0.462201,0.527586,0.528788,0.466390,0.372093,0.328571,0.745391,...,0.401493,0.0578,0.0924,0.473585,0.166752,0.550711,0.558824,0.5,0.7,2023.0


In [11]:
df_rolling["team"] = df["team"]
df_rolling["spread"] = df["spread"]
df_rolling = df_rolling.dropna()
df_rolling = df_rolling.reset_index()

df_rolling

Unnamed: 0,index,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,...,blk%_max_opp,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,home_opp,won,season,team,spread
0,234,0.000,0.468182,0.379688,0.482775,0.320690,0.328788,0.430641,0.346512,0.295238,...,0.0679,0.413522,0.124134,0.361611,0.449412,0.4,0.8,2016.0,ATL,8
1,250,0.000,0.529545,0.440625,0.506699,0.420690,0.392424,0.493349,0.395349,0.357143,...,0.0772,0.469497,0.219641,0.394787,0.531765,0.5,1.0,2016.0,GSW,13
2,252,0.000,0.322727,0.354687,0.323684,0.165517,0.210606,0.315558,0.444186,0.382540,...,0.1145,0.437841,0.138126,0.507109,0.360000,0.6,0.4,2016.0,MEM,1
3,257,0.000,0.375000,0.343750,0.390431,0.224138,0.239394,0.382423,0.548837,0.471429,...,0.1072,0.380294,0.273427,0.270616,0.478824,0.6,0.7,2016.0,TOR,19
4,262,0.000,0.381818,0.351562,0.396172,0.268966,0.266667,0.420071,0.374419,0.358730,...,0.0759,0.512159,0.133633,0.277251,0.388235,0.4,0.6,2016.0,IND,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16217,18377,0.050,0.443182,0.362500,0.460766,0.568966,0.616667,0.437648,0.455814,0.403175,...,0.1674,0.264256,0.155841,0.489100,0.560000,0.4,0.7,2023.0,DAL,15
16218,18378,0.025,0.484091,0.481250,0.420813,0.479310,0.607576,0.373753,0.423256,0.371429,...,0.0757,0.337631,0.156868,0.496209,0.515294,0.3,0.5,2023.0,BOS,6
16219,18379,0.000,0.559091,0.487500,0.499761,0.351724,0.433333,0.372803,0.427907,0.401587,...,0.0740,0.381971,0.184339,0.454502,0.535294,0.5,0.6,2023.0,MEM,13
16220,18380,0.025,0.495455,0.443750,0.462201,0.527586,0.528788,0.466390,0.372093,0.328571,...,0.0924,0.473585,0.166752,0.550711,0.558824,0.5,0.7,2023.0,LAC,-6


In [12]:
from sklearn.feature_selection import SequentialFeatureSelector

In [13]:
# X = df_rolling[selected_columns]
# y = df_rolling["spread"]

# sfs.fit(X, y)

In [14]:
# predictors = list(selected_columns[sfs.get_support()])
# predictors

In [15]:
saved_predictors = [
    'mp',
    'fg%',
    'blk',
    'pf',
    'pts',
    'ts%',
    'trb%',
    'ast%',
    'usg%',
    '3pa_max',
    'ft%_max',
    'ast_max',
    'blk_max',
    'tov_max',
    '+/-_max',
    'ortg_max',
    'home',
    'mp_opp',
    'ast_opp',
    'blk_opp',
    'pts_opp',
    'ftr_opp',
    'trb%_opp',
    'tov%_opp',
    'usg%_opp',
    'ortg_opp',
    'fg_max_opp',
    'orb_max_opp',
    'drb_max_opp',
    'stl_max_opp',
    '+/-_max_opp',
    'stl%_max_opp',
    'tov%_max_opp',
    'ortg_max_opp',
    'home_opp'
]

In [16]:
predictions = backtest(model, df_rolling, saved_predictors)
mae = mean_absolute_error(predictions["actual"], predictions["predictions"])
mae

preds season 2018.0 = [ 4.86 -9.47  6.47 ...  8.67 -3.05  9.37]
preds season 2019.0 = [ 8.76  3.89 -3.86 ...  6.04  9.77 -0.44]
preds season 2020.0 = [ 1.57 -9.08 -8.57 ... -3.33  5.31 -5.25]
preds season 2021.0 = [ 5.42 -3.12 10.23 ...  2.46  3.63 -1.84]
preds season 2022.0 = [ 2.1   4.92 -3.25 ...  3.43  0.99  5.01]
preds season 2023.0 = [-1.150e+00 -1.580e+01  1.120e+00  1.770e+00 -5.200e-01  8.900e-01
 -8.990e+00 -5.880e+00  1.490e+00 -1.264e+01 -5.410e+00 -1.510e+00
 -9.600e-01 -4.600e+00  5.700e+00 -4.340e+00  3.000e-01  5.190e+00
  2.500e-01 -7.600e-01 -7.460e+00 -8.330e+00  6.460e+00  5.590e+00
 -3.070e+00  7.700e-01  6.050e+00  5.860e+00  1.800e+00  1.114e+01
  1.199e+01  1.190e+00  1.930e+00 -2.810e+00 -5.000e-01 -2.600e-01
 -1.190e+01 -8.490e+00 -2.320e+00  6.000e-02 -8.550e+00 -4.500e-01
 -5.920e+00 -5.140e+00 -6.200e-01 -5.640e+00  1.234e+01 -2.010e+00
 -4.110e+00 -1.071e+01  6.200e-01  1.520e+00  7.880e+00 -7.600e+00
  6.160e+00  2.190e+00  2.600e-01  4.720e+00 -8.960e+00

10.543120923231308

In [17]:
# Predict the spread for a specific game

model.predict([df_rolling.loc[16198][saved_predictors]])[0]



-1.97

## Future Predictions

Set up home_predictors representing stats from the home team and opp_predictors which are stats from away team. Grab the specific stats from each team and combine the data to compose the input for the the prediction. This example using GSW as the home team and DET as the away team.
Note: This is for testing purposes and needs to be changed.

In [18]:
home_predictors = [
    'mp',
    'fg%',
    'blk',
    'pf',
    'pts',
    'ts%',
    'trb%',
    'ast%',
    'usg%',
    '3pa_max',
    'ft%_max',
    'ast_max',
    'blk_max',
    'tov_max',
    '+/-_max',
    'ortg_max',
    'home'
]

In [19]:
# Group the DataFrame by the 'team' column
team_groups = df_rolling.groupby('team')

# Select the group for the team 'GSW'
gsw_group = team_groups.get_group('GSW').iloc[-1]

gsw_group = gsw_group[home_predictors]

home_predictors =  gsw_group.index

home_predictors

Index(['mp', 'fg%', 'blk', 'pf', 'pts', 'ts%', 'trb%', 'ast%', 'usg%',
       '3pa_max', 'ft%_max', 'ast_max', 'blk_max', 'tov_max', '+/-_max',
       'ortg_max', 'home'],
      dtype='object')

In [20]:
opp_predictors = [
    'mp',
    'ast',
    'blk',
    'pts',
    'ftr',
    'trb%',
    'tov%',
    'usg%',
    'ortg',
    'fg_max',
    'orb_max',
    'drb_max',
    'stl_max',
    '+/-_max',
    'stl%_max',
    'tov%_max',
    'ortg_max',
    'home'
]

In [21]:
team_groups = df_rolling.groupby('team')

# Select the group for the team 'GSW'
det_group = team_groups.get_group('DET').iloc[-1]

det_group = det_group[opp_predictors]

det_group

mp              0.05
ast         0.369767
blk             0.17
pts         0.496154
ftr         0.431486
trb%        0.514865
tov%        0.442164
usg%             0.0
ortg        0.494731
fg_max      0.252381
orb_max     0.376923
drb_max         0.27
stl_max          0.2
+/-_max     0.393333
stl%_max      0.1703
tov%_max    0.408281
ortg_max    0.495261
home             0.6
Name: 16199, dtype: object

In [22]:
#opp_cols = {f"{col}_opp" for col in det_group.columns}
det_group_opp = det_group.rename(index={col: col + '_opp' for col in det_group.index})
opp_predictors = det_group.index
opp_predictors

Index(['mp', 'ast', 'blk', 'pts', 'ftr', 'trb%', 'tov%', 'usg%', 'ortg',
       'fg_max', 'orb_max', 'drb_max', 'stl_max', '+/-_max', 'stl%_max',
       'tov%_max', 'ortg_max', 'home'],
      dtype='object')

In [23]:
input = pd.concat([gsw_group, det_group.rename(index={col: col + '_opp' for col in det_group.index})])

In [24]:
model.predict([input])[0]



2.1

In [25]:
import pickle

data = {"model": model, "data": df_rolling, "home": home_predictors, "away": opp_predictors}
with open('../../app/saved_steps.pkl', 'wb') as file:
    pickle.dump(data, file)

In [26]:
with open('../../app/saved_steps.pkl', 'rb') as file:
    data = pickle.load(file)

regressor_loaded = data["model"]
home = data["home"]
away = data["away"]

home
away

Index(['mp', 'ast', 'blk', 'pts', 'ftr', 'trb%', 'tov%', 'usg%', 'ortg',
       'fg_max', 'orb_max', 'drb_max', 'stl_max', '+/-_max', 'stl%_max',
       'tov%_max', 'ortg_max', 'home'],
      dtype='object')

## Improve testing error by combining df with rolling data

In [27]:
df_roll_opp = df[list(selected_columns) + ["won", "team_opp", "season"]]

def find_opp_average(opp):
    rolling = opp.rolling(10).mean()
    return rolling

df_roll_opp = df_roll_opp.groupby(["team_opp", "season"], group_keys=False).apply(find_opp_average)
df_roll_opp["team_opp"] = df["team_opp"]
df_roll_opp

  rolling = opp.rolling(10).mean()


Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,stl%_max_opp,blk%_max_opp,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,home_opp,won,season,team_opp
0,,,,,,,,,,,...,,,,,,,,,,DET
1,,,,,,,,,,,...,,,,,,,,,,GSW
2,,,,,,,,,,,...,,,,,,,,,,CLE
3,,,,,,,,,,,...,,,,,,,,,,CHI
4,,,,,,,,,,,...,,,,,,,,,,NOP
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18377,0.000,0.484091,0.459375,0.441148,0.513793,0.587879,0.418527,0.388372,0.344444,0.747958,...,0.0689,0.1057,0.317296,0.144544,0.526066,0.541176,0.7,0.7,2023.0,HOU
18378,0.025,0.468182,0.420312,0.445455,0.348276,0.406061,0.385154,0.402326,0.350794,0.776429,...,0.0640,0.0521,0.414570,0.157638,0.526540,0.477647,0.5,0.3,2023.0,LAC
18379,0.025,0.518182,0.340625,0.584928,0.458621,0.448485,0.471615,0.404651,0.353968,0.751109,...,0.0580,0.1322,0.370335,0.139923,0.361611,0.602353,0.5,0.8,2023.0,TOR
18380,0.025,0.529545,0.482812,0.470096,0.420690,0.463636,0.419002,0.334884,0.296825,0.741890,...,0.0449,0.0958,0.475577,0.167266,0.576777,0.509412,0.7,0.5,2023.0,BOS


In [28]:
opp_columns = df_roll_opp.columns[df_roll_opp.columns.str.contains("_opp")]
df_roll_opp = df_roll_opp[opp_columns]

In [29]:
# Build new dataframe that takes rolling data from previous 10 games

removed_columns = ["season", "date", "won", "spread", "team", "team_opp", "total", "total_opp"]
selected_columns = df.columns[~df.columns.isin(removed_columns)]

df_roll = df[list(selected_columns) + ["won", "team", "season"]]

def find_team_average(team):
    rolling = team.rolling(10).mean()
    return rolling

# Groupby team to only get team averages and grouping by season since teams change between seasons
df_roll = df_roll.groupby(["team", "season"], group_keys=False).apply(find_team_average)
df_roll

  rolling = team.rolling(10).mean()


Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,ast%_max_opp,stl%_max_opp,blk%_max_opp,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,home_opp,won,season
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18377,0.050,0.443182,0.362500,0.460766,0.568966,0.616667,0.437648,0.455814,0.403175,0.737923,...,0.386797,0.0579,0.1674,0.264256,0.155841,0.489100,0.560000,0.4,0.7,2023.0
18378,0.025,0.484091,0.481250,0.420813,0.479310,0.607576,0.373753,0.423256,0.371429,0.748425,...,0.313777,0.0946,0.0757,0.337631,0.156868,0.496209,0.515294,0.3,0.5,2023.0
18379,0.000,0.559091,0.487500,0.499761,0.351724,0.433333,0.372803,0.427907,0.401587,0.698833,...,0.332721,0.0499,0.0740,0.381971,0.184339,0.454502,0.535294,0.5,0.6,2023.0
18380,0.025,0.495455,0.443750,0.462201,0.527586,0.528788,0.466390,0.372093,0.328571,0.745391,...,0.401493,0.0578,0.0924,0.473585,0.166752,0.550711,0.558824,0.5,0.7,2023.0


In [30]:
selected_cols = df_roll.columns[~df_roll.columns.isin(opp_columns)]
df_roll = df_roll[list(selected_cols)]

In [31]:
df[['team', 'team_opp', 'spread']]

Unnamed: 0,team,team_opp,spread
0,ATL,DET,-12
1,NOP,GSW,-16
2,CHI,CLE,2
3,CLE,CHI,-2
4,GSW,NOP,16
...,...,...,...
18377,DAL,HOU,15
18378,BOS,LAC,6
18379,MEM,TOR,13
18380,LAC,BOS,-6


In [32]:
full_roll = pd.concat([df_roll, df_roll_opp], axis=1)
full_roll[["team", "team_opp", "spread"]] = df[["team", "team_opp", "spread"]]
full_roll

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,stl%_max_opp,blk%_max_opp,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,home_opp,team_opp,team,spread
0,,,,,,,,,,,...,,,,,,,,DET,ATL,-12
1,,,,,,,,,,,...,,,,,,,,GSW,NOP,-16
2,,,,,,,,,,,...,,,,,,,,CLE,CHI,2
3,,,,,,,,,,,...,,,,,,,,CHI,CLE,-2
4,,,,,,,,,,,...,,,,,,,,NOP,GSW,16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18377,0.050,0.443182,0.362500,0.460766,0.568966,0.616667,0.437648,0.455814,0.403175,0.737923,...,0.0689,0.1057,0.317296,0.144544,0.526066,0.541176,0.7,HOU,DAL,15
18378,0.025,0.484091,0.481250,0.420813,0.479310,0.607576,0.373753,0.423256,0.371429,0.748425,...,0.0640,0.0521,0.414570,0.157638,0.526540,0.477647,0.5,LAC,BOS,6
18379,0.000,0.559091,0.487500,0.499761,0.351724,0.433333,0.372803,0.427907,0.401587,0.698833,...,0.0580,0.1322,0.370335,0.139923,0.361611,0.602353,0.5,TOR,MEM,13
18380,0.025,0.495455,0.443750,0.462201,0.527586,0.528788,0.466390,0.372093,0.328571,0.745391,...,0.0449,0.0958,0.475577,0.167266,0.576777,0.509412,0.7,BOS,LAC,-6


In [33]:
full_roll = full_roll.dropna()
full_roll

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,stl%_max_opp,blk%_max_opp,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,home_opp,team_opp,team,spread
252,0.000,0.322727,0.354687,0.323684,0.165517,0.210606,0.315558,0.444186,0.382540,0.764177,...,0.0511,0.0731,0.351468,0.165083,0.429384,0.451765,0.5,POR,MEM,1
273,0.000,0.445455,0.359375,0.473206,0.344828,0.346970,0.443587,0.346512,0.349206,0.652742,...,0.0705,0.0709,0.290985,0.140180,0.379621,0.454118,0.4,MEM,POR,-1
274,0.025,0.534091,0.442188,0.510526,0.431034,0.398485,0.499644,0.369767,0.338095,0.736756,...,0.0655,0.0766,0.393501,0.095250,0.275355,0.461176,0.3,BRK,GSW,8
275,0.050,0.438636,0.378125,0.447608,0.317241,0.351515,0.401306,0.372093,0.373016,0.636639,...,0.0550,0.0974,0.476834,0.190629,0.297156,0.488235,0.6,MIL,CLE,-3
278,0.050,0.388636,0.332813,0.422727,0.244828,0.236364,0.426485,0.404651,0.341270,0.783897,...,0.0511,0.0875,0.561530,0.166624,0.376777,0.370588,0.5,CLE,MIL,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18377,0.050,0.443182,0.362500,0.460766,0.568966,0.616667,0.437648,0.455814,0.403175,0.737923,...,0.0689,0.1057,0.317296,0.144544,0.526066,0.541176,0.7,HOU,DAL,15
18378,0.025,0.484091,0.481250,0.420813,0.479310,0.607576,0.373753,0.423256,0.371429,0.748425,...,0.0640,0.0521,0.414570,0.157638,0.526540,0.477647,0.5,LAC,BOS,6
18379,0.000,0.559091,0.487500,0.499761,0.351724,0.433333,0.372803,0.427907,0.401587,0.698833,...,0.0580,0.1322,0.370335,0.139923,0.361611,0.602353,0.5,TOR,MEM,13
18380,0.025,0.495455,0.443750,0.462201,0.527586,0.528788,0.466390,0.372093,0.328571,0.745391,...,0.0449,0.0958,0.475577,0.167266,0.576777,0.509412,0.7,BOS,LAC,-6


In [34]:
full_roll[["team", "team_opp", "spread"]]

Unnamed: 0,team,team_opp,spread
252,MEM,POR,1
273,POR,MEM,-1
274,GSW,BRK,8
275,CLE,MIL,-3
278,MIL,CLE,3
...,...,...,...
18377,DAL,HOU,15
18378,BOS,LAC,6
18379,MEM,TOR,13
18380,LAC,BOS,-6


In [35]:
removed_columns = removed_columns + list(full_roll.columns[full_roll.dtypes == "object"])
selected_columns = full_roll.columns[~full_roll.columns.isin(removed_columns)]

In [36]:
# sfs.fit(full_roll[selected_columns], full_roll["spread"])

In [37]:
# predictors = list(selected_columns[sfs.get_support()])
# predictors

In [38]:
predictors = [
    '3p%',
     'ft',
     'ast',
     'usg%',
     'ortg',
     'drtg',
     'orb_max',
     'drb_max',
     '+/-_max',
     'ts%_max',
     'stl%_max',
     'home',
     'mp_opp',
     'fg%_opp',
     '3pa_opp',
     'fta_opp',
     'pf_opp',
     'ts%_opp',
     'ftr_opp',
     'drb%_opp',
     'trb%_opp',
     'ast%_opp',
     'tov%_opp',
     'usg%_opp',
     'ortg_opp',
     'drtg_opp',
     '3p_max_opp',
     '3p%_max_opp',
     'ft%_max_opp',
     'orb_max_opp',
     '+/-_max_opp',
     'ast%_max_opp',
     'blk%_max_opp',
     'usg%_max_opp',
     'drtg_max_opp'
]

In [39]:
predictors = [
    '3p%',
     'ft',
     'ast',
     'usg%',
     'ortg',
     'drtg',
     'orb_max',
     'drb_max',
     '+/-_max',
     'ts%_max',
     'stl%_max',
     'home',
     'fg%_opp',
     '3pa_opp',
     'fta_opp',
     'pf_opp',
     'ts%_opp',
     'ftr_opp',
     'drb%_opp',
     'trb%_opp',
     'ast%_opp',
     'tov%_opp',
     'usg%_opp',
     'ortg_opp',
     'drtg_opp',
     '3p_max_opp',
     '3p%_max_opp',
     'ft%_max_opp',
     'orb_max_opp',
     '+/-_max_opp',
     'ast%_max_opp',
     'blk%_max_opp',
     'usg%_max_opp',
     'drtg_max_opp'
]

In [59]:
predictions = backtest(model, full_roll, predictors)
mae = mean_absolute_error(predictions["actual"], predictions["predictions"])
mae

9.63592552118359

In [51]:
def fulltest(model, data, predictors, start=2, step=1):
    all_predictions = []
    
    seasons = sorted(data["season"].unique())
    
    train = data[data["season"] < 2023]
    test = data[data["season"] == 2023]
    
    model.fit(train[predictors], train["spread"])  
    
    preds = model.predict(test[predictors])
    preds = pd.Series(preds, index=test.index)
        
    combined = pd.concat([test["spread"], preds], axis=1)
    combined.columns = ["actual", "predictions"]
                
    return combined

In [58]:
predictions = fulltest(model, df, predictors)
mae = mean_absolute_error(predictions["actual"], predictions["predictions"])
mae

1.2634185606060606

In [60]:
df

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won,spread
0,0.0,0.409091,0.343750,0.435407,0.275862,0.348485,0.351544,0.255814,0.222222,0.766628,...,0.019255,0.203791,0.317647,DET,106,0.0,2016,2015-10-27,False,-12
1,0.0,0.363636,0.359375,0.366029,0.206897,0.212121,0.395487,0.418605,0.412698,0.654609,...,0.277279,0.554502,0.317647,GSW,111,1.0,2016,2015-10-27,False,-16
2,0.0,0.409091,0.421875,0.373206,0.241379,0.227273,0.437055,0.348837,0.349206,0.645274,...,0.088575,0.232227,0.329412,CLE,95,0.0,2016,2015-10-27,True,2
3,0.0,0.431818,0.531250,0.322967,0.310345,0.378788,0.368171,0.209302,0.253968,0.519253,...,0.160462,0.345972,0.317647,CHI,97,1.0,2016,2015-10-27,False,-2
4,0.0,0.500000,0.562500,0.377990,0.310345,0.393939,0.356295,0.441860,0.333333,0.893816,...,0.215661,0.530806,0.505882,NOP,95,0.0,2016,2015-10-27,True,16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18377,0.0,0.636364,0.484375,0.590909,0.620690,0.621212,0.475059,0.372093,0.380952,0.626604,...,0.233633,0.734597,0.647059,HOU,114,0.0,2023,2022-12-29,True,15
18378,0.0,0.568182,0.484375,0.514354,0.379310,0.530303,0.334917,0.372093,0.333333,0.735123,...,0.093710,0.691943,0.564706,LAC,110,0.0,2023,2022-12-29,True,6
18379,0.0,0.659091,0.546875,0.564593,0.448276,0.378788,0.532067,0.209302,0.206349,0.666278,...,0.066752,0.364929,0.635294,TOR,106,1.0,2023,2022-12-29,True,13
18380,0.0,0.545455,0.453125,0.511962,0.517241,0.530303,0.457245,0.186047,0.206349,0.583431,...,0.227214,0.753555,0.494118,BOS,116,1.0,2023,2022-12-29,False,-6


## Testing different models

In [41]:
from sklearn.neural_network import MLPClassifier

# Declare the model
nn = MLPClassifier(hidden_layer_sizes=(10,), max_iter=1000, alpha=0.001,
                    solver='sgd', verbose=0, tol=1e-4, activation='logistic', random_state=1,
                    learning_rate_init=.1)

# {'solver': 'sgd', 'learning_rate_init': 0.1, 'hidden_layer_sizes': (10,), 'alpha': 0.001, 'activation': 'logistic'}

In [42]:
predictions = backtest(nn, full_roll, predictors)
mae = mean_absolute_error(predictions["actual"], predictions["predictions"])
mae

preds season 2018.0 = [ 7 -7  7 ...  7  7  7]
preds season 2019.0 = [ 7  7 -7 ...  7  7 -7]
preds season 2020.0 = [ 5 -7  7 ... -5  7 -5]
preds season 2021.0 = [-7  7 -7 ...  7  7 -7]
preds season 2022.0 = [ 7 -3  7 ...  3 -4  3]
preds season 2023.0 = [  9  -7  -8   8  -7  -7   9  -7   6   9  -4  -4   8  -7   9   6  -4  -7
   8  -7   5  14   9   8  -7  -7  -7  -7   9   9  -7   8  -7  -7  -7   8
   9  -7  -4  -7  -7  -8   8  -7   9  -7   8   8  -7  -7   9   9   9 -15
   9  -7   9  -7   8  -7   9   8  -8  -7   9   8   6  -7  -7   8  -7   8
  -4  -7   8   8  -7  -7  -7   9   6  -7   8   8  -7   6   5  -7   8   8
  -7  -7   8  -7   6   9  -7   9   8  -7  -7   9   9  -7   9   9   6  -8
  -7  -7  -7   5   4  -4  -7   5   8  -7  -7  -7  -7  -3   8  -7  -7   9
  -7   9   8   8  -8   5   9  -7  -4   8  -8  -7  -7   8  -7  -3  -4   6
  -7  -7   9   8  -4  -7  -4  -7   9   9  -7   8  -7  -7   9   9  -8   8
  -7   6   9  -7  -4   5   9  -7  -7   8   9  -7  -7   6  -7  -7  -4   6
  -4   8   6   8  



10.45569939475454

In [43]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Split the data into training and testing sets
X = full_roll[predictors]
y = full_roll['spread']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the MLPClassifier
nn = MLPClassifier(hidden_layer_sizes=(5, 2), max_iter=1000, alpha=1e-4,
                    solver='sgd', verbose=0, tol=1e-4, random_state=1,
                    learning_rate_init=.1)

# Train the model on the training data
nn.fit(X_train, y_train)

# Make predictions on the test data
y_pred = nn.predict(X_test)

# Calculate the mean squared error
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)


Mean Squared Error: 254.8617911372792
