In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("nba_games.csv", index_col=0)

In [3]:
df = df.sort_values("date")
df = df.reset_index(drop = True)

In [4]:
del df["mp.1"]
del df["mp_opp.1"]
del df["index_opp"]

In [5]:
def add_target(team):
    team["target"] = team["won"].shift(-1)
    return team

df = df.groupby("team", group_keys=False).apply(add_target)

In [6]:
df["target"][pd.isnull(df["target"])] = 2

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["target"][pd.isnull(df["target"])] = 2


In [7]:
df["target"] = df["target"].astype(int, errors="ignore")

In [8]:
df["won"].value_counts()

True     10540
False    10540
Name: won, dtype: int64

In [9]:
df["target"].value_counts()

1    10525
0    10525
2       30
Name: target, dtype: int64

In [10]:
nulls = pd.isnull(df)

In [11]:
nulls = nulls.sum()

In [12]:
nulls = nulls[nulls > 0]

In [13]:
valid_columns = df.columns[~df.columns.isin(nulls.index)]

In [14]:
df = df[valid_columns].copy()

In [15]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import RidgeClassifier

rr = RidgeClassifier(alpha=1)
split = TimeSeriesSplit(n_splits=3)
sfs = SequentialFeatureSelector(rr, n_features_to_select=30, direction="forward", cv=split)

In [16]:
removed_columns = ["season", "date", "won", "target", "team", "team_opp"]
selected_columns = df.columns[~df.columns.isin(removed_columns)]

In [17]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df[selected_columns] = scaler.fit_transform(df[selected_columns])

In [18]:
sfs.fit(df[selected_columns], df["target"])

In [19]:
predictors = list(selected_columns[sfs.get_support()])

In [20]:
def backtest(data, model, predictors, start=2, step=1):
    all_predictions = []
    seasons = sorted(data["season"].unique())

    for i in range(start, len(seasons), step):
        season = seasons[i]

        train = data[data["season"] < season]
        test = data[data["season"] == season]

        model.fit(train[predictors], train["target"])

        preds = model.predict(test[predictors])
        preds = pd.Series(preds, index=test.index)

        combined = pd.concat([test["target"], preds], axis=1)

        combined.columns = ["actual", "prediction"]

        all_predictions.append(combined)
    
    return pd.concat(all_predictions)

In [21]:
predictions = backtest(df, rr, predictors)

In [22]:
from sklearn.metrics import accuracy_score

predictions = predictions[predictions["actual"] != 2]
accuracy_score(predictions["actual"], predictions["prediction"])


0.5433654089642947

In [23]:
df_rolling = df[list(selected_columns) + ["won", "team", "season"]]

In [24]:
df_rolling

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,blk%_max_opp,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,total_opp,home_opp,won,team,season
0,0.0,0.272727,0.188406,0.373206,0.413793,0.378788,0.491686,0.767442,0.777778,0.626604,...,0.101,0.312369,0.142490,0.255924,0.611765,0.250000,1.0,True,HOU,2015
1,0.0,0.204545,0.275362,0.203349,0.103448,0.090909,0.356295,0.697674,0.603175,0.760793,...,0.046,0.403564,0.161746,0.341232,0.376471,0.423077,0.0,False,LAL,2015
2,0.0,0.295455,0.347826,0.267943,0.137931,0.106061,0.432304,0.348837,0.317460,0.722287,...,0.163,0.475891,0.236200,0.549763,0.305882,0.355769,1.0,False,ORL,2015
3,0.0,0.500000,0.594203,0.327751,0.137931,0.196970,0.279097,0.325581,0.476190,0.397900,...,0.080,0.308176,0.251605,0.236967,0.423529,0.192308,0.0,True,NOP,2015
4,0.0,0.431818,0.260870,0.521531,0.275862,0.257576,0.452494,0.348837,0.285714,0.815636,...,0.056,0.371069,0.154044,0.454976,0.529412,0.355769,1.0,False,DAL,2015
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21075,0.0,0.613636,0.420290,0.593301,0.379310,0.424242,0.408551,0.441860,0.460317,0.611435,...,0.110,0.596436,0.139923,0.331754,0.600000,0.307692,1.0,True,SAC,2023
21076,0.0,0.545455,0.579710,0.385167,0.827586,0.863636,0.466746,0.232558,0.206349,0.750292,...,0.054,0.401468,0.148909,0.526066,0.635294,0.346154,1.0,True,DAL,2023
21077,0.0,0.500000,0.362319,0.509569,0.344828,0.318182,0.475059,0.534884,0.476190,0.736289,...,0.145,0.228512,0.181001,0.554502,0.588235,0.451923,1.0,True,POR,2023
21078,0.0,0.340909,0.391304,0.291866,0.172414,0.303030,0.247031,0.511628,0.396825,0.865811,...,0.142,0.371069,0.094994,0.308057,0.258824,0.394231,0.0,False,CHO,2023


In [25]:
def find_team_averages(team):
    rolling = team.rolling(10).mean()
    return rolling

df_rolling = df_rolling.groupby(["team", "season"], group_keys=False).apply(find_team_averages)

  rolling = team.rolling(10).mean()


In [26]:
rolling_cols = [f"{col}_10" for col in df_rolling.columns]
df_rolling.columns = rolling_cols

df = pd.concat([df, df_rolling], axis=1)

In [27]:
df = df.dropna()

In [28]:
def shift_col(team, col_name):
    next_col = team[col_name].shift(-1)
    return next_col

def add_col(df, col_name):
    return df.groupby("team", group_keys=False).apply(lambda x: shift_col(x, col_name))

df["home_next"] = add_col(df, "home")
df["team_opp_next"] = add_col(df, "team_opp")
df["date_next"] = add_col(df, "date")


In [29]:
df = df.copy()

In [30]:
df

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,usg%_max_opp_10,ortg_max_opp_10,drtg_max_opp_10,total_opp_10,home_opp_10,won_10,season_10,home_next,team_opp_next,date_next
244,0.00,0.363636,0.289855,0.404306,0.344828,0.303030,0.495249,0.465116,0.412698,0.740957,...,0.274326,0.427014,0.395294,0.338462,0.6,0.4,2015.0,0.0,WAS,2014-11-15
246,0.25,0.431818,0.521739,0.303828,0.413793,0.484848,0.395487,0.000000,0.079365,0.028005,...,0.136072,0.457820,0.391765,0.312500,0.6,0.3,2015.0,1.0,HOU,2014-11-16
248,0.00,0.454545,0.318841,0.495215,0.172414,0.060606,0.742280,0.372093,0.301587,0.824971,...,0.127985,0.422275,0.427059,0.332692,0.4,0.2,2015.0,1.0,DEN,2014-11-16
250,0.00,0.386364,0.144928,0.586124,0.344828,0.272727,0.540380,0.441860,0.380952,0.766628,...,0.148010,0.424645,0.478824,0.365385,0.6,0.4,2015.0,0.0,TOR,2014-11-15
254,0.00,0.340909,0.347826,0.325359,0.275862,0.272727,0.432304,0.232558,0.333333,0.416569,...,0.152888,0.405687,0.400000,0.288462,0.4,0.3,2015.0,0.0,CHI,2014-11-15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21075,0.00,0.613636,0.420290,0.593301,0.379310,0.424242,0.408551,0.441860,0.460317,0.611435,...,0.167779,0.435545,0.581176,0.495192,0.4,0.7,2023.0,,,
21076,0.00,0.545455,0.579710,0.385167,0.827586,0.863636,0.466746,0.232558,0.206349,0.750292,...,0.147497,0.620379,0.518824,0.439423,0.5,0.4,2023.0,,,
21077,0.00,0.500000,0.362319,0.509569,0.344828,0.318182,0.475059,0.534884,0.476190,0.736289,...,0.184981,0.535545,0.536471,0.506731,0.6,0.3,2023.0,,,
21078,0.00,0.340909,0.391304,0.291866,0.172414,0.303030,0.247031,0.511628,0.396825,0.865811,...,0.162259,0.544076,0.469412,0.509615,0.5,0.4,2023.0,,,


In [31]:
#ADD FUTURE GAMES CODE
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup, Comment
import requests
import time
import urllib
import urllib.request
from datetime import datetime
from tqdm import tqdm

In [32]:
# --------- SUBJECT TO CHANGE --------
current_season = 2023
current_month = "december"
today = datetime.today().strftime('%b %#d, %Y')
test_today = "Dec 2, 2022"

In [33]:
url = f"https://www.basketball-reference.com/leagues/NBA_{current_season}_games-{current_month}.html"

opener = urllib.request.FancyURLopener({})
file = "secret/current.html"
opener.retrieve(url, file)

  opener = urllib.request.FancyURLopener({})


('secret/current.html', <http.client.HTTPMessage at 0x1fc3f728250>)

In [34]:
with open(file, "r", encoding='utf-8') as f:
    html = f.read()

soup = BeautifulSoup(html)
comments = soup.find_all(string=lambda text: isinstance(text, Comment))

[s.decompose() for s in soup.select("tr.over_header")]
[s.decompose() for s in soup.select("tr.thread")]

schedule = pd.read_html(str(soup))
schedule = schedule[0]
for i, date in enumerate(schedule["Date"]):
    schedule["Date"][i] = date[5:]

schedule = schedule[["Date", "Visitor/Neutral", "Home/Neutral"]]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  schedule["Date"][i] = date[5:]


In [35]:
games_to_predict = schedule.loc[schedule['Date'] == today]

team_index = {"Atlanta Hawks" : "ATL", "Boston Celtics" : "BOS", "Charlotte Hornets" : "CHO", "Chicago Bulls" : "CHI", "Cleveland Cavaliers" : "CLE",
"Dallas Mavericks" : "DAL", "Denver Nuggets" : "DEN", "Detroit Pistons" : "DET", "Golden State Warriors" : "GSW", "Houston Rockets" : "HOU", 
"Indiana Pacers" : "IND", "Los Angeles Clippers" : "LAC", "Los Angeles Lakers" : "LAL", "Memphis Grizzlies" : "MEM", "Miami Heat" : "MIA",
"Milwaukee Bucks" : "MIL", "Minnesota Timberwolves" : "MIN", "New Orleans Pelicans" : "NOP", "New York Knicks": "NYK", "Brooklyn Nets" : "BRK",
"Oklahoma City Thunder" : "OKC", "Orlando Magic" : "ORL", "Philadelphia 76ers" : "PHI", "Phoenix Suns" : "PHO", "Portland Trail Blazers" : "POR",
"Sacramento Kings" : "SAC", "San Antonio Spurs" : "SAS", "Toronto Raptors" : "TOR", "Utah Jazz" : "UTA", "Washington Wizards" : "WAS"
}


cols = list(games_to_predict.columns)
cols[1] = "Away"
cols[2] = "Home"
games_to_predict.columns = cols

t_before = list(team_index.keys())
t_abbr = list(team_index.values())

games_to_predict["Away"] = games_to_predict["Away"].replace(t_before, t_abbr)
games_to_predict["Home"] = games_to_predict["Home"].replace(t_before, t_abbr)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  games_to_predict["Away"] = games_to_predict["Away"].replace(t_before, t_abbr)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  games_to_predict["Home"] = games_to_predict["Home"].replace(t_before, t_abbr)


In [36]:
df

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,usg%_max_opp_10,ortg_max_opp_10,drtg_max_opp_10,total_opp_10,home_opp_10,won_10,season_10,home_next,team_opp_next,date_next
244,0.00,0.363636,0.289855,0.404306,0.344828,0.303030,0.495249,0.465116,0.412698,0.740957,...,0.274326,0.427014,0.395294,0.338462,0.6,0.4,2015.0,0.0,WAS,2014-11-15
246,0.25,0.431818,0.521739,0.303828,0.413793,0.484848,0.395487,0.000000,0.079365,0.028005,...,0.136072,0.457820,0.391765,0.312500,0.6,0.3,2015.0,1.0,HOU,2014-11-16
248,0.00,0.454545,0.318841,0.495215,0.172414,0.060606,0.742280,0.372093,0.301587,0.824971,...,0.127985,0.422275,0.427059,0.332692,0.4,0.2,2015.0,1.0,DEN,2014-11-16
250,0.00,0.386364,0.144928,0.586124,0.344828,0.272727,0.540380,0.441860,0.380952,0.766628,...,0.148010,0.424645,0.478824,0.365385,0.6,0.4,2015.0,0.0,TOR,2014-11-15
254,0.00,0.340909,0.347826,0.325359,0.275862,0.272727,0.432304,0.232558,0.333333,0.416569,...,0.152888,0.405687,0.400000,0.288462,0.4,0.3,2015.0,0.0,CHI,2014-11-15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21075,0.00,0.613636,0.420290,0.593301,0.379310,0.424242,0.408551,0.441860,0.460317,0.611435,...,0.167779,0.435545,0.581176,0.495192,0.4,0.7,2023.0,,,
21076,0.00,0.545455,0.579710,0.385167,0.827586,0.863636,0.466746,0.232558,0.206349,0.750292,...,0.147497,0.620379,0.518824,0.439423,0.5,0.4,2023.0,,,
21077,0.00,0.500000,0.362319,0.509569,0.344828,0.318182,0.475059,0.534884,0.476190,0.736289,...,0.184981,0.535545,0.536471,0.506731,0.6,0.3,2023.0,,,
21078,0.00,0.340909,0.391304,0.291866,0.172414,0.303030,0.247031,0.511628,0.396825,0.865811,...,0.162259,0.544076,0.469412,0.509615,0.5,0.4,2023.0,,,


In [37]:
games_to_predict

Unnamed: 0,Date,Away,Home
19,"Dec 4, 2022",DEN,NOP
20,"Dec 4, 2022",PHO,SAS
21,"Dec 4, 2022",BOS,BRK
22,"Dec 4, 2022",MEM,DET
23,"Dec 4, 2022",CLE,NYK
24,"Dec 4, 2022",CHI,SAC
25,"Dec 4, 2022",LAL,WAS
26,"Dec 4, 2022",IND,POR


In [38]:
df_lower = df[df["team_opp_next"].isna()]
df_upper = df.dropna()

In [39]:
df_lower

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,usg%_max_opp_10,ortg_max_opp_10,drtg_max_opp_10,total_opp_10,home_opp_10,won_10,season_10,home_next,team_opp_next,date_next
21043,0.25,0.727273,0.492754,0.655502,0.517241,0.469697,0.509501,0.302326,0.365079,0.513419,...,0.165597,0.493839,0.512941,0.514423,0.6,0.3,2023.0,,,
21044,0.0,0.477273,0.449275,0.409091,0.482759,0.439394,0.503563,0.325581,0.253968,0.86231,...,0.169576,0.434123,0.504706,0.401923,0.5,0.6,2023.0,,,
21045,0.0,0.522727,0.536232,0.392344,0.37931,0.393939,0.435867,0.488372,0.428571,0.750292,...,0.1181,0.359716,0.517647,0.461538,0.5,0.5,2023.0,,,
21046,0.0,0.477273,0.463768,0.397129,0.413793,0.575758,0.339667,0.55814,0.396825,0.955659,...,0.157766,0.474408,0.562353,0.411538,0.3,0.8,2023.0,,,
21047,0.0,0.431818,0.550725,0.284689,0.37931,0.530303,0.334917,0.255814,0.238095,0.708285,...,0.231451,0.427014,0.430588,0.571154,0.6,0.0,2023.0,,,
21049,0.0,0.590909,0.362319,0.62201,0.586207,0.5,0.545131,0.302326,0.285714,0.693116,...,0.158922,0.444076,0.617647,0.473077,0.3,0.7,2023.0,,,
21050,0.0,0.477273,0.463768,0.397129,0.413793,0.454545,0.41924,0.604651,0.507937,0.787631,...,0.149037,0.390995,0.498824,0.496154,0.6,0.6,2023.0,,,
21051,0.0,0.545455,0.478261,0.461722,0.310345,0.348485,0.395487,0.348837,0.365079,0.611435,...,0.157638,0.44455,0.471765,0.513462,0.6,0.3,2023.0,,,
21055,0.0,0.75,0.536232,0.638756,0.37931,0.333333,0.502375,0.395349,0.365079,0.708285,...,0.144929,0.455924,0.584706,0.477885,0.4,0.7,2023.0,,,
21057,0.0,0.568182,0.318841,0.641148,0.37931,0.424242,0.408551,0.325581,0.333333,0.628938,...,0.125931,0.404265,0.518824,0.468269,0.5,0.7,2023.0,,,


In [40]:
for i in range(len(games_to_predict)):

    df_away = df_lower.copy()
    temp = list(games_to_predict["Away"])
    temp2 = list(games_to_predict["Home"])

    df_away = df_away.loc[df_away["team"] == temp[i]]

    df_away["home_next"] = 0.0
    df_away["team_opp_next"] = temp2[i]
    df_away["date_next"] = datetime.today().strftime('%Y-%m-%d')

    df_home = df_lower.copy()
    df_home = df_home.loc[df_home["team"] == temp2[i]]
    df_home["home_next"] = 1.0
    df_home["team_opp_next"] = temp[i]
    df_home["date_next"] = datetime.today().strftime('%Y-%m-%d')

    df_upper = pd.concat([df_upper, df_home])
    df_upper = pd.concat([df_upper, df_away])


In [41]:
df = df_upper.copy()

In [42]:
full = df.merge(df[rolling_cols + ["team_opp_next", "date_next", "team"]], left_on=["team", "date_next"], right_on=["team_opp_next", "date_next"])


In [43]:
full[["team_x", "team_opp_next_x", "team_y", "team_opp_next_y", "date_next"]]

Unnamed: 0,team_x,team_opp_next_x,team_y,team_opp_next_y,date_next
0,TOR,MEM,MEM,TOR,2014-11-19
1,ORL,DET,DET,ORL,2014-11-17
2,MEM,HOU,HOU,MEM,2014-11-17
3,UTA,OKC,OKC,UTA,2014-11-18
4,DET,ORL,ORL,DET,2014-11-17
...,...,...,...,...,...
18527,CHI,SAC,SAC,CHI,2022-12-04
18528,WAS,LAL,LAL,WAS,2022-12-04
18529,LAL,WAS,WAS,LAL,2022-12-04
18530,POR,IND,IND,POR,2022-12-04


In [44]:
removed_columns = list(full.columns[full.dtypes == "object"]) + removed_columns

In [45]:
selected_columns = full.columns[~full.columns.isin(removed_columns)]

In [46]:
sfs.fit(full[selected_columns], full["target"])

In [47]:
predictors = list(selected_columns[sfs.get_support()])

In [48]:
predictions = backtest(full, rr, predictors)
accuracy_score(predictions["actual"], predictions["prediction"])

0.629068340910731

In [49]:
new_prediction = full[full["target"] == 2]
actual_prediction = predictions[predictions["actual"] == 2]
final_prediction = [new_prediction, actual_prediction]
final_prediction = pd.concat(final_prediction, axis=1)
final_prediction = final_prediction[["team_opp_next_y", "team_y", "prediction"]]
final_prediction = final_prediction.reset_index()

for i in range(len(final_prediction)):
    final_prediction["index"][i] = i%2
    if final_prediction["prediction"][i] == 0:
        final_prediction["prediction"][i] = final_prediction["team_y"][i]
    else:
        final_prediction["prediction"][i] = final_prediction["team_opp_next_y"][i]
    final_prediction["team_y"] = final_prediction["team_y"].replace(t_abbr, t_before)
    final_prediction["team_opp_next_y"] = final_prediction["team_opp_next_y"].replace(t_abbr, t_before)
    final_prediction["prediction"] = final_prediction["prediction"].replace(t_abbr, t_before)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_prediction["index"][i] = i%2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_prediction["prediction"][i] = final_prediction["team_opp_next_y"][i]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_prediction["prediction"][i] = final_prediction["team_y"][i]


In [50]:
cols = list(final_prediction.columns)
cols[0] = "DISPLAY"
cols[1] = "HOME"
cols[2] = "AWAY"
cols[3] = "PREDICTION"
final_prediction.columns = cols


final_prediction = final_prediction[final_prediction["DISPLAY"] == 0]

final_prediction = final_prediction[["HOME", "AWAY", "PREDICTION"]]



In [1]:
final_prediction