In [1]:
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import MinMaxScaler

In [6]:
# Load and preprocess the data
matches = pd.read_csv("/Users/meritbhusal/Desktop/NBA DATA/nba_games.csv", index_col=0)
matches = matches.sort_values("date").reset_index(drop=True)

In [7]:
# Drop unnecessary columns
drop_columns = ["mp.1", "mp_opp.1", "index_opp"]
matches = matches.drop(columns=drop_columns)

In [8]:
matches

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won
0,240.0,35.0,83.0,0.422,6.0,18.0,0.333,19.0,27.0,0.704,...,69.4,43.7,206.0,104.0,GSW,111,1,2016,2015-10-27,False
1,240.0,38.0,94.0,0.404,9.0,29.0,0.310,10.0,17.0,0.588,...,53.2,34.6,162.0,104.0,CHI,97,1,2016,2015-10-27,False
2,240.0,37.0,87.0,0.425,7.0,19.0,0.368,16.0,23.0,0.696,...,30.4,29.0,138.0,105.0,CLE,95,0,2016,2015-10-27,True
3,240.0,41.0,96.0,0.427,9.0,30.0,0.300,20.0,22.0,0.909,...,37.5,38.9,201.0,120.0,NOP,95,0,2016,2015-10-27,True
4,240.0,37.0,82.0,0.451,8.0,27.0,0.296,12.0,15.0,0.800,...,33.3,23.6,132.0,104.0,DET,106,0,2016,2015-10-27,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17767,240.0,34.0,85.0,0.400,15.0,38.0,0.395,14.0,19.0,0.737,...,42.9,36.3,133.0,112.0,GSW,107,0,2022,2022-06-10,False
17768,240.0,41.0,88.0,0.466,9.0,40.0,0.225,13.0,15.0,0.867,...,45.0,94.4,300.0,112.0,BOS,94,0,2022,2022-06-13,True
17769,240.0,31.0,75.0,0.413,11.0,32.0,0.344,21.0,31.0,0.677,...,33.3,36.2,222.0,107.0,GSW,104,1,2022,2022-06-13,False
17770,240.0,34.0,80.0,0.425,11.0,28.0,0.393,11.0,12.0,0.917,...,33.3,31.5,186.0,111.0,GSW,103,0,2022,2022-06-16,False


In [9]:
# Add target column
def add_target(group):
    group = group.copy()  # Prevent SettingWithCopyWarning
    group["target"] = group["won"].shift(-1)
    return group

matches = matches.groupby("team", group_keys=False).apply(add_target)
matches["target"] = matches["target"].fillna(2).astype(int)

  matches = matches.groupby("team", group_keys=False).apply(add_target)


In [10]:
matches[matches["team"] == "GSW"]

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won,target
3,240.0,41.0,96.0,0.427,9.0,30.0,0.300,20.0,22.0,0.909,...,38.9,201.0,120.0,NOP,95,0,2016,2015-10-27,True,1
44,240.0,43.0,93.0,0.462,9.0,26.0,0.346,17.0,25.0,0.680,...,37.5,151.0,118.0,HOU,92,1,2016,2015-10-30,True,1
67,240.0,46.0,84.0,0.548,17.0,30.0,0.567,25.0,35.0,0.714,...,36.1,218.0,131.0,NOP,120,1,2016,2015-10-31,True,1
98,240.0,43.0,84.0,0.512,11.0,25.0,0.440,22.0,30.0,0.733,...,44.3,106.0,126.0,MEM,69,0,2016,2015-11-02,True,1
137,240.0,39.0,85.0,0.459,10.0,26.0,0.385,24.0,31.0,0.774,...,32.9,250.0,122.0,LAC,108,0,2016,2015-11-04,True,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17762,240.0,39.0,86.0,0.453,15.0,37.0,0.405,14.0,20.0,0.700,...,35.2,300.0,117.0,BOS,88,0,2022,2022-06-05,True,0
17764,240.0,36.0,78.0,0.462,15.0,40.0,0.375,13.0,15.0,0.867,...,28.8,175.0,117.0,BOS,116,1,2022,2022-06-08,False,1
17766,240.0,40.0,91.0,0.440,15.0,43.0,0.349,12.0,15.0,0.800,...,32.4,205.0,120.0,BOS,97,1,2022,2022-06-10,True,1
17768,240.0,41.0,88.0,0.466,9.0,40.0,0.225,13.0,15.0,0.867,...,94.4,300.0,112.0,BOS,94,0,2022,2022-06-13,True,1


In [12]:
matches["won"].value_counts() #checking if the wins and losses are balanced

won
False    8886
True     8886
Name: count, dtype: int64

In [13]:
matches["target"].value_counts() #not balanced cause of 2 values (sometimes team dont have next game)

target
1    8872
0    8870
2      30
Name: count, dtype: int64

In [14]:
nulls = pd.isnull(matches).sum() #checking how many comlumns have null values

In [15]:
nulls

mp          0
fg          0
fga         0
fg%         0
3p          0
           ..
home_opp    0
season      0
date        0
won         0
target      0
Length: 142, dtype: int64

In [16]:
# Remove columns with null values
nulls = matches.isnull().sum()
valid_columns = matches.columns[~matches.columns.isin(nulls[nulls > 0].index)]
matches = matches[valid_columns].copy()

In [20]:
# Define columns to exclude from scaling and feature selection
removed_columns = ["season", "date", "won", "target", "team", "team_opp"]
selected_columns = matches.select_dtypes(include=["number"]).columns.difference(removed_columns)

In [21]:
selected_columns

Index(['+/-_max', '+/-_max_opp', '3p', '3p%', '3p%_max', '3p%_max_opp',
       '3p%_opp', '3p_max', '3p_max_opp', '3p_opp',
       ...
       'trb_max_opp', 'trb_opp', 'ts%', 'ts%_max', 'ts%_max_opp', 'ts%_opp',
       'usg%', 'usg%_max', 'usg%_max_opp', 'usg%_opp'],
      dtype='object', length=136)

In [23]:
matches

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won,target
0,0.0,0.363636,0.338235,0.366029,0.206897,0.212121,0.395487,0.418605,0.412698,0.654609,...,0.277279,0.554502,0.317647,GSW,0.451923,1.0,2016,2015-10-27,False,0
1,0.0,0.431818,0.500000,0.322967,0.310345,0.378788,0.368171,0.209302,0.253968,0.519253,...,0.160462,0.345972,0.317647,CHI,0.317308,1.0,2016,2015-10-27,False,1
2,0.0,0.409091,0.397059,0.373206,0.241379,0.227273,0.437055,0.348837,0.349206,0.645274,...,0.088575,0.232227,0.329412,CLE,0.298077,0.0,2016,2015-10-27,True,1
3,0.0,0.500000,0.529412,0.377990,0.310345,0.393939,0.356295,0.441860,0.333333,0.893816,...,0.215661,0.530806,0.505882,NOP,0.298077,0.0,2016,2015-10-27,True,1
4,0.0,0.409091,0.323529,0.435407,0.275862,0.348485,0.351544,0.255814,0.222222,0.766628,...,0.019255,0.203791,0.317647,DET,0.403846,0.0,2016,2015-10-27,False,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17767,0.0,0.340909,0.367647,0.313397,0.517241,0.515152,0.469121,0.302326,0.285714,0.693116,...,0.182285,0.208531,0.411765,GSW,0.413462,0.0,2022,2022-06-10,False,0
17768,0.0,0.500000,0.411765,0.471292,0.310345,0.545455,0.267221,0.279070,0.222222,0.844807,...,0.928113,1.000000,0.411765,BOS,0.288462,0.0,2022,2022-06-13,True,1
17769,0.0,0.272727,0.220588,0.344498,0.379310,0.424242,0.408551,0.465116,0.476190,0.623104,...,0.181001,0.630332,0.352941,GSW,0.384615,1.0,2022,2022-06-13,False,0
17770,0.0,0.340909,0.294118,0.373206,0.379310,0.363636,0.466746,0.232558,0.174603,0.903151,...,0.120668,0.459716,0.400000,GSW,0.375000,0.0,2022,2022-06-16,False,2


In [24]:
# Scale the selected columns
scaler = MinMaxScaler()
matches[selected_columns] = scaler.fit_transform(matches[selected_columns])

In [25]:
# Initialize XGBoost classifier
xgb_model = XGBClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

In [26]:
# Backtesting function
def backtest(data, model, predictors, start=2, step=1):
    all_predictions = []
    seasons = sorted(data["season"].unique())

    for i in range(start, len(seasons), step):
        season = seasons[i]
        train = data[data["season"] < season]
        test = data[data["season"] == season]

        model.fit(train[predictors], train["target"])
        preds = model.predict(test[predictors])
        preds = pd.Series(preds, index=test.index)

        combined = pd.concat([test["target"], preds], axis=1)
        combined.columns = ["actual", "prediction"]
        all_predictions.append(combined)

    return pd.concat(all_predictions)

In [27]:
# Rolling averages for feature engineering
matches_rolling = matches[list(selected_columns) + ["won", "team", "season"]]

In [28]:
def find_team_averages(team):
    numeric_cols = team.select_dtypes(include=["number"])  # Numeric columns only
    return numeric_cols.rolling(10).mean()

In [29]:
matches_rolling = matches_rolling.groupby(["team", "season"], group_keys=False).apply(find_team_averages)
rolling_cols = [f"{col}_10" for col in matches_rolling.columns]
matches_rolling.columns = rolling_cols

matches = pd.concat([matches, matches_rolling], axis=1).dropna()

  matches_rolling = matches_rolling.groupby(["team", "season"], group_keys=False).apply(find_team_averages)


In [35]:
matches_rolling

Unnamed: 0,+/-_max_10,+/-_max_opp_10,3p_10,3p%_10,3p%_max_10,3p%_max_opp_10,3p%_opp_10,3p_max_10,3p_max_opp_10,3p_opp_10,...,trb_opp_10,ts%_10,ts%_max_10,ts%_max_opp_10,ts%_opp_10,usg%_10,usg%_max_10,usg%_max_opp_10,usg%_opp_10,season_10
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17767,0.481333,0.396000,0.468966,0.448100,0.8394,0.7404,0.398100,0.314286,0.328571,0.437931,...,0.391837,0.521086,0.6244,0.5088,0.419207,0.0,0.248267,0.170603,0.0,2022.0
17768,0.480000,0.398667,0.455172,0.440736,0.7834,0.7957,0.461045,0.385714,0.350000,0.541379,...,0.357143,0.549896,0.5967,0.5952,0.500000,0.0,0.192555,0.321566,0.0,2022.0
17769,0.444000,0.408000,0.437931,0.429572,0.8394,0.6904,0.389905,0.314286,0.335714,0.434483,...,0.385714,0.490814,0.6494,0.4933,0.422756,0.0,0.328498,0.174711,0.0,2022.0
17770,0.442667,0.409333,0.434483,0.431710,0.8194,0.7000,0.399406,0.328571,0.350000,0.462069,...,0.406122,0.471816,0.6013,0.5032,0.419415,0.0,0.254814,0.172144,0.0,2022.0


In [36]:
def shift_col(team, col_name):
    return team[col_name].shift(-1)

In [37]:
def add_col(matches, col_name):
    return matches.groupby("team", group_keys=False).apply(lambda x: shift_col(x, col_name))

In [38]:
matches["home_next"] = add_col(matches, "home")
matches["team_opp_next"] = add_col(matches, "team_opp")
matches["date_next"] = add_col(matches, "date")

  return matches.groupby("team", group_keys=False).apply(lambda x: shift_col(x, col_name))
  return matches.groupby("team", group_keys=False).apply(lambda x: shift_col(x, col_name))
  return matches.groupby("team", group_keys=False).apply(lambda x: shift_col(x, col_name))


In [39]:
# Merge to align next game stats
full = matches.merge(
    matches[rolling_cols + ["team_opp_next", "date_next", "team"]],
    left_on=["team", "date_next"],
    right_on=["team_opp_next", "date_next"],
    suffixes=("", "_next")
)

In [40]:
# Prepare final feature set
removed_columns = list(full.columns[full.dtypes == "object"]) + removed_columns
selected_columns = full.columns[~full.columns.isin(removed_columns)]
predictors = list(selected_columns)

In [41]:
# Backtest with XGBoost
predictions = backtest(full, xgb_model, predictors)

In [42]:
# Evaluate the model
print("Accuracy:", accuracy_score(predictions["actual"], predictions["prediction"]))
print("Classification Report:\n", classification_report(predictions["actual"], predictions["prediction"]))

Accuracy: 0.6170136072812472
Classification Report:
               precision    recall  f1-score   support

           0       0.62      0.61      0.62      5553
           1       0.62      0.62      0.62      5544

    accuracy                           0.62     11097
   macro avg       0.62      0.62      0.62     11097
weighted avg       0.62      0.62      0.62     11097

