In [1]:
import pandas as pd
import numpy as np

games = pd.read_csv("game.csv", index_col=0)
g = games.copy()

In [2]:
g["game_date"] = pd.to_datetime(g["game_date"])

home_cols = {
    "team_id_home": "team_id", "team_name_home": "team_name",
    "pts_home": "pts", "fgm_home":"fgm", "fga_home":"fga", "fg3m_home":"fg3m",
    "fta_home":"fta", "tov_home":"tov", "oreb_home":"oreb", "dreb_home":"dreb",
    "ast_home":"ast", "stl_home":"stl", "blk_home":"blk", "reb_home":"reb",
    "plus_minus_home":"plus_minus"
}
away_cols = {
    "team_id_away": "team_id", "team_name_away": "team_name",
    "pts_away": "pts", "fgm_away":"fgm", "fga_away":"fga", "fg3m_away":"fg3m",
    "fta_away":"fta", "tov_away":"tov", "oreb_away":"oreb", "dreb_away":"dreb",
    "ast_away":"ast", "stl_away":"stl", "blk_away":"blk", "reb_away":"reb",
    "plus_minus_away":"plus_minus"
}
home_long = g[["game_id","game_date","season_type","team_id_home","team_name_home",
               "pts_home","fgm_home","fga_home","fg3m_home","fta_home","tov_home",
               "oreb_home","dreb_home","ast_home","stl_home","blk_home","reb_home",
               "plus_minus_home","team_id_away","team_name_away","dreb_away"]].rename(columns=home_cols)
home_long["is_home"] = 1
home_long = home_long.rename(columns={"team_id_away":"opp_team_id","team_name_away":"opp_team_name",
                                      "dreb_away":"opp_dreb"})

away_long = g[["game_id","game_date","season_type","team_id_away","team_name_away",
               "pts_away","fgm_away","fga_away","fg3m_away","fta_away","tov_away",
               "oreb_away","dreb_away","ast_away","stl_away","blk_away","reb_away",
               "plus_minus_away","team_id_home","team_name_home","dreb_home"]].rename(columns=away_cols)
away_long["is_home"] = 0
away_long = away_long.rename(columns={"team_id_home":"opp_team_id","team_name_home":"opp_team_name",
                                      "dreb_home":"opp_dreb"})

long = pd.concat([home_long, away_long], ignore_index=True)

long

Unnamed: 0,game_id,game_date,season_type,team_id,team_name,pts,fgm,fga,fg3m,fta,...,dreb,ast,stl,blk,reb,plus_minus,opp_team_id,opp_team_name,opp_dreb,is_home
0,24600001,1946-11-01,Regular Season,1610610035,Toronto Huskies,66.0,25.0,,,29.0,...,,,,,,-2,1610612752,New York Knicks,,1
1,24600003,1946-11-02,Regular Season,1610610034,St. Louis Bombers,56.0,20.0,59.0,,,...,,,,,,5,1610610031,Pittsburgh Ironmen,,1
2,24600002,1946-11-02,Regular Season,1610610032,Providence Steamrollers,59.0,21.0,,,,...,,,,,,6,1610612738,Boston Celtics,,1
3,24600004,1946-11-02,Regular Season,1610610025,Chicago Stags,63.0,21.0,,,,...,,,,,,16,1610612752,New York Knicks,,1
4,24600005,1946-11-02,Regular Season,1610610028,Detroit Falcons,33.0,10.0,,,,...,,,,,,-17,1610610036,Washington Capitols,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
131391,42200403,2023-06-07,Playoffs,1610612743,Denver Nuggets,109.0,41.0,80.0,5.0,27.0,...,45.0,28.0,3.0,5.0,58.0,15,1610612748,Miami Heat,23.0,0
131392,42200404,2023-06-09,Playoffs,1610612743,Denver Nuggets,108.0,39.0,79.0,14.0,21.0,...,29.0,26.0,11.0,7.0,34.0,13,1610612748,Miami Heat,29.0,0
131393,42200405,2023-06-12,Playoffs,1610612748,Miami Heat,89.0,33.0,96.0,9.0,16.0,...,33.0,18.0,9.0,7.0,44.0,-5,1610612743,Denver Nuggets,46.0,0
131394,32200001,2023-02-19,All-Star,1610616833,Team Giannis,184.0,76.0,123.0,29.0,4.0,...,36.0,43.0,8.0,1.0,46.0,9,1610616834,Team LeBron,32.0,0


In [3]:
long["eFG"] = (long["fgm"] + 0.5*long["fg3m"]) / long["fga"].replace(0, np.nan)
# TOV% = TOV / (FGA + 0.44*FTA + TOV)
long["TOVp"] = long["tov"] / (long["fga"] + 0.44*long["fta"] + long["tov"]).replace(0, np.nan)
# FTr = FTA / FGA
long["FTr"] = long["fta"] / long["fga"].replace(0, np.nan)
# ORB% = OREB / (OREB + Opp DREB)
long["ORBp"] = long["oreb"] / (long["oreb"] + long["opp_dreb"]).replace(0, np.nan)
# Point differential
long["pts_diff"] = long["pts"] - (long["plus_minus"] + long["pts"])  # plus_minus = pts - opp_pts ⇒ opp_pts = pts - plus_minus
long["opp_pts"] = long["pts"] - long["plus_minus"]
long["pts_margin"] = long["pts"] - long["opp_pts"]

for c in ["eFG","TOVp","FTr","ORBp"]:
    long[c] = long[c].replace([np.inf, -np.inf], np.nan)

long = long.sort_values(["team_id", "game_date"])
long["prev_date"] = long.groupby("team_id")["game_date"].shift(1)
long["rest_days"] = (long["game_date"] - long["prev_date"]).dt.days
long["b2b"] = (long["rest_days"] == 1).astype(int)

roll_base = ["eFG","TOVp","FTr","ORBp","pts_margin"]
for k in [3,5,10]:
    for c in roll_base:
        long[f"{c}_roll{k}"] = (
            long.groupby("team_id")[c]
            .apply(lambda s: s.shift(1).rolling(k, min_periods=1).mean())
            .values
        )

home_side = long[long["is_home"] == 1].copy()
away_side = long[long["is_home"] == 0].copy()

feat = home_side.merge(
    away_side,
    on=["game_id","game_date","season_type"],
    suffixes=("_home","_away")
)

delta_cols = []
for k in [3,5,10]:
    for c in roll_base:
        h, a = f"{c}_roll{k}_home", f"{c}_roll{k}_away"
        d = f"Δ{c}_roll{k}"
        feat[d] = feat[h] - feat[a]
        delta_cols.append(d)

feat["Δrest_days"] = feat["rest_days_home"] - feat["rest_days_away"]
feat["b2b_home"] = feat["b2b_home"].astype(int)
feat["b2b_away"] = feat["b2b_away"].astype(int)

feat["team_code_home"] = feat["team_name_home"].astype("category").cat.codes
feat["team_code_away"] = feat["team_name_away"].astype("category").cat.codes
feat["day_code"] = feat["game_date"].dt.dayofweek
feat = feat.merge(g[["game_id", "wl_home"]], on="game_id", how="left")
feat["target_home"] = (feat["wl_home"] == "W").astype(int)
feat = feat.drop(columns=["wl_home"])

games_rolling = feat.reset_index(drop=True)

predictors = (
    ["team_code_home", "team_code_away", "day_code",
     "Δrest_days", "b2b_home", "b2b_away"]
    + delta_cols
)

games_rolling = games_rolling.dropna(subset=predictors)

In [4]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)
train = feat[(feat["game_date"] < '2021-01-01') & (feat["game_date"] > '2000-09-01')]
test = feat[feat["game_date"] > '2021-01-01']

In [5]:
rf.fit(train[predictors], train["target_home"])
preds = rf.predict(test[predictors])

In [6]:
from sklearn.metrics import accuracy_score

acc = accuracy_score(test["target_home"], preds)
print(acc)

0.599896480331263


In [7]:
combined = pd.DataFrame(dict(actual=test["target_home"], prediction=preds))
pd.crosstab(index=combined["actual"], columns=combined["prediction"])

prediction,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,624,1081
1,465,1694


In [8]:
from sklearn.metrics import precision_score
precision_score(test["target_home"], preds)

0.6104504504504504

In [9]:
def make_predictions(data, predictors):
    train = data[(data["game_date"] < '2021-01-01') & (data["game_date"] > '2000-09-01')]
    test = data[data["game_date"] > '2021-01-01']
    rf.fit(train[predictors], train["target_home"])
    preds = rf.predict(test[predictors])
    combined = pd.DataFrame(dict(actual=test["target_home"], prediction=preds), index=test.index)
    precision = precision_score(test["target_home"], preds)
    return combined, precision

In [10]:
combined, precision = make_predictions(games_rolling, predictors)

In [11]:
import joblib, json, pathlib

if "team_name_home_home" in games_rolling.columns:
    team_names = pd.Index(
        sorted(pd.unique(list(games_rolling["team_name_home_home"]) + list(games_rolling["team_name_away_home"])))
    )
    team_categories = team_names.tolist()
else:
    # fallback: derive from codes but you should prefer names
    team_categories = sorted(set(games_rolling["team_code_home"]) | set(games_rolling["team_code_away"]))
    # if these are ints, keep as ints; otherwise cast to str and be consistent

bundle = {
    "model": rf,
    "predictors": predictors,
    "label_name": "target_home",
    "team_categories": team_categories,
}

In [12]:
joblib.dump(bundle, "rf_homewin.joblib")
print("Saved rf_homewin.joblib")

Saved rf_homewin.joblib


In [13]:
feat_cols_for_app = (
    ["game_id","game_date"] +
    [c for c in games_rolling.columns if c.startswith("team_name_home")] +
    [c for c in games_rolling.columns if c.startswith("team_name_away")] +
    [bundle["label_name"]] + predictors
)

In [14]:
if not any(c.startswith("team_name_home") for c in games_rolling.columns):
    feat_cols_for_app = ["game_id","game_date","team_code_home","team_code_away", bundle["label_name"]] + predictors

features_for_app = games_rolling[feat_cols_for_app].copy()
# ensure plain date (no tz/pyarrow ext types)
features_for_app["game_date"] = pd.to_datetime(features_for_app["game_date"]).dt.date

try:
    # try pyarrow first
    features_for_app.to_parquet("features_for_app.parquet", index=False, engine="pyarrow")
    print("Saved features_for_app.parquet (pyarrow).")
except Exception as e1:
    try:
        # try fastparquet if installed
        features_for_app.to_parquet("features_for_app.parquet", index=False, engine="fastparquet")
        print("Saved features_for_app.parquet (fastparquet).")
    except Exception as e2:
        # final fallback: CSV is perfectly fine for the Gradio app
        features_for_app.to_csv("features_for_app.csv", index=False)
        print("Parquet failed, saved features_for_app.csv instead.\n",
              f"pyarrow error: {e1}\nfastparquet error: {e2}")


Saved features_for_app.parquet (pyarrow).
