<a href="https://colab.research.google.com/github/patsoong/CS506FinalProject/blob/main/notebooks/Random_Forest_Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score, average_precision_score, accuracy_score
from sklearn.calibration import CalibratedClassifierCV
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

features_df = pd.read_csv("team_season_features_v2_clean-2.csv")

num_cols = features_df.select_dtypes(include="number").columns.tolist() #remove features not to be used in training data
for col in ["champion", "season"]:
    if col in num_cols:
        num_cols.remove(col)

X = features_df[num_cols].copy()
X = X.replace([np.inf, -np.inf], np.nan)
y = features_df["champion"].astype(int)

#temporal split
train_mask = features_df["season"] <= 2015
X_train, X_test = X[train_mask], X[~train_mask]
y_train, y_test = y[train_mask], y[~train_mask]

id_test = features_df.loc[~train_mask, ["season", "team", "champion"]].copy()

base_rf = RandomForestClassifier(
        n_estimators=20000,
        max_depth=None,
        min_samples_split=8,
        min_samples_leaf=4,
        class_weight="balanced",
        n_jobs=-1,
        bootstrap=True,
        max_features='sqrt',
        random_state=42,
    )

rf_cal = CalibratedClassifierCV(base_rf, method="sigmoid", cv=3)

rf_model = Pipeline([
    ("impute", SimpleImputer(strategy="median")),
    ("clf", rf_cal),
])

rf_model.fit(X_train, y_train)
rf_probs = rf_model.predict_proba(X_test)[:, 1]

print("RF ROC-AUC:", roc_auc_score(y_test, rf_probs))
print("RF Average Precision (PR-AUC):", average_precision_score(y_test, rf_probs))

rf_pred_labels = (rf_probs > 0.5).astype(int)
print("RF Binary Accuracy:", accuracy_score(y_test, rf_pred_labels))

id_test["rf_proba_win"] = rf_probs

idx = id_test.groupby("season")["rf_proba_win"].idxmax()

rf_predicted_champs = (
    id_test.loc[idx, ["season", "team", "rf_proba_win"]]
           .rename(columns={"team": "team_pred", "rf_proba_win": "pred_prob"})
           .reset_index(drop=True)
)

rf_true_champs = (
    id_test.loc[id_test["champion"] == 1, ["season", "team"]]
           .rename(columns={"team": "team_true"})
           .reset_index(drop=True)
)

rf_eval = rf_predicted_champs.merge(rf_true_champs, on="season", how="left")
rf_eval["correct"] = (rf_eval["team_pred"] == rf_eval["team_true"]).astype(int)

print("RF Top-1 accuracy:", rf_eval["correct"].mean())
print("Predicted vs. True Champions by Season:")
print(rf_eval)

RF ROC-AUC: 0.973448275862069
RF Average Precision (PR-AUC): 0.4554329004329004
RF Binary Accuracy: 0.9566666666666667
RF Top-1 accuracy: 0.6
Predicted vs. True Champions by Season:
   season team_pred  pred_prob  team_true  correct
0    2016     Spurs   0.834684  Cavaliers        0
1    2017  Warriors   0.807888   Warriors        1
2    2018  Warriors   0.742870   Warriors        1
3    2019     Bucks   0.718634    Raptors        0
4    2020    Lakers   0.112131     Lakers        1
5    2021      Suns   0.296214      Bucks        0
6    2022      Suns   0.635315   Warriors        0
7    2023   Nuggets   0.184813    Nuggets        1
8    2024   Celtics   0.784274    Celtics        1
9    2025   Thunder   0.696842    Thunder        1
