<a href="https://colab.research.google.com/github/patsoong/CS506FinalProject/blob/main/notebooks/Random_Forest_Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score, average_precision_score, accuracy_score
from sklearn.calibration import CalibratedClassifierCV
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

features_df = pd.read_csv("team_season_features_v2_clean-2.csv")

#remove features not to be used in training data
num_cols = features_df.select_dtypes(include="number").columns.tolist()
for col in ["champion", "season"]:
    if col in num_cols:
        num_cols.remove(col)

# add season-relative features (rank and z-score within each season)
for col in num_cols:
    features_df[f'{col}_season_rank'] = features_df.groupby('season')[col].rank(pct=True)
    features_df[f'{col}_season_zscore'] = features_df.groupby('season')[col].transform(
        lambda x: (x - x.mean()) / x.std() if x.std() > 0 else 0
    )

# update feature list
num_cols_extended = features_df.select_dtypes(include="number").columns.tolist()
for col in ["champion", "season"]:
    if col in num_cols_extended:
        num_cols_extended.remove(col)

X = features_df[num_cols_extended].copy()
X = X.replace([np.inf, -np.inf], np.nan)
y = features_df["champion"].astype(int)

#temporal split
train_mask = features_df["season"] <= 2015
X_train, X_test = X[train_mask], X[~train_mask]
y_train, y_test = y[train_mask], y[~train_mask]

id_test = features_df.loc[~train_mask, ["season", "team", "champion"]].copy()

base_rf = RandomForestClassifier(
        n_estimators=5000,
        max_depth=None,
        min_samples_split=2,
        min_samples_leaf=1,
        class_weight="balanced",
        n_jobs=-1,
        bootstrap=True,
        max_features='sqrt',
        random_state=42,
    )

rf_cal = CalibratedClassifierCV(base_rf, method="sigmoid", cv=3)

rf_model = Pipeline([
    ("impute", SimpleImputer(strategy="median")),
    ("clf", rf_cal),
])

rf_model.fit(X_train, y_train)
rf_probs = rf_model.predict_proba(X_test)[:, 1]

print("RF ROC-AUC:", roc_auc_score(y_test, rf_probs))
print("RF Average Precision (PR-AUC):", average_precision_score(y_test, rf_probs))

rf_pred_labels = (rf_probs > 0.5).astype(int)
print("RF Binary Accuracy:", accuracy_score(y_test, rf_pred_labels))

id_test["rf_proba_win"] = rf_probs
id_test["ranking_score"] = id_test["rf_proba_win"]
id_test["rank"] = (
    id_test.groupby("season")["ranking_score"]
           .rank(ascending=False, method="first")
)

idx = id_test.groupby("season")["rf_proba_win"].idxmax()

rf_predicted_champs = (
    id_test.loc[idx, ["season", "team", "rf_proba_win"]]
           .rename(columns={"team": "team_pred", "rf_proba_win": "pred_prob"})
           .reset_index(drop=True)
)

rf_true_champs = (
    id_test.loc[id_test["champion"] == 1, ["season", "team"]]
           .rename(columns={"team": "team_true"})
           .reset_index(drop=True)
)

rf_eval = rf_predicted_champs.merge(rf_true_champs, on="season", how="left")
rf_eval["correct"] = (rf_eval["team_pred"] == rf_eval["team_true"]).astype(int)

print("RF Top-1 accuracy:", rf_eval["correct"].mean())
print("Predicted vs. True Champions by Season:")
print(rf_eval)

true_champ_ranks = id_test.loc[id_test["champion"] == 1, "rank"]

# print top-k accuracies
k_values = [1, 2, 4]
print("\nTop-K Accuracy:")
for k in k_values:
    accuracy = (true_champ_ranks <= k).mean()
    print(f" Top-{k}: {accuracy:.4f}")

RF ROC-AUC: 0.9879310344827587
RF Average Precision (PR-AUC): 0.8336494252873563
RF Binary Accuracy: 0.9833333333333333
RF Top-1 accuracy: 0.7
Predicted vs. True Champions by Season:
   season team_pred  pred_prob  team_true  correct
0    2016  Warriors   0.521217  Cavaliers        0
1    2017  Warriors   0.739682   Warriors        1
2    2018  Warriors   0.483521   Warriors        1
3    2019   Raptors   0.841025    Raptors        1
4    2020     Bucks   0.078294     Lakers        0
5    2021  Clippers   0.043562      Bucks        0
6    2022  Warriors   0.710242   Warriors        1
7    2023   Nuggets   0.760820    Nuggets        1
8    2024   Celtics   0.899829    Celtics        1
9    2025   Thunder   0.841519    Thunder        1

Top-K Accuracy:
 Top-1: 0.7000
 Top-2: 1.0000
 Top-4: 1.0000
