<a href="https://colab.research.google.com/github/patsoong/CS506FinalProject/blob/main/notebooks/Logistic_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [34]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score, average_precision_score, accuracy_score
from sklearn.preprocessing import PolynomialFeatures
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

features_df = pd.read_csv("team_season_features_v2_clean-2.csv")

num_cols = features_df.select_dtypes(include="number").columns.tolist() #remove features not to be used in training data
for col in ["champion", "season"]:
    if col in num_cols:
        num_cols.remove(col)

X = features_df[num_cols].copy()
X = X.replace([np.inf, -np.inf], np.nan)
y = features_df["champion"].astype(int)

#temporal split
train_mask = features_df["season"] <= 2015
X_train, X_test = X[train_mask], X[~train_mask]
y_train, y_test = y[train_mask], y[~train_mask]

id_test = features_df.loc[~train_mask, ["season", "team", "champion"]].copy()

model = Pipeline([
    ("impute", SimpleImputer(strategy="median")),
    ("poly", PolynomialFeatures(degree=2, include_bias=False, interaction_only=True)),
    ("scale", StandardScaler(with_mean=True, with_std=True)),
    ("clf", LogisticRegression(
        multi_class='ovr',
        class_weight='balanced',
        max_iter=10000,
        C=1,
        random_state=42
        ))
])

model.fit(X_train, y_train)
probabilities = model.predict_proba(X_test)[:, 1]

# measures how well your model can rank positive examples (champions) above
# negatives (non-champions), across all possible probability thresholds
# ROC-AUC = probability that your model assigns a higher predicted probability to the champion
# How well does the model rank real champions higher than others?
# how well it ranks teams overall
print("ROC-AUC:", roc_auc_score(y_test, probabilities))

# emphasizes performance on the positive class
# How well does the model focus on the true champion cases out of all possible teams?
# how good it is at finding the actual champions
print("Average Precision (PR-AUC):", average_precision_score(y_test, probabilities))

pred_labels = (probabilities > 0.5).astype(int)
print("Binary accuracy:", accuracy_score(y_test, pred_labels))

id_test["proba_win"] = probabilities

idx = id_test.groupby("season")["proba_win"].idxmax()

predicted_champs = (
    id_test.loc[idx, ["season", "team", "proba_win"]]
           .rename(columns={"team": "team_pred", "proba_win": "pred_prob"})
           .reset_index(drop=True)
)

true_champs = (
    id_test.loc[id_test["champion"] == 1, ["season", "team"]]
           .rename(columns={"team": "team_true"})
           .reset_index(drop=True)
)

eval = predicted_champs.merge(true_champs, on="season", suffixes=("_pred", "_true"))
eval["correct"] = (eval["team_pred"] == eval["team_true"]).astype(int)
print("Top-1 accuracy:", eval["correct"].mean())

print("\nPredicted vs. True Champions by Season:\n")
print(eval[["season", "team_pred", "team_true", "pred_prob", "correct"]]
      .sort_values("season")
      .to_string(index=False))

ROC-AUC: 0.9627586206896551
Average Precision (PR-AUC): 0.5511730032419687
Binary accuracy: 0.9666666666666667
Top-1 accuracy: 0.8

Predicted vs. True Champions by Season:

 season team_pred team_true  pred_prob  correct
   2016  Warriors Cavaliers   0.994014        0
   2017  Warriors  Warriors   0.994092        1
   2018  Warriors  Warriors   0.815186        1
   2019   Raptors   Raptors   0.182558        1
   2020    Lakers    Lakers   0.007219        1
   2021      Suns     Bucks   0.075588        0
   2022  Warriors  Warriors   0.078105        1
   2023   Nuggets   Nuggets   0.048625        1
   2024   Celtics   Celtics   0.823239        1
   2025   Thunder   Thunder   0.839275        1


