<a href="https://colab.research.google.com/github/patsoong/CS506FinalProject/blob/main/notebooks/Logistic_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score, average_precision_score, accuracy_score
from sklearn.preprocessing import PolynomialFeatures
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

features_df = pd.read_csv("team_season_features_v2_clean-2.csv")

#remove features not to be used in training data
num_cols = features_df.select_dtypes(include="number").columns.tolist()
for col in ["champion", "season"]:
    if col in num_cols:
        num_cols.remove(col)

# add season-relative features (rank and z-score within each season)
for col in num_cols:
    features_df[f'{col}_season_rank'] = features_df.groupby('season')[col].rank(pct=True)
    features_df[f'{col}_season_zscore'] = features_df.groupby('season')[col].transform(
        lambda x: (x - x.mean()) / x.std() if x.std() > 0 else 0
    )

# update feature list
num_cols_extended = features_df.select_dtypes(include="number").columns.tolist()
for col in ["champion", "season"]:
    if col in num_cols_extended:
        num_cols_extended.remove(col)

X = features_df[num_cols_extended].copy()
X = X.replace([np.inf, -np.inf], np.nan)
y = features_df["champion"].astype(int)

#temporal split
train_mask = features_df["season"] <= 2015
X_train, X_test = X[train_mask], X[~train_mask]
y_train, y_test = y[train_mask], y[~train_mask]

id_test = features_df.loc[~train_mask, ["season", "team", "champion"]].copy()

# model creation
model = Pipeline([
    ("impute", SimpleImputer(strategy="median")),
    ("poly", PolynomialFeatures(degree=2, include_bias=False, interaction_only=True)),
    ("scale", StandardScaler(with_mean=True, with_std=True)),
    ("clf", LogisticRegression(
        multi_class='ovr',
        class_weight='balanced',
        max_iter=10000,
        C=0.05,
        random_state=42
        ))
])

model.fit(X_train, y_train)
probabilities = model.predict_proba(X_test)[:, 1]

# Metrics
print("LogReg ROC-AUC:", roc_auc_score(y_test, probabilities))
print("LogReg Average Precision (PR-AUC):", average_precision_score(y_test, probabilities))

pred_labels = (probabilities > 0.5).astype(int)
print("LogReg Binary accuracy:", accuracy_score(y_test, pred_labels))

id_test["proba_win"] = probabilities

id_test["ranking_score"] = id_test["proba_win"]
id_test["rank"] = (
    id_test.groupby("season")["ranking_score"]
           .rank(ascending=False, method="first")
)

idx = id_test.groupby("season")["proba_win"].idxmax()

predicted_champs = (
    id_test.loc[idx, ["season", "team", "proba_win"]]
           .rename(columns={"team": "team_pred", "proba_win": "pred_prob"})
           .reset_index(drop=True)
)

true_champs = (
    id_test.loc[id_test["champion"] == 1, ["season", "team"]]
           .rename(columns={"team": "team_true"})
           .reset_index(drop=True)
)

eval = predicted_champs.merge(true_champs, on="season", suffixes=("_pred", "_true"))
eval["correct"] = (eval["team_pred"] == eval["team_true"]).astype(int)
print("LogReg Top-1 accuracy:", eval["correct"].mean())

print("\nPredicted vs. True Champions by Season:\n")
print(eval[["season", "team_pred", "team_true", "pred_prob", "correct"]]
      .sort_values("season")
      .to_string(index=False))


true_champ_ranks = id_test.loc[id_test["champion"] == 1, "rank"]

# print top-k accuracies
k_values = [1, 2, 4]
print("\nTop-K Accuracy:")
for k in k_values:
    accuracy = (true_champ_ranks <= k).mean()
    print(f" Top-{k}: {accuracy:.4f}")



LogReg ROC-AUC: 0.9817241379310344
LogReg Average Precision (PR-AUC): 0.7043381180223285
LogReg Binary accuracy: 0.9666666666666667
LogReg Top-1 accuracy: 0.6

Predicted vs. True Champions by Season:

 season team_pred team_true  pred_prob  correct
   2016  Warriors Cavaliers   0.999938        0
   2017  Warriors  Warriors   0.999984        1
   2018  Warriors  Warriors   0.996921        1
   2019   Raptors   Raptors   0.992933        1
   2020     Bucks    Lakers   0.979083        0
   2021  Clippers     Bucks   0.479522        0
   2022  Warriors  Warriors   0.961972        1
   2023   Celtics   Nuggets   0.949741        0
   2024   Celtics   Celtics   0.999948        1
   2025   Thunder   Thunder   0.999990        1

Top-K Accuracy:
 Top-1: 0.6000
 Top-2: 0.9000
 Top-4: 1.0000
