In [1]:
import pandas as pd

# List your season CSVs here (order doesn't matter)
csv_files = [
    "nba_team_stats_2021_22.csv",
    "nba_team_stats_2022_23.csv",
    "nba_team_stats_2023_24.csv",
    "nba_team_stats_2024_25.csv"
]

for f in csv_files:
    print("\n" + "="*80)
    print("üìò FILE:", f)
    print("="*80)

    try:
        df = pd.read_csv(f)
    except Exception as e:
        print("‚ùå ERROR reading file:", e)
        continue

    # 1. Seasons inside file
    seasons = df["season"].unique()
    print("üîé Seasons found in file:", seasons)

    # 2. Total rows
    print("üìä Total rows:", len(df))

    # 3. Unique game IDs
    unique_games = df["game_id"].nunique()
    print("üéÆ Unique games:", unique_games, "(expected ‚âà 1230 for a full season)")

    # 4. Games per team
    print("\nüßÆ Games per team:")
    team_games = df.groupby("team_id")["game_id"].nunique().sort_values()
    print(team_games)

    # 5. Detect missing or incomplete teams
    print("\n‚ö†Ô∏è Missing / Low game-count teams:")
    missing = team_games[team_games < 82]
    if len(missing) == 0:
        print("‚úîÔ∏è No missing teams ‚Äî season looks complete.")
    else:
        print(missing)

    # 6. Number of teams found
    print("\nüèÄ Total teams found:", df["team_id"].nunique())



üìò FILE: nba_team_stats_2021_22.csv
üîé Seasons found in file: ['2021-22']
üìä Total rows: 2460
üéÆ Unique games: 1230 (expected ‚âà 1230 for a full season)

üßÆ Games per team:
team_id
1610612737    82
1610612764    82
1610612763    82
1610612762    82
1610612761    82
1610612760    82
1610612759    82
1610612758    82
1610612757    82
1610612756    82
1610612755    82
1610612754    82
1610612753    82
1610612752    82
1610612751    82
1610612750    82
1610612749    82
1610612748    82
1610612747    82
1610612746    82
1610612745    82
1610612744    82
1610612743    82
1610612742    82
1610612741    82
1610612740    82
1610612739    82
1610612738    82
1610612765    82
1610612766    82
Name: game_id, dtype: int64

‚ö†Ô∏è Missing / Low game-count teams:
‚úîÔ∏è No missing teams ‚Äî season looks complete.

üèÄ Total teams found: 30

üìò FILE: nba_team_stats_2022_23.csv
üîé Seasons found in file: ['2022-23']
üìä Total rows: 2460
üéÆ Unique games: 1230 (expected ‚âà 1230 for a 

In [2]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support,
    classification_report,
    confusion_matrix,
    roc_auc_score,
    log_loss,
    brier_score_loss,
)


In [3]:
OUT_DIR = "nba_fe_outputs"

# Metadata with season, team_id, game_index, etc.
win_df = pd.read_csv(f"{OUT_DIR}/static_features_windows_raw.csv")

# Normalized features and continuous final win% target
X_win = np.load(f"{OUT_DIR}/X_static_windows.npy")     # (num_samples, num_features)
y_win = np.load(f"{OUT_DIR}/y_static_windows.npy")     # final_win_pct in [0, 1]

print("X_win shape:", X_win.shape)
print("y_win shape:", y_win.shape)
print("\nSeasons present in windowed dataset:")
print(win_df["season"].value_counts())


X_win shape: (1800, 22)
y_win shape: (1800,)

Seasons present in windowed dataset:
2021-22    450
2022-23    450
2023-24    450
2024-25    450
Name: season, dtype: int64


In [4]:
low_thresh = 0.45
high_thresh = 0.55

y_class = np.empty_like(y_win, dtype=int)
y_class[y_win <= low_thresh] = 0
y_class[(y_win > low_thresh) & (y_win <= high_thresh)] = 1
y_class[y_win > high_thresh] = 2

classes, counts = np.unique(y_class, return_counts=True)
print("Class distribution (class: count):")
for c, n in zip(classes, counts):
    print(f"  {c}: {n}")


Class distribution (class: count):
  0: 600
  1: 420
  2: 780


In [5]:
# Detect seasons and choose most recent as test
seasons_sorted = sorted(win_df["season"].unique())
print("All seasons (sorted):", seasons_sorted)

test_season = seasons_sorted[-1]
train_seasons = seasons_sorted[:-1]

print("Train seasons:", train_seasons)
print("Test season:", test_season)

season_series = win_df["season"]

train_mask = season_series.isin(train_seasons)
test_mask  = season_series == test_season

X_train = X_win[train_mask]
y_train = y_class[train_mask]

X_test = X_win[test_mask]
y_test = y_class[test_mask]

print("Train shape:", X_train.shape, "Test shape:", X_test.shape)


All seasons (sorted): ['2021-22', '2022-23', '2023-24', '2024-25']
Train seasons: ['2021-22', '2022-23', '2023-24']
Test season: 2024-25
Train shape: (1350, 22) Test shape: (450, 22)


In [6]:
lr = LogisticRegression(
    multi_class="multinomial",
    solver="lbfgs",
    max_iter=1000,
    C=1.0,
)

lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)
y_proba = lr.predict_proba(X_test)  # needed for Log Loss / Brier / ROC-AUC


In [7]:
acc = accuracy_score(y_test, y_pred)
prec, rec, f1, _ = precision_recall_fscore_support(
    y_test, y_pred, average="macro"
)

# Multi-class ROC-AUC (One-vs-Rest)
roc = roc_auc_score(y_test, y_proba, multi_class="ovr")

# Log Loss (multi-class)
ll = log_loss(y_test, y_proba)

# Brier Score (multi-class version via flattening one-hot vs probs)
num_classes = y_proba.shape[1]
y_onehot = np.eye(num_classes)[y_test]       # shape: (N, C)
brier = brier_score_loss(
    y_onehot.ravel(),
    y_proba.ravel(),
)

print("Logistic Regression (tier classification on windowed data)")
print("---------------------------------------------------------")
print("Accuracy      :", acc)
print("Precision (macro):", prec)
print("Recall (macro):   ", rec)
print("F1-score (macro): ", f1)
print("ROC-AUC (ovr):    ", roc)
print("Log Loss:         ", ll)
print("Brier Score:      ", brier)


Logistic Regression (tier classification on windowed data)
---------------------------------------------------------
Accuracy      : 0.8711111111111111
Precision (macro): 0.8355307241210747
Recall (macro):    0.8242165242165242
F1-score (macro):  0.828562441506604
ROC-AUC (ovr):     0.9731930513268662
Log Loss:          0.24249721553660147
Brier Score:       0.05112923304295025


In [8]:
print("Classification report:")
print(classification_report(y_test, y_pred))

print("Confusion matrix (rows = true, cols = pred):")
print(confusion_matrix(y_test, y_pred))


Classification report:
              precision    recall  f1-score   support

           0       0.83      0.87      0.85       165
           1       0.71      0.61      0.65        90
           2       0.97      0.99      0.98       195

    accuracy                           0.87       450
   macro avg       0.84      0.82      0.83       450
weighted avg       0.87      0.87      0.87       450

Confusion matrix (rows = true, cols = pred):
[[143  22   0]
 [ 30  55   5]
 [  0   1 194]]
