In [14]:
import pandas as pd
import numpy as np
from pathlib import Path

# Prefer full_dataset.csv (your FE source). Fall back to PCA file only if you must (not recommended).
candidates = [
    Path("data/processed/full_dataset.csv"),
    Path("../data/processed/full_dataset.csv"),
    Path("../../data/processed/full_dataset.csv"),
    # fallback (avoid unless absolutely needed)
    Path("data/processed/pca_full_dataset.csv"),
    Path("../data/processed/pca_full_dataset.csv"),
    Path("../../data/processed/pca_full_dataset.csv"),
]

csv_path = next((p for p in candidates if p.exists()), None)
if csv_path is None:
    print("Working dir:", Path.cwd())
    raise FileNotFoundError("Could not find full_dataset.csv. Make sure it is in data/processed/.")

print("Using:", csv_path.resolve())
df = pd.read_csv(csv_path, low_memory=False)
df.columns = [c.strip().replace(" ", "_") for c in df.columns]
print("Shape:", df.shape)
df.head()


Using: /Users/kennethchen/CS506-FALL-2025/CS506FinalProject/data/processed/full_dataset.csv
Shape: (143966, 52)


Unnamed: 0,gameId,gameDate,teamCity,team_name,teamId,opponentTeamCity,opponentTeamName,opponentTeamId,home,win,...,pointsSecondChance,timesTied,timeoutsRemaining,seasonWins,seasonLosses,coachId,season,games_played,wins,win_pct
0,22500104,2025-10-25T21:00:00Z,Denver,Nuggets,1610612743,Phoenix,Suns,1610612756,1,1,...,16.0,1.0,1.0,1.0,1.0,,2025.0,7.0,0.0,0.0
1,22500104,2025-10-25T21:00:00Z,Phoenix,Suns,1610612756,Denver,Nuggets,1610612743,0,0,...,14.0,1.0,1.0,1.0,2.0,,2025.0,7.0,0.0,0.0
2,22500103,2025-10-25T20:00:00Z,Indiana,Pacers,1610612754,Memphis,Grizzlies,1610612763,0,0,...,10.0,3.0,0.0,0.0,2.0,,2025.0,6.0,0.0,0.0
3,22500103,2025-10-25T20:00:00Z,Memphis,Grizzlies,1610612763,Indiana,Pacers,1610612754,1,1,...,19.0,3.0,1.0,2.0,1.0,,2025.0,8.0,0.0,0.0
4,22500101,2025-10-25T19:30:00Z,Atlanta,Hawks,1610612737,Oklahoma City,Thunder,1610612760,1,0,...,9.0,5.0,1.0,1.0,2.0,,2025.0,7.0,0.0,0.0


In [15]:
assert "season" in df.columns, "Missing season"
team_col = "team_name" if "team_name" in df.columns else "team"
assert team_col in df.columns, "Missing team_name"



In [16]:
# ---- Cell 2.5: Canonicalize team names to your 30-team list ----
CANON = {
    "76ers": "Philadelphia 76ers",
    "Sixers": "Philadelphia 76ers",
    "Bucks": "Milwaukee Bucks",
    "Bulls": "Chicago Bulls",
    "Cavaliers": "Cleveland Cavaliers",
    "Cavs": "Cleveland Cavaliers",
    "Celtics": "Boston Celtics",
    "Hawks": "Atlanta Hawks",
    "Heat": "Miami Heat",
    "Hornets": "Charlotte Hornets",
    "Knicks": "New York Knicks",
    "Nets": "Brooklyn Nets",
    "Magic": "Orlando Magic",
    "Pacers": "Indiana Pacers",
    "Pistons": "Detroit Pistons",
    "Raptors": "Toronto Raptors",
    "Wizards": "Washington Wizards",
    "Mavericks": "Dallas Mavericks",
    "Mavs": "Dallas Mavericks",
    "Rockets": "Houston Rockets",
    "Grizzlies": "Memphis Grizzlies",
    "Pelicans": "New Orleans Pelicans",
    "Spurs": "San Antonio Spurs",
    "Nuggets": "Denver Nuggets",
    "Timberwolves": "Minnesota Timberwolves",
    "Wolves": "Minnesota Timberwolves",
    "Thunder": "Oklahoma City Thunder",
    "Trail Blazers": "Portland Trail Blazers",
    "Blazers": "Portland Trail Blazers",
    "Jazz": "Utah Jazz",
    "Warriors": "Golden State Warriors",
    "Clippers": "LA Clippers",
    "Lakers": "Los Angeles Lakers",
    "Suns": "Phoenix Suns",
    "Kings": "Sacramento Kings",
}

def canonize(name: str) -> str:
    s = str(name).strip()
    return CANON.get(s, s)

# Apply to the team column immediately so all downstream ops use canonical names
df[team_col] = df[team_col].astype(str).str.strip().map(canonize)



In [17]:
def find_col(d, candidates):
    for c in candidates:
        if c in d.columns:
            return c
    return None

offrtg = find_col(df, ["OffRtg","offrtg","offensive_rating","off_rating"])
defrtg = find_col(df, ["DefRtg","defrtg","defensive_rating","def_rating"])
pts    = find_col(df, ["PTS","points","pts","team_points"])
opppts = find_col(df, ["OppPTS","opp_points","points_allowed","opp_pts"])

df_fe = df.copy()
if offrtg and defrtg: df_fe["net_rating"] = df_fe[offrtg] - df_fe[defrtg]
if pts and opppts:    df_fe["point_diff"] = df_fe[pts] - df_fe[opppts]



In [18]:
# ---- Cell 4: Build opponent name and strength-of-schedule (opp_win_pct) ----

# Home/away columns (used to derive opponent per game)
home = find_col(df_fe, ["hometeamName","home_team"])
away = find_col(df_fe, ["awayteamName","away_team"])

if home and away:
    is_home = (df_fe[team_col].astype(str) == df_fe[home].astype(str))
    df_fe["opponent_name"] = np.where(is_home, df_fe[away], df_fe[home]).astype(str)
else:
    # If the dataset already has an opponent column, try that; otherwise NaN
    exist_opp = find_col(df_fe, ["opponent","opponent_name","opp_name"])
    df_fe["opponent_name"] = df_fe[exist_opp] if exist_opp else np.nan

# If you added Cell 2.5 with CANON + canonize(name), apply it here to opponent names
if "canonize" in globals():
    df_fe["opponent_name"] = df_fe["opponent_name"].astype(str).str.strip().map(canonize)

# ------------ Season win% per team ------------
if "win_pct" in df_fe.columns:
    # Use the provided season win% if present
    team_win = (
        df_fe.groupby(["season", team_col], dropna=False)["win_pct"]
             .max()
             .reset_index()
    )
else:
    # Fallback: infer wins from points if Off/Def ratings not given
    assert pts and opppts, "Need PTS/OppPTS to infer win_pct when 'win_pct' is not provided."
    df_fe["is_win"] = (df_fe[pts] > df_fe[opppts]).astype(int)
    team_win = (
        df_fe.groupby(["season", team_col], dropna=False)["is_win"]
             .mean()
             .reset_index()
             .rename(columns={"is_win":"win_pct"})
    )

# Canonicalize team names inside team_win as well (so both frames match exactly)
if "canonize" in globals():
    team_win[team_col] = team_win[team_col].astype(str).str.strip().map(canonize)

# Rank by win% within season (seed proxy)
team_win["win_pct_rank"] = (
    team_win.groupby("season")["win_pct"]
            .rank(ascending=False, method="dense")
)

# ------------ Build opponent season win% table ------------
opp_win = (
    team_win.rename(columns={team_col: "opponent_name", "win_pct": "opp_win_pct"})
            [["season", "opponent_name", "opp_win_pct"]]
)

# ---- Robust dtype alignment for merge keys ----
# Coerce to strings and clean whitespace; convert literal "nan"/"none"/"null" back to NaN
for frame, cols in [(df_fe, ["season","opponent_name"]), (opp_win, ["season","opponent_name"])]:
    for col in cols:
        frame[col] = frame[col].astype(str).str.strip()

nan_like = {"nan","none","null",""}
df_fe.loc[df_fe["opponent_name"].str.lower().isin(nan_like), "opponent_name"] = np.nan
opp_win.loc[opp_win["opponent_name"].str.lower().isin(nan_like), "opponent_name"] = np.nan

# Merge opponent win% onto each game row (left join)
df_fe = df_fe.merge(opp_win, on=["season","opponent_name"], how="left")




In [19]:
exclude = {team_col, "season", "gameId","gameDate","winner","hometeamName","awayteamName","opponent_name"}
num_cols = [c for c in df_fe.columns if c not in exclude and np.issubdtype(df_fe[c].dtype, np.number)]

agg = df_fe.groupby(["season", team_col])[num_cols].agg(["mean","std"])
agg.columns = [f"{c}_{stat}" for c, stat in agg.columns]
team_season = agg.reset_index()

# 🔧 Force identical dtypes for merge keys
team_season["season"] = team_season["season"].astype(str).str.strip()
team_season[team_col] = team_season[team_col].astype(str).str.strip()
team_win["season"] = team_win["season"].astype(str).str.strip()
team_win[team_col] = team_win[team_col].astype(str).str.strip()

team_season = team_season.merge(
    team_win[["season", team_col, "win_pct", "win_pct_rank"]],
    on=["season", team_col],
    how="left"
)

if "net_rating_mean" in team_season.columns:
    team_season["net_rating_x_win_pct"] = team_season["net_rating_mean"] * team_season["win_pct"]

team_season.head()




Unnamed: 0,season,team_name,teamId_mean,teamId_std,opponentTeamId_mean,opponentTeamId_std,home_mean,home_std,win_mean,win_std,...,games_played_mean,games_played_std,wins_mean,wins_std,win_pct_mean,win_pct_std,opp_win_pct_mean,opp_win_pct_std,win_pct,win_pct_rank
0,2025.0,Atlanta Hawks,1610613000.0,0.0,1610613000.0,7.785823,0.571429,0.534522,0.428571,0.534522,...,7.0,0.0,0.0,0.0,0.0,0.0,,,0.0,1.0
1,2025.0,Boston Celtics,1610613000.0,0.0,1610613000.0,8.953584,0.5,0.547723,0.5,0.547723,...,6.0,0.0,0.0,0.0,0.0,0.0,,,0.0,1.0
2,2025.0,Brooklyn Nets,1610613000.0,0.0,1342186000.0,657509500.0,0.5,0.547723,0.333333,0.516398,...,6.0,0.0,0.0,0.0,0.0,0.0,,,0.0,1.0
3,2025.0,Charlotte Hornets,1610613000.0,0.0,1610613000.0,7.158079,0.428571,0.534522,0.428571,0.534522,...,7.0,0.0,0.0,0.0,0.0,0.0,,,0.0,1.0
4,2025.0,Chicago Bulls,1610613000.0,0.0,1610613000.0,9.178131,0.571429,0.534522,0.714286,0.48795,...,7.0,0.0,0.0,0.0,0.0,0.0,,,0.0,1.0


In [20]:
from pathlib import Path

out = Path("data/processed/team_season_features.csv")
out.parent.mkdir(parents=True, exist_ok=True)  # <-- make sure folders exist
team_season.to_csv(out, index=False)

out, team_season.shape




(PosixPath('data/processed/team_season_features.csv'), (73, 96))