In [4]:
import pandas as pd

# Load file
df = pd.read_excel("./datasci/whl_2025.xlsx")

# --- Extract numeric game number safely ---
# Works for: 1, "1", "game_1", "game_012", etc.
df["game_num"] = (
    df["game_id"]
    .astype(str)
    .str.extract(r"(\d+)")
    .astype(int)
)

# --- Sort from game 1 to max ---
df = df.sort_values("game_num")

# --- Format game_id with required padding ---
def format_game(n: int) -> str:
    if n < 10:
        return f"game_000{n}"
    elif n < 100:
        return f"game_00{n}"
    elif n < 1000:
        return f"game_0{n}"
    else:
        return f"game_{n}"

df["game_id"] = df["game_num"].apply(format_game)

# --- Cleanup ---
df = df.drop(columns=["game_num"])

# --- Save result ---
df.to_excel("./datasci/cleanedup/whl_2025_cleaned.xlsx", index=False)
