<a href="https://colab.research.google.com/github/mikexie360/data-visualization-final-project/blob/main/data_visualization_final_project_data_collection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Collection from OpenDota API for Final Project
This google colab collects data from the top 5000 teams, team information such as hero picks, win rate and rating. And also their match history from patch 7.33 and up.

Running this google colab can take upwards of a few hours to collect all the data that you need.
You also might need an OpenDota API Key.

In [58]:
## imports
import requests
import pandas as pd
from datetime import datetime, timezone
import time
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, log_loss
from sklearn.ensemble import HistGradientBoostingClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
import xgboost as xgb
from xgboost import XGBClassifier

from google.colab import userdata


In [37]:
# ==== CONFIG ====
OPENDOTA_API_KEY = userdata.get('opendota_api_key')
PATCH_MIN = 7.33        # 7.33 and up
LIMIT = 5000            # keep this moderate; huge pages + JSON can 400
START_OFFSET = 0
MAX_PAGES = None        # None = keep going until empty; or an int
TIMEOUT = 60

API_BASE = "https://api.opendota.com/api"
SLEEP_SEC = 0.25           # gentle rate-limit
MAX_TEAMS = None           # set an int to cap for testing; or None for all

In [23]:
## teams_df

## top 5000 teams

all_batches = []
for page in range(5):  # pages 0..4
    params = {"api_key": OPENDOTA_API_KEY, "page": page}
    resp = requests.get("https://api.opendota.com/api/teams", params=params, timeout=30)
    resp.raise_for_status()
    batch = resp.json()
    if not batch:
        break
    all_batches.append(pd.DataFrame(batch))

# combine pages
teams_df = pd.concat(all_batches, ignore_index=True)

# add readable timestamp (UTC ISO-8601)
teams_df["last_match_time_iso"] = pd.to_datetime(
    teams_df["last_match_time"], unit="s", utc=True, errors="coerce"
).dt.strftime("%Y-%m-%dT%H:%M:%SZ")

# select/ordering columns
cols = [
    "team_id", "name", "tag", "rating", "wins", "losses",
    "last_match_time", "last_match_time_iso", "logo_url"
]
teams_df = teams_df.reindex(columns=cols)

# write once
teams_df.to_csv("teams.csv", index=False, encoding="utf-8")
print(teams_df.head())
print(f"Number of rows: {len(teams_df)}")

   team_id          name         tag   rating  wins  losses  last_match_time  \
0  7119388   Team Spirit     TSpirit  1578.63   768     512       1752941276   
1  9572001    PARIVISION  PARIVISION  1559.36   179      91       1753691348   
2  8255888  BetBoom Team     BetBoom  1547.12   502     405       1753694464   
3  7412785  CyberBonch-1          CB  1520.12   267       1       1639928575   
4  8605863        Cloud9          C9  1497.49   326     243       1726305637   

    last_match_time_iso                                           logo_url  
0  2025-07-19T16:07:56Z  https://cdn.steamusercontent.com/ugc/183917912...  
1  2025-07-28T08:29:08Z  https://cdn.steamusercontent.com/ugc/247650869...  
2  2025-07-28T09:21:04Z  https://cdn.steamusercontent.com/ugc/999542643...  
3  2021-12-19T15:42:55Z  https://cdn.steamusercontent.com/ugc/184253787...  
4  2024-09-14T09:20:37Z  https://cdn.steamusercontent.com/ugc/239994188...  
Number of rows: 5000


In [11]:
## heroes_df

### All heroes in the game

heroes_url = "https://api.opendota.com/api/heroes"
resp = requests.get(heroes_url)
resp.raise_for_status()
data = resp.json()  # <- list of dicts

# optional: turn the roles list into a comma-separated string
for row in data:
    if isinstance(row.get("roles"), list):
        row["roles"] = ",".join(row["roles"])

# pick a column order (optional)
cols = ["id", "name", "localized_name", "primary_attr", "attack_type", "roles", "legs"]
heroes_df = pd.DataFrame(data)[cols]

heroes_df.to_csv("heroes.csv", index=False, encoding="utf-8")
print(heroes_df.head)


<bound method NDFrame.head of       id                          name  localized_name primary_attr  \
0      1        npc_dota_hero_antimage       Anti-Mage          agi   
1      2             npc_dota_hero_axe             Axe          str   
2      3            npc_dota_hero_bane            Bane          all   
3      4     npc_dota_hero_bloodseeker     Bloodseeker          agi   
4      5  npc_dota_hero_crystal_maiden  Crystal Maiden          int   
..   ...                           ...             ...          ...   
121  135     npc_dota_hero_dawnbreaker     Dawnbreaker          str   
122  136           npc_dota_hero_marci           Marci          all   
123  137    npc_dota_hero_primal_beast    Primal Beast          str   
124  138          npc_dota_hero_muerta          Muerta          int   
125  145             npc_dota_hero_kez             Kez          agi   

    attack_type                                    roles  legs  
0         Melee                       Carry,Escape,N

In [32]:
## matches_teams_picks_bans

API = "https://api.opendota.com/api/explorer"

# output filenames
MATCH_TEAMS_CSV = "match_teams_733_plus.csv"
PICKS_BANS_LONG_CSV = "match_picks_bans_long_733_plus.csv"
MATCHES_TEAMS_PB_WIDE_CSV = "matches_teams_picks_bans_733_plus.csv"
# ===================================================

def run_explorer(limit: int, offset: int) -> pd.DataFrame:
    sql = f"""
        SELECT
          m.match_id,
          m.start_time,
          m.radiant_team_id,
          m.dire_team_id,
          m.radiant_win,
          m.duration,
          m.radiant_score,
          m.dire_score,
          m.picks_bans,
          CASE WHEN m.radiant_win THEN m.radiant_team_id ELSE m.dire_team_id END AS winner_team_id,
          m.leagueid,
          l.name AS league_name,
          mp.patch
        FROM matches m
        JOIN match_patch mp USING (match_id)
        LEFT JOIN leagues l ON l.leagueid = m.leagueid
        WHERE m.radiant_team_id IS NOT NULL
          AND m.dire_team_id IS NOT NULL
          AND m.radiant_team_id > 0
          AND m.dire_team_id > 0
          AND NULLIF(regexp_replace(mp.patch, '[^0-9\\.]', '', 'g'), '') IS NOT NULL
          AND CAST(regexp_replace(mp.patch, '[^0-9\\.]', '', 'g') AS numeric) >= {PATCH_MIN}
        ORDER BY m.match_id
        LIMIT {limit} OFFSET {offset}
    """
    params = {"sql": sql}
    if OPENDOTA_API_KEY:
        params["api_key"] = OPENDOTA_API_KEY
    r = requests.get(API, params=params, timeout=TIMEOUT)
    r.raise_for_status()
    return pd.DataFrame(r.json().get("rows", []))

all_match_pages = []
all_pb_rows = []
page = 0
offset = START_OFFSET
prev_sample = None

while True:
    page += 1

    if prev_sample is not None:
        print(f"[Page {page}] Sample of previous page (first 5 rows):")
        print(prev_sample.to_string(index=False))
        print("-" * 80)

    print(f"Fetching page {page} (offset={offset}, limit={LIMIT}) ...")
    df = run_explorer(LIMIT, offset)
    n = len(df)
    print(f"Fetched {n} rows.")
    if n == 0:
        print("No more rows. Stopping.")
        break

    # Preview for next loop
    show_cols = [c for c in ["match_id", "radiant_team_id", "dire_team_id", "league_name", "patch"] if c in df.columns]
    prev_sample = df[show_cols].head(5).copy()

    # Accumulate matches
    all_match_pages.append(df)

    # Build a long-form picks/bans while we have the row context (to resolve team_id)
    if "picks_bans" in df.columns:
        for _, row in df.iterrows():
            match_id = row["match_id"]
            r_tid = row["radiant_team_id"]
            d_tid = row["dire_team_id"]
            pbs = row["picks_bans"]
            if isinstance(pbs, list):
                for pb in pbs:
                    side = pb.get("team")  # 0=radiant, 1=dire
                    team_id = r_tid if side == 0 else d_tid if side == 1 else None
                    all_pb_rows.append({
                        "match_id": match_id,
                        "order": pb.get("order"),
                        "is_pick": pb.get("is_pick"),
                        "hero_id": pb.get("hero_id"),
                        "team_side": side,
                        "team_id": team_id,
                    })

    offset += LIMIT
    if MAX_PAGES is not None and page >= MAX_PAGES:
        print("Reached MAX_PAGES cap. Stopping.")
        break

# ===== Build match_teams_df =====
match_teams_df = pd.concat(all_match_pages, ignore_index=True) if all_match_pages else pd.DataFrame()

# Add ISO timestamp
if "start_time" in match_teams_df.columns:
    match_teams_df["start_time_iso"] = pd.to_datetime(
        match_teams_df["start_time"], unit="s", utc=True, errors="coerce"
    ).dt.strftime("%Y-%m-%dT%H:%M:%SZ")

# Column order
preferred_cols = [
    "match_id",
    "radiant_team_id", "dire_team_id",
    "start_time", "start_time_iso",
    "radiant_win", "winner_team_id",
    "duration", "radiant_score", "dire_score",
    "leagueid", "league_name", "patch", "picks_bans"
]
match_teams_df = match_teams_df[[c for c in preferred_cols if c in match_teams_df.columns] +
                                [c for c in match_teams_df.columns if c not in preferred_cols]]

# ===== Build long-form picks/bans df (already team_id-resolved) =====
match_picks_bans_df = pd.DataFrame(all_pb_rows).sort_values(["match_id", "order"]).reset_index(drop=True)

# ===== Build WIDE matches_teams_picks_bans =====
# For each match row, create: isPick{n}, hero_id{n}, team_id{n}, order{n}
def flatten_picks_bans_row(row) -> dict:
    base = {k: row[k] for k in row.index if k != "picks_bans"}  # copy all except the JSON
    pbs = row.get("picks_bans")
    if isinstance(pbs, list):
        # ensure ordered by "order"
        pbs_sorted = sorted(pbs, key=lambda x: x.get("order", 0))
        for pb in pbs_sorted:
            k = int(pb.get("order", 0)) + 1  # 1-based suffix
            side = pb.get("team")  # 0=radiant, 1=dire
            team_id = row["radiant_team_id"] if side == 0 else row["dire_team_id"] if side == 1 else None
            base[f"isPick{k}"] = bool(pb.get("is_pick"))
            base[f"hero_id{k}"] = pb.get("hero_id")
            base[f"team_id{k}"] = team_id
            base[f"order{k}"] = pb.get("order")
    return base

if not match_teams_df.empty:
    wide_records = [flatten_picks_bans_row(r) for _, r in match_teams_df.iterrows()]
    matches_teams_picks_bans = pd.DataFrame(wide_records)
else:
    matches_teams_picks_bans = pd.DataFrame()

# Drop the raw JSON column from match_teams_df before saving (keeps file tidy)
if "picks_bans" in match_teams_df.columns:
    match_teams_df_nojson = match_teams_df.drop(columns=["picks_bans"])
else:
    match_teams_df_nojson = match_teams_df.copy()

# ===== SAVE CSVs =====
match_teams_df_nojson.to_csv(MATCH_TEAMS_CSV, index=False, encoding="utf-8")
match_picks_bans_df.to_csv(PICKS_BANS_LONG_CSV, index=False, encoding="utf-8")
matches_teams_picks_bans.to_csv(MATCHES_TEAMS_PB_WIDE_CSV, index=False, encoding="utf-8")

print("\nDone!")
print("Saved:", MATCH_TEAMS_CSV)
print("Saved:", PICKS_BANS_LONG_CSV)
print("Saved:", MATCHES_TEAMS_PB_WIDE_CSV)

print("\nShapes:")
print("match_teams_df_nojson:", match_teams_df_nojson.shape)
print("match_picks_bans_df:", match_picks_bans_df.shape)
print("matches_teams_picks_bans:", matches_teams_picks_bans.shape)

print("\nPreview matches_teams_picks_bans:")
print(matches_teams_picks_bans.head().to_string(index=False))

Fetching page 1 (offset=0, limit=5000) ...
Fetched 5000 rows.
[Page 2] Sample of previous page (first 5 rows):
  match_id  radiant_team_id  dire_team_id                                                         league_name patch
7116526799          8680612       8971308 DPC 2023 SA Spring Tour Division II – presented by ESB Liga Esports  7.33
7116656225          8893835       8864178                                                     Ancients League  7.33
7116662198          8629315       8629318                                                      Destiny league  7.33
7116708252          8864178       8893835                                                     Ancients League  7.33
7116718813          8863825       8736661          DPC 2023 SEA Spring Tour Division II - presented by Epulze  7.33
--------------------------------------------------------------------------------
Fetching page 2 (offset=5000, limit=5000) ...
Fetched 5000 rows.
[Page 3] Sample of previous page (first 5 rows)

In [38]:
## teams_heros_df

OUT_CSV = "team_hero_df.csv"

# --- Helper to fetch a team's hero stats ---
def fetch_team_heroes(team_id: int, api_key: str | None = None, timeout: int = 30) -> pd.DataFrame:
    url = f"{API_BASE}/teams/{team_id}/heroes"
    params = {"api_key": api_key} if api_key else None
    r = requests.get(url, params=params, timeout=timeout)
    r.raise_for_status()
    data = r.json()  # list[dict]
    df = pd.DataFrame(data)
    if df.empty:
        return pd.DataFrame(columns=["team_id", "hero_id"])
    df["team_id"] = team_id
    return df

# --- Optional: hero lookup (id -> names) ---
def get_heroes_lookup() -> pd.DataFrame:
    # Use existing heroes_df if present; else fetch once.
    if "heroes_df" in globals() and isinstance(heroes_df, pd.DataFrame) and "id" in heroes_df.columns:
        h = heroes_df.copy()
    else:
        r = requests.get(f"{API_BASE}/heroes", timeout=30)
        r.raise_for_status()
        h = pd.DataFrame(r.json())
    # Keep common columns; rename id -> hero_id for merge
    keep = [c for c in ["id", "name", "localized_name", "primary_attr", "attack_type"] if c in h.columns]
    h = h[keep].rename(columns={"id": "hero_id"})
    return h

# --- Collect team->hero stats for all teams in teams_df ---
team_ids = teams_df["team_id"].dropna().astype(int).unique()
if MAX_TEAMS is not None:
    team_ids = team_ids[:MAX_TEAMS]

all_frames = []
for i, tid in enumerate(team_ids, start=1):
    try:
        if i == 1:
            print(f"Starting team hero fetch for {len(team_ids)} teams...")
        if i % 50 == 1 or i == len(team_ids):
            print(f"[{i}/{len(team_ids)}] team_id={tid}")
        df = fetch_team_heroes(tid, api_key=OPENDOTA_API_KEY)
        all_frames.append(df)
    except Exception as e:
        print(f"Failed team_id={tid}: {e}")
    time.sleep(SLEEP_SEC)

team_hero_df = pd.concat(all_frames, ignore_index=True) if all_frames else pd.DataFrame()

# --- Add winrate if wins/games columns exist (naming can vary) ---
wins_col = "wins" if "wins" in team_hero_df.columns else ("win" if "win" in team_hero_df.columns else None)
games_col = "games_played" if "games_played" in team_hero_df.columns else ("games" if "games" in team_hero_df.columns else None)
if wins_col and games_col:
    team_hero_df["winrate"] = (team_hero_df[wins_col] / team_hero_df[games_col]).round(4)

# --- Attach hero names (id -> names) ---
try:
    heroes_lookup = get_heroes_lookup()
    team_hero_df = team_hero_df.merge(heroes_lookup, on="hero_id", how="left")
except Exception as e:
    print("Skipping hero name merge:", e)

# --- Nice column order (best-effort; keeps extras at the end) ---
preferred = ["team_id", "hero_id", "localized_name", "name", "primary_attr", "attack_type",
             "games_played", "games", "wins", "win", "winrate"]
ordered = [c for c in preferred if c in team_hero_df.columns]
rest = [c for c in team_hero_df.columns if c not in ordered]
team_hero_df = team_hero_df[ordered + rest]

# --- Save & peek ---
team_hero_df.to_csv(OUT_CSV, index=False, encoding="utf-8")
print("Saved:", OUT_CSV)
print("team_hero_df shape:", team_hero_df.shape)
print(team_hero_df.head().to_string(index=False))


Starting team hero fetch for 5000 teams...
[1/5000] team_id=7119388
[51/5000] team_id=9187066
[101/5000] team_id=9303484
[151/5000] team_id=8214850
[201/5000] team_id=8998465
[251/5000] team_id=8936568
[301/5000] team_id=1848465
[351/5000] team_id=9216247
[401/5000] team_id=3018001
[451/5000] team_id=2850822
[501/5000] team_id=8495691
[551/5000] team_id=1665758
[601/5000] team_id=8713367
[651/5000] team_id=7716098
[701/5000] team_id=7019918
[751/5000] team_id=2024718
[801/5000] team_id=8193371
[851/5000] team_id=7422393
[901/5000] team_id=8526136
[951/5000] team_id=2025160
[1001/5000] team_id=5006832
[1051/5000] team_id=6435695
[1101/5000] team_id=9683757
[1151/5000] team_id=9440022
[1201/5000] team_id=9466553
[1251/5000] team_id=7510730
[1301/5000] team_id=7578716
[1351/5000] team_id=8604954
[1401/5000] team_id=8999483
[1451/5000] team_id=9373820
[1501/5000] team_id=9530233
[1551/5000] team_id=2496863
[1601/5000] team_id=2913028
[1651/5000] team_id=9126284
[1701/5000] team_id=1709168


In [39]:
## data frames we have so far

## teams_df
print(teams_df.head)
print("number of rows " + str(len(teams_df)))
## heroes_df
print(heroes_df.head)
print("number of rows " + str(len(heroes_df)))
## matches_teams_picks_bans
print(matches_teams_picks_bans.head)
print("number of rows " + str(len(matches_teams_picks_bans)))
# match_teams_df_nojson
print(match_teams_df_nojson.head)
print("number of rows " + str(len(match_teams_df_nojson)))
# match_picks_bans_df
print(match_picks_bans_df.head)
print("number of rows " + str(len(match_picks_bans_df)))
## team_hero information
print(team_hero_df.head)
print("number of rows " + str(len(team_hero_df)))

<bound method NDFrame.head of       team_id                  name         tag   rating  wins  losses  \
0     7119388           Team Spirit     TSpirit  1578.63   768     512   
1     9572001            PARIVISION  PARIVISION  1559.36   179      91   
2     8255888          BetBoom Team     BetBoom  1547.12   502     405   
3     7412785          CyberBonch-1          CB  1520.12   267       1   
4     8605863                Cloud9          C9  1497.49   326     243   
...       ...                   ...         ...      ...   ...     ...   
4995   470237  High Skill Dickheads         HSD  1015.33     1       0   
4996  5635522            truestrike  Truestrike  1015.33     3       2   
4997  3703112             Coca Cola              1015.32     1       0   
4998  3326101               Too EzZ       !TE!.  1015.32     1       0   
4999  7358791               SVALINN       SVL |  1015.32     1       0   

      last_match_time   last_match_time_iso  \
0          1752941276  2025-07-19T

In [43]:
## that is all for data collection for now

## use match_teams_733_plus with teams_df and team_heroes
## teams_df
print(teams_df.head)
print("number of rows " + str(len(teams_df)))
# match_teams_df_nojson
print(match_teams_df_nojson.head)
print("number of rows " + str(len(match_teams_df_nojson)))
## team_hero information
print(team_hero_df.head)
print("number of rows " + str(len(team_hero_df)))
## heroes_df
print(heroes_df.head)
print("number of rows " + str(len(heroes_df)))

<bound method NDFrame.head of       team_id                  name         tag   rating  wins  losses  \
0     7119388           Team Spirit     TSpirit  1578.63   768     512   
1     9572001            PARIVISION  PARIVISION  1559.36   179      91   
2     8255888          BetBoom Team     BetBoom  1547.12   502     405   
3     7412785          CyberBonch-1          CB  1520.12   267       1   
4     8605863                Cloud9          C9  1497.49   326     243   
...       ...                   ...         ...      ...   ...     ...   
4995   470237  High Skill Dickheads         HSD  1015.33     1       0   
4996  5635522            truestrike  Truestrike  1015.33     3       2   
4997  3703112             Coca Cola              1015.32     1       0   
4998  3326101               Too EzZ       !TE!.  1015.32     1       0   
4999  7358791               SVALINN       SVL |  1015.32     1       0   

      last_match_time   last_match_time_iso  \
0          1752941276  2025-07-19T

In [45]:
print("Input shapes:")
print("teams_df:", teams_df.shape)
print("match_teams_df_nojson:", match_teams_df_nojson.shape)
print("team_hero_df:", team_hero_df.shape)
if "matches_teams_picks_bans" in globals():
    print("matches_teams_picks_bans:", matches_teams_picks_bans.shape)
if "match_picks_bans_df" in globals():
    print("match_picks_bans_df:", match_picks_bans_df.shape)

# ---------------- helpers ----------------
def slugify(name: str) -> str:
    if pd.isna(name):
        return "unknown"
    s = str(name).lower()
    s = s.replace("’","'")  # normalize curly apostrophe
    s = re.sub(r"[^a-z0-9]+", "_", s)
    return s.strip("_")

# normalize team_hero_df column names to games/wins/winrate and attach a clean hero_name
print("\n[1/5] Normalizing team_hero_df ...")
th = team_hero_df.copy()

# unify games/wins columns
if "games" not in th.columns and "games_played" in th.columns:
    th = th.rename(columns={"games_played":"games"})
if "wins" not in th.columns and "win" in th.columns:
    th = th.rename(columns={"win":"wins"})

# attach hero_name (prefer heroes_df if available)
hero_name_col = None
candidate_cols = [c for c in ["localized_name","localized_name_y","localized_name_x"] if c in th.columns]
if candidate_cols:
    th["hero_name"] = th[candidate_cols[0]].copy()
    for c in candidate_cols[1:]:
        th["hero_name"] = th["hero_name"].fillna(th[c])
else:
    th["hero_name"] = np.nan

if "heroes_df" in globals() and isinstance(heroes_df, pd.DataFrame):
    hlookup = heroes_df[["id","localized_name"]].rename(columns={"id":"hero_id","localized_name":"h_localized_name"})
    th = th.merge(hlookup, on="hero_id", how="left")
    th["hero_name"] = th["hero_name"].fillna(th["h_localized_name"])

# fallback from 'name' like npc_dota_hero_tiny
if "name" in th.columns:
    fallback = th["name"].str.replace("^npc_dota_hero_", "", regex=True).str.replace("_", " ").str.title()
    th["hero_name"] = th["hero_name"].fillna(fallback)

th["hero_name"] = th["hero_name"].fillna("Unknown")
th["hero_key"] = th["hero_name"].map(slugify)

# keep only needed columns
keep_th = ["team_id","hero_id","hero_name","hero_key","games","wins","winrate"]
for col in ["games","wins","winrate"]:
    if col not in th.columns:
        th[col] = np.nan
th = th[keep_th].drop_duplicates()

print("team_hero_df normalized shape:", th.shape)

# ---------------- base matches with team metadata ----------------
print("\n[2/5] Building base match table with team metadata ...")
base_cols = ["match_id","radiant_team_id","dire_team_id","radiant_win","winner_team_id",
             "duration","patch","radiant_score","dire_score"]
base_cols = [c for c in base_cols if c in match_teams_df_nojson.columns]
base = match_teams_df_nojson[base_cols].drop_duplicates(subset=["match_id"]).copy()

# join team meta for radiant/dire
tmeta = teams_df[["team_id","name","rating","wins","losses"]].drop_duplicates()
rad = tmeta.rename(columns={
    "team_id":"radiant_team_id","name":"radiant_team_name","rating":"radiant_team_rating",
    "wins":"radiant_team_wins","losses":"radiant_team_losses"
})
dire = tmeta.rename(columns={
    "team_id":"dire_team_id","name":"dire_team_name","rating":"dire_team_rating",
    "wins":"dire_team_wins","losses":"dire_team_losses"
})
base = base.merge(rad, on="radiant_team_id", how="left")
base = base.merge(dire, on="dire_team_id", how="left")

print("Base matches shape after team joins:", base.shape)
print(base.head(3).to_string(index=False))

# ---------------- get picks (long) ----------------
print("\n[3/5] Preparing picks (one row per hero pick) ...")

if "match_picks_bans_df" in globals() and not match_picks_bans_df.empty:
    pb = match_picks_bans_df.copy()
    source = "match_picks_bans_df"
else:
    # derive long picks from matches_teams_picks_bans if available
    if "matches_teams_picks_bans" in globals() and not matches_teams_picks_bans.empty:
        print("Deriving picks from matches_teams_picks_bans (wide) ...")
        wide = matches_teams_picks_bans.copy()
        pick_rows = []
        k = 1
        # detect max K
        while f"hero_id{k}" in wide.columns:
            k += 1
        max_k = k - 1
        for ki in range(1, max_k+1):
            cols_needed = [f"isPick{ki}", f"hero_id{ki}", f"team_id{ki}"]
            have = [c for c in cols_needed if c in wide.columns]
            if len(have) < 3:
                continue
            tmp = wide[["match_id"] + cols_needed].rename(columns={
                f"isPick{ki}":"is_pick", f"hero_id{ki}":"hero_id", f"team_id{ki}":"team_id"
            })
            tmp["order"] = ki-1
            pick_rows.append(tmp)
        pb = pd.concat(pick_rows, ignore_index=True) if pick_rows else pd.DataFrame(columns=["match_id","is_pick","hero_id","team_id","order"])
        source = "matches_teams_picks_bans (derived)"
    else:
        raise RuntimeError("No picks available. Provide match_picks_bans_df or matches_teams_picks_bans.")

print(f"Using picks from: {source}")
pb = pb[pb["is_pick"] == True].copy()

# ensure team_id exists; if not, resolve from side + base table
if "team_id" not in pb.columns or pb["team_id"].isna().any():
    # if we only have 'team' = 0/1, resolve via base radiant/dire ids
    if "team" in pb.columns:
        key = base[["match_id","radiant_team_id","dire_team_id"]]
        pb = pb.merge(key, on="match_id", how="left")
        pb["team_id"] = np.where(pb["team"]==0, pb["radiant_team_id"], pb["dire_team_id"])
        pb = pb.drop(columns=["radiant_team_id","dire_team_id"])
    else:
        raise RuntimeError("Cannot resolve team_id for picks; expected 'team_id' or ('team' + base table).")

print("Picks (is_pick=True) rows:", pb.shape)

# attach team/hero prior stats
pb = pb.merge(th, on=["team_id","hero_id"], how="left")

# determine side label from base (radiant vs dire) if not present
if "side" not in pb.columns:
    key = base[["match_id","radiant_team_id","dire_team_id"]]
    pb = pb.merge(key, on="match_id", how="left")
    pb["side"] = np.where(pb["team_id"]==pb["radiant_team_id"], "radiant", "dire")
    pb = pb.drop(columns=["radiant_team_id","dire_team_id"])

print("Picks with hero stats shape:", pb.shape)
print(pb.head(5).to_string(index=False))

# ---------------- pivot hero stats to wide ----------------
print("\n[4/5] Pivoting hero stats to wide columns ...")
# Long-form metrics for pivot
metrics = ["games","wins","winrate"]
for m in metrics:
    if m not in pb.columns:
        pb[m] = np.nan

long = pb.melt(
    id_vars=["match_id","side","hero_key","hero_name"],
    value_vars=metrics,
    var_name="metric",
    value_name="value"
)

wide = long.pivot_table(
    index="match_id",
    columns=["side","hero_key","metric"],
    values="value",
    aggfunc="first"
)

# flatten multiindex columns -> radiant_team_<hero>_<metric>
wide.columns = [f"{side}_team_{hero}_{metric}" for (side, hero, metric) in wide.columns.to_list()]
wide = wide.reset_index()

print("Wide hero-stats shape:", wide.shape)
print(wide.iloc[:3, :12].to_string(index=False))  # print first few columns for sanity

# ---------------- merge base+wide and save ----------------
print("\n[5/5] Merging all pieces ...")
matches_team_herostats_df = base.merge(wide, on="match_id", how="left")

# ensure match_id unique
dups = matches_team_herostats_df["match_id"].duplicated().sum()
if dups:
    print(f"WARNING: found {dups} duplicate match_id rows; dropping duplicates by match_id")
    matches_team_herostats_df = matches_team_herostats_df.drop_duplicates(subset=["match_id"])

print("Final dataset shape:", matches_team_herostats_df.shape)
print(matches_team_herostats_df.head(3).to_string(index=False))

# save
OUT_CSV = "matches_team_herostats_df.csv"
matches_team_herostats_df.to_csv(OUT_CSV, index=False, encoding="utf-8")
print(f"\nSaved: {OUT_CSV}")

Input shapes:
teams_df: (5000, 9)
match_teams_df_nojson: (65766, 13)
team_hero_df: (169555, 10)
matches_teams_picks_bans: (65766, 109)
match_picks_bans_df: (1564929, 6)

[1/5] Normalizing team_hero_df ...
team_hero_df normalized shape: (169555, 7)

[2/5] Building base match table with team metadata ...
Base matches shape after team joins: (65766, 17)
  match_id  radiant_team_id  dire_team_id  radiant_win  winner_team_id  duration patch  radiant_score  dire_score radiant_team_name  radiant_team_rating  radiant_team_wins  radiant_team_losses dire_team_name  dire_team_rating  dire_team_wins  dire_team_losses
7116526799          8680612       8971308        False         8971308      1849  7.33             10          42           noMERCY              1187.31              181.0                193.0      x5 Gaming           1033.77           123.0             121.0
7116656225          8893835       8864178         True         8893835      1718  7.33             34          13              

In [47]:
## some teams were included that we actually don't care about, we can drop those

print("\n[Cleanup] Dropping rows with missing/blank team names...")

def nonempty(series: pd.Series) -> pd.Series:
    return series.notna() & series.astype(str).str.strip().ne("")

before = len(matches_team_herostats_df)
mask = nonempty(matches_team_herostats_df["radiant_team_name"]) & \
       nonempty(matches_team_herostats_df["dire_team_name"])
matches_team_herostats_df = matches_team_herostats_df[mask].copy()
after = len(matches_team_herostats_df)

print(f"Dropped {before - after} rows; kept {after} rows with valid team names.")

# re-save cleaned file
OUT_CSV = "matches_team_herostats_df.csv"
matches_team_herostats_df.to_csv(OUT_CSV, index=False, encoding="utf-8")
print(f"Saved cleaned dataset: {OUT_CSV}")



[Cleanup] Dropping rows with missing/blank team names...
Dropped 46883 rows; kept 18883 rows with valid team names.
Saved cleaned dataset: matches_team_herostats_df.csv


In [48]:
print(matches_team_herostats_df.head)
print("Number of rows: " + str(len(matches_team_herostats_df)))

<bound method NDFrame.head of          match_id  radiant_team_id  dire_team_id  radiant_win  winner_team_id  \
0      7116526799          8680612       8971308        False         8971308   
4      7116718813          8863825       8736661        False         8736661   
10     7116809609          8863825       8736661        False         8736661   
18     7116959751          8961813       8893840         True         8961813   
22     7117047135          8893840       8961813         True         8893840   
...           ...              ...           ...          ...             ...   
65759  8409736964          9790546       9600141        False         9600141   
65760  8409741530          9758040       9586122         True         9758040   
65762  8409798809          9844322       9844313        False         9844313   
65763  8409825381          9586122       9758040        False         9758040   
65765  8409857991          9844322       9844313         True         9844322  

In [49]:
## add adaptive laplace smoothing with pseudocounts smoothing

print("Starting Laplace smoothing on hero stats...")
print("Input df:", "matches_team_herostats_df" in globals())
print("Shape:", matches_team_herostats_df.shape)

# Work on a copy
matches_team_herostats_laplace_smoothing = matches_team_herostats_df.copy()

# Find all hero base prefixes like:
# radiant_team_<hero_key>_games  -> base: radiant_team_<hero_key>
# dire_team_<hero_key>_games     -> base: dire_team_<hero_key>
games_cols = [c for c in matches_team_herostats_laplace_smoothing.columns
              if c.endswith("_games") and ("radiant_team_" in c or "dire_team_" in c)]

print(f"Found {len(games_cols)} hero '_games' columns to smooth.")

bases_updated = []
for gcol in games_cols:
    base = gcol[:-len("_games")]
    wcol = f"{base}_wins"
    rcol = f"{base}_winrate"

    if wcol not in matches_team_herostats_laplace_smoothing.columns:
        # No wins column -> skip this base (nothing to smooth/recompute)
        continue

    # Convert to numeric, treat NaN as 0 before adding pseudocounts
    g = pd.to_numeric(matches_team_herostats_laplace_smoothing[gcol], errors="coerce").fillna(0)
    w = pd.to_numeric(matches_team_herostats_laplace_smoothing[wcol], errors="coerce").fillna(0)

    # Laplace pseudocounts: +2 games, +1 win
    g_sm = g + 2
    w_sm = w + 1

    matches_team_herostats_laplace_smoothing[gcol] = g_sm
    matches_team_herostats_laplace_smoothing[wcol] = w_sm

    # Recompute winrate; create column if missing
    matches_team_herostats_laplace_smoothing[rcol] = (w_sm / g_sm).astype(float)

    bases_updated.append(base)

print(f"Smoothed {len(bases_updated)} hero bases (games/wins) and recomputed winrates.")

# Quick preview for a few bases
for base in bases_updated[:3]:
    gcol, wcol, rcol = f"{base}_games", f"{base}_wins", f"{base}_winrate"
    print(f"\nSample after smoothing for '{base}':")
    print(
        matches_team_herostats_laplace_smoothing[
            ["match_id", gcol, wcol, rcol]
        ].head(5).to_string(index=False)
    )

# Save
OUT_CSV = "matches_team_herostats_laplace_smoothing.csv"
matches_team_herostats_laplace_smoothing.to_csv(OUT_CSV, index=False, encoding="utf-8")

print("\nDone.")
print("Output df shape:", matches_team_herostats_laplace_smoothing.shape)
print("Saved:", OUT_CSV)

Starting Laplace smoothing on hero stats...
Input df: True
Shape: (18883, 767)
Found 250 hero '_games' columns to smooth.
Smoothed 250 hero bases (games/wins) and recomputed winrates.

Sample after smoothing for 'dire_team_abaddon':
  match_id  dire_team_abaddon_games  dire_team_abaddon_wins  dire_team_abaddon_winrate
7116526799                      2.0                     1.0                        0.5
7116718813                      2.0                     1.0                        0.5
7116809609                      2.0                     1.0                        0.5
7116959751                      2.0                     1.0                        0.5
7117047135                      2.0                     1.0                        0.5

Sample after smoothing for 'dire_team_alchemist':
  match_id  dire_team_alchemist_games  dire_team_alchemist_wins  dire_team_alchemist_winrate
7116526799                        2.0                       1.0                          0.5
71167188

In [51]:
# 1) Base table: use the smoothed wide set
df = matches_team_herostats_laplace_smoothing.copy()

# 2) (Optional but recommended) bring back start_time for time-based split
if "start_time" not in df.columns and "match_teams_df_nojson" in globals():
    df = df.merge(
        match_teams_df_nojson[["match_id","start_time"]],
        on="match_id", how="left"
    )

# 3) Target and leakage guard
y = df["radiant_win"].astype(int)

leak_cols = [c for c in ["winner_team_id","radiant_score","dire_score","duration"] if c in df.columns]
drop_non_features = ["radiant_win","match_id"] + leak_cols

# 4) Feature prep
# Cast patch to numeric if it's like "7.39"
if "patch" in df.columns:
    df["patch_num"] = pd.to_numeric(df["patch"], errors="coerce")
    drop_non_features.append("patch")

# Remove obviously non-numeric/text columns (team names, etc.)
X = df.drop(columns=drop_non_features, errors="ignore")
for c in X.columns:
    if X[c].dtype == "object":
        # keep only numeric features for this baseline
        X = X.drop(columns=[c])

# Fill any remaining NaNs (after smoothing there may still be some)
X = X.fillna(0.0)

# 5) Time-based split (last 20% by start_time)
if "start_time" in df.columns and df["start_time"].notna().any():
    df["_order"] = df["start_time"].rank(method="first")
    cutoff = df["_order"].quantile(0.8)
    train_idx = df["_order"] <= cutoff
    test_idx  = df["_order"] >  cutoff
else:
    # fallback: random split (use time split once start_time is available)
    train_idx, test_idx = train_test_split(df.index, test_size=0.2, random_state=42, shuffle=True)

X_train, X_test = X.loc[train_idx], X.loc[test_idx]
y_train, y_test = y.loc[train_idx], y.loc[test_idx]

print(f"Train size: {len(X_train)} | Test size: {len(X_test)} | Features: {X.shape[1]}")

# 6) Simple baseline model
clf = HistGradientBoostingClassifier(
    learning_rate=0.08,
    max_depth=None,
    max_leaf_nodes=31,
    min_samples_leaf=50,
    l2_regularization=0.0,
    early_stopping=True,
    random_state=42
)
clf.fit(X_train, y_train)

# 7) Eval
proba = clf.predict_proba(X_test)[:,1]
pred  = (proba >= 0.5).astype(int)
acc   = accuracy_score(y_test, pred)
auc   = roc_auc_score(y_test, proba)
ll    = log_loss(y_test, proba)

# Baseline to beat: always pick majority class in train
maj = int(y_train.mean() >= 0.5)
maj_acc = (y_test == maj).mean()

print(f"Accuracy: {acc:.4f} | ROC AUC: {auc:.4f} | LogLoss: {ll:.4f}")
print(f"Majority-class Accuracy baseline: {maj_acc:.4f}")

Train size: 15106 | Test size: 3777 | Features: 760
Accuracy: 0.7350 | ROC AUC: 0.8229 | LogLoss: 0.5210
Majority-class Accuracy baseline: 0.5234


In [60]:
# class weighting (optional)
pos = int(y_train.sum()); neg = int(len(y_train) - pos)
scale_pos_weight = (neg / pos) if pos > 0 else 1.0
print(f"scale_pos_weight={scale_pos_weight:.3f}  (pos={pos}, neg={neg})")

# DMatrix
dtrain = xgb.DMatrix(X_train, label=y_train)
dvalid = xgb.DMatrix(X_test,  label=y_test)

# params (tweak as you like)
params = {
    "objective": "binary:logistic",
    "eval_metric": ["logloss","auc"],
    "eta": 0.05,                 # learning rate
    "max_depth": 6,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "min_child_weight": 10,
    "reg_lambda": 1.0,
    "reg_alpha": 0.0,
    "tree_method": "hist",       # use "gpu_hist" if you have a GPU runtime
    "scale_pos_weight": scale_pos_weight,
    "seed": 42,
}

print("Training XGBoost with early stopping (xgb.train)...")
watchlist = [(dtrain, "train"), (dvalid, "valid")]
bst = xgb.train(
    params=params,
    dtrain=dtrain,
    num_boost_round=5000,
    evals=watchlist,
    early_stopping_rounds=100,
    verbose_eval=200,
)

# Predict using the best iteration
if hasattr(bst, "best_ntree_limit") and bst.best_ntree_limit is not None:
    proba = bst.predict(dvalid, ntree_limit=bst.best_ntree_limit)
else:
    # fallback if attribute not available
    proba = bst.predict(dvalid)

pred = (proba >= 0.5).astype(int)

acc = accuracy_score(y_test, pred)
auc = roc_auc_score(y_test, proba)
ll  = log_loss(y_test, proba)
print(f"\nXGBoost  Acc={acc:.4f}  AUC={auc:.4f}  LogLoss={ll:.4f}")

# Top features (robust to older versions)
try:
    imp = pd.Series(bst.get_score(importance_type="total_gain")).sort_values(ascending=False)
except Exception:
    try:
        imp = pd.Series(bst.get_score(importance_type="gain")).sort_values(ascending=False)
    except Exception:
        imp = pd.Series(bst.get_score(importance_type="weight")).sort_values(ascending=False)

print("\nTop 20 features:")
print(imp.head(20).to_string())

scale_pos_weight=0.976  (pos=7644, neg=7462)
Training XGBoost with early stopping (xgb.train)...
[0]	train-logloss:0.69011	train-auc:0.62959	valid-logloss:0.69087	valid-auc:0.59826
[200]	train-logloss:0.54835	train-auc:0.86635	valid-logloss:0.59154	valid-auc:0.79181
[400]	train-logloss:0.48723	train-auc:0.89557	valid-logloss:0.54649	valid-auc:0.81603
[600]	train-logloss:0.44801	train-auc:0.90842	valid-logloss:0.52139	valid-auc:0.82538
[800]	train-logloss:0.41846	train-auc:0.91956	valid-logloss:0.50798	valid-auc:0.82961
[1000]	train-logloss:0.39409	train-auc:0.92940	valid-logloss:0.50001	valid-auc:0.83294
[1200]	train-logloss:0.37341	train-auc:0.93774	valid-logloss:0.49562	valid-auc:0.83468
[1400]	train-logloss:0.35544	train-auc:0.94502	valid-logloss:0.49429	valid-auc:0.83542
[1600]	train-logloss:0.33938	train-auc:0.95183	valid-logloss:0.49271	valid-auc:0.83718
[1763]	train-logloss:0.32786	train-auc:0.95600	valid-logloss:0.49378	valid-auc:0.83693

XGBoost  Acc=0.7498  AUC=0.8369  LogLos