### Imports & Configs


In [57]:
import os
import json
import requests
from datetime import datetime, timezone
import pathlib, json
import pandas as pd
import config  

DATA_DIR = config.DATA_DIR


### Helper Function

In [58]:

def norm_text(s: str | None) -> str:
    return " ".join(str(s).lower().split()) if s else ""

def make_team_id(name: str, league_code: str, season: str) -> str:
    slug = norm_text(name).replace(" ", "_")
    code = norm_text(league_code).replace(" ", "_")
    return f"team_{slug}_{code}_{season.replace('-', '')}"


def build_team_entity(name: str, season: str, league_meta: dict) -> dict:
    now_iso = datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")
    team_id = make_team_id(name, league_meta["code"], season)
    return {
        "entity_type": "team",
        "team_id": team_id,
        "name": norm_text(name),
        "identity": {
            "full_name": None, "country": None, "city": None,
            "founded_year": None, "club_colors": None, "crest_url": None,
        },
        "venue": {"stadium_name": None, "city": None, "capacity": None},
        "coach": {"coach_id": None, "name": None, "dob": None, "nationality": None},
        "current_season": season,
        "current_league": league_meta,
        "running_competitions": [league_meta.copy()],
        "season_stats": {"rank": None, "played": None, "wins": None, "draws": None,
                        "losses": None, "goals_for": None, "goals_against": None,
                        "goal_difference": None, "points": None},
        "form": {"last_5": None, "last_5_points": None},
        "squad": [],
        "advanced_team_stats": {"xGF": None, "xGA": None, "xGD": None, "possession_pct": None},
        "historical_performance": [],
        "external_ids": {},
        "sources": [],
        "last_updated": now_iso,
    }

def update_identity(entity, data, source=None):
    entity["identity"].update({k: v for k, v in data.items() if v})
    if source:
        entity.setdefault("sources", [])
        if source not in entity["sources"]:
            entity["sources"].append(source)

def update_season_stats(entity, stats):
    for k, v in stats.items():
        if v is not None:
            entity["season_stats"][k] = v

def save_cache(path, payload):
    pathlib.Path(path).write_text(json.dumps(payload, indent=2))

def load_cache(path):
    p = pathlib.Path(path)
    return json.loads(p.read_text()) if p.exists() else None




### Fetch data

In [59]:
import os
import time
import requests
from pathlib import Path
import json

API_BASE = "https://v3.football.api-sports.io"
API_KEY = os.getenv("API_FOOTBALL_KEY")
CACHE_DIR = Path("data/cache")
CACHE_DIR.mkdir(parents=True, exist_ok=True)

SESSION = requests.Session()
SESSION.headers.update({
    "x-apisports-key": API_KEY,
    "Accept": "application/json",
})
if not API_KEY:
    raise RuntimeError("Missing API_FOOTBALL_KEY")


def api_get(path, params=None, cache_name=None, ttl_hours=12, force_refresh=False):
    cache_path = CACHE_DIR / f"{cache_name}.json" if cache_name else None

    # Đọc cache nếu còn hạn và không force_refresh
    if cache_path and cache_path.exists() and not force_refresh:
        age_s = time.time() - cache_path.stat().st_mtime
        if ttl_hours is None or age_s < ttl_hours * 3600:
            return json.loads(cache_path.read_text())

    try:
        resp = requests.get(f"{API_BASE}/{path}", params=params, timeout=30)
        resp.raise_for_status()
        data = resp.json()
        if cache_path:
            cache_path.write_text(json.dumps(data, indent=2))
        return data
    except requests.RequestException:
        if cache_path and cache_path.exists():
            return json.loads(cache_path.read_text())
        raise

def fetch_team_identity(team_api_id):
    data = api_get("teams", params={"id": team_api_id}, cache_name=f"team_{team_api_id}")
    team = data["response"][0]["team"]
    venue = data["response"][0].get("venue", {})
    return {
        "identity": {
            "full_name": team["name"],
            "country": team["country"],
            "city": venue.get("city"),
            "founded_year": team.get("founded"),
            "club_colors": team.get("colors", {}).get("club"),
            "crest_url": team.get("logo"),
        },
        "venue": {
            "stadium_name": venue.get("name"),
            "city": venue.get("city"),
            "capacity": venue.get("capacity"),
        },
        "external_ids": {"api_football": team_api_id},
    }

def fetch_standings(league_id, season):
    data = api_get(
        "standings",
        params={"league": league_id, "season": season.split("-")[0]},
        cache_name=f"standings_{league_id}_{season}",
    )
    return data


### Pipeline


In [60]:
big5_leagues = {
    39: {
        "league_id": "league_epl",
        "code": "EPL",
        "name": "England Premier League",
        "season": "2023-2024",
        "external_ids": {"api_football": 39},
    },
    140: {
        "league_id": "league_laliga",
        "code": "LL",
        "name": "Spain La Liga",
        "season": "2023-2024",
        "external_ids": {"api_football": 140},
    },
    135: {
        "league_id": "league_seriea",
        "code": "SA",
        "name": "Italy Serie A",
        "season": "2023-2024",
        "external_ids": {"api_football": 135},
    },
    78: {
        "league_id": "league_bundesliga",
        "code": "BL",
        "name": "Germany Bundesliga",
        "season": "2023-2024",
        "external_ids": {"api_football": 78},
    },
    61: {
        "league_id": "league_ligue1",
        "code": "L1",
        "name": "France Ligue 1",
        "season": "2023-2024",
        "external_ids": {"api_football": 61},
    },
}


def fetch_league_teams(league_meta: dict):
    league_id = league_meta["external_ids"]["api_football"]
    season_year = int(league_meta["season"].split("-")[0])
    data = api_get(
        "teams",
        params={"league": league_id, "season": season_year},
        cache_name=f"league_teams_{league_id}_{season_year}",
    )

    if data.get("errors"):
        raise RuntimeError(f"API error {data['errors']}")
    if not data.get("response"):
        print(f"[WARN] No teams for league {league_id} season {season_year}")
        return []

    return data["response"]



# season_years = list(range(2021, 2024))  # tu season 2014-2015 den 2023-2024, 2024-2025 is paid plan

# all_teams = {}
# for league_meta in big5_leagues.values():
#     league_key = league_meta["league_id"]
#     league_id = league_meta["external_ids"]["api_football"]
#     all_teams[league_key] = []

#     for season_year in season_years:
#         data = api_get(
#             "teams",
#             params={"league": league_id, "season": season_year},
#             cache_name=f"league_{league_id}_{season_year}",
#         )
#         teams = data.get("response", [])
#         for item in teams:
#             all_teams[league_key].append({
#                 "season_year": season_year,
#                 "team_api_id": item["team"]["id"],
#                 "team_name": item["team"]["name"],
#                 "league_meta": league_meta,
#                 "raw_team": item["team"],
#                 "raw_venue": item["venue"],
#             })
# print(f"Loaded {sum(len(v) for v in all_teams.values())} teams from Big 5 leagues")

    
    

### Enrich with soccerdata

In [61]:
import unicodedata
import soccerdata as sd
import os
import json
from pathlib import Path

from torch import cat

def season_to_fbref(s: str) -> str:
    s = s.strip()
    if len(s) == 5 and "-" in s:        # "23-24"
        return s.replace("-", "")
    if len(s) == 9 and "-" in s:        # "2024-2025"
        return s[2:4] + s[7:9]
    return s

def _norm_name(s: str) -> str:
    s = " ".join(s.lower().split())
    s = unicodedata.normalize("NFKD", s).encode("ascii","ignore").decode("ascii")
    return s.replace("-", " ")

def load_fbref_team_stats(league_code: str, seasons: list[str]):
    fb = sd.FBref(leagues=league_code, seasons=seasons)
    return fb.read_team_season_stats(stat_type="standard")

def get_team_fbref_stats(standings_df, team_name: str, season: str):
    season_key = season_to_fbref(season) 
    try:
        row = standings_df.loc[(slice(None), season_key, team_name)]
    except KeyError:
        try:
            df_season = standings_df.xs(season_key, level="season")
        except KeyError:
            return None
        mask = df_season.index.get_level_values("team").map(_norm_name) == _norm_name(team_name)
        if not mask.any():
            return None
        row = df_season[mask].droplevel("league").iloc[0]


    def num(cat, stat):
        if (cat, stat) not in row.index:
            return None
        v = row[(cat, stat)]
        return float(v) if pd.notna(v) else None


    return {
        "played": int(num("Playing Time","MP") or 0),
        "wins": int(num("Performance","W") or 0),
        "draws": int(num("Performance","D") or 0),
        "losses": int(num("Performance","L") or 0),
        "goals_for": int(num("Performance","GF") or 0),
        "goals_against": int(num("Performance","GA") or 0),
        "goal_difference": int(num("Performance","GD") or 0),
        "points": int(num("Performance","Pts") or 0),
        "xGF": num("Expected","xG"),
        "xGA": num("Expected","xGA"),
        "xGD": num("Expected","xGD"),
    }



In [62]:
import os
from pathlib import Path
import json

teams_dir = Path("data/cache")          # hoặc "data/teams" nếu bạn lưu ở đó
all_teams = {}

for path in teams_dir.glob("league_teams_*.json"):
    parts = path.stem.split("_")        # ["league","teams","39","2023-2024"]
    league_id = int(parts[2])
    league_meta = big5_leagues.get(league_id)
    if not league_meta:
        continue

    data = json.loads(path.read_text())
    entries = []
    for item in data.get("response", []):
        entries.append({
            "raw_team": item["team"],
            "raw_venue": item["venue"],
            "league_meta": league_meta,
        })
    all_teams[league_meta["league_id"]] = entries

FBREF_LEAGUES = {
    39: "ENG-Premier League",
    140: "ESP-La Liga",
    135: "ITA-Serie A",
    78: "GER-Bundesliga",
    61: "FRA-Ligue 1",
}
FBREF_SEASONS = ["2324","2223","2122","2021","1920","1819","1718","1617","1516","1415"]

_FBREF_CACHE = {}

def load_fbref_df_for_league(league_meta):
    league_api_id = league_meta["external_ids"]["api_football"]
    fbref_code = FBREF_LEAGUES[league_api_id]
    if fbref_code not in _FBREF_CACHE:
        df = load_fbref_team_stats(fbref_code, FBREF_SEASONS)
        if "player" in df.index.names:
            df = df.droplevel("player")
        _FBREF_CACHE[fbref_code] = df
    return _FBREF_CACHE[fbref_code]


output_dir = Path("data/team_metadata")
output_dir.mkdir(parents=True, exist_ok=True)



In [70]:
for league_key, team_list in all_teams.items():
    print(team_list[0]['league_meta'])

{'league_id': 'league_seriea', 'code': 'SA', 'name': 'Italy Serie A', 'season': '2023-2024', 'external_ids': {'api_football': 135}}
{'league_id': 'league_laliga', 'code': 'LL', 'name': 'Spain La Liga', 'season': '2023-2024', 'external_ids': {'api_football': 140}}
{'league_id': 'league_epl', 'code': 'EPL', 'name': 'England Premier League', 'season': '2023-2024', 'external_ids': {'api_football': 39}}
{'league_id': 'league_ligue1', 'code': 'L1', 'name': 'France Ligue 1', 'season': '2023-2024', 'external_ids': {'api_football': 61}}
{'league_id': 'league_bundesliga', 'code': 'BL', 'name': 'Germany Bundesliga', 'season': '2023-2024', 'external_ids': {'api_football': 78}}


In [None]:
entities = []
for league_key, team_list in all_teams.items():
    league_meta = team_list[0]["league_meta"] if team_list else None
    if not league_meta:
        continue
    df_fbref = load_fbref_df_for_league(league_meta)

    for entry in team_list:
        team = entry["raw_team"]
        venue = entry["raw_venue"]

        entity = build_team_entity(team["name"], league_meta["season"], league_meta)
        update_identity(entity, {
            "full_name": team["name"],
            "country": team["country"],
            "city": venue.get("city"),
            "founded_year": team.get("founded"),
            "crest_url": team.get("logo"),
        }, source="api_football")
        entity["venue"].update({
            "stadium_name": venue.get("name"),
            "city": venue.get("city"),
            "capacity": venue.get("capacity"),
        })
        entity.setdefault("external_ids", {})["api_football"] = team["id"]

        historical = []
        season_stats_map = {}
        for season_key in FBREF_SEASONS:
            season_label = f"20{season_key[:2]}-20{season_key[2:]}"
            stats = get_team_fbref_stats(df_fbref, team["name"], season_label)
            if not stats:
                continue
            season_stats_map[season_label] = stats
            historical.append({
                "season": season_label,
                "points": stats["points"],
                "goal_difference": stats["goal_difference"],
                "wins": stats["wins"],
                "draws": stats["draws"],
                "losses": stats["losses"],
            })

        current_season = league_meta["season"]
        if current_season in season_stats_map:
            stats = season_stats_map[current_season]
            update_season_stats(entity, stats)
            adv = entity["advanced_team_stats"]
            adv["xGF"] = stats["xGF"]
            adv["xGA"] = stats["xGA"]
            adv["xGD"] = stats["xGD"]
        else:
            print(f"[WARN] missing FBref stats for {team['name']} {current_season}")

        entity["historical_performance"] = historical
        entities.append(entity)

out_path = output_dir / "team_metadata_2023_2024.jsonl"
with out_path.open("w", encoding="utf-8") as f:
    for e in entities:
        f.write(json.dumps(e, ensure_ascii=False) + "\n")

### Refill voi nhung club ko matching ten tu 2 source khac nhau

In [None]:
import json
from pathlib import Path

SEASON_LABELS = [
    "2023-2024","2022-2023","2021-2022","2020-2021","2019-2020",
    "2018-2019","2017-2018","2016-2017","2015-2016","2014-2015"
]
FBREF_SEASONS = [season_to_fbref(s) for s in SEASON_LABELS]

FBREF_LEAGUES = {
    39: ["ENG-Premier League"],
    140: ["ESP-La Liga"],
    135: ["ITA-Serie A"],
    78: ["GER-Bundesliga"],
    61: ["FRA-Ligue 1"],
}
FBREF_TEAM_ALIASES = {
    # Premier League
    (39, "newcastle"): "Newcastle United",
    (39, "nottingham forest"): "Nottingham Forest",
    (39, "luton"): "Luton Town",
    (39, "manchester united"): "Manchester Utd",

    # Serie A
    (135, "ac milan"): "Milan",
    (135, "as roma"): "Roma",
    (135, "verona"): "Hellas Verona",

    # La Liga
    (140, "alaves"): "Deportivo Alavés",
    (140, "real betis"): "Real Betis",
    (140, "granada cf"): "Granada",
    (140, "almeria"): "Almería",
    (140, "cadiz"): "Cádiz",
    (140, "atletico madrid"): "Atlético Madrid",

    # Ligue 1
    (61, "paris saint germain"): "Paris Saint-Germain",
    (61, "stade brestois 29"): "Brest",
    (61, "saint etienne"): "Saint-Étienne",

    # Bundesliga
    (78, "bayern munchen"): "Bayern Munich",
    (78, "borussia dortmund"): "Dortmund",
    (78, "borussia monchengladbach"): "Borussia Mönchengladbach",
    (78, "fsv mainz 05"): "Mainz 05",
    (78, "1899 hoffenheim"): "TSG 1899 Hoffenheim",
    (78, "bayer leverkusen"): "Bayer Leverkusen",
    (78, "eintracht frankfurt"): "Eintracht Frankfurt",
    (78, "vfb stuttgart"): "VfB Stuttgart",
    (78, "vfl wolfsburg"): "Wolfsburg",
    (78, "fc augsburg"): "Augsburg",
    (78, "sc freiburg"): "Freiburg",
    (78, "vfl bochum"): "VfL Bochum",
    (78, "1. fc heidenheim"): "Heidenheim",
    (78, "sv darmstadt 98"): "Darmstadt 98",
    (78, "1.fc koln"): "Köln",
    (78, "1. fc koln"): "Köln",
}


# load fbref DF per league (drop player level)
_FBREF_CACHE = {}
def load_league_df(league_meta):
    league_id = league_meta["external_ids"]["api_football"]
    if league_id in _FBREF_CACHE:
        return _FBREF_CACHE[league_id]
    league_codes = FBREF_LEAGUES.get(league_id)
    if not league_codes:
        return None
    df_list = []
    for code in league_codes:
        df = load_fbref_team_stats(code, FBREF_SEASONS)
        if "player" in df.index.names:
            df = df.droplevel("player")
        df_list.append(df)
    df_all = df_list[0] if len(df_list) == 1 else pd.concat(df_list)
    _FBREF_CACHE[league_id] = df_all
    return df_all

def pick_fbref_name(df_season, league_id, team_name):
    target = _norm_name(team_name)
    alias = FBREF_TEAM_ALIASES.get((league_id, target))
    if alias:
        target = _norm_name(alias)
    names = df_season.index.get_level_values("team")
    mask = names.map(_norm_name) == target
    if mask.any():
        return df_season[mask].droplevel("league").iloc[0]
    return None

def refresh_season_stats(entity, df, league_id, season_label):
    try:
        df_season = df.xs(season_to_fbref(season_label), level="season")
    except KeyError:
        return None
    row = pick_fbref_name(df_season, league_id, entity["identity"]["full_name"])
    if row is None:
        return None
    def num(cat, stat):
        if (cat, stat) not in row.index:
            return None
        v = row[(cat, stat)]
        return float(v) if pd.notna(v) else None
    return {
        "played": int(num("Playing Time","MP") or 0),
        "wins": int(num("Performance","W") or 0),
        "draws": int(num("Performance","D") or 0),
        "losses": int(num("Performance","L") or 0),
        "goals_for": int(num("Performance","GF") or 0),
        "goals_against": int(num("Performance","GA") or 0),
        "goal_difference": int(num("Performance","GD") or 0),
        "points": int(num("Performance","Pts") or 0),
        "xGF": num("Expected","xG"),
        "xGA": num("Expected","xGA"),
        "xGD": num("Expected","xGD"),
    }

def de_dupe_hist(hist):
    by_season = {}
    for item in hist:
        by_season[item["season"]] = item
    return list(by_season.values())

out_path = Path("data/team_metadata/team_metadata_2023_2024.jsonl")
entities = [json.loads(line) for line in out_path.open(encoding="utf-8")]

refilled_current = 0
refilled_hist = 0
skipped = []

for e in entities:
    league_meta = e.get("current_league") or {}
    league_id = league_meta.get("external_ids", {}).get("api_football")
    if not league_id:
        continue
    df = load_league_df(league_meta)
    if df is None:
        skipped.append(e["identity"]["full_name"])
        continue

    # season hiện tại
    stats = refresh_season_stats(e, df, league_id, "2023-2024")
    if stats:
        update_season_stats(e, stats)
        adv = e.setdefault("advanced_team_stats", {})
        adv["xGF"] = stats["xGF"]
        adv["xGA"] = stats["xGA"]
        adv["xGD"] = stats["xGD"]
        refilled_current += 1

    # historical
    hist = e.get("historical_performance") or []
    existing = {h["season"] for h in hist}
    for season_label in SEASON_LABELS:
        if season_label in existing:
            continue
        stats = refresh_season_stats(e, df, league_id, season_label)
        if not stats:
            continue
        hist.append({
            "season": season_label,
            "points": stats["points"],
            "goal_difference": stats["goal_difference"],
            "wins": stats["wins"],
            "draws": stats["draws"],
            "losses": stats["losses"],
        })
        refilled_hist += 1
    e["historical_performance"] = de_dupe_hist(hist)

print(f"Refilled current stats: {refilled_current}, historical entries: {refilled_hist}")
if skipped:
    print("Skipped:", skipped[:10])

with out_path.open("w", encoding="utf-8") as f:
    for e in entities:
        f.write(json.dumps(e, ensure_ascii=False) + "\n")
print(f"Saved to: {out_path}")


In [81]:
# language: python
from pathlib import Path
import json

# Data file
file_path = Path("data/team_metadata/team_metadata_2023_2024.jsonl")
SEASON_LABELS = [
    "2023-2024", "2022-2023", "2021-2022", "2020-2021", "2019-2020",
    "2018-2019", "2017-2018", "2016-2017", "2015-2016", "2014-2015",
]

def missing_seasons(entity):
    have = {h["season"] for h in (entity.get("historical_performance") or [])}
    return [s for s in SEASON_LABELS if s not in have]

# Load entities
entities = []
missing_report = []

with file_path.open("r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        try:
            entity = json.loads(line)
        except json.JSONDecodeError:
            continue
        entities.append(entity)
        miss = missing_seasons(entity)
        if miss:
            team_name = entity.get("identity", {}).get("full_name") or entity.get("name", "unknown")
            missing_report.append((team_name, miss))

# Summary
total_teams = len(entities)
teams_with_missing = len(missing_report)
print(f"Total teams: {total_teams}")
print(f"Teams with missing seasons: {teams_with_missing}")
print(f"Teams with complete data: {total_teams - teams_with_missing}\n")

# Show first N teams with missing data
N = 20
print(f"First {N} teams with missing seasons:")
for i, (team, miss) in enumerate(missing_report[:N], 1):
    print(f"{i:2d}. {team}: {miss}")

if len(missing_report) > N:
    print(f"... and {len(missing_report) - N} more teams have missing seasons.")

Total teams: 98
Teams with missing seasons: 23
Teams with complete data: 75

First 20 teams with missing seasons:
 1. Verona: ['2018-2019', '2016-2017']
 2. Alaves: ['2022-2023', '2015-2016', '2014-2015']
 3. Real Betis: ['2023-2024', '2022-2023', '2021-2022', '2020-2021', '2019-2020', '2018-2019', '2017-2018', '2016-2017', '2015-2016', '2014-2015']
 4. Granada CF: ['2022-2023', '2018-2019', '2017-2018']
 5. Almeria: ['2021-2022', '2020-2021', '2019-2020', '2018-2019', '2017-2018', '2016-2017', '2015-2016']
 6. Cadiz: ['2019-2020', '2018-2019', '2017-2018', '2016-2017', '2015-2016', '2014-2015']
 7. Newcastle: ['2023-2024', '2022-2023', '2021-2022', '2020-2021', '2019-2020', '2018-2019', '2017-2018', '2016-2017', '2015-2016', '2014-2015']
 8. Nottingham Forest: ['2023-2024', '2022-2023', '2021-2022', '2020-2021', '2019-2020', '2018-2019', '2017-2018', '2016-2017', '2015-2016', '2014-2015']
 9. Luton: ['2022-2023', '2021-2022', '2020-2021', '2019-2020', '2018-2019', '2017-2018', '2016-2