In [None]:
from __future__ import annotations
import re
from pathlib import Path
from typing import Optional, List, Dict
import requests
import pandas as pd

BASE = "https://ghoapi.azureedge.net/api"
CACHE_DIR = Path("data/who")
PAGE_SIZE = 1000
TIMEOUT = 30

INDICATORS: List[str] = [
    "MDG_0000000020","TB_1","TB_e_inc_num",
    "MALARIA_EST_INCIDENCE","MALARIA_TOTAL_CASES","MALARIA_EST_DEATHS",
    "MALARIA_ITN_COVERAGE","MALARIA_IRS_COVERAGE","MALARIA_IPTP3_COVERAGE",
    "HIV_0000000026","SDGHIV","MDG_0000000029"
]

def _session_with_retries() -> requests.Session:
    from urllib3.util.retry import Retry
    from requests.adapters import HTTPAdapter
    s = requests.Session()
    retries = Retry(total=5, backoff_factor=0.5,
                    status_forcelist=(429,500,502,503,504),
                    allowed_methods=frozenset(["GET"]))
    s.mount("https://", HTTPAdapter(max_retries=retries))
    return s

def fetch_indicator_raw(
    code: str,
    include_regions: bool = False,
    year_min: Optional[int] = None,
    year_max: Optional[int] = None,
    session: Optional[requests.Session] = None
) -> list[dict]:
    sess = session or _session_with_retries()
    rows, skip = [], 0
    def _get(params):
        r = sess.get(f"{BASE}/{code}", params=params, timeout=TIMEOUT)
        r.raise_for_status()
        js = r.json()
        return js.get("value") if isinstance(js, dict) else js
    while True:
        params = {"$format":"json","$top":PAGE_SIZE,"$skip":skip}
        filt = []
        if not include_regions:
            filt.append("SpatialDimType eq 'COUNTRY'")
        if year_min is not None:
            filt.append(f"TimeDim ge {int(year_min)}")
        if year_max is not None:
            filt.append(f"TimeDim le {int(year_max)}")
        if filt:
            params["$filter"] = " and ".join(filt)
        try:
            batch = _get(params)
        except requests.HTTPError:
            params.pop("$filter", None)
            try:
                batch = _get(params)
            except requests.HTTPError:
                batch = _get({"$format":"json","$top":PAGE_SIZE,"$skip":skip})
        batch = batch or []
        rows.extend(batch)
        if len(batch) < PAGE_SIZE:
            break
        skip += PAGE_SIZE
    return rows

def normalize_df(raw_rows: list[dict], indicator: str) -> pd.DataFrame:
    if not raw_rows:
        return pd.DataFrame(columns=[
            "indicator","iso3","year","value","low","high",
            "dim1","dim1_type","who_region","who_region_code",
            "spatial_type","Date"
        ])
    df = pd.DataFrame(raw_rows)
    rename_map = {
        "SpatialDim":"iso3","TimeDim":"year","NumericValue":"value",
        "Low":"low","High":"high","Dim1":"dim1","Dim1Type":"dim1_type",
        "ParentLocation":"who_region","ParentLocationCode":"who_region_code",
        "SpatialDimType":"spatial_type"
    }
    df = df.rename(columns={k:v for k,v in rename_map.items() if k in df.columns})
    if "value" not in df.columns or df["value"].isna().all():
        if "Value" in df.columns:
            def parse_numeric(s):
                if pd.isna(s): return None
                if isinstance(s,(int,float)): return float(s)
                m = re.match(r"^\s*([+-]?\d+(?:[\.,]\d+)?)", str(s))
                return float(m.group(1).replace(",",".")) if m else None
            df["value"] = df["Value"].map(parse_numeric)
    if "year" in df: df["year"] = pd.to_numeric(df["year"], errors="coerce").astype("Int64")
    for c in ("value","low","high"):
        if c in df: df[c] = pd.to_numeric(df[c], errors="coerce").astype("float32")
    for c in ("iso3","dim1","dim1_type","who_region","who_region_code"):
        if c in df: df[c] = df[c].astype("string")
    if "spatial_type" in df: df["spatial_type"] = df["spatial_type"].astype("category")
    df["indicator"] = indicator
    cols = ["indicator","iso3","year","value","low","high",
            "dim1","dim1_type","who_region","who_region_code","spatial_type","Date"]
    df = df[[c for c in cols if c in df.columns]].copy()
    return df

def load_or_download_indicator(
    code: str,
    refresh: bool = False,
    include_regions: bool = False,
    year_min: Optional[int] = None,
    year_max: Optional[int] = None,
) -> pd.DataFrame:
    CACHE_DIR.mkdir(parents=True, exist_ok=True)
    path = CACHE_DIR / f"{code}.parquet"
    if path.exists() and not refresh:
        return pd.read_parquet(path)
    raw = fetch_indicator_raw(code, include_regions, year_min, year_max)
    df = normalize_df(raw, indicator=code)
    if not include_regions:
        if "spatial_type" in df.columns:
            df = df[df["spatial_type"].astype(str) == "COUNTRY"]
        elif "iso3" in df.columns:
            df = df[df["iso3"].str.fullmatch(r"[A-Z]{3}", na=False)]
    df.to_parquet(path, index=False)
    return df

def download_each(
    indicators: List[str] = INDICATORS,
    refresh: bool = False,
    include_regions: bool = False,
    year_min: Optional[int] = None,
    year_max: Optional[int] = None,
    save: bool = True,
    out_dir: str = "data/who",
    fmt: str = "parquet",
) -> Dict[str, pd.DataFrame]:
    Path(out_dir).mkdir(parents=True, exist_ok=True)
    dfs: Dict[str, pd.DataFrame] = {}
    for code in indicators:
        try:
            df = load_or_download_indicator(
                code, refresh, include_regions, year_min, year_max
            )
            dfs[code] = df
            if save:
                if fmt == "parquet":
                    df.to_parquet(f"{out_dir}/{code}.parquet", index=False)
                elif fmt == "csv":
                    df.to_csv(f"{out_dir}/{code}.csv", index=False)
                else:
                    raise ValueError("fmt deve essere 'parquet' o 'csv'")
        except Exception as e:
            print(f"[WARN] {code}: {e}")
    return dfs

# --- esempio d'uso ---
# dfs = download_each(include_regions=False, fmt="parquet")
# print(dfs["MDG_0000000020"].head())


dfs = download_each(indicators=[
    "MDG_0000000020","TB_1","TB_e_inc_num",
    "MALARIA_EST_INCIDENCE","MALARIA_TOTAL_CASES","MALARIA_EST_DEATHS",
    "MALARIA_ITN_COVERAGE","MALARIA_IRS_COVERAGE","MALARIA_IPTP3_COVERAGE",
    "HIV_0000000026","SDGHIV","MDG_0000000029"
], include_regions=False, fmt="csv")

#tb = dfs["MDG_0000000020"]
