In [2]:
# === NHS ODS Bronze (Jupyter) — baseline + sync WITH PROGRESS BARS ===
# - Spec-compliant params (_format=json), 1-based Offset
# - Overall + per-role tqdm progress (percent, time, ETA)
# - Immutable Bronze, manifests, watermarks
# - Optional flatten to CSV/Parquet

import os, json, time, hashlib, math, requests
from datetime import datetime, timedelta, timezone
from pathlib import Path
from typing import Dict, Any, List, Optional, Tuple
from urllib.parse import urlparse

import pandas as pd
from tqdm import tqdm

# ------------------- USER SETTINGS -------------------
BRONZE_ROOT = Path(r"C:\Users\NikhilYadav\Desktop\NHS ODS\bronze\ods")
ORD_BASE    = "https://directory.spineservices.nhs.uk/ORD/2-0-0"
RATE_LIMIT_RPS = 4

# Roles to baseline first (tweak as you like)
ROLE_IDS: List[str] = ["RO177", "RO98"]  # RO177 = Prescribing Cost Centre; RO98 = CCG (legacy)
USE_ROLES_PARAM_IF_NEEDED = True         # fallback to Roles= if PrimaryRoleId is rejected
PAGE_LIMIT = 1000                        # page size

MAKE_FLATTEN_EXTRACT = True

# ------------------- HELPERS -------------------
def now_utc_iso() -> str:
    return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")

def ensure_dir(p: Path) -> None:
    p.mkdir(parents=True, exist_ok=True)

def write_json(path: Path, obj: Any) -> None:
    ensure_dir(path.parent)
    with path.open("w", encoding="utf-8") as f:
        json.dump(obj, f, indent=2, ensure_ascii=False)

def read_json(path: Path, default=None):
    if not path.exists():
        return default
    with path.open("r", encoding="utf-8") as f:
        return json.load(f)

def sha256_file(path: Path) -> str:
    h = hashlib.sha256()
    with path.open("rb") as f:
        for chunk in iter(lambda: f.read(1024 * 1024), b""):
            h.update(chunk)
    return h.hexdigest()

# watermarks
def wm_path() -> Path: return BRONZE_ROOT / "_watermarks.json"
def get_wm() -> Dict[str, Any]: return read_json(wm_path(), default={}) or {}
def set_wm(key: str, val: Any) -> None:
    wm = get_wm(); wm[key] = val; write_json(wm_path(), wm)

# ------------------- ORD HTTP (spec-compliant) -------------------
session = requests.Session()
session.headers.update({"User-Agent": "ods-bronze/1.3"})  # friendly UA only

def rate_sleep():
    time.sleep(1.0 / max(RATE_LIMIT_RPS, 1))

def ord_request(url: str, params: Dict[str, Any]) -> requests.Response:
    """
    Low-level GET returning the raw Response (so we can read headers like X-Total-Count).
    - adds _format=json (lowercase)
    - uses exact param names per spec
    """
    q = dict(params)
    q["_format"] = "json"
    rate_sleep()
    r = session.get(url, params=q, timeout=60, allow_redirects=True)
    return r

def ord_get_json(url: str, params: Dict[str, Any]) -> Dict[str, Any]:
    r = ord_request(url, params)
    if not (200 <= r.status_code < 300):
        raise RuntimeError(f"ORD GET failed {r.status_code}. URL: {r.url}\nBody: {r.text}")
    return r.json()

def ord_get_full_org(link: str) -> Dict[str, Any]:
    if "_format=" not in link:
        sep = "&" if "?" in link else "?"
        link = f"{link}{sep}_format=json"
    rate_sleep()
    r = session.get(link, timeout=60, allow_redirects=True)
    if not (200 <= r.status_code < 300):
        raise RuntimeError(f"ORD org GET failed {r.status_code}. URL: {link}\nBody: {r.text}")
    return r.json()

# ------------------- TOTAL COUNT DISCOVERY -------------------
def get_total_for_role(search_url: str, role_id: str, use_roles_param: bool) -> Tuple[Optional[int], Dict[str, Any]]:
    """
    Try to fetch X-Total-Count for a role using a tiny page (Limit=1, Offset=1).
    Returns (total_or_None, params_used).
    Some deployments don’t set X-Total-Count — we'll handle that.
    """
    # Try PrimaryRoleId first
    params = {"PrimaryRoleId": role_id, "Limit": 1, "Offset": 1}
    r = ord_request(search_url, params)
    if 200 <= r.status_code < 300:
        total = r.headers.get("X-Total-Count")
        if total and total.isdigit():
            return int(total), {"PrimaryRoleId": role_id}
        else:
            return None, {"PrimaryRoleId": role_id}
    # If rejected and fallback allowed, try Roles=
    if use_roles_param:
        params = {"Roles": role_id, "Limit": 1, "Offset": 1}
        r = ord_request(search_url, params)
        if 200 <= r.status_code < 300:
            total = r.headers.get("X-Total-Count")
            if total and total.isdigit():
                return int(total), {"Roles": role_id}
            else:
                return None, {"Roles": role_id}
        else:
            raise RuntimeError(f"Unable to get total for {role_id}. URL: {r.url}\nBody: {r.text}")
    else:
        raise RuntimeError(f"Unable to get total for {role_id}. URL: {r.url}\nBody: {r.text}")

# ------------------- BASELINE (with progress) -------------------
def baseline_roles_with_progress(role_ids: List[str]) -> Path:
    ensure_dir(BRONZE_ROOT)
    release_date = datetime.now(timezone.utc).strftime("%Y-%m-%d")
    search_url   = ORD_BASE.rstrip("/") + "/organisations"

    base_dir  = BRONZE_ROOT / f"release_date={release_date}" / "source=ord" / "release_type=api_baseline" / "dataset=roles"
    chunk_dir = base_dir / "chunks"
    ensure_dir(chunk_dir)

    # 1) discover totals per role (so we can show overall + per-role %)
    role_plans = []
    grand_total = 0
    totals_known = True
    for rid in role_ids:
        total, base_params = get_total_for_role(search_url, rid, USE_ROLES_PARAM_IF_NEEDED)
        role_plans.append({"role": rid, "total": total, "base_params": base_params})
        if total is None:
            totals_known = False
        else:
            grand_total += total

    # 2) manifests
    manifest = {
        "api": ORD_BASE, "release_date": release_date, "release_type": "api_baseline",
        "downloaded_at_utc": now_utc_iso(), "roles": role_ids,
        "role_plans": role_plans,
        "summary_chunks": [], "org_records": []
    }

    # 3) Overall progress bar (if we know totals)
    overall = tqdm(total=grand_total if totals_known else None, unit="org", desc="ALL ROLES", leave=True)

    # 4) crawl each role with a per-role progress bar
    def page_once(params: Dict[str, Any], offset: int, limit: int) -> Dict[str, Any]:
        p = dict(params)
        # 1-based Offset
        p["Limit"]  = limit
        p["Offset"] = max(1, offset)
        return ord_get_json(search_url, p)

    for plan in role_plans:
        rid = plan["role"]
        total = plan["total"]
        base_params = plan["base_params"]

        per_role = tqdm(total=total if total is not None else None,
                        unit="org", desc=f"Role {rid}", leave=False)

        # 1-based paging
        offset = 1
        while True:
            try:
                data = page_once(base_params, offset, PAGE_LIMIT)
            except RuntimeError as e:
                # last-chance param flip if needed
                if "PrimaryRoleId" in base_params and USE_ROLES_PARAM_IF_NEEDED:
                    base_params = {"Roles": rid}
                    data = page_once(base_params, offset, PAGE_LIMIT)
                else:
                    per_role.close()
                    overall.close()
                    raise

            orgs = data.get("Organisations", []) or []
            if not orgs:
                per_role.close()
                break

            # save summary
            chunk_name = f"search_{rid}_{offset:09d}.json"
            write_json(chunk_dir / chunk_name, data)
            manifest["summary_chunks"].append({"role": rid, "file": chunk_name, "count": len(orgs)})

            # fetch each full record
            for rec in orgs:
                link = rec.get("OrgLink"); oid = rec.get("OrgId")
                if not link or not oid:
                    continue
                try:
                    full = ord_get_full_org(link)
                    ofile = f"org_{oid}.json"
                    write_json(chunk_dir / ofile, full)
                    manifest["org_records"].append({"org_id": oid, "file": ofile})
                except Exception as ex:
                    manifest.setdefault("errors", []).append({"org": oid, "error": str(ex)})

            # progress
            per_role.update(len(orgs))
            overall.update(len(orgs))

            # advance offset by page size (1-based stepping: 1, 1001, 2001, ...)
            offset += PAGE_LIMIT

            # stop if short page
            if len(orgs) < PAGE_LIMIT:
                per_role.close()
                break

    overall.close()
    write_json(base_dir / "_manifest.json", manifest)

    # watermarks
    set_wm("ord_api_baseline_date", release_date)
    set_wm("ord_last_change_date", (datetime.now(timezone.utc) - timedelta(days=1)).strftime("%Y-%m-%d"))

    print(f"[OK] Baseline complete → {base_dir}")
    print("Org files:", len(manifest["org_records"]))
    return base_dir

# ------------------- INCREMENTAL SYNC (with progress) -------------------
def incremental_sync_with_progress():
    wm = get_wm()
    since = wm.get("ord_last_change_date")
    assert since, "No watermark found. Run baseline first."

    sync_date = datetime.now(timezone.utc).strftime("%Y-%m-%d")
    sync_url  = ORD_BASE.rstrip("/") + "/sync"

    base_dir  = BRONZE_ROOT / f"release_date={sync_date}" / "source=ord" / "release_type=api_sync" / "dataset=all"
    chunk_dir = base_dir / "chunks"
    ensure_dir(chunk_dir)

    data = ord_get_json(sync_url, {"LastChangeDate": since})
    write_json(chunk_dir / f"sync_list_since_{since}.json", data)

    changed = data.get("Organisations", []) or []
    bar = tqdm(total=len(changed), unit="org", desc="SYNC download", leave=True)
    for o in changed:
        link = o.get("OrgLink")
        if not link: 
            bar.update(1); 
            continue
        full = ord_get_full_org(link)
        oid  = full.get("OrgId") or urlparse(link).path.rstrip("/").split("/")[-1].split("?",1)[0]
        write_json(chunk_dir / f"org_{oid}.json", full)
        bar.update(1)
    bar.close()

    write_json(base_dir / "_manifest.json", {
        "api": ORD_BASE, "release_date": sync_date, "release_type": "api_sync",
        "downloaded_at_utc": now_utc_iso(), "params":{"LastChangeDate": since},
        "changed_count": len(changed)
    })

    set_wm("ord_last_change_date", datetime.now(timezone.utc).strftime("%Y-%m-%d"))
    print(f"[OK] Sync complete → {base_dir} (changed orgs: {len(changed)})")
    return base_dir

# ------------------- FLATTEN (optional) -------------------
def flatten_latest_baseline_to_tabular():
    candidates = sorted(BRONZE_ROOT.glob("release_date=*/source=ord/release_type=api_baseline/dataset=*/chunks"))
    assert candidates, "No baseline found."
    latest_chunks = candidates[-1]

    rows=[]
    for f in latest_chunks.glob("org_*.json"):
        o = read_json(f, {})
        rows.append({
            "OrgId": o.get("OrgId"),
            "Name": o.get("Name"),
            "Status": o.get("Status"),
            "OrgRecordClass": o.get("OrgRecordClass"),
            "PostCode": o.get("PostCode"),
            "LastChangeDate": o.get("LastChangeDate"),
            "PrimaryRoles": ",".join(sorted({
                (r.get("id") or r.get("idCode") or r.get("Id"))
                for r in (o.get("Roles") or []) if isinstance(r, dict)
            }))
        })
    df = pd.DataFrame(rows).drop_duplicates(subset=["OrgId"]).sort_values("OrgId")

    extracts = BRONZE_ROOT / "extracts"
    ensure_dir(extracts)
    stamp = datetime.now(timezone.utc).strftime("%Y-%m-%d")
    csv_path  = extracts / f"snapshot_{stamp}.csv"
    parq_path = extracts / f"snapshot_{stamp}.parquet"
    df.to_csv(csv_path, index=False)
    df.to_parquet(parq_path, index=False)

    print("Flattened CSV:", csv_path)
    print("Flattened Parquet:", parq_path)
    return df

# ------------------- RUN -------------------
ensure_dir(BRONZE_ROOT)
print("Bronze root:", BRONZE_ROOT.resolve())

# BASELINE with progress bars
baseline_dir = baseline_roles_with_progress(ROLE_IDS)

# LATER: run incremental sync with progress
# sync_dir = incremental_sync_with_progress()
# _df2 = flatten_latest_baseline_to_tabular(); display(_df2.head())


Bronze root: C:\Users\NikhilYadav\Desktop\NHS ODS\bronze\ods


ALL ROLES:   0%|          | 0/15613 [00:00<?, ?org/s]
[Ae RO177:   0%|          | 0/15270 [00:00<?, ?org/s]
ALL ROLES:   6%|▋         | 1000/15613 [04:56<1:12:15,  3.37org/s]]
ALL ROLES:  13%|█▎        | 2000/15613 [10:19<1:10:45,  3.21org/s]]
ALL ROLES:  19%|█▉        | 3000/15613 [15:49<1:07:20,  3.12org/s]]
ALL ROLES:  26%|██▌       | 4000/15613 [21:15<1:02:25,  3.10org/s]]
ALL ROLES:  32%|███▏      | 5000/15613 [26:34<56:47,  3.11org/s]   
ALL ROLES:  38%|███▊      | 6000/15613 [31:56<51:31,  3.11org/s]]
ALL ROLES:  45%|████▍     | 7000/15613 [37:16<46:06,  3.11org/s]]
ALL ROLES:  51%|█████     | 8000/15613 [42:23<40:10,  3.16org/s]]
ALL ROLES:  58%|█████▊    | 9000/15613 [48:20<36:15,  3.04org/s]]
ALL ROLES:  64%|██████▍   | 10000/15613 [53:17<29:52,  3.13org/s]]
ALL ROLES:  70%|███████   | 11000/15613 [58:14<24:01,  3.20org/s]]
ALL ROLES:  77%|███████▋  | 12000/15613 [1:03:13<18:33,  3.24org/s]]
ALL ROLES:  83%|████████▎ | 13000/15613 [1:08:07<13:14,  3.29org/s]]
ALL ROLES:  90%

[OK] Baseline complete → C:\Users\NikhilYadav\Desktop\NHS ODS\bronze\ods\release_date=2025-09-22\source=ord\release_type=api_baseline\dataset=roles
Org files: 15611
Flattened CSV: C:\Users\NikhilYadav\Desktop\NHS ODS\bronze\ods\extracts\snapshot_2025-09-22.csv
Flattened Parquet: C:\Users\NikhilYadav\Desktop\NHS ODS\bronze\ods\extracts\snapshot_2025-09-22.parquet


Unnamed: 0,OrgId,Name,Status,OrgRecordClass,PostCode,LastChangeDate,PrimaryRoles
0,,,,,,,
