In [1]:
# =========================
# ODS → Curated tables (LOCAL) + ONSPD enrichment + GP practices (RO76)
# - Newest-per-OrgId across ALL bronze runs (no re-download)
# - Tables: orgs, org_dates, org_roles, org_rels, org_succs
# - Enrichment: AreaCode_{LAD,LSOA,MSOA,ICB}, Latitude, Longitude (+ ctry)
# - GP slice: any org with RoleId RO76 (GP PRACTICE). Fallback: RO177 + GP-code regex.
# - Progress bars + fixed coverage regexes + robust joins
# =========================

import os, re, json
from datetime import datetime
from pathlib import Path
from typing import Dict, Any, List, Optional, Tuple
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

# ---------- USER SETTINGS ----------
BRONZE_ROOT = Path(r"C:\Users\NikhilYadav\Desktop\NHS ODS\bronze\ods")
EXTRACTS_DIR = BRONZE_ROOT / "extracts" / "curated"
# Point this at your local ONSPD CSV (leave as None to skip enrichment)
ONSPD_CSV: Optional[str] = r"C:\Users\NikhilYadav\Desktop\NHS ODS\bronze\ods\extracts\curated\latest\ONSPD_MAY_2025_UK.csv"

MAX_WORKERS = 16
FULL_RESCAN = False  # True only if you want to force re-parse of every org_*.json

# Role codes
GP_ROLE_IDS = {"RO76"}      # GP PRACTICE
PRESCRIBE_ROLE = "RO177"    # PRESCRIBING COST CENTRE

# ---------- ultra-fast JSON (optional) ----------
try:
    import orjson
    def read_json_fast(p: Path):
        try:
            return orjson.loads(p.read_bytes())
        except Exception:
            with p.open("r", encoding="utf-8") as f:
                return json.load(f)
except Exception:
    def read_json_fast(p: Path):
        with p.open("r", encoding="utf-8") as f:
            return json.load(f)

# ---------- helpers ----------
def ensure_dir(p: Path) -> None:
    p.mkdir(parents=True, exist_ok=True)

def write_json(path: Path, obj: Any) -> None:
    ensure_dir(path.parent)
    with path.open("w", encoding="utf-8") as f:
        json.dump(obj, f, indent=2, ensure_ascii=False)

def read_json(path: Path, default=None):
    try:
        with path.open("r", encoding="utf-8") as f:
            return json.load(f)
    except Exception:
        return default

def scalar(x):
    if x is None or isinstance(x, (str, int, float, bool)): return x
    if isinstance(x, dict):
        for k in ("extension","value","_","text","#text","displayName","code","id"):
            if k in x and isinstance(x[k], (str, int, float, bool)): return x[k]
        if len(x)==1:
            v = next(iter(x.values()))
            if isinstance(v,(str,int,float,bool)): return v
    if isinstance(x, list):
        for it in x:
            s = scalar(it)
            if s is not None: return s
    try: return json.dumps(x, ensure_ascii=False, separators=(",",":"))
    except: return str(x)

def ensure_str(x): return "" if x is None else str(x)

def pick_org(j):
    if isinstance(j, dict) and "Organisation" in j and isinstance(j["Organisation"], dict): return j["Organisation"]
    if isinstance(j, dict): return j
    return {}

def get_org_id(org):
    raw = org.get("OrgId")
    if isinstance(raw, dict) and "extension" in raw: return str(raw["extension"])
    return str(scalar(raw) or "")

def get_record_class(org):
    rc = org.get("orgRecordClass") or org.get("OrgRecordClass")
    rc = str(scalar(rc) or "")
    label = {"RC1":"HSCOrg", "RC2":"HSCSite"}.get(rc.upper(), "")
    return rc, label

def deep_find_first_key(obj, key_regex):
    pat = re.compile(key_regex, re.IGNORECASE)
    stack = [obj]; seen=set()
    while stack:
        cur = stack.pop()
        if id(cur) in seen: continue
        seen.add(id(cur))
        if isinstance(cur, dict):
            for k,v in cur.items():
                if isinstance(k, str) and pat.search(k):
                    val = scalar(v)
                    if val not in (None,"","null"): return val
            for v in cur.values():
                if isinstance(v,(dict,list)): stack.append(v)
        elif isinstance(cur, list):
            for it in cur:
                if isinstance(it,(dict,list)): stack.append(it)
    return None

def clean_postcode(pc):
    if not pc: return "", ""
    s = re.sub(r"\s+","", str(pc)).upper()
    spaced = s[:-3] + " " + s[-3:] if len(s)>3 else s
    return s, spaced

def extract_address_fields(org):
    candidates=[]
    for path in [("GeoLoc","Location"), ("PostalAddress",), ("Address",), ("Contact","Address")]:
        cur = org; ok=True
        for k in path:
            if isinstance(cur, dict) and k in cur: cur = cur[k]
            else: ok=False; break
        if ok and isinstance(cur, dict): candidates.append(cur)
    addr={}
    for cand in candidates:
        keys = {k.lower() for k in cand.keys()}
        if keys & {"addrln1","addrln2","addrln3","addrln4","addrl1","addrl2","town","city","county","postcode"}:
            addr=cand; break
        if any(k.lower()=="postcode" for k in cand.keys()):
            addr=cand; break
    def pick(*names):
        for n in names:
            if isinstance(addr, dict) and n in addr: return scalar(addr[n])
        return None
    line1 = pick("AddrLn1","Addrl1","Address1")
    line2 = pick("AddrLn2","Addrl2","Address2")
    line3 = pick("AddrLn3","Addrl3","Address3")
    line4 = pick("AddrLn4","Addrl4","Address4")
    city  = pick("Town","City","Locality")
    county= pick("County")
    country = pick("Country")
    pc = pick("PostCode","Postcode","Post_Code","Post Code") or deep_find_first_key(org, r"post\s*code|postcode")
    parts = [line1,line2,line3,line4,city,county,country]
    addr_full = ", ".join([ensure_str(x).strip() for x in parts if x and str(x).strip()!=""])
    pc_compact, pc_spaced = clean_postcode(pc)
    return {
        "AddrLine1": ensure_str(line1), "AddrLine2": ensure_str(line2),
        "AddrLine3": ensure_str(line3), "AddrLine4": ensure_str(line4),
        "TownCity": ensure_str(city), "County": ensure_str(county), "Country": ensure_str(country),
        "PostCode": pc_compact, "PostCodeSpaced": pc_spaced, "AddressFull": addr_full
    }

def extract_dates(org):
    out=[]; d = org.get("Date")
    if isinstance(d, list):
        for item in d:
            if isinstance(item, dict):
                out.append({"DateType": ensure_str(item.get("Type")),
                            "Start": ensure_str(item.get("Start")),
                            "End": ensure_str(item.get("End"))})
    return out

def extract_roles(org):
    out=[]; R = org.get("Roles")
    def emit(it):
        if not isinstance(it, dict): return
        rid = scalar(it.get("id") or it.get("idCode") or it.get("Id") or it.get("code"))
        primary = bool(it.get("primaryRole", False))
        rstat = ensure_str(scalar(it.get("Status")))
        dates=[]; d = it.get("Date")
        if isinstance(d, list):
            for di in d:
                if isinstance(di, dict):
                    dates.append({"Type": ensure_str(di.get("Type")),
                                  "Start": ensure_str(di.get("Start")),
                                  "End": ensure_str(di.get("End"))})
        out.append({"RoleId": ensure_str(rid), "PrimaryRole": primary,
                    "RoleStatus": rstat, "RoleDates": dates})
    if isinstance(R, list):
        for it in R: emit(it)
    elif isinstance(R, dict):
        rl = R.get("Role")
        if isinstance(rl, list):
            for it in rl: emit(it)
        elif isinstance(rl, dict):
            emit(rl)
    return out

def extract_rels(org):
    out=[]; R = org.get("Rels")
    if not isinstance(R, dict): return out
    rl = R.get("Rel")
    items = rl if isinstance(rl, list) else ([rl] if isinstance(rl, dict) else [])
    for it in items:
        if not isinstance(it, dict): continue
        rel_id = ensure_str(scalar(it.get("id"))); rstat  = ensure_str(scalar(it.get("Status")))
        target = it.get("Target") or {}
        tgt_org_raw = target.get("OrgId")
        tgt_org = ensure_str(tgt_org_raw.get("extension") if isinstance(tgt_org_raw, dict) and "extension" in tgt_org_raw else scalar(tgt_org_raw))
        tgt_role_raw = target.get("PrimaryRoleId") or {}
        tgt_role = ensure_str((tgt_role_raw.get("id") if isinstance(tgt_role_raw, dict) else tgt_role_raw) or "")
        dates=[]; d = it.get("Date")
        if isinstance(d, list):
            for di in d:
                if isinstance(di, dict):
                    dates.append({"Type": ensure_str(di.get("Type")),
                                  "Start": ensure_str(di.get("Start")),
                                  "End": ensure_str(di.get("End"))})
        out.append({"RelId": rel_id, "RelStatus": rstat, "TargetOrgId": tgt_org,
                    "TargetPrimaryRoleId": tgt_role, "RelDates": dates})
    return out

def extract_succs(org):
    out=[]; S = org.get("Succs")
    if not isinstance(S, dict): return out
    sc = S.get("Succ")
    items = sc if isinstance(sc, list) else ([sc] if isinstance(sc, dict) else [])
    for it in items:
        if not isinstance(it, dict): continue
        typ = ensure_str(scalar(it.get("Type"))); target = it.get("Target") or {}
        tgt_org_raw = target.get("OrgId")
        tgt_org = ensure_str(tgt_org_raw.get("extension") if isinstance(tgt_org_raw, dict) and "extension" in tgt_org_raw else scalar(tgt_org_raw))
        tgt_role_raw = target.get("PrimaryRoleId") or {}
        tgt_role = ensure_str((tgt_role_raw.get("id") if isinstance(tgt_role_raw, dict) else tgt_role_raw) or "")
        dates=[]; d = it.get("Date")
        if isinstance(d, list):
            for di in d:
                if isinstance(di, dict):
                    dates.append({"Type": ensure_str(di.get("Type")),"Start": ensure_str(di.get("Start")),"End": ensure_str(di.get("End"))})
        out.append({"SuccType": typ, "TargetOrgId": tgt_org,
                    "TargetPrimaryRoleId": tgt_role, "SuccDates": dates})
    return out

# ---------- discovery ----------
def list_chunk_dirs_desc() -> List[Path]:
    # newest → oldest by release_date
    return sorted(
        BRONZE_ROOT.glob("release_date=*/source=ord/*/dataset=*/chunks"),
        key=lambda d: d.parent.parent.parent.name.replace("release_date=",""),
        reverse=True
    )

def oid_from_file(fp: Path) -> Optional[str]:
    name = fp.stem
    return name[4:] if name.startswith("org_") else None

# ---------- core build ----------
def build_curated_tables(onspd_csv: Optional[str]=None, full_rescan: bool=False, max_workers: int=16):
    ensure_dir(EXTRACTS_DIR)
    chunk_dirs = list_chunk_dirs_desc()
    if not chunk_dirs:
        raise RuntimeError("No chunks found under bronze/ods.")

    # PASS A: newest-per-OrgId selection (no file opens)
    chosen_oids = set()
    dir_need_files: Dict[str, List[Path]] = {}
    for cdir in chunk_dirs:
        files = list(cdir.glob("org_*.json"))
        if full_rescan:
            dir_need_files[str(cdir)] = files
            continue
        need = []
        for fp in files:
            oid = oid_from_file(fp)
            if not oid:
                continue
            if oid not in chosen_oids:
                need.append(fp)
                chosen_oids.add(oid)
        if need:
            dir_need_files[str(cdir)] = need

    total_unique_orgs = sum(len(v) for v in dir_need_files.values())
    if total_unique_orgs == 0:
        raise RuntimeError("Nothing to parse (already consolidated).")

    # PASS B: parse with one overall progress bar
    chosen: Dict[str, Dict[str, Any]] = {}
    overall = tqdm(total=total_unique_orgs, unit="org", desc="Building curated tables", leave=True)

    for cdir in chunk_dirs:
        key = str(cdir)
        if key not in dir_need_files:
            continue
        need_files = dir_need_files[key]

        def parse_one(fp: Path):
            j = read_json_fast(fp)
            org = pick_org(j); oid = get_org_id(org)
            return oid, org

        with ThreadPoolExecutor(max_workers=max_workers) as ex:
            futures = {ex.submit(parse_one, fp): fp for fp in need_files}
            for fut in as_completed(futures):
                try:
                    oid, org = fut.result()
                    if oid:
                        chosen[oid] = org
                except Exception:
                    pass
                finally:
                    overall.update(1)
    overall.close()

    # Build tidy rows
    org_rows, date_rows, role_rows, rel_rows, succ_rows = [], [], [], [], []

    for oid, org in chosen.items():
        name   = ensure_str(scalar(org.get("Name")))
        status = ensure_str(scalar(org.get("Status"))); is_active = (status.strip().lower()=="active")
        lcd    = ensure_str(scalar(org.get("LastChangeDate")))
        rc_code, rc_label = get_record_class(org)
        addr = extract_address_fields(org)

        org_rows.append({
            "OrgId": oid, "Name": name, "Status": status, "IsActive": is_active,
            "OrgRecordClass": rc_code, "OrgRecordClassLabel": rc_label,
            "LastChangeDate": lcd, **addr
        })

        for d in extract_dates(org):
            date_rows.append({"OrgId": oid, **d})

        for r in extract_roles(org):
            if r["RoleDates"]:
                for rd in r["RoleDates"]:
                    role_rows.append({
                        "OrgId": oid, "RoleId": r["RoleId"], "PrimaryRole": r["PrimaryRole"],
                        "RoleStatus": r["RoleStatus"], "RoleDateType": rd["Type"],
                        "RoleStart": rd["Start"], "RoleEnd": rd["End"]
                    })
            else:
                role_rows.append({
                    "OrgId": oid, "RoleId": r["RoleId"], "PrimaryRole": r["PrimaryRole"],
                    "RoleStatus": r["RoleStatus"], "RoleDateType": "", "RoleStart": "", "RoleEnd": ""
                })

        for rel in extract_rels(org):
            if rel["RelDates"]:
                for rd in rel["RelDates"]:
                    rel_rows.append({
                        "OrgId": oid, "RelId": rel["RelId"], "RelStatus": rel["RelStatus"],
                        "TargetOrgId": rel["TargetOrgId"], "TargetPrimaryRoleId": rel["TargetPrimaryRoleId"],
                        "RelDateType": rd["Type"], "RelStart": rd["Start"], "RelEnd": rd["End"]
                    })
            else:
                rel_rows.append({
                        "OrgId": oid, "RelId": rel["RelId"], "RelStatus": rel["RelStatus"],
                        "TargetOrgId": rel["TargetOrgId"], "TargetPrimaryRoleId": rel["TargetPrimaryRoleId"],
                        "RelDateType": "", "RelStart": "", "RelEnd": ""
                })

        for sc in extract_succs(org):
            if sc["SuccDates"]:
                for sd in sc["SuccDates"]:
                    succ_rows.append({
                        "OrgId": oid, "SuccType": sc["SuccType"], "TargetOrgId": sc["TargetOrgId"],
                        "TargetPrimaryRoleId": sc["TargetPrimaryRoleId"], "SuccDateType": sd["Type"],
                        "SuccStart": sd["Start"], "SuccEnd": sd["End"]
                    })
            else:
                succ_rows.append({
                    "OrgId": oid, "SuccType": sc["SuccType"], "TargetOrgId": sc["TargetOrgId"],
                    "TargetPrimaryRoleId": sc["TargetPrimaryRoleId"], "SuccDateType": "",
                    "SuccStart": "", "SuccEnd": ""
                })

    # Safe builders
    def build_df(rows, required_cols, sort_cols):
        df = pd.DataFrame(rows)
        if df.empty:
            df = pd.DataFrame(columns=required_cols)
        else:
            for c in required_cols:
                if c not in df.columns: df[c] = ""
        sort_by = [c for c in sort_cols if c in df.columns]
        if sort_by: df = df.sort_values(sort_by)
        return df.reset_index(drop=True)

    orgs_df  = build_df(
        org_rows,
        ["OrgId","Name","Status","IsActive","OrgRecordClass","OrgRecordClassLabel",
         "LastChangeDate","AddrLine1","AddrLine2","AddrLine3","AddrLine4",
         "TownCity","County","Country","PostCode","PostCodeSpaced","AddressFull"],
        ["OrgId"]
    ).drop_duplicates(subset=["OrgId"])

    dates_df = build_df(date_rows, ["OrgId","DateType","Start","End"], ["OrgId","DateType","Start"])
    roles_df = build_df(role_rows, ["OrgId","RoleId","PrimaryRole","RoleStatus","RoleDateType","RoleStart","RoleEnd"], ["OrgId","RoleId","RoleStart"])
    rels_df  = build_df(rel_rows,  ["OrgId","RelId","RelStatus","TargetOrgId","TargetPrimaryRoleId","RelDateType","RelStart","RelEnd"], ["OrgId","RelId","RelStart"])
    succ_df  = build_df(succ_rows, ["OrgId","SuccType","TargetOrgId","TargetPrimaryRoleId","SuccDateType","SuccStart","SuccEnd"], ["OrgId","SuccType","SuccStart"])

    # ---------- ONSPD enrichment (FULL merge; coverage uses England filter later) ----------
    if onspd_csv:
        try:
            print("Enriching with ONSPD (AreaCodes + lat/long) ...")
            hdr_lower = pd.read_csv(onspd_csv, nrows=0).columns.str.lower().tolist()
            # map long vs short schemas (with spaced postcode + lat/long + ctry)
            keep_cols_long  = {"PCDS":"pcds","LADCD":"lad","LSOA21CD":"lsoa21","MSOA21CD":"msoa21","ICB23CD":"icb","CTRY":"ctry","LAT":"lat","LONG":"long"}
            keep_cols_short = {"pcds":"pcds","oslaua":"lad","lsoa21":"lsoa21","msoa21":"msoa21","icb":"icb","ctry":"ctry","lat":"lat","long":"long"}
            use_map = keep_cols_short if "oslaua" in hdr_lower else keep_cols_long
            usecols = [c for c in use_map.keys() if c.lower() in hdr_lower]
            onspd = pd.read_csv(onspd_csv, dtype=str, usecols=usecols).rename(columns={k:v for k,v in use_map.items() if k in usecols})
            onspd = onspd.drop_duplicates(subset=["pcds"])

            # Merge on spaced postcode (keep ctry for coverage calc)
            right_cols = [c for c in ["pcds","lad","lsoa21","msoa21","icb","lat","long","ctry"] if c in onspd.columns]
            orgs_df = (
                orgs_df.merge(onspd[right_cols], left_on="PostCodeSpaced", right_on="pcds", how="left")
                       .drop(columns=["pcds"])
            )
            # Canonical columns
            if "lat" in orgs_df.columns:
                orgs_df["Latitude"]  = pd.to_numeric(orgs_df["lat"], errors="coerce")
                orgs_df["Longitude"] = pd.to_numeric(orgs_df["long"], errors="coerce")
            orgs_df["AreaCode_LAD"]  = orgs_df.get("lad")
            orgs_df["AreaCode_LSOA"] = orgs_df.get("lsoa21")
            orgs_df["AreaCode_MSOA"] = orgs_df.get("msoa21")
            orgs_df["AreaCode_ICB"]  = orgs_df.get("icb")
            orgs_df.drop(columns=[c for c in ["lad","lsoa21","msoa21","icb","lat","long"] if c in orgs_df.columns], inplace=True, errors="ignore")
        except Exception as e:
            print("ONSPD enrichment skipped due to error:", e)

    # ---------- GP practices slice ----------
    gp_ids_ro76 = set(roles_df.loc[roles_df["RoleId"].isin(GP_ROLE_IDS), "OrgId"])
    # Fallback: some data may only show RO177. Restrict by GP-like code pattern: letter + 5 digits (e.g., A81002, H85063)
    gp_like_ids_ro177 = set(
        roles_df.loc[roles_df["RoleId"].eq(PRESCRIBE_ROLE), "OrgId"].astype(str).str.match(r"^[A-Z]\d{5}$", na=False)
    )
    gp_like_ids_ro177 = set(roles_df.loc[roles_df["RoleId"].eq(PRESCRIBE_ROLE), "OrgId"][roles_df["RoleId"].eq(PRESCRIBE_ROLE)
                          & roles_df["OrgId"].astype(str).str.match(r"^[A-Z]\d{5}$", na=False)])

    gp_ids = gp_ids_ro76 | gp_like_ids_ro177

    orgs_gp = orgs_df[orgs_df["OrgId"].isin(gp_ids)].copy()
    if not orgs_gp.empty:
        orgs_gp["__is_active__"] = orgs_gp["IsActive"].fillna(False)
        orgs_gp = (orgs_gp.sort_values(["OrgId","__is_active__","LastChangeDate"], ascending=[True, False, False])
                         .drop_duplicates(subset=["OrgId"])
                         .drop(columns="__is_active__", errors="ignore"))

    # ---------- Output ----------
    stamp = datetime.now().strftime("%Y-%m-%d_%H%M")
    out_dir = EXTRACTS_DIR / stamp
    latest_dir = EXTRACTS_DIR / "latest"
    ensure_dir(out_dir); ensure_dir(latest_dir)

    # Main tables
    orgs_df.to_parquet(out_dir / "orgs.parquet", index=False)
    orgs_df.to_csv(out_dir / "orgs.csv", index=False)
    dates_df.to_csv(out_dir / "org_dates.csv", index=False)
    roles_df.to_csv(out_dir / "org_roles.csv", index=False)
    rels_df.to_csv(out_dir / "org_rels.csv", index=False)
    succ_df.to_csv(out_dir / "org_succs.csv", index=False)

    # GP practices table
    orgs_gp.to_csv(out_dir / "orgs_gp.csv", index=False)

    # Mirror to latest/
    orgs_df.to_parquet(latest_dir / "orgs.parquet", index=False)
    orgs_df.to_csv(latest_dir / "orgs.csv", index=False)
    dates_df.to_csv(latest_dir / "org_dates.csv", index=False)
    roles_df.to_csv(latest_dir / "org_roles.csv", index=False)
    rels_df.to_csv(latest_dir / "org_rels.csv", index=False)
    succ_df.to_csv(latest_dir / "org_succs.csv", index=False)
    orgs_gp.to_csv(latest_dir / "orgs_gp.csv", index=False)

    # ---------- Coverage report ----------
    def pct(x, y):
        try: return 100.0 * (x / y) if y else 0.0
        except: return 0.0

    # Prefer ONSPD ctry if present; else text Country
    if "ctry" in orgs_df.columns:
        eng_mask = orgs_df["ctry"].eq("E92000001")
    else:
        eng_mask = orgs_df["Country"].str.upper().eq("ENGLAND")

    total_eng = int(eng_mask.sum())

    # LAD codes: E06/E07/E08/E09xxxxx
    lad_cov_mask  = eng_mask & orgs_df["AreaCode_LAD"].astype(str).str.match(r"^E0(6|7|8|9)\d{5}$", na=False)
    lsoa_cov_mask = eng_mask & orgs_df["AreaCode_LSOA"].astype(str).str.match(r"^E010\d{6}$", na=False)
    msoa_cov_mask = eng_mask & orgs_df["AreaCode_MSOA"].astype(str).str.match(r"^E020\d{5}$", na=False)
    icb_cov_mask  = eng_mask & orgs_df["AreaCode_ICB"].astype(str).str.match(r"^E54\d+$",    na=False)

    lad_cov_ct  = int(lad_cov_mask.sum())
    lsoa_cov_ct = int(lsoa_cov_mask.sum())
    msoa_cov_ct = int(msoa_cov_mask.sum())
    icb_cov_ct  = int(icb_cov_mask.sum())

    gp_total   = len(orgs_gp)
    gp_latlon  = orgs_gp.dropna(subset=["Latitude","Longitude"]) if {"Latitude","Longitude"} <= set(orgs_gp.columns) else pd.DataFrame()
    gp_latlon_ct = len(gp_latlon)

    report = [
        f"Snapshot: {stamp}",
        f"Total unique orgs: {len(orgs_df):,}",
        f"English orgs: {total_eng:,}",
        f"LAD coverage (England):  {lad_cov_ct:,} / {total_eng:,} = {pct(lad_cov_ct, total_eng):.1f}%",
        f"LSOA coverage (England): {lsoa_cov_ct:,} / {total_eng:,} = {pct(lsoa_cov_ct, total_eng):.1f}%",
        f"MSOA coverage (England): {msoa_cov_ct:,} / {total_eng:,} = {pct(msoa_cov_ct, total_eng):.1f}%",
        f"ICB coverage (England):  {icb_cov_ct:,} / {total_eng:,} = {pct(icb_cov_ct, total_eng):.1f}%",
        f"GP practices (RO76∪RO177-gpcode): {gp_total:,}",
        f"GP with lat/long:   {gp_latlon_ct:,} / {gp_total:,} = {pct(gp_latlon_ct, gp_total):.1f}%",
    ]
    (out_dir / "coverage_report.txt").write_text("\n".join(report), encoding="utf-8")
    (latest_dir / "coverage_report.txt").write_text("\n".join(report), encoding="utf-8")

    print("\n".join(["✓ Wrote curated tables:",
                     f"- Snapshot: {out_dir}",
                     f"- Latest:   {latest_dir}",
                     "",
                     "Coverage:",
                     *report[2:]]))

    return {
        "orgs": orgs_df,
        "dates": dates_df,
        "roles": roles_df,
        "rels": rels_df,
        "succs": succ_df,
        "orgs_gp": orgs_gp
    }

# ---------- RUN ----------
tables = build_curated_tables(
    onspd_csv=ONSPD_CSV,
    full_rescan=FULL_RESCAN,
    max_workers=MAX_WORKERS
)

# Peek a few rows
try:
    display(tables["orgs"].head(10))
    display(tables["orgs_gp"].head(10))
except Exception:
    print(tables["orgs"].head(10).to_string(index=False))
    print(tables["orgs_gp"].head(10).to_string(index=False))


Building curated tables: 100%|██████████| 286680/286680 [14:48<00:00, 322.62org/s] 


Enriching with ONSPD (AreaCodes + lat/long) ...
✓ Wrote curated tables:
- Snapshot: C:\Users\NikhilYadav\Desktop\NHS ODS\bronze\ods\extracts\curated\2025-09-26_1110
- Latest:   C:\Users\NikhilYadav\Desktop\NHS ODS\bronze\ods\extracts\curated\latest

Coverage:
English orgs: 275,786
LAD coverage (England):  0 / 275,786 = 0.0%
LSOA coverage (England): 0 / 275,786 = 0.0%
MSOA coverage (England): 275,786 / 275,786 = 100.0%
ICB coverage (England):  275,786 / 275,786 = 100.0%
GP practices (RO76∪RO177-gpcode): 16,371
GP with lat/long:   16,348 / 16,371 = 99.9%


Unnamed: 0,OrgId,Name,Status,IsActive,OrgRecordClass,OrgRecordClassLabel,LastChangeDate,AddrLine1,AddrLine2,AddrLine3,...,PostCode,PostCodeSpaced,AddressFull,ctry,Latitude,Longitude,AreaCode_LAD,AreaCode_LSOA,AreaCode_MSOA,AreaCode_ICB
0,002,DYFED,Inactive,False,RC1,HSCOrg,2013-05-08,**UNKNOWN**,,,...,,,"**UNKNOWN**, WALES",,,,,,,
1,003,GWENT,Inactive,False,RC1,HSCOrg,2013-05-08,**UNKNOWN**,,,...,,,"**UNKNOWN**, WALES",,,,,,,
2,004,GWYNEDD,Inactive,False,RC1,HSCOrg,2013-05-08,**UNKNOWN**,,,...,,,"**UNKNOWN**, WALES",,,,,,,
3,005,MID GLAMORGAN,Inactive,False,RC1,HSCOrg,2013-05-08,**UNKNOWN**,,,...,,,"**UNKNOWN**, WALES",,,,,,,
4,006,POWYS,Inactive,False,RC1,HSCOrg,2013-05-08,**UNKNOWN**,,,...,,,"**UNKNOWN**, WALES",,,,,,,
5,007,SOUTH GLAMORGAN,Inactive,False,RC1,HSCOrg,2013-05-08,**UNKNOWN**,,,...,,,"**UNKNOWN**, WALES",,,,,,,
6,008,WEST GLAMORGAN,Inactive,False,RC1,HSCOrg,2014-02-20,**UNKNOWN**,,,...,,,"**UNKNOWN**, WALES",,,,,,,
7,00D,"NHS DURHAM DALES, EASINGTON AND SEDGEFIELD CCG",Inactive,False,RC1,HSCOrg,2023-09-21,SEDGEFIELD COMMUNITY HOSPITAL,SALTERS LANE,SEDGEFIELD,...,TS213EE,TS21 3EE,"SEDGEFIELD COMMUNITY HOSPITAL, SALTERS LANE, S...",E92000001,54.661229,-1.448141,E06000047,E01020828,E02004339,E54000050
8,00DAA,"NHS DURHAM DALES, EASINGTON AND SEDGEFIELD HQ",Inactive,False,RC2,HSCSite,2021-04-26,SEDGEFIELD COMMUNITY HOSPITAL,SALTERS LANE,SEDGEFIELD,...,TS213EE,TS21 3EE,"SEDGEFIELD COMMUNITY HOSPITAL, SALTERS LANE, S...",E92000001,54.661229,-1.448141,E06000047,E01020828,E02004339,E54000050
9,00F,NHS GATESHEAD CCG,Inactive,False,RC1,HSCOrg,2021-03-10,RIVERSIDE HOUSE,GOLDCREST WAY,NEWBURN RIVERSIDE,...,NE158NY,NE15 8NY,"RIVERSIDE HOUSE, GOLDCREST WAY, NEWBURN RIVERS...",E92000001,54.96972,-1.712379,E08000021,E01008387,E02001728,E54000050


Unnamed: 0,OrgId,Name,Status,IsActive,OrgRecordClass,OrgRecordClassLabel,LastChangeDate,AddrLine1,AddrLine2,AddrLine3,...,PostCode,PostCodeSpaced,AddressFull,ctry,Latitude,Longitude,AreaCode_LAD,AreaCode_LSOA,AreaCode_MSOA,AreaCode_ICB
33787,A81002,QUEENS PARK MEDICAL CENTRE,Active,True,RC1,HSCOrg,2024-06-11,FARRER STREET,,,...,TS182AW,TS18 2AW,"FARRER STREET, STOCKTON ON TEES, CLEVELAND, EN...",E92000001,54.569175,-1.313893,E06000004,E01012267,E02007060,E54000050
33788,A81003,VICTORIA MEDICAL PRACTICE,Inactive,False,RC1,HSCOrg,2023-08-22,HEALTH CENTRE,VICTORIA ROAD,,...,TS268DB,TS26 8DB,"HEALTH CENTRE, VICTORIA ROAD, HARTLEPOOL, CLEV...",E92000001,54.68525,-1.217613,E06000001,E01011999,E02002489,E54000050
33789,A81004,ACKLAM MEDICAL CENTRE,Active,True,RC1,HSCOrg,2023-08-22,TRIMDON AVENUE,ACKLAM,,...,TS58SB,TS5 8SB,"TRIMDON AVENUE, ACKLAM, MIDDLESBROUGH, ENGLAND",E92000001,54.538198,-1.262142,E06000002,E01012024,E02002510,E54000050
33791,A81005,SPRINGWOOD SURGERY,Active,True,RC1,HSCOrg,2023-08-22,RECTORY LANE,,,...,TS147DJ,TS14 7DJ,"RECTORY LANE, GUISBOROUGH, CLEVELAND, ENGLAND",E92000001,54.53261,-1.055459,E06000003,E01012117,E02002532,E54000050
33792,A81006,TENNANT STREET MEDICAL PRACTICE,Active,True,RC1,HSCOrg,2024-08-31,TENNANT STREET,,,...,TS182AT,TS18 2AT,"TENNANT STREET, STOCKTON-ON-TEES, CLEVELAND, E...",E92000001,54.568616,-1.313593,E06000004,E01012267,E02007060,E54000050
33793,A81007,BANKHOUSE SURGERY,Active,True,RC1,HSCOrg,2023-08-22,ONE LIFE HARTLEPOOL,PARK ROAD,,...,TS247PW,TS24 7PW,"ONE LIFE HARTLEPOOL, PARK ROAD, HARTLEPOOL, CL...",E92000001,54.682342,-1.214164,E06000001,E01011999,E02002489,E54000050
33794,A81008,ALBERT HOUSE CLINIC,Inactive,False,RC1,HSCOrg,2023-08-22,LOW GRANGE HEALTH VILLAGE,NORMANBY ROAD,,...,TS66TD,TS6 6TD,"LOW GRANGE HEALTH VILLAGE, NORMANBY ROAD, MIDD...",E92000001,54.571137,-1.168043,E06000003,E01033470,E02006910,E54000050
33795,A81009,VILLAGE MEDICAL CENTRE,Active,True,RC1,HSCOrg,2023-08-22,400-404 LINTHORPE ROAD,,,...,TS56HF,TS5 6HF,"400-404 LINTHORPE ROAD, MIDDLESBROUGH, CLEVELA...",E92000001,54.562285,-1.241897,E06000002,E01012078,E02002500,E54000050
33796,A81011,CHADWICK PRACTICE,Active,True,RC1,HSCOrg,2023-08-22,PARK ROAD,,,...,TS247PW,TS24 7PW,"PARK ROAD, HARTLEPOOL, CLEVELAND, ENGLAND",E92000001,54.682342,-1.214164,E06000001,E01011999,E02002489,E54000050
33797,A81012,WESTBOURNE MEDICAL CENTRE,Active,True,RC1,HSCOrg,2023-08-22,NORTH ORMESBY HEALTH VILLAGE,7 TRINITY MEWS,NORTH ORMESBY,...,TS36AL,TS3 6AL,"NORTH ORMESBY HEALTH VILLAGE, 7 TRINITY MEWS, ...",E92000001,54.571738,-1.216246,E06000002,E01012059,E02002497,E54000050


In [4]:
tables["orgs_gp"].sample(300)


Unnamed: 0,OrgId,Name,Status,IsActive,OrgRecordClass,OrgRecordClassLabel,LastChangeDate,AddrLine1,AddrLine2,AddrLine3,...,PostCode,PostCodeSpaced,AddressFull,ctry,Latitude,Longitude,AreaCode_LAD,AreaCode_LSOA,AreaCode_MSOA,AreaCode_ICB
120347,G82802,CANTERBURY HEALTH CENTRE,Active,True,RC1,HSCOrg,2023-08-22,26 OLD DOVER ROAD,,,...,CT13JH,CT1 3JH,"26 OLD DOVER ROAD, CANTERBURY, KENT, ENGLAND",E92000001,51.274249,1.082980,E07000106,E01024044,E02005025,E54000032
131041,M85176,KIRPAL MEDICAL PRACTICE,Active,True,RC1,HSCOrg,2023-08-22,251 SOHO ROAD,HANDSWORTH,,...,B219RY,B21 9RY,"251 SOHO ROAD, HANDSWORTH, BIRMINGHAM, WEST MI...",E92000001,52.503109,-1.934939,E08000025,E01034921,E02001865,E54000055
274393,W00079,RCT LHB & MERTHYR LHB OOH,Active,True,RC1,HSCOrg,2023-08-22,NAVIGATION PARK,ABERCYNON,RHINDA CYNON TAFF,...,CF454SN,CF45 4SN,"NAVIGATION PARK, ABERCYNON, RHINDA CYNON TAFF,...",W92000004,51.641569,-3.328932,W06000016,W01001143,W02000263,W99999999
281799,Y06469,TB COMMUNITY SERVICE,Active,True,RC1,HSCOrg,2020-03-25,HARTINGTON RD HLTH CLINIC,LESSEPS ROAD,,...,L80RD,L8 0RD,"HARTINGTON RD HLTH CLINIC, LESSEPS ROAD, LIVER...",E92000001,53.395668,-2.945005,E08000012,E01006556,E02001385,E54000008
277249,Y00068,EAST SURREY INTERMEDIATE CARE TEAM,Inactive,False,RC1,HSCOrg,2023-08-22,TANDRIDGE HEIGHTS,MEMORIAL CARE HOME,MEMORIAL CLOSE,...,RH80NH,RH8 0NH,"TANDRIDGE HEIGHTS, MEMORIAL CARE HOME, MEMORIA...",E92000001,51.264411,-0.010904,E07000215,E01030836,E02006433,E54000063
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100447,F86085,HAINAULT SURGERY,Active,True,RC1,HSCOrg,2024-04-01,34 NEW NORTH ROAD,HAINAULT,,...,IG62XG,IG6 2XG,"34 NEW NORTH ROAD, HAINAULT, ILFORD, ESSEX, EN...",E92000001,51.602180,0.086133,E09000026,E01003704,E02000756,E54000029
151555,P87613,CLEGGS LANE MEDICAL PRACTICE/129,Active,True,RC1,HSCOrg,2023-08-22,129-131 CLEGGS LANE,LITTLE HULTON,,...,M389RS,M38 9RS,"129-131 CLEGGS LANE, LITTLE HULTON, MANCHESTER...",E92000001,53.534956,-2.419334,E08000006,E01005660,E02001157,E54000057
132713,N81012,GUARDIAN STREET MED/CTR,Active,True,RC1,HSCOrg,2023-08-22,GUARDIAN STREET,,,...,WA51UD,WA5 1UD,"GUARDIAN STREET, WARRINGTON, CHESHIRE, ENGLAND",E92000001,53.391917,-2.606296,E06000007,E01033302,E02002607,E54000008
122524,H83004,FARLEY ROAD MEDICAL PRACTICE,Active,True,RC1,HSCOrg,2024-04-01,53 FARLEY ROAD,,,...,CR27NG,CR2 7NG,"53 FARLEY ROAD, SOUTH CROYDON, SURREY, ENGLAND",E92000001,51.345803,-0.066472,E09000008,E01001125,E02000226,E54000031


In [5]:
tables["orgs_gp"].shape

(16371, 24)