In [56]:
# =========================
# ODS → Curated tables + ONSPD enrichment + NHSPD Names & Lookup files + NHSPD master fallback (NHSER)
#        + optional ICB CSV + GP slice + relationship roll-ups
# NOTE: "Is Partner To" prefers PCN (RE8→RO272). If absent, falls back to RE6.
# If "High Level Health Geography Name" is entirely null, it is dropped (code kept).
# =========================

import os, re, json
from datetime import datetime
from pathlib import Path
from typing import Dict, Any, List, Optional
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm.auto import tqdm

# ---------- USER SETTINGS ----------
BRONZE_ROOT = Path(r"C:\Users\NikhilYadav\Desktop\NHS ODS\bronze\ods")
EXTRACTS_DIR = BRONZE_ROOT / "extracts" / "curated"

# NHSPD folders (we scan BOTH if they exist)
NHSPD_NAMES_DIR:  Optional[str] = r"C:\Users\NikhilYadav\Desktop\NHS ODS\bronze\ods\extracts\curated\latest\NHSPD_AUG_2025_UK_FULL\Documents\Names and Codes"
NHSPD_LOOKUP_DIR: Optional[str] = r"C:\Users\NikhilYadav\Desktop\NHS ODS\bronze\ods\extracts\curated\latest\NHSPD_AUG_2025_UK_FULL\Documents\Lookup files"

# NHSPD master (headerless) for robust NHSER fallback (recommended)
NHSPD_MASTER_CSV: Optional[str] = r"C:\Users\NikhilYadav\Desktop\NHS ODS\bronze\ods\extracts\curated\latest\NHSPD_AUG_2025_UK_FULL\Data\nhg25aug.csv"

# ONSPD CSV (LAD/LSOA/MSOA/ICB + lat/long; may or may not have RGN/NHSER)
ONSPD_CSV: Optional[str] = r"C:\Users\NikhilYadav\Desktop\NHS ODS\bronze\ods\extracts\curated\latest\ONSPD_MAY_2025_UK.csv"

# OPTIONAL: explicit ICB names CSV (overrides/augments)
# e.g. "Integrated Care Boards (December 2024) Names and Codes in England.csv"
ICB_NAMES_CSV: Optional[str] = r"C:\Users\NikhilYadav\Desktop\NHS ODS\bronze\ods\extracts\curated\latest\Integrated_Care_Boards_(December_2024)_Names_and_Codes_in_EN.csv"

MAX_WORKERS = 16
FULL_RESCAN = False

# Role codes
GP_ROLE_IDS = {"RO76"}      # GP PRACTICE
PRESCRIBE_ROLE = "RO177"    # PRESCRIBING COST CENTRE
PCN_ROLE_ID   = "RO272"     # Primary Care Network (for RE8)

# ---------- fast JSON ----------
try:
    import orjson
    def read_json_fast(p: Path):
        try:
            return orjson.loads(p.read_bytes())
        except Exception:
            with p.open("r", encoding="utf-8") as f:
                return json.load(f)
except Exception:
    def read_json_fast(p: Path):
        with p.open("r", encoding="utf-8") as f:
            return json.load(f)

# ---------- helpers ----------
def ensure_dir(p: Path) -> None:
    p.mkdir(parents=True, exist_ok=True)

def scalar(x):
    if x is None or isinstance(x, (str, int, float, bool)): return x
    if isinstance(x, dict):
        for k in ("extension","value","_","text","#text","displayName","code","id"):
            if k in x and isinstance(x[k], (str, int, float, bool)): return x[k]
        if len(x)==1:
            v = next(iter(x.values()))
            if isinstance(v,(str,int,float,bool)): return v
    if isinstance(x, list):
        for it in x:
            s = scalar(it)
            if s is not None: return s
    try: return json.dumps(x, ensure_ascii=False, separators=(",",":"))
    except: return str(x)

def ensure_str(x): return "" if x is None else str(x)

def pick_org(j):
    if isinstance(j, dict) and "Organisation" in j and isinstance(j["Organisation"], dict): return j["Organisation"]
    if isinstance(j, dict): return j
    return {}

def get_org_id(org):
    raw = org.get("OrgId")
    if isinstance(raw, dict) and "extension" in raw: return str(raw["extension"])
    return str(scalar(raw) or "")

def get_record_class(org):
    rc = org.get("orgRecordClass") or org.get("OrgRecordClass")
    rc = str(scalar(rc) or "")
    label = {"RC1":"HSCOrg", "RC2":"HSCSite"}.get(rc.upper(), "")
    return rc, label

def deep_find_first_key(obj, key_regex):
    pat = re.compile(key_regex, re.IGNORECASE)
    stack = [obj]; seen=set()
    while stack:
        cur = stack.pop()
        if id(cur) in seen: continue
        seen.add(id(cur))
        if isinstance(cur, dict):
            for k,v in cur.items():
                if isinstance(k, str) and pat.search(k):
                    val = scalar(v)
                    if val not in (None,"","null"): return val
            for v in cur.values():
                if isinstance(v,(dict,list)): stack.append(v)
        elif isinstance(cur, list):
            for it in cur:
                if isinstance(it,(dict,list)): stack.append(it)
    return None

def clean_postcode(pc):
    if not pc: return "", ""
    s = re.sub(r"\s+","", str(pc)).upper()
    spaced = s[:-3] + " " + s[-3:] if len(s)>3 else s
    return s, spaced

def extract_address_fields(org):
    candidates=[]
    for path in [("GeoLoc","Location"), ("PostalAddress",), ("Address",), ("Contact","Address")]:
        cur = org; ok=True
        for k in path:
            if isinstance(cur, dict) and k in cur: cur = cur[k]
            else: ok=False; break
        if ok and isinstance(cur, dict): candidates.append(cur)
    addr={}
    for cand in candidates:
        keys = {k.lower() for k in cand.keys()}
        if keys & {"addrln1","addrln2","addrln3","addrln4","addrl1","addrl2","town","city","county","postcode"}:
            addr=cand; break
        if any(k.lower()=="postcode" for k in cand.keys()):
            addr=cand; break
    def pick(*names):
        for n in names:
            if isinstance(addr, dict) and n in addr: return scalar(addr[n])
        return None
    line1 = pick("AddrLn1","Addrl1","Address1")
    line2 = pick("AddrLn2","Addrl2","Address2")
    line3 = pick("AddrLn3","Addrl3","Address3")
    line4 = pick("AddrLn4","Addrl4","Address4")
    city  = pick("Town","City","Locality")
    county= pick("County")
    country = pick("Country")
    pc = pick("PostCode","Postcode","Post_Code","Post Code") or deep_find_first_key(org, r"post\s*code|postcode")
    parts = [line1,line2,line3,line4,city,county,country]
    addr_full = ", ".join([ensure_str(x).strip() for x in parts if x and str(x).strip()!=""])
    pc_compact, pc_spaced = clean_postcode(pc)
    return {
        "AddrLine1": ensure_str(line1), "AddrLine2": ensure_str(line2),
        "AddrLine3": ensure_str(line3), "AddrLine4": ensure_str(line4),
        "TownCity": ensure_str(city), "County": ensure_str(county), "Country": ensure_str(country),
        "PostCode": pc_compact, "PostCodeSpaced": pc_spaced, "AddressFull": addr_full
    }

def extract_dates(org):
    out=[]; d = org.get("Date")
    if isinstance(d, list):
        for item in d:
            if isinstance(item, dict):
                out.append({"DateType": ensure_str(item.get("Type")),
                            "Start": ensure_str(item.get("Start")),
                            "End": ensure_str(item.get("End"))})
    return out

def extract_roles(org):
    out=[]; R = org.get("Roles")
    def emit(it):
        if not isinstance(it, dict): return
        rid = scalar(it.get("id") or it.get("idCode") or it.get("Id") or it.get("code"))
        primary = bool(it.get("primaryRole", False))
        rstat = ensure_str(scalar(it.get("Status")))
        dates=[]; d = it.get("Date")
        if isinstance(d, list):
            for di in d:
                if isinstance(di, dict):
                    dates.append({"Type": ensure_str(di.get("Type")),
                                  "Start": ensure_str(di.get("Start")),
                                  "End": ensure_str(di.get("End"))})
        out.append({"RoleId": ensure_str(rid), "PrimaryRole": primary,
                    "RoleStatus": rstat, "RoleDates": dates})
    if isinstance(R, list):
        for it in R: emit(it)
    elif isinstance(R, dict):
        rl = R.get("Role")
        if isinstance(rl, list):
            for it in rl: emit(it)
        elif isinstance(rl, dict):
            emit(rl)
    return out

def extract_rels(org):
    out=[]; R = org.get("Rels")
    if not isinstance(R, dict): return out
    rl = R.get("Rel")
    items = rl if isinstance(rl, list) else ([rl] if isinstance(rl, dict) else [])
    for it in items:
        if not isinstance(it, dict): continue
        rel_id = ensure_str(scalar(it.get("id"))).strip().upper()
        rstat  = ensure_str(scalar(it.get("Status")))
        target = it.get("Target") or {}
        tgt_org_raw = target.get("OrgId")
        tgt_org = ensure_str(tgt_org_raw.get("extension") if isinstance(tgt_org_raw, dict) and "extension" in tgt_org_raw else scalar(tgt_org_raw)).strip()
        tgt_role_raw = target.get("PrimaryRoleId") or {}
        tgt_role = ensure_str((tgt_role_raw.get("id") if isinstance(tgt_role_raw, dict) else tgt_role_raw) or "")
        dates=[]; d = it.get("Date")
        if isinstance(d, list):
            for di in d:
                if isinstance(di, dict):
                    dates.append({"Type": ensure_str(di.get("Type")),
                                  "Start": ensure_str(di.get("Start")),
                                  "End": ensure_str(di.get("End"))})
        out.append({"RelId": rel_id, "RelStatus": rstat, "TargetOrgId": tgt_org,
                    "TargetPrimaryRoleId": tgt_role, "RelDates": dates})
    return out

# ---------- discovery ----------
def list_chunk_dirs_desc() -> List[Path]:
    return sorted(
        BRONZE_ROOT.glob("release_date=*/source=ord/*/dataset=*/chunks"),
        key=lambda d: d.parent.parent.parent.name.replace("release_date=",""),
        reverse=True
    )

def oid_from_file(fp: Path) -> Optional[str]:
    name = fp.stem
    return name[4:] if name.startswith("org_") else None

# ---------- NHSPD Names/Lookup discovery ----------
def _read_any_csv_or_xlsx(path: Path, usecols: Optional[List[str]]=None, dtype="string") -> pd.DataFrame:
    if path.suffix.lower() in (".xlsx", ".xls"):
        try:
            return pd.read_excel(path, dtype=dtype, usecols=usecols)
        except Exception:
            return pd.read_excel(path, usecols=usecols)
    return pd.read_csv(path, dtype=dtype, usecols=usecols)

def _best_col(cols: List[str], patterns: List[str]) -> Optional[str]:
    for pat in patterns:
        for c in cols:
            if re.search(pat, c, re.IGNORECASE):
                return c
    return None

def _collect_lookup_dirs(names_dir: Optional[str], lookup_dir: Optional[str]) -> List[Path]:
    out = []
    for d in [names_dir, lookup_dir]:
        if d and Path(d).exists():
            out.append(Path(d))
    # try sibling and common folders
    for base in [names_dir, lookup_dir]:
        if not base: continue
        p = Path(base)
        for cand in [p.parent / "Lookup files", p.parent / "Lookups", p.parent / "Lookup", p.parent / "lookup files"]:
            if cand.exists(): out.append(cand)
        if len(p.parents) >= 2:
            for extra in [p.parents[1] / "Documents" / "Lookup files",
                          p.parents[1] / "Data" / "Lookup files",
                          p.parents[1] / "Lookup files"]:
                if extra.exists(): out.append(extra)
    # de-dup preserve order
    seen=set(); uniq=[]
    for d in out:
        if d not in seen:
            uniq.append(d); seen.add(d)
    return uniq

def _read_latest(dirs: List[Path], patterns: List[str]) -> Optional[pd.DataFrame]:
    cands=[]
    for nd in dirs:
        for pat in patterns:
            cands += list(nd.rglob(pat))
    if not cands: return None
    cands.sort(key=lambda p: p.stat().st_mtime, reverse=True)
    for p in cands:
        try:
            return _read_any_csv_or_xlsx(p)
        except Exception:
            continue
    return None

def _clean_map(df: Optional[pd.DataFrame], code_regexes: List[str], name_regexes: List[str]) -> Optional[pd.DataFrame]:
    if df is None: return None
    cols = df.columns.tolist()
    code_col = _best_col(cols, code_regexes)
    name_col = _best_col(cols, name_regexes) or _best_col(cols, [r"name$", r"nm$"])
    if not code_col or not name_col: return None
    out = df[[code_col, name_col]].dropna().drop_duplicates()
    out.columns = ["code","name"]
    out["code"] = out["code"].astype("string").str.strip().str.upper()
    out["name"] = out["name"].astype("string").str.strip()
    return out

def load_nhspd_lookups(names_dir: Optional[str], lookup_dir: Optional[str]) -> dict:
    """
    Returns dict with DataFrames for: lad, icb, rgn, nhser, ctry
    Scans BOTH 'Names and Codes' and 'Lookup files' (and sensible siblings).
    """
    lookups = {"lad": None, "icb": None, "rgn": None, "nhser": None, "ctry": None}
    dirs = _collect_lookup_dirs(names_dir, lookup_dir)

    # Broad patterns
    lad_df   = _read_latest(dirs, [
        "*LA_UA*names*codes*UK*.csv","*LA_UA*names*codes*UK*.xlsx",
        "*Local*Authority*Names*Codes*UK*.csv","*Local*Authority*Names*Codes*UK*.xlsx"
    ])
    icb_df   = _read_latest(dirs, [
        "*ICB*names*codes*EN*.csv","*ICB*names*codes*EN*.xlsx",
        "*ICB*names*codes*UK*.csv","*ICB*names*codes*UK*.xlsx",
        "*Integrated*Care*Board*Names*Codes*Eng*.csv","*Integrated*Care*Boards*Names*Codes*Eng*.csv",
        "*Integrated*Care*Board*Names*Codes*.xlsx","*Integrated*Care*Boards*Names*Codes*.xlsx"
    ])
    rgn_df   = _read_latest(dirs, [
        "*Region*names*codes*EN*.csv","*Region*names*codes*EN*.xlsx",
        "*Government*Office*Region*Names*Codes*Eng*.csv","*GOR*Names*Codes*.csv","*GOR*Names*Codes*.xlsx"
    ])
    nhser_df = _read_latest(dirs, ["*NHSER*names*codes*EN*.csv","*NHSER*names*codes*EN*.xlsx"])
    ctry_df  = _read_latest(dirs, ["*Country*names*codes*UK*.csv","*Country*names*codes*UK*.xlsx"])

    lookups["lad"]   = _clean_map(lad_df,   [r"^lad\d*cd$", r"^laua\d*cd$"], [r"^lad\d*nm$", r"^laua\d*nm$"])
    lookups["icb"]   = _clean_map(icb_df,   [r"^icb\d{2}cdh?$", r"^icb\d{2}cd$"], [r"^icb\d{2}nm$"])
    lookups["rgn"]   = _clean_map(rgn_df,   [r"^rgn\d*cd$", r"^gor\d*cd$"],      [r"^rgn\d*nm$", r"^gor\d*nm$"])
    lookups["nhser"] = _clean_map(nhser_df, [r"^nhse\d*cd$", r"^nhsercd$"],      [r"^nhse\d*nm$", r"^nhsernm$"])
    lookups["ctry"]  = _clean_map(ctry_df,  [r"^ctry\d*cd$"],                    [r"^ctry\d*nm$"])

    # Minimal fallbacks
    if lookups["nhser"] is None or lookups["nhser"].empty:
        lookups["nhser"] = pd.DataFrame({
            "code": ["Y56","Y58","Y59","Y60","Y61","Y62","Y63","W92","S92","N92","L93","M83"],
            "name": ["London","South West","South East","Midlands","East of England","North West","North East and Yorkshire",
                     "(pseudo) Wales","(pseudo) Scotland","(pseudo) Northern Ireland","(pseudo) Channel Islands","(pseudo) Isle of Man"]
        })
    if lookups["ctry"] is None or lookups["ctry"].empty:
        lookups["ctry"] = pd.DataFrame({
            "code":["E92000001","W92000004","S92000003","N92000002"],
            "name":["England","Wales","Scotland","Northern Ireland"]
        })

    return lookups

# ---------- NHSPD master (headerless) fallback: detect NHSER per postcode ----------
def _detect_nhser_cols(sample: pd.DataFrame) -> Optional[tuple]:
    """Return (pcds_col_idx, nhser_col_idx) detected from a headerless NHSPD sample."""
    allowed_nhser = {"Y56","Y58","Y59","Y60","Y61","Y62","Y63","W92","S92","N92","L93","M83"}
    pcds_idx = None; nhser_idx = None
    best_pc_score = -1.0; best_nhser_score = -1.0
    for col in sample.columns:
        s = sample[col].astype("string").str.upper()
        s_clean = s.str.replace(r"\s+", " ", regex=True)
        pc_mask = s_clean.str.match(r"^[A-Z]{1,2}\d[A-Z0-9]?\s\d[A-Z]{2}$", na=False)
        pc_score = pc_mask.mean()
        if pc_score > best_pc_score:
            best_pc_score = pc_score; pcds_idx = col
        nh_mask = s.str.match(r"^[A-Z]\d{2}$", na=False)
        in_allowed = s.isin(list(allowed_nhser))
        nh_score = 0.6 * nh_mask.mean() + 0.4 * in_allowed.mean()
        if nh_score > best_nhser_score:
            best_nhser_score = nh_score; nhser_idx = col
    if pcds_idx is None or best_pc_score < 0.2:
        return None
    if nhser_idx is None or best_nhser_score < 0.1:
        return None
    return (pcds_idx, nhser_idx)

def load_nhspd_master_nhser(master_csv: Optional[str]) -> Optional[pd.DataFrame]:
    """Read headerless NHSPD master just enough to get [pcds, nhser]."""
    if not master_csv or not Path(master_csv).exists():
        return None
    sample = pd.read_csv(master_csv, header=None, dtype="string", nrows=2000)
    detected = _detect_nhser_cols(sample)
    if not detected:
        print("NHSPD master fallback: could not detect columns for pcds/nhser.")
        return None
    pc_idx, nhser_idx = detected
    try:
        df = pd.read_csv(master_csv, header=None, dtype="string", usecols=[pc_idx, nhser_idx])
    except Exception:
        df = pd.read_csv(master_csv, header=None, dtype="string")[[pc_idx, nhser_idx]]
    df.columns = ["pcds","nhser"]
    df["pcds"] = df["pcds"].astype("string").str.upper().str.replace(r"\s+"," ", regex=True)
    df["nhser"] = df["nhser"].astype("string").str.strip().str.upper()
    df = df.dropna(subset=["pcds"]).drop_duplicates(subset=["pcds"])
    print(f"NHSPD master fallback: detected pcds col {pc_idx}, nhser col {nhser_idx}; rows={len(df):,}")
    return df

# ---------- Optional ICB Names override ----------
def attach_icb_names_from_csv(orgs_df: pd.DataFrame, icb_names_csv: Optional[str]) -> pd.DataFrame:
    if not icb_names_csv or not Path(icb_names_csv).exists():
        if "Geographic Primary Care Organisation Name" not in orgs_df.columns:
            orgs_df["Geographic Primary Care Organisation Name"] = pd.NA
        return orgs_df
    icb_names = pd.read_csv(icb_names_csv, dtype="string")
    cols = icb_names.columns.tolist()
    code_col = next((c for c in cols if re.fullmatch(r"ICB\d{2}CDH?", c, flags=re.I)), None)
    name_col = next((c for c in cols if re.fullmatch(r"ICB\d{2}NM",  c, flags=re.I)), None)
    if code_col is None or name_col is None:
        if "Geographic Primary Care Organisation Name" not in orgs_df.columns:
            orgs_df["Geographic Primary Care Organisation Name"] = pd.NA
        return orgs_df
    icb_map = (
        icb_names[[code_col, name_col]].dropna().drop_duplicates()
        .rename(columns={code_col: "Geographic Primary Care Organisation Code",
                         name_col: "Geographic Primary Care Organisation Name"})
    )
    icb_map["Geographic Primary Care Organisation Code"] = icb_map["Geographic Primary Care Organisation Code"].str.strip().str.upper()
    orgs_df["Geographic Primary Care Organisation Code"] = (
        orgs_df.get("Geographic Primary Care Organisation Code", pd.Series([], dtype="string"))
        .astype("string").str.strip().str.upper()
    )
    orgs_df = orgs_df.merge(icb_map, on="Geographic Primary Care Organisation Code", how="left", suffixes=("", "_icbfix"))
    if "Geographic Primary Care Organisation Name_icbfix" in orgs_df.columns:
        base = orgs_df.get("Geographic Primary Care Organisation Name")
        fix  = orgs_df["Geographic Primary Care Organisation Name_icbfix"]
        orgs_df["Geographic Primary Care Organisation Name"] = base.fillna(fix) if base is not None else fix
        orgs_df.drop(columns=["Geographic Primary Care Organisation Name_icbfix"], inplace=True, errors="ignore")
    return orgs_df

# ---------- core build ----------
def build_curated_tables(onspd_csv: Optional[str]=None, full_rescan: bool=False, max_workers: int=16):
    ensure_dir(EXTRACTS_DIR)
    chunk_dirs = list_chunk_dirs_desc()
    if not chunk_dirs:
        raise RuntimeError("No chunks found under bronze/ods.")

    # PASS A: newest-per-OrgId
    chosen_oids = set()
    dir_need_files: Dict[str, List[Path]] = {}
    for cdir in chunk_dirs:
        files = list(cdir.glob("org_*.json"))
        if full_rescan:
            dir_need_files[str(cdir)] = files
            continue
        need = []
        for fp in files:
            oid = oid_from_file(fp)
            if oid and oid not in chosen_oids:
                need.append(fp); chosen_oids.add(oid)
        if need:
            dir_need_files[str(cdir)] = need

    total_unique_orgs = sum(len(v) for v in dir_need_files.values())
    if total_unique_orgs == 0:
        raise RuntimeError("Nothing to parse (already consolidated).")

    # PASS B: parse with progress bar
    overall = tqdm(total=total_unique_orgs, unit="org", desc="Building curated tables",
                   position=0, leave=True, dynamic_ncols=True, mininterval=0.5, smoothing=0.3)

    chosen: Dict[str, Dict[str, Any]] = {}
    for cdir in chunk_dirs:
        key = str(cdir)
        if key not in dir_need_files: continue
        need_files = dir_need_files[key]

        def parse_one(fp: Path):
            j = read_json_fast(fp)
            org = pick_org(j); oid = get_org_id(org)
            return oid, org

        with ThreadPoolExecutor(max_workers=max_workers) as ex:
            futures = {ex.submit(parse_one, fp): fp for fp in need_files}
            for fut in as_completed(futures):
                try:
                    oid, org = fut.result()
                    if oid: chosen[oid] = org
                finally:
                    overall.update(1)
    overall.close()

    # Build rows
    org_rows, date_rows, role_rows, rel_rows, succ_rows = [], [], [], [], []

    for oid, org in chosen.items():
        name   = ensure_str(scalar(org.get("Name")))
        status = ensure_str(scalar(org.get("Status"))); is_active = (status.strip().lower()=="active")
        lcd    = ensure_str(scalar(org.get("LastChangeDate")))
        rc_code, rc_label = get_record_class(org)
        addr = extract_address_fields(org)

        org_rows.append({
            "OrgId": oid, "Name": name, "Status": status, "IsActive": is_active,
            "OrgRecordClass": rc_code, "OrgRecordClassLabel": rc_label,
            "LastChangeDate": lcd, **addr
        })

        for d in extract_dates(org):
            date_rows.append({"OrgId": oid, **d})

        for r in extract_roles(org):
            if r["RoleDates"]:
                for rd in r["RoleDates"]:
                    role_rows.append({
                        "OrgId": oid, "RoleId": r["RoleId"], "PrimaryRole": r["PrimaryRole"],
                        "RoleStatus": r["RoleStatus"], "RoleDateType": rd["Type"],
                        "RoleStart": rd["Start"], "RoleEnd": rd["End"]
                    })
            else:
                role_rows.append({
                    "OrgId": oid, "RoleId": r["RoleId"], "PrimaryRole": r["PrimaryRole"],
                    "RoleStatus": r["RoleStatus"], "RoleDateType": "", "RoleStart": "", "RoleEnd": ""
                })

        for rel in extract_rels(org):
            if rel["RelDates"]:
                for rd in rel["RelDates"]:
                    rel_rows.append({
                        "OrgId": oid, "RelId": rel["RelId"], "RelStatus": rel["RelStatus"],
                        "TargetOrgId": rel["TargetOrgId"], "TargetPrimaryRoleId": rel["TargetPrimaryRoleId"],
                        "RelDateType": rd["Type"], "RelStart": rd["Start"], "RelEnd": rd["End"]
                    })
            else:
                rel_rows.append({
                    "OrgId": oid, "RelId": rel["RelId"], "RelStatus": rel["RelStatus"],
                    "TargetOrgId": rel["TargetOrgId"], "TargetPrimaryRoleId": rel["TargetPrimaryRoleId"],
                    "RelDateType": "", "RelStart": "", "RelEnd": ""
                })

        S = org.get("Succs")
        if isinstance(S, dict):
            sc = S.get("Succ")
            items = sc if isinstance(sc, list) else ([sc] if isinstance(sc, dict) else [])
            for it in items:
                if not isinstance(it, dict): continue
                typ = ensure_str(scalar(it.get("Type"))); target = it.get("Target") or {}
                tgt_org_raw = target.get("OrgId")
                tgt_org = ensure_str(tgt_org_raw.get("extension") if isinstance(tgt_org_raw, dict) and "extension" in tgt_org_raw else scalar(tgt_org_raw))
                tgt_role_raw = target.get("PrimaryRoleId") or {}
                tgt_role = ensure_str((tgt_role_raw.get("id") if isinstance(tgt_role_raw, dict) else tgt_role_raw) or "")
                d = it.get("Date")
                if isinstance(d, list):
                    for sd in d:
                        if isinstance(sd, dict):
                            succ_rows.append({
                                "OrgId": oid, "SuccType": typ, "TargetOrgId": tgt_org,
                                "TargetPrimaryRoleId": tgt_role, "SuccDateType": ensure_str(sd.get("Type")),
                                "SuccStart": ensure_str(sd.get("Start")), "SuccEnd": ensure_str(sd.get("End"))
                            })

    def build_df(rows, required_cols, sort_cols):
        df = pd.DataFrame(rows)
        if df.empty:
            df = pd.DataFrame(columns=required_cols)
        else:
            for c in required_cols:
                if c not in df.columns: df[c] = ""
        sort_by = [c for c in sort_cols if c in df.columns]
        if sort_by: df = df.sort_values(sort_by)
        return df.reset_index(drop=True)

    orgs_df  = build_df(
        org_rows,
        ["OrgId","Name","Status","IsActive","OrgRecordClass","OrgRecordClassLabel",
         "LastChangeDate","AddrLine1","AddrLine2","AddrLine3","AddrLine4",
         "TownCity","County","Country","PostCode","PostCodeSpaced","AddressFull"],
        ["OrgId"]
    ).drop_duplicates(subset=["OrgId"])

    dates_df = build_df(date_rows, ["OrgId","DateType","Start","End"], ["OrgId","DateType","Start"])
    roles_df = build_df(role_rows, ["OrgId","RoleId","PrimaryRole","RoleStatus","RoleDateType","RoleStart","RoleEnd"], ["OrgId","RoleId","RoleStart"])
    rels_df  = build_df(rel_rows,  ["OrgId","RelId","RelStatus","TargetOrgId","TargetPrimaryRoleId","RelDateType","RelStart","RelEnd"], ["OrgId","RelId","RelStart"])
    succ_df  = build_df(succ_rows, ["OrgId","SuccType","TargetOrgId","TargetPrimaryRoleId","SuccDateType","SuccStart","SuccEnd"], ["OrgId","SuccType","SuccStart"])

    # ---------- Relationship roll-ups ----------
    def _pick_latest_target(df: pd.DataFrame) -> pd.DataFrame:
        if df.empty:
            return pd.DataFrame(columns=["OrgId", "TargetOrgId"])
        cand = df.copy()
        cand["__active__"]   = cand["RelStatus"].astype(str).str.lower().eq("active")
        cand["RelEnd_dt"]    = pd.to_datetime(cand["RelEnd"].replace("", pd.NA),  errors="coerce")
        cand["RelStart_dt"]  = pd.to_datetime(cand["RelStart"].replace("", pd.NA),errors="coerce")
        cand = cand.sort_values(["OrgId", "__active__", "RelEnd_dt", "RelStart_dt"],
                                ascending=[True, False, False, False])
        return cand.groupby("OrgId", as_index=False).head(1)[["OrgId","TargetOrgId"]]

    # Commissioned By → RE4 (as before)
    comm_pool = rels_df[rels_df["RelId"].astype(str).str.upper().eq("RE4")]
    comm_df   = _pick_latest_target(comm_pool).rename(columns={"TargetOrgId":"Is Commissioned By - Code"})

    # Partner To:
    #   Prefer PCN: RE8 + TargetPrimaryRoleId == RO272
    pcn_pool = rels_df[
        rels_df["RelId"].astype(str).str.upper().eq("RE8") &
        rels_df["TargetPrimaryRoleId"].astype(str).str.upper().eq(PCN_ROLE_ID)
    ]
    partner_pcn = _pick_latest_target(pcn_pool).rename(columns={"TargetOrgId":"Is Partner To - Code"})

    #   Fallback: RE6 (historical commissioning/partner linkage)
    re6_pool = rels_df[rels_df["RelId"].astype(str).str.upper().eq("RE6")]
    partner_re6 = _pick_latest_target(re6_pool).rename(columns={"TargetOrgId":"Is Partner To - Code"})

    # Merge preference: start with RE6 then overwrite with PCN if available
    rels_rollup = orgs_df[["OrgId"]].merge(partner_re6, on="OrgId", how="left") \
                                     .merge(partner_pcn, on="OrgId", how="left", suffixes=("", "_pcn"))
    # prefer PCN code if present
    rels_rollup["Is Partner To - Code"] = rels_rollup["Is Partner To - Code_pcn"].fillna(rels_rollup["Is Partner To - Code"])
    rels_rollup.drop(columns=[c for c in rels_rollup.columns if c.endswith("_pcn")], inplace=True, errors="ignore")

    # Attach names for both Partner/Commission
    tgt_names = orgs_df[["OrgId","Name"]].rename(columns={"OrgId":"__TgtId","Name":"__TgtName"})
    rels_rollup = (rels_rollup
        .merge(tgt_names, left_on="Is Partner To - Code", right_on="__TgtId", how="left")
        .drop(columns="__TgtId").rename(columns={"__TgtName":"Is Partner To - Name"})
        .merge(comm_df, on="OrgId", how="left")
        .merge(tgt_names, left_on="Is Commissioned By - Code", right_on="__TgtId", how="left")
        .drop(columns="__TgtId").rename(columns={"__TgtName":"Is Commissioned By - Name"}))

    for c in ["Is Partner To - Code","Is Partner To - Name","Is Commissioned By - Code","Is Commissioned By - Name"]:
        if c not in rels_rollup.columns: rels_rollup[c] = pd.NA

    orgs_df = orgs_df.merge(rels_rollup, on="OrgId", how="left")

    # ---------- ONSPD enrichment ----------
    if onspd_csv and Path(onspd_csv).exists():
        try:
            hdr = pd.read_csv(onspd_csv, nrows=0).columns
            hdr_lower = [c.lower() for c in hdr]
            base_long  = {"PCDS":"pcds","LADCD":"lad","LSOA21CD":"lsoa21","MSOA21CD":"msoa21",
                          "ICB23CD":"icb","CTRY":"ctry","LAT":"lat","LONG":"long",
                          "RGN":"rgn","RGNNM":"rgnnm","NHSECD":"nhser","NHSENM":"nhsernm"}
            base_short = {"pcds":"pcds","oslaua":"lad","lsoa21":"lsoa21","msoa21":"msoa21",
                          "icb":"icb","ctry":"ctry","lat":"lat","long":"long",
                          "rgn":"rgn","rgnnm":"rgnnm","nhsecd":"nhser","nhsernm":"nhsernm"}

            use_map = base_short if "oslaua" in hdr_lower else base_long
            usecols = [c for c in use_map if c in hdr]
            onspd = pd.read_csv(onspd_csv, dtype="string", usecols=usecols).rename(columns={k:v for k,v in use_map.items() if k in hdr})
            onspd = onspd.drop_duplicates(subset=["pcds"])

            orgs_df = (orgs_df
                       .merge(onspd, left_on="PostCodeSpaced", right_on="pcds", how="left")
                       .drop(columns=["pcds"]))

            if "lat" in orgs_df.columns:
                orgs_df["Latitude"]  = pd.to_numeric(orgs_df["lat"], errors="coerce")
                orgs_df["Longitude"] = pd.to_numeric(orgs_df["long"], errors="coerce")

            orgs_df["AreaCode_LAD"]  = orgs_df.get("lad",   orgs_df.get("AreaCode_LAD"))
            orgs_df["AreaCode_LSOA"] = orgs_df.get("lsoa21",orgs_df.get("AreaCode_LSOA"))
            orgs_df["AreaCode_MSOA"] = orgs_df.get("msoa21",orgs_df.get("AreaCode_MSOA"))
            orgs_df["AreaCode_ICB"]  = orgs_df.get("icb",   orgs_df.get("AreaCode_ICB"))

            if "rgn" in orgs_df.columns:
                orgs_df["Geographic Government Office Region Code"] = orgs_df["rgn"]
            if "nhser" in orgs_df.columns:
                orgs_df["High Level Health Geography Code"] = orgs_df["nhser"]
            if "ctry" in orgs_df.columns:
                orgs_df["National Grouping Code"] = orgs_df["ctry"]

            orgs_df.drop(columns=[c for c in ["lat","long","lad","lsoa21","msoa21","icb","rgn","rgnnm","nhser","nhsernm","ctry"]
                                  if c in orgs_df.columns],
                         inplace=True, errors="ignore")
        except Exception as e:
            print("ONSPD enrichment skipped:", e)

    # ---------- NHSPD master fallback for NHSER (fills code if still missing) ----------
    nhser_needed = ("High Level Health Geography Code" not in orgs_df.columns) or orgs_df["High Level Health Geography Code"].isna().all()
    if nhser_needed:
        nhser_df = load_nhspd_master_nhser(NHSPD_MASTER_CSV)
        if nhser_df is not None and not nhser_df.empty:
            orgs_df["__pc_join__"] = orgs_df["PostCodeSpaced"].astype("string").str.upper().str.replace(r"\s+"," ", regex=True)
            nhser_df["pcds"] = nhser_df["pcds"].astype("string").str.upper().str.replace(r"\s+"," ", regex=True)
            orgs_df = orgs_df.merge(
                nhser_df.rename(columns={"pcds":"__pc_join__","nhser":"High Level Health Geography Code"}),
                on="__pc_join__", how="left"
            )
            orgs_df.drop(columns=["__pc_join__"], inplace=True, errors="ignore")

    # ---------- NHSPD names (LAD/ICB/RGN/NHSER/CTRY) ----------
    lookups = load_nhspd_lookups(NHSPD_NAMES_DIR, NHSPD_LOOKUP_DIR)

    def attach_name(df, code_col, out_name_col, lk_key):
        lk = lookups.get(lk_key)
        if lk is None or not isinstance(lk, pd.DataFrame) or lk.empty:
            if out_name_col not in df.columns: df[out_name_col] = pd.NA
            return df
        if code_col not in df.columns:
            df[code_col] = pd.NA
        df[code_col] = df[code_col].astype("string").str.strip().str.upper()
        lk = lk.drop_duplicates(subset=["code"])
        return df.merge(lk.rename(columns={"code":code_col, "name":out_name_col}), on=code_col, how="left")

    # Ensure code cols exist prior to attaching names
    for col in ["AreaCode_LAD","AreaCode_ICB","Geographic Government Office Region Code",
                "High Level Health Geography Code","National Grouping Code"]:
        if col not in orgs_df.columns: orgs_df[col] = pd.NA

    # Promote canonical columns used for naming
    orgs_df["Geographic Local Authority Code"]           = orgs_df["AreaCode_LAD"]
    orgs_df["Geographic Primary Care Organisation Code"] = orgs_df["AreaCode_ICB"]

    # Attach names
    orgs_df = attach_name(orgs_df, "Geographic Local Authority Code",           "Geographic Local Authority Name",           "lad")
    orgs_df = attach_name(orgs_df, "Geographic Primary Care Organisation Code", "Geographic Primary Care Organisation Name", "icb")
    orgs_df = attach_name(orgs_df, "Geographic Government Office Region Code",  "Geographic Government Office Region Name",  "rgn")
    orgs_df = attach_name(orgs_df, "High Level Health Geography Code",          "High Level Health Geography Name",          "nhser")
    orgs_df = attach_name(orgs_df, "National Grouping Code",                    "National Grouping Name",                    "ctry")

    # Optional ICB name overlay
    orgs_df = attach_icb_names_from_csv(orgs_df, ICB_NAMES_CSV)

    # If NHSER name is entirely null, drop the column (keep the code)
    if "High Level Health Geography Name" in orgs_df.columns:
        col = orgs_df["High Level Health Geography Name"].astype("string").str.strip()
        all_null = (~col.replace({"": pd.NA}).notna()).all()
        if all_null:
            orgs_df.drop(columns=["High Level Health Geography Name"], inplace=True, errors="ignore")

    # ---------- GP practices slice ----------
    gp_ids_ro76 = set(roles_df.loc[roles_df["RoleId"].isin(GP_ROLE_IDS), "OrgId"])
    gp_like_ids_ro177 = set(
        roles_df.loc[
            roles_df["RoleId"].eq(PRESCRIBE_ROLE) &
            roles_df["OrgId"].astype(str).str.match(r"^[A-Z]\d{5}$", na=False),
            "OrgId"
        ]
    )
    gp_ids = gp_ids_ro76 | gp_like_ids_ro177

    orgs_gp = orgs_df[orgs_df["OrgId"].isin(gp_ids)].copy()
    if not orgs_gp.empty:
        orgs_gp["__is_active__"] = orgs_gp["IsActive"].fillna(False)
        orgs_gp = (orgs_gp.sort_values(["OrgId","__is_active__","LastChangeDate"], ascending=[True, False, False])
                         .drop_duplicates(subset=["OrgId"])
                         .drop(columns="__is_active__", errors="ignore"))

    # ---------- Output ----------
    stamp = datetime.now().strftime("%Y-%m-%d_%H%M")
    out_dir = EXTRACTS_DIR / stamp
    latest_dir = EXTRACTS_DIR / "latest"
    ensure_dir(out_dir); ensure_dir(latest_dir)

    orgs_df.to_parquet(out_dir / "orgs.parquet", index=False)
    orgs_df.to_csv(out_dir / "orgs.csv", index=False)
    dates_df.to_csv(out_dir / "org_dates.csv", index=False)
    roles_df.to_csv(out_dir / "org_roles.csv", index=False)
    rels_df.to_csv(out_dir / "org_rels.csv", index=False)
    succ_df.to_csv(out_dir / "org_succs.csv", index=False)
    orgs_gp.to_csv(out_dir / "orgs_gp.csv", index=False)

    # Mirror to latest/
    orgs_df.to_parquet(latest_dir / "orgs.parquet", index=False)
    orgs_df.to_csv(latest_dir / "orgs.csv", index=False)
    dates_df.to_csv(latest_dir / "org_dates.csv", index=False)
    roles_df.to_csv(latest_dir / "org_roles.csv", index=False)
    rels_df.to_csv(latest_dir / "org_rels.csv", index=False)
    succ_df.to_csv(latest_dir / "org_succs.csv", index=False)
    orgs_gp.to_csv(latest_dir / "orgs_gp.csv", index=False)

    # ---------- Coverage ----------
    def pct(x, y):
        try: return 100.0 * (x / y) if y else 0.0
        except: return 0.0

    eng_mask = orgs_df.get("National Grouping Code", pd.Series([], dtype="string")).eq("E92000001")
    if eng_mask.empty:
        eng_mask = orgs_df["Country"].astype(str).str.upper().eq("ENGLAND")

    total_eng = int(eng_mask.sum())
    lad_cov_ct  = int((eng_mask & orgs_df["Geographic Local Authority Code"].astype(str).str.match(r"^E0(6|7|8|9)\d{5}$", na=False)).sum())
    icb_cov_ct  = int((eng_mask & orgs_df["Geographic Primary Care Organisation Code"].astype(str).str.match(r"^E54\d+$", na=False)).sum())

    report = [
        f"Snapshot: {stamp}",
        f"Total unique orgs: {len(orgs_df):,}",
        f"English orgs: {total_eng:,}",
        f"LAD code coverage (England): {lad_cov_ct:,} / {total_eng:,} = {pct(lad_cov_ct, total_eng):.1f}%",
        f"ICB code coverage (England): {icb_cov_ct:,} / {total_eng:,} = {pct(icb_cov_ct, total_eng):.1f}%",
    ]
    (out_dir / "coverage_report.txt").write_text("\n".join(report), encoding="utf-8")
    (latest_dir / "coverage_report.txt").write_text("\n".join(report), encoding="utf-8")

    print("\n".join(["✓ Wrote curated tables:",
                     f"- Snapshot: {out_dir}",
                     f"- Latest:   {latest_dir}",
                     "", *report]))

    return {
        "orgs": orgs_df, "dates": dates_df, "roles": roles_df,
        "rels": rels_df, "succs": succ_df, "orgs_gp": orgs_gp
    }

# ---------- RUN ----------
tables = build_curated_tables(
    onspd_csv=ONSPD_CSV,
    full_rescan=FULL_RESCAN,
    max_workers=MAX_WORKERS
)

# Quick peek
try:
    display(tables["orgs"].head(10))
    display(tables["orgs_gp"].head(10))
except Exception:
    print(tables["orgs"].head(10).to_string(index=False))
    print(tables["orgs_gp"].head(10).to_string(index=False))


Building curated tables:   0%|          | 0/286680 [00:00<?, ?org/s]

NHSPD master fallback: detected pcds col 0, nhser col 7; rows=2,717,882
✓ Wrote curated tables:
- Snapshot: C:\Users\NikhilYadav\Desktop\NHS ODS\bronze\ods\extracts\curated\2025-10-03_1703
- Latest:   C:\Users\NikhilYadav\Desktop\NHS ODS\bronze\ods\extracts\curated\latest

Snapshot: 2025-10-03_1703
Total unique orgs: 286,680
English orgs: 275,786
LAD code coverage (England): 0 / 275,786 = 0.0%
ICB code coverage (England): 275,786 / 275,786 = 100.0%


Unnamed: 0,OrgId,Name,Status,IsActive,OrgRecordClass,OrgRecordClassLabel,LastChangeDate,AddrLine1,AddrLine2,AddrLine3,...,Geographic Government Office Region Code,National Grouping Code,High Level Health Geography Code,Geographic Local Authority Code,Geographic Primary Care Organisation Code,Geographic Local Authority Name,Geographic Primary Care Organisation Name,Geographic Government Office Region Name,High Level Health Geography Name,National Grouping Name
0,002,DYFED,Inactive,False,RC1,HSCOrg,2013-05-08,**UNKNOWN**,,,...,,,,,,,,,,
1,003,GWENT,Inactive,False,RC1,HSCOrg,2013-05-08,**UNKNOWN**,,,...,,,,,,,,,,
2,004,GWYNEDD,Inactive,False,RC1,HSCOrg,2013-05-08,**UNKNOWN**,,,...,,,,,,,,,,
3,005,MID GLAMORGAN,Inactive,False,RC1,HSCOrg,2013-05-08,**UNKNOWN**,,,...,,,,,,,,,,
4,006,POWYS,Inactive,False,RC1,HSCOrg,2013-05-08,**UNKNOWN**,,,...,,,,,,,,,,
5,007,SOUTH GLAMORGAN,Inactive,False,RC1,HSCOrg,2013-05-08,**UNKNOWN**,,,...,,,,,,,,,,
6,008,WEST GLAMORGAN,Inactive,False,RC1,HSCOrg,2014-02-20,**UNKNOWN**,,,...,,,,,,,,,,
7,00D,"NHS DURHAM DALES, EASINGTON AND SEDGEFIELD CCG",Inactive,False,RC1,HSCOrg,2023-09-21,SEDGEFIELD COMMUNITY HOSPITAL,SALTERS LANE,SEDGEFIELD,...,E12000001,E92000001,116.0,E06000047,E54000050,County Durham,NHS North East and North Cumbria Integrated Ca...,North East,,England
8,00DAA,"NHS DURHAM DALES, EASINGTON AND SEDGEFIELD HQ",Inactive,False,RC2,HSCSite,2021-04-26,SEDGEFIELD COMMUNITY HOSPITAL,SALTERS LANE,SEDGEFIELD,...,E12000001,E92000001,116.0,E06000047,E54000050,County Durham,NHS North East and North Cumbria Integrated Ca...,North East,,England
9,00F,NHS GATESHEAD CCG,Inactive,False,RC1,HSCOrg,2021-03-10,RIVERSIDE HOUSE,GOLDCREST WAY,NEWBURN RIVERSIDE,...,E12000001,E92000001,107.0,E08000021,E54000050,Newcastle upon Tyne,NHS North East and North Cumbria Integrated Ca...,North East,,England


Unnamed: 0,OrgId,Name,Status,IsActive,OrgRecordClass,OrgRecordClassLabel,LastChangeDate,AddrLine1,AddrLine2,AddrLine3,...,Geographic Government Office Region Code,National Grouping Code,High Level Health Geography Code,Geographic Local Authority Code,Geographic Primary Care Organisation Code,Geographic Local Authority Name,Geographic Primary Care Organisation Name,Geographic Government Office Region Name,High Level Health Geography Name,National Grouping Name
33787,A81002,QUEENS PARK MEDICAL CENTRE,Active,True,RC1,HSCOrg,2024-06-11,FARRER STREET,,,...,E12000001,E92000001,114,E06000004,E54000050,Stockton-on-Tees,NHS North East and North Cumbria Integrated Ca...,North East,,England
33788,A81003,VICTORIA MEDICAL PRACTICE,Inactive,False,RC1,HSCOrg,2023-08-22,HEALTH CENTRE,VICTORIA ROAD,,...,E12000001,E92000001,111,E06000001,E54000050,Hartlepool,NHS North East and North Cumbria Integrated Ca...,North East,,England
33789,A81004,ACKLAM MEDICAL CENTRE,Active,True,RC1,HSCOrg,2023-08-22,TRIMDON AVENUE,ACKLAM,,...,E12000001,E92000001,112,E06000002,E54000050,Middlesbrough,NHS North East and North Cumbria Integrated Ca...,North East,,England
33791,A81005,SPRINGWOOD SURGERY,Active,True,RC1,HSCOrg,2023-08-22,RECTORY LANE,,,...,E12000001,E92000001,113,E06000003,E54000050,Redcar and Cleveland,NHS North East and North Cumbria Integrated Ca...,North East,,England
33792,A81006,TENNANT STREET MEDICAL PRACTICE,Active,True,RC1,HSCOrg,2024-08-31,TENNANT STREET,,,...,E12000001,E92000001,114,E06000004,E54000050,Stockton-on-Tees,NHS North East and North Cumbria Integrated Ca...,North East,,England
33793,A81007,BANKHOUSE SURGERY,Active,True,RC1,HSCOrg,2023-08-22,ONE LIFE HARTLEPOOL,PARK ROAD,,...,E12000001,E92000001,111,E06000001,E54000050,Hartlepool,NHS North East and North Cumbria Integrated Ca...,North East,,England
33794,A81008,ALBERT HOUSE CLINIC,Inactive,False,RC1,HSCOrg,2023-08-22,LOW GRANGE HEALTH VILLAGE,NORMANBY ROAD,,...,E12000001,E92000001,113,E06000003,E54000050,Redcar and Cleveland,NHS North East and North Cumbria Integrated Ca...,North East,,England
33795,A81009,VILLAGE MEDICAL CENTRE,Active,True,RC1,HSCOrg,2023-08-22,400-404 LINTHORPE ROAD,,,...,E12000001,E92000001,112,E06000002,E54000050,Middlesbrough,NHS North East and North Cumbria Integrated Ca...,North East,,England
33796,A81011,CHADWICK PRACTICE,Active,True,RC1,HSCOrg,2023-08-22,PARK ROAD,,,...,E12000001,E92000001,111,E06000001,E54000050,Hartlepool,NHS North East and North Cumbria Integrated Ca...,North East,,England
33797,A81012,WESTBOURNE MEDICAL CENTRE,Active,True,RC1,HSCOrg,2023-08-22,NORTH ORMESBY HEALTH VILLAGE,7 TRINITY MEWS,NORTH ORMESBY,...,E12000001,E92000001,112,E06000002,E54000050,Middlesbrough,NHS North East and North Cumbria Integrated Ca...,North East,,England


In [57]:
sample=tables["orgs"][tables["orgs"].columns[:21]]

In [58]:
sample.sample(50)

Unnamed: 0,OrgId,Name,Status,IsActive,OrgRecordClass,OrgRecordClassLabel,LastChangeDate,AddrLine1,AddrLine2,AddrLine3,...,TownCity,County,Country,PostCode,PostCodeSpaced,AddressFull,Is Partner To - Code,Is Partner To - Name,Is Commissioned By - Code,Is Commissioned By - Name
218297,V07758,DENTAL SURGERY,Inactive,False,RC1,HSCOrg,2024-03-05,498 NARBOROUGH ROAD,,,...,LEICESTER,LEICESTERSHIRE,ENGLAND,LE32FU,LE3 2FU,"498 NARBOROUGH ROAD, LEICESTER, LEICESTERSHIRE...",,,QK1,"NHS LEICESTER, LEICESTERSHIRE AND RUTLAND INTE..."
173096,RRE1L,MENTAL HEALTH 1,Inactive,False,RC2,HSCSite,2024-04-02,ST. MICHAELS HOSPITAL,OLD NURSES HOME,TRENT VALLEY ROAD,...,LICHFIELD,STAFFORDSHIRE,ENGLAND,WS136EF,WS13 6EF,"ST. MICHAELS HOSPITAL, OLD NURSES HOME, TRENT ...",RRE,MIDLANDS PARTNERSHIP UNIVERSITY NHS FOUNDATION...,,
97894,EE150852,KING'S LANDER PRIMARY ACADEMY,Active,True,RC1,HSCOrg,2024-09-25,PENNINGTON ROAD,LITHERLAND,,...,LIVERPOOL,,ENGLAND,L218HY,L21 8HY,"PENNINGTON ROAD, LITHERLAND, LIVERPOOL, ENGLAND",,,,
286531,Z8G4B,COPELAND NPC - COVID LOCAL VACCINATION SERVICE 2,Active,True,RC2,HSCSite,2024-08-12,3 CASTLE MEADOWS,,,...,WHITEHAVEN,,ENGLAND,CA287RG,CA28 7RG,"3 CASTLE MEADOWS, WHITEHAVEN, ENGLAND",A82041,LOWTHER MEDICAL CENTRE,,
276207,X0C5C,THE GALTRES CENTRE - COVID LOCAL VACCINATION S...,Inactive,False,RC2,HSCSite,2021-06-25,MARKET PLACE,EASINGWOLD,,...,YORK,,ENGLAND,YO613AD,YO61 3AD,"MARKET PLACE, EASINGWOLD, YORK, ENGLAND",B82033,PICKERING MEDICAL PRACTICE,,
65606,DE825,CROSSLEY MANOR CHILDRENS HOME,Active,True,RC2,HSCSite,2020-04-03,MILL LANE,RAINHILL,,...,PRESCOT,MERSEYSIDE,ENGLAND,L356NE,L35 6NE,"MILL LANE, RAINHILL, PRESCOT, MERSEYSIDE, ENGLAND",DE8,ELYSIUM HEALTHCARE,,
39458,AGX2,HOME ADDRESS LTD,Active,True,RC1,HSCOrg,2019-06-04,PROSPECT PLACE,PROSPECT PLACE,,...,CRYMYCH,DYFED,WALES,SA413QQ,SA41 3QQ,"PROSPECT PLACE, PROSPECT PLACE, CRYMYCH, DYFED...",,,,
241652,VM3W9,MARTINS CARE - THE ANGELS,Inactive,False,RC1,HSCOrg,2020-01-23,38 COUNTY CLOSE,WOODGATE,,...,BIRMINGHAM,WEST MIDLANDS,ENGLAND,B323SZ,B32 3SZ,"38 COUNTY CLOSE, WOODGATE, BIRMINGHAM, WEST MI...",AEVN,SHARON JANE MARTIN,,
144477,NYNAV,SPECSAVERS HEARCARE - BLACK COUNTRY - BROWNHILLS,Active,True,RC2,HSCSite,2020-04-05,UNIT 1 12-28 HIGH STREET,BROWNHILLS,,...,WALSALL,WEST MIDLANDS,ENGLAND,WS86EQ,WS8 6EQ,"UNIT 1 12-28 HIGH STREET, BROWNHILLS, WALSALL,...",NYN,SPECSAVERS HEARCARE GROUP LTD,,
254694,VN8A9,SHINE PARTNERSHIPS,Active,True,RC1,HSCOrg,2024-05-20,2 MOSS HALL CRESCENT,,,...,LONDON,GREATER LONDON,ENGLAND,N128NY,N12 8NY,"2 MOSS HALL CRESCENT, LONDON, GREATER LONDON, ...",AWGW,SHINE PARTNERSHIPS LTD,,


In [59]:
tables["orgs"].sample(200)

Unnamed: 0,OrgId,Name,Status,IsActive,OrgRecordClass,OrgRecordClassLabel,LastChangeDate,AddrLine1,AddrLine2,AddrLine3,...,Geographic Government Office Region Code,National Grouping Code,High Level Health Geography Code,Geographic Local Authority Code,Geographic Primary Care Organisation Code,Geographic Local Authority Name,Geographic Primary Care Organisation Name,Geographic Government Office Region Name,High Level Health Geography Name,National Grouping Name
276914,X7J0Y,UNIVERSITY HOSPITAL SOUTHAMPTON NHS FT - VALNE...,Active,True,RC2,HSCSite,2021-08-23,TREMONA ROAD,,,...,E12000008,E92000001,814,E06000045,E54000042,Southampton,NHS Hampshire and Isle of Wight Integrated Car...,South East,,England
39937,AJ60B,FLAGG COURT HEALTH CENTRE,Active,True,RC2,HSCSite,2020-04-03,7 FLAGG COURT,,,...,E12000001,E92000001,109,E08000023,E54000050,South Tyneside,NHS North East and North Cumbria Integrated Ca...,North East,,England
78258,EE116894,IVINGTON COFE PRIMARY AND PRE-SCHOOL,Active,True,RC1,HSCOrg,2023-02-09,IVINGTON,,,...,E12000005,E92000001,415,E06000019,E54000019,"Herefordshire, County of",NHS Herefordshire and Worcestershire Integrate...,West Midlands,,England
235540,VLNJF,NORWAY LODGE NURSING HOME,Active,True,RC1,HSCOrg,2021-02-12,RESERVOIR ROAD,PRENTON,,...,E12000002,E92000001,319,E08000015,E54000008,Wirral,NHS Cheshire and Merseyside Integrated Care Board,North West,,England
270785,VNV6P,THE PAVILIONS,Active,True,RC1,HSCOrg,2024-07-24,50 ALMA ROAD,MILLFIELD,,...,E12000006,E92000001,624,E06000031,E54000056,Peterborough,NHS Cambridgeshire and Peterborough Integrated...,East of England,,England
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125600,J84018,GROVE HOUSE SURGERY,Inactive,False,RC1,HSCOrg,2023-08-22,102 ALBERT STREET,,,...,E12000008,E92000001,803,E06000046,E54000042,Isle of Wight,NHS Hampshire and Isle of Wight Integrated Car...,South East,,England
267048,VNP1X,CHITIM CARE LIMITED,Inactive,False,RC1,HSCOrg,2025-07-18,14 BOYD CLOSE,MOSTON,,...,E12000002,E92000001,326,E06000049,E54000008,Cheshire East,NHS Cheshire and Merseyside Integrated Care Board,North West,,England
146304,P04A,U & U LTD,Active,True,RC1,HSCOrg,2012-05-15,57-59 FOXHALL ROAD,,,...,E12000006,E92000001,609,E07000202,E54000023,Ipswich,NHS Suffolk and North East Essex Integrated Ca...,East of England,,England
276584,X3H9C,KLEYN HEALTHCARE - LOWER VALLEY PCN,Active,True,RC2,HSCSite,2022-02-10,16A CHURCH LANE,,,...,E12000003,E92000001,210,E08000033,E54000054,Calderdale,NHS West Yorkshire Integrated Care Board,Yorkshire and The Humber,,England


In [60]:
oid = "C83617"
df = tables["orgs"]

# 1) Exact match (case-insensitive, trims spaces)
row = df[df["OrgId"].astype(str).str.strip().str.upper().eq(oid.upper())]
display(row)            # in notebooks
print(row.to_string())  # in scripts/console


Unnamed: 0,OrgId,Name,Status,IsActive,OrgRecordClass,OrgRecordClassLabel,LastChangeDate,AddrLine1,AddrLine2,AddrLine3,...,Geographic Government Office Region Code,National Grouping Code,High Level Health Geography Code,Geographic Local Authority Code,Geographic Primary Care Organisation Code,Geographic Local Authority Name,Geographic Primary Care Organisation Name,Geographic Government Office Region Name,High Level Health Geography Name,National Grouping Name
60685,C83617,ABBEYVIEW SURGERY,Active,True,RC1,HSCOrg,2023-08-22,CROWLAND HEALTH CENTRE,THORNEY ROAD,CROWLAND,...,E12000004,E92000001,503,E07000140,E54000013,South Holland,NHS Lincolnshire Integrated Care Board,East Midlands,,England


        OrgId               Name  Status  IsActive OrgRecordClass OrgRecordClassLabel LastChangeDate               AddrLine1     AddrLine2 AddrLine3 AddrLine4      TownCity          County  Country PostCode PostCodeSpaced                                                                            AddressFull Is Partner To - Code          Is Partner To - Name Is Commissioned By - Code   Is Commissioned By - Name  Latitude  Longitude AreaCode_LAD AreaCode_LSOA AreaCode_MSOA AreaCode_ICB Geographic Government Office Region Code National Grouping Code High Level Health Geography Code Geographic Local Authority Code Geographic Primary Care Organisation Code Geographic Local Authority Name Geographic Primary Care Organisation Name Geographic Government Office Region Name High Level Health Geography Name National Grouping Name
60685  C83617  ABBEYVIEW SURGERY  Active      True            RC1              HSCOrg     2023-08-22  CROWLAND HEALTH CENTRE  THORNEY ROAD  CROWLAND            PETERBORO