In [7]:
from pathlib import Path
import re, json, gzip, hashlib, time

# --- config (override if your root isn't 'out') ---
OUTPUT_DIR = Path("out")  # <- change if needed

# ========== helpers ==========
def _token(parts, key):
    for p in parts:
        if isinstance(p, str) and p.startswith(f"{key}="):
            return p.split("=", 1)[1]
    return None

def _sha256_of_gzip_uncompressed(p: Path, max_bytes=16*1024*1024):
    """Fast-ish hash for change detection: read up to 16MB uncompressed."""
    h = hashlib.sha256()
    with gzip.open(p, "rb") as g:
        while True:
            chunk = g.read(1024*256)
            if not chunk:
                break
            h.update(chunk)
            max_bytes -= len(chunk)
            if max_bytes <= 0:
                break
    return h.hexdigest()

def _bronze_data_base(out_root: Path) -> Path:
    return out_root / "bronze" / "data"

def _list_ingest_dates(out_root: Path):
    base = _bronze_data_base(out_root)
    if not base.exists():
        print(f"[DIFF] Base not found: {base} — run your downloader first (or set OUTPUT_DIR).")
        return []
    dates = set()
    for p in base.rglob("ingest_date=*"):
        if p.is_dir():
            d = p.name.split("=",1)[1]
            if re.fullmatch(r"\d{4}-\d{2}-\d{2}", d):
                dates.add(d)
    return sorted(dates)

def _load_partition_index(out_root: Path, ingest_date: str):
    """
    Returns: {(profile_key, area_type_id): {"status": ok|empty|ok_no_meta|missing, "sha": str|None, "rows": int|None}}
    """
    base = _bronze_data_base(out_root)
    idx = {}
    if not base.exists():
        return idx

    # authoritative: _meta.json
    for meta_path in base.rglob("_meta.json"):
        parts = list(meta_path.parts)
        pkey = _token(parts, "profile_key")
        atid = _token(parts, "area_type_id")
        idate = _token(parts, "ingest_date")
        if not (pkey and atid and idate) or idate != ingest_date:
            continue

        csv_path = meta_path.parent / "all_data_by_profile.csv.gz"
        try:
            meta = json.loads(meta_path.read_text(encoding="utf-8"))
        except Exception:
            meta = {}
        is_empty = bool(meta.get("empty"))
        sha = meta.get("sha256_uncompressed")
        rows = meta.get("row_count_estimate")
        status = "empty" if is_empty else ("ok" if csv_path.exists() else "missing")
        # if ok but sha missing, compute a quick one so we can diff
        if status == "ok" and not sha and csv_path.exists():
            sha = _sha256_of_gzip_uncompressed(csv_path)
        idx[(pkey, int(atid))] = {"status": status, "sha": sha, "rows": rows}

    # also consider csvs with no meta (older runs)
    for csv_path in base.rglob("all_data_by_profile.csv.gz"):
        parts = list(csv_path.parts)
        pkey = _token(parts, "profile_key")
        atid = _token(parts, "area_type_id")
        idate = _token(parts, "ingest_date")
        if not (pkey and atid and idate) or idate != ingest_date:
            continue
        k = (pkey, int(atid))
        if k not in idx:
            idx[k] = {
                "status": "ok_no_meta",
                "sha": _sha256_of_gzip_uncompressed(csv_path),
                "rows": None
            }
    return idx

def diff_ingests(out_root: Path, cur_date: str = None, prev_date: str = None, write_reports=True):
    dates = _list_ingest_dates(out_root)
    if not dates:
        return {"error": "no_ingests_found", "message": f"No ingest_date folders under {_bronze_data_base(out_root)}"}
    if cur_date is None:
        cur_date = dates[-1]
    if prev_date is None:
        prevs = [d for d in dates if d < cur_date]
        prev_date = prevs[-1] if prevs else None

    print(f"[DIFF] Current: {cur_date}  | Previous: {prev_date or '(none)'}")
    cur_idx = _load_partition_index(out_root, cur_date)
    prev_idx = _load_partition_index(out_root, prev_date) if prev_date else {}

    cur_keys = set(cur_idx.keys())
    prev_keys = set(prev_idx.keys())

    new_ok = []
    changed_ok = []
    became_empty = []
    became_ok = []
    disappeared = []
    unchanged_ok = []

    # new or changed
    for k in sorted(cur_keys):
        cur = cur_idx[k]
        if k not in prev_keys:
            if cur["status"].startswith("ok"):
                new_ok.append((k[0], k[1], cur["sha"], cur["rows"]))
            # empty-but-new isn't actionable usually; skip
        else:
            prev = prev_idx[k]
            if cur["status"].startswith("ok") and prev["status"].startswith("ok"):
                if (cur["sha"] and prev["sha"] and cur["sha"] != prev["sha"]):
                    changed_ok.append((k[0], k[1], prev["sha"], cur["sha"]))
                else:
                    unchanged_ok.append((k[0], k[1], cur["sha"]))
            elif prev["status"].startswith("ok") and cur["status"] == "empty":
                became_empty.append((k[0], k[1]))
            elif prev["status"] == "empty" and cur["status"].startswith("ok"):
                became_ok.append((k[0], k[1], cur["sha"], cur["rows"]))

    # disappeared (present before, missing now)
    for k in sorted(prev_keys - cur_keys):
        disappeared.append((k[0], k[1]))

    # summary
    print("\n[DIFF] Summary")
    print(f"  Total current partitions: {len(cur_keys)}")
    print(f"  Total previous partitions: {len(prev_keys)}")
    print(f"  New OK partitions: {len(new_ok)}")
    print(f"  Changed OK partitions: {len(changed_ok)}")
    print(f"  Became EMPTY: {len(became_empty)}")
    print(f"  Became OK (from empty): {len(became_ok)}")
    print(f"  Disappeared (prev but not current): {len(disappeared)}")
    print(f"  Unchanged OK: {len(unchanged_ok)}")

    out = {
        "current_date": cur_date,
        "previous_date": prev_date,
        "counts": {
            "new_ok": len(new_ok),
            "changed_ok": len(changed_ok),
            "became_empty": len(became_empty),
            "became_ok": len(became_ok),
            "disappeared": len(disappeared),
            "unchanged_ok": len(unchanged_ok),
        },
        "new_ok": new_ok,
        "changed_ok": changed_ok,
        "became_empty": became_empty,
        "became_ok": became_ok,
        "disappeared": disappeared,
        "unchanged_ok": unchanged_ok,
    }

    if write_reports:
        diffs_dir = OUTPUT_DIR / "bronze" / "diffs" / f"ingest_date={cur_date}"
        diffs_dir.mkdir(parents=True, exist_ok=True)

        # write minimal CSVs without pandas
        def _w(name, rows, hdr):
            p = diffs_dir / name
            with p.open("w", encoding="utf-8") as f:
                f.write(",".join(hdr) + "\n")
                for r in rows:
                    f.write(",".join("" if x is None else str(x) for x in r) + "\n")
            return p

        p1 = _w("new_ok.csv", new_ok, ["profile_key","area_type_id","sha","row_count_estimate"])
        p2 = _w("changed_ok.csv", changed_ok, ["profile_key","area_type_id","prev_sha","cur_sha"])
        p3 = _w("became_empty.csv", became_empty, ["profile_key","area_type_id"])
        p4 = _w("became_ok.csv", became_ok, ["profile_key","area_type_id","sha","row_count_estimate"])
        p5 = _w("disappeared.csv", disappeared, ["profile_key","area_type_id"])
        p6 = _w("unchanged_ok.csv", unchanged_ok, ["profile_key","area_type_id","sha"])

        print("\n[DIFF] Reports written to:")
        for p in [p1,p2,p3,p4,p5,p6]:
            print(" ", p)

    return out

# ---- run it (auto-picks latest vs previous) ----
diff_result = diff_ingests(OUTPUT_DIR)


[DIFF] Current: 2025-09-17  | Previous: (none)

[DIFF] Summary
  Total current partitions: 1980
  Total previous partitions: 0
  New OK partitions: 244
  Changed OK partitions: 0
  Became EMPTY: 0
  Became OK (from empty): 0
  Disappeared (prev but not current): 0
  Unchanged OK: 0

[DIFF] Reports written to:
  out\bronze\diffs\ingest_date=2025-09-17\new_ok.csv
  out\bronze\diffs\ingest_date=2025-09-17\changed_ok.csv
  out\bronze\diffs\ingest_date=2025-09-17\became_empty.csv
  out\bronze\diffs\ingest_date=2025-09-17\became_ok.csv
  out\bronze\diffs\ingest_date=2025-09-17\disappeared.csv
  out\bronze\diffs\ingest_date=2025-09-17\unchanged_ok.csv


In [8]:
# show the exact partitions in each bucket
for name in ["new_ok","changed_ok","became_ok","became_empty","disappeared"]:
    rows = diff_result.get(name, [])
    print(f"{name}: {len(rows)}")
    for r in rows[:10]:
        print(" ", r)  # (profile_key, area_type_id)


new_ok: 244
  ('amr-local-indicators', 7, '4d18bf8800eb1a7cf71350056e679e5bcb58c1229eed86b243ef501f4bdc7b1f', 1088577)
  ('amr-local-indicators', 15, '0e59a223bb3280bec772e2681f51272270941948a914675e5b265a7ae2a7cbe2', 26349)
  ('amr-local-indicators', 66, 'ae17d3b426447a34c8c978d5b1c4bd6c1f7fe30ff790050df57cff72964bc06b', 421989)
  ('amr-local-indicators', 118, '3df1da969ac00d708385d5b03930333d764e36ffcaaf00d7ff5546f0313a7099', 392732)
  ('amr-local-indicators', 129, '9dd99077f488c48d7073af8550bb329a36646fd73e04ed9316f843ff52530d28', 2521)
  ('amr-local-indicators', 221, '55e60bb9264ed63eeb443c284495108e8476b6aa3bbf54aa694bed6ba8324ec7', 1589)
  ('cancerservices', 7, 'ed04397b3b5f59c238081d5f43decc60c95189785652922b0e0d32f41acc9275', 3276947)
  ('cancerservices', 15, '2b0e168661d8cf9f89e10364341878129bbf103e72bdd06e6d867a955db2fa24', 2477)
  ('cancerservices', 66, '26ec336f0f4adb86730c15d0b138ce0e7db5f933a02ec54af3ceccf7ccf8c343', 57472)
  ('cancerservices', 204, 'c92931ee847cac78436f2

In [9]:
from pathlib import Path
def check_local_exists(items, today):
    base = Path("out/bronze/data")
    missing = []
    for item in items:
        pkey, atid = item[0], int(item[1])
        for name in ["_meta.json", "all_data_by_profile.csv.gz"]:
            p = base / f"profile_key={pkey}" / f"area_type_id={atid}" / f"ingest_date={today}" / name
            if not p.exists(): missing.append(str(p))
    return missing

to_upload = (diff_result.get("new_ok", []) +
             diff_result.get("changed_ok", []) +
             diff_result.get("became_ok", []))
missing_paths = check_local_exists(to_upload, diff_result["current_date"])
print("Missing local files:", len(missing_paths))
for m in missing_paths[:10]: print("  ", m)


Missing local files: 0


In [10]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import argparse
import os
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed

import boto3
from botocore.exceptions import ClientError
from boto3.s3.transfer import TransferConfig
from tqdm.auto import tqdm

# ---------- Helpers ----------

def mime_for(path: Path):
    name = path.name.lower()
    if name.endswith(".csv.gz"):
        return {"ContentType": "text/csv", "ContentEncoding": "gzip"}
    if name.endswith(".json.gz"):
        return {"ContentType": "application/json", "ContentEncoding": "gzip"}
    if name.endswith(".json"):
        return {"ContentType": "application/json"}
    return {"ContentType": "application/octet-stream"}

def iter_files_for_ingest(root: Path, ingest_date: str):
    """
    Yield files under out/bronze/** that either:
      - live in a folder with /ingest_date=<date>/ in its path; or
      - are the manifest for that ingest date at out/bronze/manifest_ingest_date=<date>.json
    """
    bronze_root = root / "bronze"
    marker = f"/ingest_date={ingest_date}/"
    for p in bronze_root.rglob("*"):
        if p.is_file():
            # normalize to posix
            posix = p.as_posix()
            if marker in posix:
                yield p
    # include the manifest for that date if present
    manifest = bronze_root / f"manifest_ingest_date={ingest_date}.json"
    if manifest.exists():
        yield manifest

def build_session(profile: str | None, region: str | None):
    if profile:
        return boto3.Session(profile_name=profile, region_name=region)
    return boto3.Session(region_name=region)

def s3_key_for_local(prefix: str, out_root: Path, local_path: Path):
    rel = local_path.relative_to(out_root).as_posix()  # "bronze/..."
    prefix = prefix.strip("/")
    return f"{prefix}/{rel}" if prefix else rel

def object_exists_same_size(s3_client, bucket: str, key: str, size: int) -> bool:
    try:
        resp = s3_client.head_object(Bucket=bucket, Key=key)
        return resp.get("ContentLength") == size
    except ClientError as e:
        if e.response.get("ResponseMetadata", {}).get("HTTPStatusCode") == 404:
            return False
        # if access denied or other, re-raise so user sees it
        raise

# ---------- Main upload ----------

def upload_bronze_ingest(
    bucket: str,
    prefix: str,
    ingest_date: str,
    out_dir: str = "out",
    aws_profile: str | None = None,
    region: str | None = None,
    workers: int = 8,
    dry_run: bool = False,
):
    sess = build_session(aws_profile, region)
    s3 = sess.client("s3")
    out_root = Path(out_dir).resolve()

    if not (out_root / "bronze").exists():
        raise SystemExit(f"❌ Not found: {out_root/'bronze'}  (run your downloader first)")

    # Collect files
    files = list(iter_files_for_ingest(out_root, ingest_date))
    if not files:
        raise SystemExit(f"❌ No files found under {out_root}/bronze for ingest_date={ingest_date}")

    # Build upload plan (skip if already same size in S3)
    plan = []
    skipped_existing = 0
    for p in files:
        key = s3_key_for_local(prefix, out_root, p)
        try:
            if object_exists_same_size(s3, bucket, key, p.stat().st_size):
                skipped_existing += 1
                continue
        except ClientError as e:
            # surface credential/permission issues immediately
            raise
        plan.append((p, key))

    print(f"[PLAN] ingest_date={ingest_date}")
    print(f"  Found local files: {len(files)}")
    print(f"  Already present (same size) in S3: {skipped_existing}")
    print(f"  To upload now: {len(plan)}")
    if dry_run:
        for p, k in plan[:10]:
            print("  ->", p, "→", f"s3://{bucket}/{k}")
        if len(plan) > 10:
            print(f"  ...and {len(plan)-10} more")
        print("Dry-run: exiting without changes.")
        return

    # Transfer config
    cfg = TransferConfig(
        multipart_threshold=8*1024*1024,
        multipart_chunksize=8*1024*1024,
        max_concurrency=workers,
        use_threads=True,
    )

    def _upload_one(local_path: Path, key: str):
        extra = mime_for(local_path)
        s3.upload_file(str(local_path), bucket, key, ExtraArgs=extra, Config=cfg)
        return key

    uploaded = 0
    errors = 0
    with ThreadPoolExecutor(max_workers=workers) as ex:
        futs = [ex.submit(_upload_one, p, k) for (p, k) in plan]
        for f in tqdm(as_completed(futs), total=len(futs), desc="S3 Upload", leave=True):
            try:
                f.result()
                uploaded += 1
            except Exception as e:
                errors += 1
                print("[ERROR] upload failed:", e)

    print(f"\n✅ Upload complete: uploaded {uploaded}/{len(plan)}; skipped (already present) {skipped_existing}; errors {errors}")
    print(f"   Destination prefix: s3://{bucket}/{prefix.strip('/')}/bronze/ ... (kept all historical ingest_date folders)")

# ---------- CLI ----------

def parse_args():
    ap = argparse.ArgumentParser(description="Upload a bronze ingest_date to S3 (no deletions).")
    ap.add_argument("--bucket", required=True, help="S3 bucket name")
    ap.add_argument("--prefix", default="", help="S3 key prefix (e.g. 'my/project')")
    ap.add_argument("--ingest-date", required=True, help="Ingest date to upload, e.g. 2025-09-15")
    ap.add_argument("--out-dir", default="out", help="Local output root (default: out)")
    ap.add_argument("--aws-profile", default=os.environ.get("AWS_PROFILE"), help="AWS CLI profile name")
    ap.add_argument("--region", default=os.environ.get("AWS_REGION") or os.environ.get("AWS_DEFAULT_REGION"), help="AWS region")
    ap.add_argument("--workers", type=int, default=8, help="Concurrent uploads (default: 8)")
    ap.add_argument("--dry-run", action="store_true", help="Show plan without uploading")
    return ap.parse_args()



In [11]:
# no import needed if upload_bronze_ingest is already defined in this notebook
from pathlib import Path

def autodetect_ingest(out_dir="out"):
    root = Path(out_dir) / "bronze" / "data"
    dates = sorted({p.name.split("=",1)[1] for p in root.rglob("ingest_date=*") if p.is_dir()})
    if not dates:
        raise SystemExit(f"No ingest_date folders found under {root}")
    print("Auto-detected ingest_date:", dates[-1])
    return dates[-1]

ingest_date = autodetect_ingest("out")

# Run the upload
upload_bronze_ingest(
    bucket="test-nhs-fingertips",
    prefix="",                 # keep empty so keys start with 'bronze/...'
    ingest_date=ingest_date,   # e.g. "2025-09-15"
    out_dir="out",
    region="eu-west-2",
    aws_profile="mybronze",    # or None if you rely on env/role
    workers=8,
    dry_run=False,             # try True first to preview
)


Auto-detected ingest_date: 2025-09-17
[PLAN] ingest_date=2025-09-17
  Found local files: 2329
  Already present (same size) in S3: 0
  To upload now: 2329


S3 Upload:   0%|          | 0/2329 [00:00<?, ?it/s]


✅ Upload complete: uploaded 2329/2329; skipped (already present) 0; errors 0
   Destination prefix: s3://test-nhs-fingertips//bronze/ ... (kept all historical ingest_date folders)
