In [ ]:
%run ./_utils

### Update daily snapshot metadata
Runs after all export tasks complete.
1. Reads per-entity manifests and builds a combined `manifest.json` per format at `daily/{date}/{format}/manifest.json`
2. Updates `daily/meta/latest.json` with the latest snapshot date

In [None]:
import json

date_str = get_snapshot_date()
print(f"Snapshot date: {date_str}")

ENTITIES = [
    "works", "authors", "institutions", "sources", "publishers", "funders",
    "topics", "subfields", "fields", "domains",
    "concepts", "keywords", "awards",
    "continents", "countries", "institution-types", "languages",
    "licenses", "sdgs", "source-types", "work-types",
]

FORMATS = ["jsonl", "parquet", "avro"]

# Build combined per-format manifests
for fmt in FORMATS:
    combined_entries = []
    for entity in ENTITIES:
        entity_manifest_path = f"{S3_BASE}/{date_str}/{fmt}/{entity}/manifest.json"
        try:
            content = dbutils.fs.head(entity_manifest_path, 65536)
            manifest = json.loads(content)
        except Exception:
            manifest = {"entries": [], "meta": {"content_length": 0, "record_count": 0}}

        combined_entries.append({
            "entity": entity,
            "record_count": manifest["meta"]["record_count"],
            "content_length": manifest["meta"]["content_length"],
            "files": manifest["entries"],
        })

    total_records = sum(e["record_count"] for e in combined_entries)
    total_size = sum(e["content_length"] for e in combined_entries)

    combined_manifest = {
        "date": date_str,
        "format": fmt,
        "meta": {
            "record_count": total_records,
            "content_length": total_size,
        },
        "entities": combined_entries,
    }

    combined_path = f"{S3_BASE}/{date_str}/{fmt}/manifest.json"
    dbutils.fs.put(combined_path, json.dumps(combined_manifest, indent=2), overwrite=True)
    print(f"{fmt}: {total_records:,} records, {total_size / (1024**2):.1f} MB across {len(ENTITIES)} entities")

# Update latest.json as a simple date pointer
latest = {"latest_date": date_str}
meta_path = f"{S3_BASE}/meta/latest.json"
dbutils.fs.put(meta_path, json.dumps(latest, indent=2), overwrite=True)
print(f"\nUpdated {meta_path}")

# Print per-entity summary
print(f"\nPer-entity record counts:")
for entry in combined_entries:
    print(f"  {entry['entity']:25s} {entry['record_count']:>14,}")