In [ ]:
%run ./_utils

### Update daily snapshot metadata
Runs after all export tasks complete. Updates `daily/meta/latest.json`
with the latest date and available entities/formats.

In [ ]:
import json

date_str = get_snapshot_date()
print(f"Snapshot date: {date_str}")

ENTITIES = [
    "works", "authors", "institutions", "sources", "publishers", "funders",
    "topics", "subfields", "fields", "domains",
    "concepts", "keywords", "awards",
    "continents", "countries", "institution-types", "languages",
    "licenses", "sdgs", "source-types", "work-types",
]

FORMATS = ["jsonl", "parquet", "avro"]

# Build per-entity summary by checking manifests
entity_stats = []
for entity in ENTITIES:
    stats = {"entity": entity, "formats": {}}
    for fmt in FORMATS:
        manifest_path = f"{S3_BASE}/{date_str}/{fmt}/{entity}/manifest"
        try:
            content = dbutils.fs.head(manifest_path, 65536)
            manifest = json.loads(content)
            stats["formats"][fmt] = {
                "record_count": manifest["meta"]["record_count"],
                "content_length": manifest["meta"]["content_length"],
                "file_count": len(manifest["entries"]),
            }
        except Exception:
            stats["formats"][fmt] = {"record_count": 0, "content_length": 0, "file_count": 0}
    entity_stats.append(stats)

# Write latest.json
latest = {
    "latest_date": date_str,
    "formats": FORMATS,
    "entities": ENTITIES,
    "entity_stats": entity_stats,
}

meta_path = f"{S3_BASE}/meta/latest.json"
dbutils.fs.put(meta_path, json.dumps(latest, indent=2), overwrite=True)
print(f"Updated {meta_path}")

# Print summary
for s in entity_stats:
    jsonl_rc = s['formats'].get('jsonl', {}).get('record_count', 0)
    print(f"  {s['entity']:25s} {jsonl_rc:>10,} records")