In [ ]:
%run ./_utils

### Update daily snapshot metadata
Runs after all export tasks complete.
1. Reads per-entity metadata from `_meta/{format}/{entity}.json` and builds a combined `manifest.json` per format at `daily/{date}/{format}/manifest.json`
2. Updates `daily/latest.json` with list of available dates (used by API for date listing)
3. Cleans up `_meta/` and `_temp/` directories

In [None]:
import json
from datetime import datetime, timedelta, timezone

date_str = get_snapshot_date()
print(f"Snapshot date: {date_str}")

ENTITIES = [
    "works", "authors", "institutions", "sources", "publishers", "funders",
    "topics", "subfields", "fields", "domains",
    "concepts", "keywords", "awards",
    "continents", "countries", "institution-types", "languages",
    "licenses", "sdgs", "source-types", "work-types",
]

FORMATS = ["jsonl", "parquet"]

# Build combined per-format manifests from _meta/ files
for fmt in FORMATS:
    combined_entries = []
    for entity in ENTITIES:
        meta_path = f"{S3_BASE}/{date_str}/_meta/{fmt}/{entity}.json"
        try:
            content = dbutils.fs.head(meta_path, 65536)
            meta = json.loads(content)
        except Exception:
            meta = {"entity": entity, "filename": None, "record_count": 0, "content_length": 0}

        # Build file list (single file or empty for zero-record entities)
        files = []
        if meta.get("filename"):
            files.append({
                "url": f"s3://{S3_BUCKET}/daily/{date_str}/{fmt}/{meta['filename']}",
                "meta": {
                    "content_length": meta["content_length"],
                    "record_count": meta["record_count"],
                }
            })

        combined_entries.append({
            "entity": entity,
            "record_count": meta["record_count"],
            "content_length": meta["content_length"],
            "files": files,
        })

    total_records = sum(e["record_count"] for e in combined_entries)
    total_size = sum(e["content_length"] for e in combined_entries)

    combined_manifest = {
        "date": date_str,
        "format": fmt,
        "meta": {
            "record_count": total_records,
            "content_length": total_size,
        },
        "entities": combined_entries,
    }

    combined_path = f"{S3_BASE}/{date_str}/{fmt}/manifest.json"
    dbutils.fs.put(combined_path, json.dumps(combined_manifest, indent=2), overwrite=True)
    print(f"{fmt}: {total_records:,} records, {total_size / (1024**2):.1f} MB across {len(ENTITIES)} entities")

# Clean up _meta/ directory
try:
    dbutils.fs.rm(f"{S3_BASE}/{date_str}/_meta", recurse=True)
    print(f"\nCleaned up _meta/ directory")
except Exception:
    print(f"\nWarning: could not clean up _meta/ directory")

# Clean up _temp/ directory
try:
    dbutils.fs.rm(f"{S3_BASE}/{date_str}/_temp", recurse=True)
    print(f"Cleaned up _temp/ directory")
except Exception:
    print(f"Warning: could not clean up _temp/ directory")

# Update latest.json with available dates
meta_path = f"{S3_BASE}/latest.json"
cutoff = (datetime.now(timezone.utc) - timedelta(days=60)).strftime("%Y-%m-%d")

# Read existing dates
try:
    existing = json.loads(dbutils.fs.head(meta_path, 65536))
    available_dates = existing.get("available_dates", [])
except Exception:
    available_dates = []

# Add today, prune old dates, deduplicate, sort descending
if date_str not in available_dates:
    available_dates.append(date_str)
available_dates = sorted([d for d in available_dates if d >= cutoff], reverse=True)

latest = {"available_dates": available_dates}
dbutils.fs.put(meta_path, json.dumps(latest, indent=2), overwrite=True)
print(f"\nUpdated {meta_path} ({len(available_dates)} dates)")

# Print per-entity summary
print(f"\nPer-entity record counts:")
for entry in combined_entries:
    print(f"  {entry['entity']:25s} {entry['record_count']:>14,}")