In [ ]:
import json
from datetime import datetime
from pyspark.sql import functions as F

date_str = datetime.now().strftime("%Y-%m-%d")
s3_base_path = f"s3://openalex-snapshots/full/{date_str}"

ENTITIES = [
    {
        "name": "continents",
        "source_table": "openalex.common.continents_api",
        "snapshot_table": "openalex.common.openalex_continents_snapshot",
        "array_columns": ["display_name_alternatives", "countries"],
    },
    {
        "name": "countries",
        "source_table": "openalex.common.countries_api",
        "snapshot_table": "openalex.common.openalex_countries_snapshot",
        "array_columns": ["display_name_alternatives"],
    },
    {
        "name": "institution-types",
        "source_table": "openalex.common.institution_types_api",
        "snapshot_table": "openalex.common.openalex_institution_types_snapshot",
        "array_columns": [],
    },
    {
        "name": "languages",
        "source_table": "openalex.common.languages_api",
        "snapshot_table": "openalex.common.openalex_languages_snapshot",
        "array_columns": [],
    },
    {
        "name": "licenses",
        "source_table": "openalex.common.licenses_api",
        "snapshot_table": "openalex.common.openalex_licenses_snapshot",
        "array_columns": [],
    },
    {
        "name": "sdgs",
        "source_table": "openalex.common.sdgs_api",
        "snapshot_table": "openalex.common.openalex_sdgs_snapshot",
        "array_columns": [],
    },
    {
        "name": "source-types",
        "source_table": "openalex.common.source_types_api",
        "snapshot_table": "openalex.common.openalex_source_types_snapshot",
        "array_columns": [],
    },
    {
        "name": "work-types",
        "source_table": "openalex.common.work_types_api",
        "snapshot_table": "openalex.common.openalex_work_types_snapshot",
        "array_columns": [],
    },
]

### Export common entities to S3 snapshot
Exports continents, countries, institution-types, languages, licenses, sdgs, source-types, and work-types as gzip JSON lines to S3.

In [ ]:
def rename_files_and_cleanup(output_path):
    """Rename partition directories and files for consistency."""
    partitions = dbutils.fs.ls(output_path)
    partitions_to_process = [p for p in partitions if p.name.startswith("_partition_date=")]

    for partition in partitions_to_process:
        date_value = partition.name.replace("_partition_date=", "").rstrip("/")
        new_partition_path = f"{output_path}/updated_date={date_value}/"

        files = dbutils.fs.ls(partition.path)
        json_files = sorted([f for f in files if f.name.endswith('.gz')], key=lambda x: x.name)

        for idx, file_info in enumerate(json_files):
            new_name = f"part_{str(idx).zfill(4)}.gz"
            new_path = f"{new_partition_path}{new_name}"
            dbutils.fs.mv(file_info.path, new_path)
            print(f"  Moved {file_info.name} -> updated_date={date_value}/{new_name}")

        for f in files:
            if not f.name.endswith('.gz'):
                try:
                    dbutils.fs.rm(f.path)
                except:
                    pass

        try:
            dbutils.fs.rm(partition.path, recurse=True)
        except:
            pass

    # Clean up root-level Spark metadata
    try:
        root_files = dbutils.fs.ls(output_path)
        for f in root_files:
            if f.name.startswith("_"):
                dbutils.fs.rm(f.path, recurse=True)
    except:
        pass


def create_manifest(output_path, entity_type):
    """Create a manifest file with all file metadata."""
    partitions = dbutils.fs.ls(output_path)
    partitions_to_process = sorted(
        [p for p in partitions if p.name.startswith("updated_date=")],
        key=lambda x: x.name, reverse=True
    )

    entries = []
    total_content_length = 0
    total_record_count = 0

    for partition in partitions_to_process:
        files = dbutils.fs.ls(partition.path)
        for file_info in files:
            if not file_info.name.endswith('.gz'):
                continue

            record_count = spark.read.text(file_info.path).count()

            raw = file_info.path.replace("dbfs:/", "s3://")
            marker = f"/{entity_type}/"
            idx = raw.find(marker)
            relative = raw[idx:]
            s3_url = f"s3://openalex/data{relative}"

            entries.append({
                "url": s3_url,
                "meta": {
                    "content_length": file_info.size,
                    "record_count": record_count
                }
            })
            total_content_length += file_info.size
            total_record_count += record_count

            print(f"  {partition.name}{file_info.name}: {record_count:,} records, {file_info.size/(1024*1024):.2f} MB")

    entries.sort(key=lambda x: x["url"])

    manifest = {
        "entries": entries,
        "meta": {
            "content_length": total_content_length,
            "record_count": total_record_count
        }
    }

    manifest_path = f"{output_path}/manifest"
    dbutils.fs.put(manifest_path, json.dumps(manifest, indent=2), overwrite=True)

    print(f"  Manifest: {len(entries)} files, {total_content_length / (1024**2):.2f} MB, {total_record_count:,} records")

In [ ]:
for entity in ENTITIES:
    name = entity["name"]
    output_path = f"{s3_base_path}/{name}"
    print(f"\n{'='*60}")
    print(f"Processing {name}")
    print(f"{'='*60}")

    # Read and coalesce null arrays
    df = spark.read.table(entity["source_table"])
    for col_name in entity["array_columns"]:
        df = df.withColumn(col_name, F.coalesce(F.col(col_name), F.array()))

    # Write snapshot table
    df.write.mode("overwrite").option("overwriteSchema", "true").saveAsTable(entity["snapshot_table"])
    print(f"  Snapshot table: {entity['snapshot_table']}")

    # Export to S3
    df = spark.read.table(entity["snapshot_table"])
    record_count = df.count()
    print(f"  Records: {record_count:,}")

    df = df.withColumn("_partition_date", F.coalesce(F.to_date("updated_date"), F.col("created_date"), F.current_date()))

    (df.coalesce(1)
       .write
       .mode("overwrite")
       .option("compression", "gzip")
       .partitionBy("_partition_date")
       .json(output_path))

    # Rename and cleanup
    rename_files_and_cleanup(output_path)

    # Create manifest
    create_manifest(output_path, name)

    print(f"  Done: {output_path}")

print(f"\nAll exports complete!")