In [ ]:
%run ./_utils

### Daily Snapshot — Small Entities
Exports publishers, funders, topics, subfields, fields, domains,
continents, countries, institution-types, languages, licenses, sdgs,
source-types, and work-types.

Each entity gets JSONL and Parquet formats.
Entities with zero updates on the snapshot day get empty manifests.

In [ ]:
from pyspark.sql import functions as F

date_str = get_snapshot_date()
print(f"Snapshot date: {date_str}")

# -----------------------------------------------------------------------
# Entity definitions
# -----------------------------------------------------------------------
# Each dict: name (S3 dir name), source_table, transform (optional fn),
# array_columns (coalesce nulls -> empty arrays)

SMALL_ENTITIES = [
    # --- Publishers ---
    {
        "name": "publishers",
        "source_table": "openalex.publishers.publishers_api",
        "id_transform": lambda df: df.withColumn("id", F.concat(F.lit("https://openalex.org/P"), F.col("id"))),
        "array_columns": ["lineage", "alternate_titles", "country_codes", "roles", "counts_by_year"],
    },
    # --- Funders ---
    {
        "name": "funders",
        "source_table": "openalex.funders.funders_api",
        "id_transform": lambda df: df.withColumn("id", F.concat(F.lit("https://openalex.org/F"), F.col("id"))),
        "array_columns": ["alternate_titles", "roles", "counts_by_year"],
    },
    # --- Topic hierarchy ---
    {
        "name": "topics",
        "source_table": "openalex.common.topics_api",
        "id_transform": lambda df: df.withColumn("id", F.concat(F.lit("https://openalex.org/T"), F.col("id"))),
        "array_columns": ["keywords", "siblings"],
    },
    {
        "name": "subfields",
        "source_table": "openalex.common.subfields_api",
        "array_columns": ["display_name_alternatives", "topics", "siblings"],
    },
    {
        "name": "fields",
        "source_table": "openalex.common.fields_api",
        "array_columns": ["display_name_alternatives", "subfields", "siblings"],
    },
    {
        "name": "domains",
        "source_table": "openalex.common.domains_api",
        "array_columns": ["display_name_alternatives", "fields", "siblings"],
    },
    # --- Common entities ---
    {
        "name": "continents",
        "source_table": "openalex.common.continents_api",
        "array_columns": ["display_name_alternatives", "countries"],
    },
    {
        "name": "countries",
        "source_table": "openalex.common.countries_api",
        "array_columns": ["display_name_alternatives"],
    },
    {
        "name": "institution-types",
        "source_table": "openalex.common.institution_types_api",
        "array_columns": [],
    },
    {
        "name": "languages",
        "source_table": "openalex.common.languages_api",
        "array_columns": [],
    },
    {
        "name": "licenses",
        "source_table": "openalex.common.licenses_api",
        "array_columns": [],
    },
    {
        "name": "sdgs",
        "source_table": "openalex.common.sdgs_api",
        "array_columns": [],
    },
    {
        "name": "source-types",
        "source_table": "openalex.common.source_types_api",
        "array_columns": [],
    },
    {
        "name": "work-types",
        "source_table": "openalex.common.work_types_api",
        "array_columns": [],
    },
]

In [ ]:
total_entities = len(SMALL_ENTITIES)
total_records = 0

for i, entity_cfg in enumerate(SMALL_ENTITIES, 1):
    name = entity_cfg["name"]
    print(f"\n[{i}/{total_entities}] {'='*50}")
    print(f"Entity: {name}")
    print(f"{'='*56}")

    # Read daily slice
    df = get_daily_df(spark, entity_cfg["source_table"], date_str)

    # Apply ID transform if defined
    id_transform = entity_cfg.get("id_transform")
    if id_transform:
        df = id_transform(df)

    # Coalesce null arrays to empty arrays
    for col_name in entity_cfg["array_columns"]:
        df = df.withColumn(col_name, F.coalesce(F.col(col_name), F.array()))

    # Export all formats (handles zero-record case internally)
    export_all_formats(spark, dbutils, df, date_str, name)

print(f"\n{'='*56}")
print(f"All {total_entities} small entities exported for {date_str}")