In [ ]:
%run ./_utils

### Daily Snapshot â€” Medium Entities
Exports institutions, sources, concepts, keywords, and awards.

Each entity gets all 3 formats (JSONL, Parquet, Avro) with a manifest.
Entities with zero updates on the snapshot day get empty manifests.

In [ ]:
from pyspark.sql import functions as F

date_str = get_snapshot_date()
print(f"Snapshot date: {date_str}")

MEDIUM_ENTITIES = [
    # --- Institutions ---
    {
        "name": "institutions",
        "source_table": "openalex.institutions.institutions_api",
        "array_columns": [
            "lineage", "display_name_acronyms", "display_name_alternatives",
            "roles", "repositories", "topics", "topic_share",
            "associated_institutions", "counts_by_year",
        ],
    },
    # --- Sources ---
    {
        "name": "sources",
        "source_table": "openalex.sources.sources_api",
        "array_columns": [
            "issn", "host_organization_lineage", "apc_prices",
            "societies", "alternate_titles", "topics", "topic_share",
            "counts_by_year",
        ],
    },
    # --- Concepts ---
    {
        "name": "concepts",
        "source_table": "openalex.common.concepts_api",
        "id_transform": lambda df: df.withColumn("id", F.concat(F.lit("https://openalex.org/C"), F.col("id"))),
        "array_columns": [],
    },
    # --- Keywords ---
    {
        "name": "keywords",
        "source_table": "openalex.common.keywords_api",
        "array_columns": [],
    },
    # --- Awards ---
    {
        "name": "awards",
        "source_table": "openalex.awards.awards_api",
        "id_transform": lambda df: df.withColumn("id", F.concat(F.lit("https://openalex.org/G"), F.col("id"))),
        "array_columns": ["investigators", "funded_outputs"],
        "drop_columns": ["funder_id"],
    },
]

In [ ]:
total_entities = len(MEDIUM_ENTITIES)

for i, entity_cfg in enumerate(MEDIUM_ENTITIES, 1):
    name = entity_cfg["name"]
    print(f"\n[{i}/{total_entities}] {'='*50}")
    print(f"Entity: {name}")
    print(f"{'='*56}")

    # Read daily slice
    df = get_daily_df(spark, entity_cfg["source_table"], date_str)

    # Apply ID transform if defined
    id_transform = entity_cfg.get("id_transform")
    if id_transform:
        df = id_transform(df)

    # Drop columns if specified
    for col_name in entity_cfg.get("drop_columns", []):
        df = df.drop(col_name)

    # Coalesce null arrays to empty arrays
    for col_name in entity_cfg["array_columns"]:
        df = df.withColumn(col_name, F.coalesce(F.col(col_name), F.array()))

    # Export all formats (handles zero-record case internally)
    export_all_formats(spark, dbutils, df, date_str, name)

print(f"\n{'='*56}")
print(f"All {total_entities} medium entities exported for {date_str}")