In [ ]:
%run ./_utils

### Daily Snapshot â€” Authors
Exports authors updated on the snapshot day to JSONL, Parquet, and Avro.
Transforms match the full snapshot (export_authors.ipynb).

In [ ]:
from pyspark.sql import functions as F

date_str = get_snapshot_date()
print(f"Snapshot date: {date_str}")

df = get_daily_df(spark, "openalex.authors.openalex_authors", date_str)

# Transform to match API format (ported from full snapshot)
df_transformed = (
    df
    .withColumn("id", F.concat(F.lit("https://openalex.org/A"), F.col("id").cast("string")))
    .withColumn("topics", F.slice(F.col("topics"), 1, 5))
    .withColumn("topic_share", F.slice(F.col("topic_share"), 1, 5))
    .withColumn("x_concepts", F.expr("""
        transform(x_concepts, c -> named_struct(
            'id', concat('https://openalex.org/C', cast(c.id as string)),
            'wikidata', c.wikidata,
            'display_name', c.display_name,
            'level', c.col4,
            'score', c.score,
            'count', c.count
        ))
    """))
    .withColumn("display_name_alternatives", F.coalesce(F.col("display_name_alternatives"), F.array()))
    .withColumn("affiliations", F.coalesce(F.col("affiliations"), F.array()))
    .withColumn("last_known_institutions", F.coalesce(F.col("last_known_institutions"), F.array()))
    .withColumn("topics", F.coalesce(F.col("topics"), F.array()))
    .withColumn("topic_share", F.coalesce(F.col("topic_share"), F.array()))
    .withColumn("x_concepts", F.coalesce(F.col("x_concepts"), F.array()))
    .withColumn("sources", F.coalesce(F.col("sources"), F.array()))
    .withColumn("counts_by_year", F.coalesce(F.col("counts_by_year"), F.array()))
    .select(
        "id",
        "display_name",
        "display_name_alternatives",
        "orcid",
        "works_count",
        "cited_by_count",
        "summary_stats",
        "ids",
        "affiliations",
        "last_known_institutions",
        "topics",
        "topic_share",
        "x_concepts",
        "sources",
        "counts_by_year",
        "works_api_url",
        "updated_date",
        "created_date",
    )
)

export_all_formats(spark, dbutils, df_transformed, date_str, "authors")