In [ ]:
%run ./_utils

### Daily Snapshot â€” Works
Exports works updated on the snapshot day to JSONL and Parquet.
Ports the full 52-field transformation from the full snapshot.

For JSONL, uses hash-based salting when daily volume exceeds 10M records
to control file sizes.

In [ ]:
import json
import math
from pyspark.sql import functions as F
from pyspark.sql.types import ArrayType, IntegerType, MapType, StringType
from pyspark.sql.functions import udf

date_str = get_snapshot_date()
print(f"Snapshot date: {date_str}")


@udf(StringType())
def truncate_abstract_index_string(raw_json: str, max_bytes: int = 32760) -> str:
    try:
        if not raw_json:
            return None
        try:
            json.loads(raw_json)
        except (json.JSONDecodeError, ValueError):
            return None
        if len(raw_json) <= (max_bytes // 4):
            return raw_json
        encoded = raw_json.encode('utf-8')
        if len(encoded) <= max_bytes:
            return raw_json
        truncated = encoded[:max_bytes].decode('utf-8', errors='ignore')
        last_bracket = truncated.rfind(']')
        if last_bracket == -1:
            return None
        return truncated[:last_bracket + 1] + '}'
    except Exception:
        return None


def sanitize_name(col_name: str):
    unwanted_chars_pattern = r"[^\p{L}\p{N}\p{P}\p{S}\p{Z}]"
    multiple_spaces_pattern = r"\s+"
    return F.trim(
        F.regexp_replace(
            F.regexp_replace(F.col(col_name), unwanted_chars_pattern, ""),
            multiple_spaces_pattern, " "
        )
    )


def sanitize_string(col_name: str, max_len: int = 32000):
    return F.when(F.col(col_name).isNotNull(), F.substring(F.col(col_name), 1, max_len)).otherwise(None)


empty_sdg_array = F.array().cast("array<struct<id:string,display_name:string,score:double>>")

In [ ]:
# Read daily slice and apply full 52-field transformation
df_raw = get_daily_df(spark, "openalex.works.openalex_works", date_str)

df_transformed = (
    df_raw
    .withColumn("display_name", F.col("title"))
    .withColumn("created_date", F.to_timestamp("created_date"))
    .withColumn("updated_date", F.to_timestamp("updated_date"))
    .withColumn("publication_date", F.to_date("publication_date"))
    .withColumn(
        "concepts",
        F.transform(
            F.col("concepts"),
            lambda c: F.struct(
                F.concat(F.lit("https://openalex.org/C"), c.id).alias("id"),
                c.wikidata.alias("wikidata"),
                c.display_name.alias("display_name"),
                c.level.alias("level"),
                c.score.alias("score")
            )
        )
    )
    .withColumn(
        "created_date",
        F.when(
            F.col("created_date").between(F.lit("1000-01-01"), F.lit("9999-12-31")),
            F.col("created_date")
        ).otherwise(F.lit(None).cast("timestamp"))
    )
    .withColumn(
        "updated_date",
        F.when(
            F.col("updated_date").between(F.lit("1000-01-01"), F.lit("9999-12-31")),
            F.col("updated_date")
        ).otherwise(F.lit(None).cast("timestamp"))
    )
    .withColumn(
        "publication_date",
        F.when(
            F.col("publication_date").between(F.lit("1000-01-01"), F.lit("2050-12-31")),
            F.col("publication_date")
        ).otherwise(F.lit(None).cast("date"))
    )
    .withColumn("id", F.concat(F.lit("https://openalex.org/W"), F.col("id")))
    .withColumn("publication_year", F.year("publication_date"))
    .withColumn("title", sanitize_name("title"))
    .withColumn("display_name", sanitize_name("display_name"))
    .withColumn("ids",
        F.transform_values("ids",
            lambda k, v: F.when(k == "doi",
                    F.concat(F.lit("https://doi.org/"), v)).otherwise(v)
        )
    )
    .withColumn("doi", sanitize_string("doi"))
    .withColumn("language", sanitize_string("language"))
    .withColumn("type", sanitize_string("type"))
    .withColumn("abstract", sanitize_string("abstract"))
    .withColumn("referenced_works",
                F.expr("transform(referenced_works, x -> 'https://openalex.org/W' || x)"))
    .withColumn("referenced_works_count",
                F.when(F.col("referenced_works").isNotNull(), F.size("referenced_works")).otherwise(0))
    .withColumn("abstract_inverted_index", truncate_abstract_index_string(F.col("abstract_inverted_index")))
    .withColumn("open_access", F.struct(
        F.col("open_access.is_oa"),
        sanitize_string("open_access.oa_status").alias("oa_status"),
        F.col("open_access.any_repository_has_fulltext"),
        F.col("open_access.oa_url")
    ))
    .withColumn("authorships", F.expr("""
        transform(authorships, x -> named_struct(
            'author', x.author,
            'affiliations', x.affiliations,
            'countries', x.countries,
            'raw_author_name', substring(x.raw_author_name, 1, 32000),
            'is_corresponding', x.is_corresponding,
            'raw_affiliation_strings', transform(x.raw_affiliation_strings, aff -> substring(aff, 1, 32000)),
            'institutions', x.institutions
        ))
    """))
    .withColumn("locations", F.expr("""
        transform(locations, x -> named_struct(
            'native_id', x.native_id,
            'source', x.source,
            'is_oa', x.is_oa,
            'is_published', x.version = 'publishedVersion',
            'landing_page_url', substring(x.landing_page_url, 1, 32000),
            'pdf_url', substring(x.pdf_url, 1, 32000),
            'raw_source_name', x.raw_source_name,
            'raw_type', x.raw_type,
            'provenance', x.provenance,
            'license', x.license,
            'license_id', x.license_id,
            'version', x.version,
            'is_accepted', x.is_accepted
        ))
    """))
    .withColumn("concepts", F.slice(F.col("concepts"), 1, 40))
    .withColumn("indexed_in", F.expr("""
        array_sort(
            array_distinct(
                array_compact(
                    flatten(
                        TRANSFORM(locations, loc ->
                            CASE
                            WHEN loc.provenance IN ('crossref', 'pubmed', 'datacite')
                                THEN array(loc.provenance, IF(loc.source.is_in_doaj, 'doaj', NULL))
                            WHEN loc.provenance = 'repo' AND lower(loc.native_id) like 'oai:arxiv.org%'
                                THEN array('arxiv')
                            WHEN loc.provenance = 'repo' AND lower(loc.native_id) like 'oai:doaj.org/%'
                                THEN array('doaj')
                            WHEN loc.provenance = 'mag' AND lower(loc.source.display_name) = 'pubmed'
                                THEN array('pubmed')
                            ELSE array()
                            END
                        )
                    )
                )
            )
        )
    """))
    .withColumn("has_fulltext", F.col("fulltext").isNotNull())
    .withColumn("corresponding_author_ids", F.coalesce(F.col("corresponding_author_ids"), F.lit([])))
    .withColumn("corresponding_institution_ids", F.coalesce(F.col("corresponding_institution_ids"), F.lit([])))
    .withColumn("sustainable_development_goals", F.coalesce(F.col("sustainable_development_goals"), empty_sdg_array))
    .withColumn("related_works", F.coalesce(F.col("related_works"), F.lit([])))
    .withColumn("fwci", F.coalesce(F.col("fwci"), F.lit(0)))
    .withColumn("mesh", F.coalesce(F.col("mesh"), F.lit([])))
    .withColumn("authorships", F.coalesce(F.col("authorships"), F.lit([])))
    .select(
        "id",
        "doi",
        "title",
        "display_name",
        "ids",
        "indexed_in",
        "publication_date",
        "publication_year",
        "language",
        "type",
        "authorships",
        "authors_count",
        "corresponding_author_ids",
        "corresponding_institution_ids",
        "primary_topic",
        "topics",
        "keywords",
        "concepts",
        "locations",
        "locations_count",
        "primary_location",
        "best_oa_location",
        "sustainable_development_goals",
        "awards",
        "funders",
        "institutions",
        "countries_distinct_count",
        "institutions_distinct_count",
        "open_access",
        "is_paratext",
        "is_retracted",
        "is_xpac",
        "biblio",
        "abstract",
        "referenced_works",
        "referenced_works_count",
        "related_works",
        "abstract_inverted_index",
        "cited_by_count",
        "counts_by_year",
        "apc_list",
        "apc_paid",
        "fwci",
        "citation_normalized_percentile",
        "cited_by_percentile_year",
        "mesh",
        "has_abstract",
        "has_content",
        "has_fulltext",
        "created_date",
        "updated_date",
    )
)

print(f"Transformation complete for {date_str}")

#### Export to all formats
For works, we use `export_all_formats` which caches the DataFrame,
writes JSONL (with abstract_inverted_index parsed to map) and Parquet,
then writes per-entity metadata for each.

In [ ]:
export_all_formats(
    spark, dbutils, df_transformed, date_str, "works",
    jsonl_records_per_file=500_000,
    columnar_records_per_file=500_000,
)