In [0]:
%pip install /Volumes/openalex/default/libraries/openalex_dlt_utils-0.2.0-py3-none-any.whl

In [0]:
import dlt
import pyspark.sql.functions as F

from openalex.dlt.normalize import normalize_title_udf, normalize_license, normalize_license_udf, walden_works_schema
from openalex.dlt.transform import apply_initial_processing, apply_final_merge_key_and_filter, enrich_with_features_and_author_keys

@dlt.table(name="mag_enriched", temporary=True, comment="MAG data after full parsing and enrichment.")
def mag_enriched():
    mag_df = (
        spark.readStream
            .option("readChangeFeed", "true")
            .table("openalex.mag.mag_works")
            .filter(F.col("_change_type").isin("insert", "update_postimage"))
            .drop("_change_type", "_commit_version", "_commit_timestamp")  # drop CDF metadata columns
            .withColumn("provenance", F.lit("mag"))
    )

    processed_df = apply_initial_processing(mag_df, "mag", walden_works_schema)
    enriched_df = enrich_with_features_and_author_keys(processed_df)
    final_df = apply_final_merge_key_and_filter(enriched_df)

    return final_df

# IMPORTANT: This switches to a different catalog, e.g., 'openalex_dlt'
dlt.create_streaming_table(
    name="mag_dlt_works",  # Final target table name
    comment="Final MAG works table with unique identifiers and CDF applied.",
    table_properties={"quality": "gold", "delta.enableChangeDataFeed": "true"}
)

dlt.apply_changes(
    target="mag_dlt_works",
    source="mag_enriched",
    keys=["native_id"],
    sequence_by="updated_date"
)
