In [0]:
%pip install /Volumes/openalex/default/libraries/openalex_dlt_utils-0.2.1-py3-none-any.whl

In [0]:
import dlt
import pyspark.sql.functions as F

from openalex.utils.environment import *

from openalex.dlt.normalize import normalize_title_udf, normalize_license, normalize_license_udf, walden_works_schema
from openalex.dlt.transform import apply_initial_processing, apply_final_merge_key_and_filter, enrich_with_features_and_author_keys

@dlt.table(name="mag_enriched", comment="MAG data after full parsing and enrichment.")
def mag_enriched():
    mag_df = (
        spark.readStream
            .table("openalex.mag.mag_works")  # Let DLT handle CDF
            .withColumn("provenance", F.lit("mag"))
            # 🚫 Exclude deleted journals here, make sure null records stay in the stream
            .filter(F.col("source_name").isNull() | (F.col("source_name") != "Deleted Journal"))
    )

    processed_df = apply_initial_processing(mag_df, "mag", walden_works_schema)
    enriched_df = enrich_with_features_and_author_keys(processed_df)
    final_df = apply_final_merge_key_and_filter(enriched_df)

    return final_df

# Create target table
dlt.create_streaming_table(
    name="mag_dlt_works",  # Final target table name
    comment=f"Final MAG works table with unique identifiers and CDF applied in {ENV.upper()} environment.",
    table_properties={"quality": "gold", "delta.enableChangeDataFeed": "true"}
)

# Apply changes
dlt.apply_changes(
    target="mag_dlt_works",
    source="mag_enriched",
    keys=["native_id"],
    sequence_by="updated_date"
)