In [0]:
%pip install /Volumes/openalex/default/libraries/openalex_dlt_utils-0.2.1-py3-none-any.whl

In [0]:
import dlt
from functools import reduce
from pyspark.sql import functions as F

from openalex.utils.environment import *

UPSTREAM_SOURCES = {
    "crossref": f"openalex{ENV_SUFFIX}.crossref.crossref_works",
    "datacite": f"openalex{ENV_SUFFIX}.datacite.datacite_works",
    "pdf": f"openalex{ENV_SUFFIX}.pdf.pdf_works",
    "pubmed": f"openalex{ENV_SUFFIX}.pubmed.pubmed_works",
    "repo": f"openalex{ENV_SUFFIX}.repo.repo_works",
    "landing_page": f"openalex{ENV_SUFFIX}.landing_page.landing_page_works",
    "mag": f"openalex{ENV_SUFFIX}.mag.mag_dlt_works"
}

# Step 1: Union upstreams into a streaming view
@dlt.view(name="locations_parsed_union")
def locations_parsed_union():
    # Get canonical column order from repo (which has raw_native_type in correct position)
    canonical_columns = None
    
    dfs = []
    for key, table_name in UPSTREAM_SOURCES.items():
        df = (
            spark.readStream
            .option("readChangeFeed", "true")
            .table(table_name)
            .filter(F.col("_change_type").isin("insert", "update_postimage", "delete"))
        )
        # Get canonical columns from repo (first source with raw_native_type)
        if (key == "repo"):
            canonical_columns = df.columns
        else:
            # Add raw_native_type as NULL for non-repo sources
            if "raw_native_type" not in df.columns:
                df = df.withColumn("raw_native_type", F.lit(None).cast("string"))                
        dfs.append(df)
    
    # Union all DataFrames, then select columns in canonical order (from repo)
    df_all = reduce(lambda d1, d2: d1.unionByName(d2, allowMissingColumns=True), dfs)
    
    # Filter out records with empty titles AFTER union to avoid missing updates
    # do not bother saving anything that has no title - it's a waste of space per Jason
    # commented out because it excludes a lot of pdf_works records that we need for pdf_urls
    # df_all = df_all.filter(F.col("title").isNotNull() & (F.length(F.col("title")) > 0))
    df_all = df_all.select(*canonical_columns)
    
    return df_all

# Step 2: Define the final SCD1 table and apply changes
dlt.create_streaming_table(
    name="locations_parsed",
    comment=f"Unified parsed works data in {ENV.upper()} environment from Crossref, DataCite, PDF, PubMed, Repo and Landing Page.",
    table_properties={
        "quality": "gold",
        "delta.enableChangeDataFeed": "true",
        "delta.autoOptimize.optimizeWrite": "true",
        "delta.autoOptimize.autoCompact": "true"
    }
)

dlt.apply_changes(
    target="locations_parsed",
    source="locations_parsed_union",
    keys=["native_id"],
    sequence_by="updated_date",
    stored_as_scd_type=1,
    except_column_list=["_change_type", "_commit_version", "_commit_timestamp"],
    apply_as_deletes="lower(_change_type) = 'delete'",
    # ignore_null_updates:
    # - True: Skip updates that only set fields to NULL (preserves existing values)
    #         Use this if upstream sends sparse updates and you want to preserve existing data
    # - False: Process all updates, including those that set fields to NULL (can overwrite with NULL)
    #          Use this if you want all upstream updates to propagate, even if they set fields to NULL
    ignore_null_updates=False  # Process all updates including sparse partial ones - changed from True
)