In [0]:
%pip install /Volumes/openalex/default/libraries/openalex_dlt_utils-0.2.1-py3-none-any.whl

In [0]:
import dlt
from functools import reduce
from pyspark.sql import DataFrame
from pyspark.sql import functions as F

from openalex.utils.environment import *

UPSTREAM_SOURCES = {
    "crossref": f"openalex{ENV_SUFFIX}.crossref.crossref_works",
    "datacite": f"openalex{ENV_SUFFIX}.datacite.datacite_works",
    "pdf": f"openalex{ENV_SUFFIX}.pdf.pdf_works",
    "pubmed": f"openalex{ENV_SUFFIX}.pubmed.pubmed_works",
    "repo": f"openalex{ENV_SUFFIX}.repo.repo_works",
    "landing_page": f"openalex{ENV_SUFFIX}.landing_page.landing_page_works",
    "mag": f"openalex{ENV_SUFFIX}.mag.mag_dlt_works"
}

# Step 1: Union upstreams into a streaming view
@dlt.view(name="locations_parsed_union")
def locations_parsed_union():
    # Get canonical column order from repo (which has raw_native_type in correct position)
    repo_df = (
        spark.readStream
        .option("readChangeFeed", "true")
        .table(UPSTREAM_SOURCES["repo"])
        .limit(0)  # Just need schema, not data
    )
    canonical_columns = repo_df.columns
    
    dfs = []
    for key, table_name in UPSTREAM_SOURCES.items():
        df = (
            spark.readStream
            .option("readChangeFeed", "true")
            .table(table_name)
            .filter(F.col("_change_type").isin("insert", "update_postimage", "delete"))
            # do not bother saving anything that has no title - it's a waste of space per Jason
            .filter(F.col("title").isNotNull() & (F.length(F.col("title")) > 0))
        )
        # remove CiteSeerX (better suited upstream but more painful to re-run)
        if (key == "repo"):
            # should this move to the locations_validated logic perhaps? That's where Sources become handy (after locations_w_sources)
            df = df.where("""NOT(native_id ILIKE 'oai:CiteSeerX%'
                OR (size(urls) = 1 AND urls[0].url ILIKE 'http://citeseerx.ist.psu.edu%'))""")
        else:
            # Add raw_native_type as NULL for non-repo sources
            if "raw_native_type" not in df.columns:
                df = df.withColumn("raw_native_type", F.lit(None).cast("string"))
        
        # Select columns in canonical order (only columns that exist in this DataFrame)
        # This ensures raw_native_type appears before type for all sources
        existing_columns = df.columns
        ordered_columns = [col for col in canonical_columns if col in existing_columns]
        df = df.select(*ordered_columns)
        
        dfs.append(df)
    
    # Now all DataFrames have same columns in same order
    return reduce(lambda d1, d2: d1.unionByName(d2, allowMissingColumns=True), dfs)

# Step 2: Define the final SCD1 table and apply changes
dlt.create_streaming_table(
    name="locations_parsed",
    comment=f"Unified parsed works data in {ENV.upper()} environment from Crossref, DataCite, PDF, PubMed, Repo and Landing Page.",
    table_properties={
        "quality": "gold",
        "delta.enableChangeDataFeed": "true",
        "delta.autoOptimize.optimizeWrite": "true",
        "delta.autoOptimize.autoCompact": "true"
    }
)

dlt.apply_changes(
    target="locations_parsed",
    source="locations_parsed_union",
    keys=["native_id"],
    sequence_by="updated_date",
    stored_as_scd_type=1,
    except_column_list=["_change_type", "_commit_version", "_commit_timestamp"],
    apply_as_deletes="lower(_change_type) = 'delete'",  # 👈 Enable DELETE detection
    ignore_null_updates=True                            # 👈 Optional safety for sparse updates
)