In [0]:
%pip install /Volumes/openalex/default/libraries/openalex_dlt_utils-0.2.1-py3-none-any.whl

In [0]:
import dlt
from functools import reduce
from pyspark.sql import DataFrame
from pyspark.sql import functions as F

from openalex.utils.environment import *

UPSTREAM_SOURCES = {
    "crossref": f"openalex{ENV_SUFFIX}.crossref.crossref_works",
    "datacite": f"openalex{ENV_SUFFIX}.datacite.datacite_works",
    "pdf": f"openalex{ENV_SUFFIX}.pdf.pdf_works",
    "pubmed": f"openalex{ENV_SUFFIX}.pubmed.pubmed_works",
    "repo": f"openalex{ENV_SUFFIX}.repo.repo_works",
    "landing_page": f"openalex{ENV_SUFFIX}.landing_page.landing_page_works",
    "mag": f"openalex{ENV_SUFFIX}.mag.mag_dlt_works"
}

# Step 1: Union upstreams into a streaming view
@dlt.view(name="locations_parsed_union")
def locations_parsed_union():
    dfs = []
    for path in UPSTREAM_SOURCES.values():
        df = (
            spark.readStream
            .option("readChangeFeed", "true")
            .table(path)
            .filter(F.col("_change_type").isin("insert", "update_postimage", "delete"))
        )
        dfs.append(df)
    return reduce(lambda d1, d2: d1.unionByName(d2), dfs)

# Step 2: Define the final SCD1 table and apply changes
dlt.create_streaming_table(
    name="locations_parsed",
    comment=f"Unified parsed works data in {ENV.upper()} environment from Crossref, DataCite, PDF, PubMed, Repo and Landing Page.",
    table_properties={
        "quality": "gold",
        "delta.enableChangeDataFeed": "true",
        "delta.autoOptimize.optimizeWrite": "true",
        "delta.autoOptimize.autoCompact": "true"
    }
)

dlt.apply_changes(
    target="locations_parsed",
    source="locations_parsed_union",
    keys=["native_id"],
    sequence_by="updated_date",
    stored_as_scd_type=1,
    except_column_list=["_change_type", "_commit_version", "_commit_timestamp"],
    apply_as_deletes="lower(_change_type) = 'delete'",  # 👈 Enable DELETE detection
    ignore_null_updates=True                            # 👈 Optional safety for sparse updates
)