In [0]:
%pip install /Volumes/openalex/default/libraries/openalex_dlt_utils-0.2.1-py3-none-any.whl

In [0]:
import dlt
from functools import reduce
from pyspark.sql import DataFrame
from pyspark.sql import functions as F

from openalex.utils.environment import *

UPSTREAM_SOURCES = {
    "crossref": f"openalex{ENV_SUFFIX}.crossref.crossref_works",
    "datacite": f"openalex{ENV_SUFFIX}.datacite.datacite_works",
    "pdf": f"openalex{ENV_SUFFIX}.pdf.pdf_works",
    "pubmed": f"openalex{ENV_SUFFIX}.pubmed.pubmed_works",
    "repo": f"openalex{ENV_SUFFIX}.repo.repo_works",
    "landing_page": f"openalex{ENV_SUFFIX}.landing_page.landing_page_works",
    "mag": f"openalex{ENV_SUFFIX}.mag.mag_dlt_works"
}

@dlt.table(
    name="locations_parsed",
    comment=f"Unified parsed works data in {ENV.upper()} environment from Crossref, DataCite, PDF, PubMed, Repo and Landing Page.",
    cluster_by=["provenance", "native_id"],
    table_properties={
        "delta.enableChangeDataFeed": "true",
        "delta.autoOptimize.optimizeWrite": "true",
        "delta.autoOptimize.autoCompact": "true"
    }
)
def locations_parsed():
    # Read all sources with CDF enabled (DLT handles it internally)
    dfs = [spark.readStream.table(src) for src in UPSTREAM_SOURCES.values()]
    
    # Union all sources
    return reduce(lambda df1, df2: df1.unionByName(df2), dfs)
