In [0]:
import dlt
from functools import reduce
from pyspark.sql import DataFrame
from pyspark.sql import functions as F

UPSTREAM_SOURCES = {
    "crossref": "openalex_dev.crossref.crossref_works",
    "datacite": "openalex_dev.datacite.datacite_works",
    "pdf": "openalex_dev.pdf.pdf_works",
    "pubmed": "openalex_dev.pubmed.pubmed_works",
    "repo": "openalex_dev.repo.repo_works",
    "landing_page": "openalex_dev.landing_page.landing_page_works"
}

@dlt.table(
    name="locations_parsed",
    comment="Unified parsed works data from Crossref, DataCite, PDF, PubMed, Repo and Landing Page.",
    cluster_by=["provenance", "native_id"],
    table_properties={
        "delta.enableChangeDataFeed": "true",
        "delta.autoOptimize.optimizeWrite": "true",
        "delta.autoOptimize.autoCompact": "true"
    }
)
def locations_parsed():
    dfs = []
    for source in UPSTREAM_SOURCES.values():
        df = (
            spark.readStream
                .option("readChangeFeed", "true")
                .table(source)
                .filter(F.col("_change_type").isin("insert", "update_postimage"))
                .drop("_change_type", "_commit_version", "_commit_timestamp")
        )
        dfs.append(df)
    
    combined_df = reduce(lambda df1, df2: df1.unionByName(df2), dfs)
    return combined_df
