In [0]:
%pip install /Volumes/openalex/default/libraries/openalex_dlt_utils-0.2.1-py3-none-any.whl

In [0]:
import dlt
from pyspark.sql.functions import *
from pyspark.sql.window import Window

from openalex.utils.environment import *

@dlt.table(
    name="base_sources",
    table_properties={"quality": "bronze"},
    comment="Sources from original postgresql table."
)
def base_sources():
    return (spark.table("openalex.sources.sources_from_postgres")
        .drop("is_in_doaj")
        .withColumn(
            'issns',
            when(col('issns').isNull(), None)
            .otherwise(
                split(
                    regexp_replace(
                        regexp_replace(col('issns'), r'^\[|\]$', ''),
                        r'["]', ''
                    ),
                    r',\s*'
                )
            )
        )
    )

@dlt.table(
   name="crossref_journals_unmatched",
   comment="Crossref journals that have NO matching ISSNs with existing base sources"
)
def crossref_journals_unmatched():
    # get all ISSNs from base sources
    base_issns = (dlt.read("base_sources")
                 .select("id", explode("issns").alias("issn"))
                 .select("issn")
                 .distinct())
    
    # get crossref journals with their exploded ISSNs
    crossref_with_issns = (spark.table("openalex.sources.crossref_journals_gold")
                          .select("*", explode("issns").alias("issn")))
    
    # find crossref journals that have at least one matching ISSN, we will ignore these for now rather than try to merge them into existing
    crossref_with_matches = (crossref_with_issns
                           .join(base_issns, "issn", "inner")
                           .select("issns_concat_id")
                           .distinct())
    
    # find crossref journals that have NO matching ISSNs
    crossref_completely_unmatched = (spark.table("openalex.sources.crossref_journals_gold")
                                   .join(crossref_with_matches, "issns_concat_id", "left_anti"))
    
    max_id = 4500000000
    
    # process completely unmatched records
    return (crossref_completely_unmatched
           .select(
               col("title").alias("display_name"),
               col("issns"),
               col("publisher"),
               lit(False).alias("is_oa"),
               lit("journal").alias("type")
           )
           .select(
               monotonically_increasing_id().alias("row_num"),
               "*"
           )
           .withColumn("id", col("row_num") + max_id + 1)
           .withColumn("issn", when(size(col("issns")) > 0, col("issns")[0]).otherwise(lit(None)))
           .drop("row_num"))

@dlt.table(
    name="sources",
    comment=f"Combined sources with DOAJ status and sample PMH records in {ENV.upper()}"
)
def sources():
    # combine base sources with unmatched crossref journals
    base_combined = (
        dlt.read("base_sources")
        .unionByName(
            dlt.read("crossref_journals_unmatched"), 
            allowMissingColumns=True
        )
    )
    
    # get DOAJ ISSNs
    doaj = (
        spark.table("openalex.sources.doaj_from_csv")
        .selectExpr("explode(issns) as doaj_issn", "oa_start_year")
        .distinct()
    )

    # get curation requests and convert to high OA rate table format
    curation_requests = (
        spark.table("openalex.unpaywall.journal_curation_requests")
        .filter(col("new_is_oa") == True)
        .select("issn", "new_oa_date")
        .withColumnRenamed("issn", "issn_l")
        .withColumn("oa_year", 
            when(col("new_oa_date").isNull(), lit(1800))
            .otherwise(col("new_oa_date").cast("int"))
        )
        .drop("new_oa_date")
        .distinct()
    )

    # get high OA rate ISSNs
    base_high_oa_rate_issns = (
        spark.table("openalex.sources.high_oa_rate_issns")
        .select("issn_l", "oa_year")
        .distinct()
    )

    # join high OA rate ISSNs with curation requests, if conflict curation request takes priority
    high_oa_rate_issns = (
        base_high_oa_rate_issns
        .join(curation_requests, on="issn_l", how="left_anti")
        .unionByName(curation_requests)
        .distinct()
    )
    
    # process records with and without ISSNs
    sources_with_issns = (
        base_combined
        .where(col("issns").isNotNull())
        .withColumn("exploded_issn", explode(col("issns")))
        .join(doaj, col("exploded_issn") == doaj["doaj_issn"], "left")
        .join(high_oa_rate_issns, col("exploded_issn") == high_oa_rate_issns["issn_l"], "left")
        .withColumn("is_in_doaj", 
                    when(doaj["doaj_issn"].isNotNull(), True).otherwise(False))
        .withColumn("is_oa_high_oa_rate", 
                    when(high_oa_rate_issns["issn_l"].isNotNull(), True).otherwise(False))
        .drop("doaj_issn", "issn_l")
        .groupBy("id", *[c for c in base_combined.columns if c not in ["id", "issns"]])
        .agg( # make sure the issns are sorted and deduplicated - avoid non-deterministic sorting
            array_sort(collect_set("exploded_issn")).alias("issns"),
            max("is_in_doaj").alias("is_in_doaj"),
            max("is_oa_high_oa_rate").alias("is_oa_high_oa_rate"),
            max("oa_start_year").alias("oa_start_year"),
            max("oa_year").alias("high_oa_rate_start_year")
        )        
        .withColumnRenamed("oa_start_year", "is_in_doaj_start_year")
        .withColumn("rank", row_number().over( # deduplicate by ISSN
            Window.partitionBy("issn").orderBy(
                size("issns").desc(),
                length("display_name").desc()
            )
        ))
        .filter((col("issn").isNull()) | (col("rank") == 1))
        .drop("rank")
    )
    
    sources_null_issns = (
        base_combined
        .where(col("issns").isNull())
        .withColumn("is_in_doaj", lit(False))
        .withColumn("is_oa_high_oa_rate", lit(False))
        .withColumn("oa_start_year", lit(None).cast("int"))
        .withColumn("high_oa_rate_start_year", lit(None).cast("int"))
        .withColumnRenamed("oa_start_year", "is_in_doaj_start_year")
        .dropDuplicates(["id"])
    )
    
    # combine ISSN and non-ISSN records
    sources_with_doaj_and_oa = (
        sources_with_issns
        .unionByName(sources_null_issns)
        .dropDuplicates(["id"])
    )

    # update is_oa column based on is_oa_high_oa_rate
    sources_with_updated_oa = (
        sources_with_doaj_and_oa
        .withColumn("is_oa",
                    when(col("is_in_doaj") == True, True)
                    .when(col("is_oa_high_oa_rate") == True, True)
                    .otherwise(col("is_oa")))
    )
    
    # join with endpoint mapping
    endpoints_table = (
        spark.table("openalex.sources.endpoint_mapping")
        .select("endpoint_id", "sample_pmh_record")
    )

    final_result = (
        sources_with_updated_oa
        .join(
            endpoints_table,
            sources_with_updated_oa["repository_id"] == endpoints_table["endpoint_id"],
            "left"
        )
        .filter(col("id") != 4317411217) # remove "Deleted Journal" Source record form upstream
        .drop("endpoint_id")
    )
    return final_result