In [0]:
import dlt
import pyspark.sql.functions as F

@dlt.table(name="crossref_grants")
def crossref_grants():
    return (
        spark.read.table('openalex.crossref.crossref_items')
        .select(F.inline('items'))
        .filter(F.col("type") == "grant")
    )


@dlt.table(name="crossref_grants_deduplicated")
def crossref_grants_deduplicated():
    from pyspark.sql.window import Window
    
    df_grants = dlt.read("crossref_grants")
    
    window = Window.partitionBy("DOI").orderBy(F.col("indexed.timestamp").desc())
    
    return (
        df_grants
        .withColumn("row_num", F.row_number().over(window))
        .filter(F.col("row_num") == 1)
        .drop("row_num")
    )


@dlt.table(name="crossref_awards")
def crossref_awards():
    df_grants = dlt.read("crossref_grants_deduplicated")
    df_funders = spark.read.table('openalex.common.funder')
    
    # helper function to build investigator struct
    def build_investigator(prefix):
        return F.when(
            F.col(f"{prefix}")[0].isNotNull(),
            F.struct(
                F.col(f"{prefix}")[0]["given"].alias("given_name"),
                F.col(f"{prefix}")[0]["family"].alias("family_name"),
                F.col(f"{prefix}")[0]["ORCID"].alias("orcid"),
                F.when(
                    F.col(f"{prefix}")[0]["role-start"]["date-parts"][0][2].isNotNull(),
                    F.make_date(
                        F.col(f"{prefix}")[0]["role-start"]["date-parts"][0][0],
                        F.col(f"{prefix}")[0]["role-start"]["date-parts"][0][1],
                        F.col(f"{prefix}")[0]["role-start"]["date-parts"][0][2]
                    )
                ).alias("role_start"),
                F.when(
                    F.size(F.col(f"{prefix}")[0]["affiliation"]) > 0,
                    F.struct(
                        F.col(f"{prefix}")[0]["affiliation"][0]["name"].alias("name"),
                        F.col(f"{prefix}")[0]["affiliation"][0]["country"].alias("country"),
                        F.transform(
                            F.col(f"{prefix}")[0]["affiliation"][0]["id"],
                            lambda x: F.struct(
                                x["id"].alias("id"),
                                x["id-type"].alias("type"),
                                x["asserted-by"].alias("asserted_by")
                            )
                        ).alias("ids")
                    )
                ).alias("affiliation")
            )
        )
    
    return (
        df_grants
        .withColumn("project", F.col("project").getItem(0))
        .withColumn("funding", F.col("project.funding")[0])
        .withColumn("start_parts", F.col("project.award-start.date-parts")[0])
        .withColumn("end_parts", F.col("project.award-end.date-parts")[0])
        # extract funder ID struct
        .withColumn("funder_id_struct", F.col("funding.funder.id")[0])
        .withColumn("funder_id_type", F.col("funder_id_struct")["id-type"])
        .withColumn("funder_id_value", F.col("funder_id_struct")["id"])
        .withColumn("funder_ror_id", 
            F.when(F.col("funder_id_type") == "ROR", F.col("funder_id_value"))
        )
        .withColumn("funder_doi", 
            F.when(F.col("funder_id_type") == "DOI", F.col("funder_id_value"))
        )
        .withColumn("funder_name_raw", F.col("funding.funder.name"))
        # funding fields
        .withColumn("amount", F.col("funding")["award-amount"]["amount"])
        .withColumn("currency", F.col("funding")["award-amount"]["currency"])
        .withColumn("funding_type", F.col("funding")["type"])
        .withColumn("funder_scheme", F.col("funding")["scheme"])
        # dates and years
        .withColumn("start_year", F.col("start_parts")[0])
        .withColumn("end_year", F.col("end_parts")[0])
        .withColumn("start_date", 
            F.when(F.col("start_parts")[2].isNotNull(),
                F.make_date(F.col("start_parts")[0], F.col("start_parts")[1], F.col("start_parts")[2])
            )
        )
        .withColumn("end_date", 
            F.when(F.col("end_parts")[2].isNotNull(),
                F.make_date(F.col("end_parts")[0], F.col("end_parts")[1], F.col("end_parts")[2])
            )
        )
        # investigators
        .withColumn("lead_investigator", build_investigator("project.lead-investigator"))
        .withColumn("co_lead_investigator", build_investigator("project.co-lead-investigator"))
        .withColumn("investigators",
            F.transform(
                F.col("project.investigator"),
                lambda inv: F.struct(
                    inv["given"].alias("given_name"),
                    inv["family"].alias("family_name"),
                    inv["ORCID"].alias("orcid"),
                    F.when(
                        inv["role-start"]["date-parts"][0][2].isNotNull(),
                        F.make_date(
                            inv["role-start"]["date-parts"][0][0],
                            inv["role-start"]["date-parts"][0][1],
                            inv["role-start"]["date-parts"][0][2]
                        )
                    ).alias("role_start"),
                    F.when(
                        F.size(inv["affiliation"]) > 0,
                        F.struct(
                            inv["affiliation"][0]["name"].alias("name"),
                            inv["affiliation"][0]["country"].alias("country"),
                            F.transform(
                                inv["affiliation"][0]["id"],
                                lambda x: F.struct(
                                    x["id"].alias("id"),
                                    x["id-type"].alias("type"),
                                    x["asserted-by"].alias("asserted_by")
                                )
                            ).alias("ids")
                        )
                    ).alias("affiliation")
                )
            )
        )
        .join(
            df_funders.select(
                F.col("funder_id").alias("f_funder_id"),
                F.col("display_name").alias("f_display_name"),
                F.col("ror_id").alias("f_ror_id"),
                F.col("doi").alias("f_doi")
            ),
            (F.col("funder_doi") == F.col("f_doi")) | (F.col("funder_ror_id") == F.col("f_ror_id")),
            "left"
        )
        .select(
            F.col("DOI").alias("id"),
            F.col("project.project-title")[0]["title"].alias("display_name"),
            F.col("project.project-description")[0]["description"].alias("description"),
            F.col("award").alias("funder_award_id"),
            "amount",
            "currency",
            F.when(
                F.col("f_funder_id").isNotNull(),
                F.struct(
                    F.concat(F.lit("https://openalex.org/F"), F.col("f_funder_id")).alias("id"),
                    F.coalesce(F.col("f_display_name"), F.col("funder_name_raw")).alias("display_name"),
                    F.col("f_ror_id").alias("ror_id"),
                    F.col("f_doi").alias("doi")
                )
            ).alias("funder"),
            "funding_type",
            "funder_scheme",
            F.lit("crossref").alias("provenance"),
            "start_date",
            "end_date",
            "start_year",
            "end_year",
            "lead_investigator",
            "co_lead_investigator",
            "investigators",
            F.col("resource.primary.URL").alias("landing_page_url"),
            F.col("URL").alias("doi"),
            F.to_timestamp(F.col("created.date-time")).alias("created_date"),
            F.to_timestamp(F.col("indexed.date-time")).alias("updated_date")
        )
    )