### Install utils libraries, import and register UDFs

In [0]:
%pip install /Volumes/openalex/default/libraries/openalex_dlt_utils-0.2.3-py3-none-any.whl
from nameparser import HumanName # Will be installed via pipeline libraries
from openalex.dlt.normalize import normalize_title_udf, udf_last_name_only
spark.udf.register("normalize_title_udf", normalize_title_udf)
spark.udf.register("udf_last_name_only", udf_last_name_only)

### Sync Tables from PROD

In [0]:
%sql
DECLARE OR REPLACE VARIABLE walden_paper_id BIGINT DEFAULT 9999999999;
SET VARIABLE walden_paper_id = (SELECT max(paper_id) from openalex.mid.work);

DECLARE OR REPLACE VARIABLE walden_full_updated_date TIMESTAMP DEFAULT current_timestamp();
SET VARIABLE walden_full_updated_date = (SELECT max(full_updated_date) from openalex.mid.work);

MERGE INTO openalex.mid.work AS target
USING (
      SELECT *
      FROM openalex_postgres.mid.work -- federated foreign table
      WHERE paper_id > walden_paper_id
         OR full_updated_date > walden_full_updated_date
) AS source
ON target.paper_id = source.paper_id
WHEN MATCHED THEN UPDATE SET *
WHEN NOT MATCHED THEN INSERT *;

In [0]:
%sql
DECLARE OR REPLACE VARIABLE walden_affiliation_updated_date TIMESTAMP DEFAULT current_timestamp();
SET VARIABLE walden_affiliation_updated_date = (SELECT max(updated_date) from openalex.mid.affiliation);

MERGE INTO openalex.mid.affiliation AS target
USING (
      SELECT *
      FROM openalex_postgres.mid.affiliation -- federated foreign table
      WHERE updated_date > walden_affiliation_updated_date
        AND paper_id IS NOT NULL AND author_id IS NOT NULL
      QUALIFY row_number() OVER (PARTITION BY paper_id, author_id, affiliation_id ORDER BY updated_date DESC) = 1
) AS source
ON target.paper_id = source.paper_id
  AND target.author_id = source.author_id
  AND target.affiliation_id = source.affiliation_id
  -- AND target.updated_date = source.updated_date
WHEN MATCHED THEN UPDATE SET *
WHEN NOT MATCHED THEN INSERT *;

In [0]:
%sql
DECLARE OR REPLACE VARIABLE walden_author_updated_date TIMESTAMP DEFAULT current_timestamp();
SET VARIABLE walden_author_updated_date = (SELECT max(full_updated_date) from openalex.mid.author);

MERGE INTO openalex.mid.author AS target
USING (
      SELECT *
      FROM openalex_postgres.mid.author -- federated foreign table
      WHERE full_updated_date > walden_author_updated_date
        AND author_id IS NOT NULL -- check with Casey which range of authors is to be ignored (too many in the table - 400M+)
) AS source
ON target.author_id = source.author_id
WHEN MATCHED THEN UPDATE SET *
WHEN NOT MATCHED THEN INSERT *;

In [0]:
%sql
DECLARE OR REPLACE VARIABLE walden_work_extra_id BIGINT DEFAULT 9999999999;
SET VARIABLE walden_work_extra_id = (SELECT max(paper_id) from openalex.mid.work_extra_ids);

MERGE INTO openalex.mid.work_extra_ids AS target
USING (
      SELECT DISTINCT *
      FROM openalex_postgres.mid.work_extra_ids -- federated foreign table
      WHERE paper_id > walden_work_extra_id
        AND paper_id IS NOT NULL
) AS source
ON target.paper_id = source.paper_id
  AND target.attribute_type = source.attribute_type
WHEN MATCHED THEN UPDATE SET target.attribute_value = source.attribute_value
WHEN NOT MATCHED THEN INSERT *;

### Update missing `normalized_title` and `match_author`

In [0]:
%sql
UPDATE openalex.mid.affiliation
SET match_author = udf_last_name_only(array(named_struct('name', original_author)))[0].author_key
WHERE match_author IS NULL;

In [0]:
%sql
UPDATE openalex.mid.work 
SET normalized_title = normalize_title_udf(original_title)
WHERE normalized_title IS NULL;

### Update or insert fresh prod data into `work_id_map` by `paper_id`.
Without changes the merge produces 0 inserts and updates.

In [0]:
%sql
WITH prod_id_data AS (
SELECT
  w.paper_id,
  -- IMPORTANT - we store clean DOI in the map
  MAX(w.doi_lower) AS doi,
  MAX(e.attribute_value) as pmid,
  MAX(w.arxiv_id) as arxiv,
  MAX(CASE 
    WHEN a.match_author IS NULL THEN w.normalized_title
    ELSE CONCAT(w.normalized_title, '_', a.match_author)
  END) as title_author,
  MAX(to_date(w.created_date)) as created_dt,
  MAX(try_to_timestamp(w.updated_date)) as updated_dt
FROM openalex.mid.work w
LEFT JOIN openalex.mid.affiliation a ON w.paper_id = a.paper_id AND a.author_sequence_number = 1
LEFT JOIN openalex.mid.work_extra_ids e ON w.paper_id = e.paper_id AND e.attribute_type = 2
GROUP BY w.paper_id
)
MERGE INTO identifier('openalex' || :env_suffix || '.works.work_id_map') AS target
USING prod_id_data AS source
  ON target.paper_id = source.paper_id
WHEN MATCHED AND (
    (target.doi IS DISTINCT FROM source.doi AND source.doi IS NOT NULL) OR
    (target.pmid IS DISTINCT FROM source.pmid AND source.pmid IS NOT NULL) OR
    (target.arxiv IS DISTINCT FROM source.arxiv AND source.arxiv IS NOT NULL) OR
    (target.title_author IS DISTINCT FROM source.title_author AND source.title_author IS NOT NULL)
)
THEN UPDATE SET
  target.paper_id = source.paper_id,
  target.doi = COALESCE(source.doi, target.doi),
  target.pmid = COALESCE(source.pmid, target.pmid),
  target.arxiv = COALESCE(source.arxiv, target.arxiv),
  target.title_author = COALESCE(source.title_author, target.title_author),
  target.openalex_created_dt = source.created_dt,
  target.openalex_updated_dt = source.updated_dt
WHEN NOT MATCHED THEN INSERT
(paper_id, doi, pmid, arxiv, title_author, openalex_created_dt, openalex_updated_dt) 
VALUES (source.paper_id, source.doi, source.pmid, source.arxiv, 
  source.title_author, source.created_dt, source.updated_dt);

### Update `authorships_backfill` once `affiliation` and `author` are refreshed

In [0]:
%sql
CREATE OR REPLACE TABLE openalex.authors.work_authorships_backfill CLUSTER BY AUTO
AS
-- Get lineage for each institution (keep raw IDs)
WITH institution_lineage AS (
  SELECT
    ia.institution_id,
    COLLECT_LIST(ia.ancestor_id) AS lineage_ids
  FROM openalex.mid.institution_ancestors ia
  GROUP BY ia.institution_id
),
author_affiliation_agg AS (
  SELECT
    a.paper_id,
    a.author_sequence_number,
    -- Author details
    MAX(au_canonical.display_name) AS author_display_name,
    MAX(au_canonical.author_id) AS author_openalex_id,
    MAX(ao.orcid) AS author_orcid,
    MAX(a.original_author) AS raw_author_name,
    -- Affiliations data
    FILTER(
      COLLECT_LIST(
        STRUCT(
          ARRAY(CASE WHEN i.affiliation_id IS NOT NULL THEN CONCAT('https://openalex.org/I', CAST(i.affiliation_id AS STRING)) END) AS institution_ids,
          a.original_affiliation AS raw_affiliation_string
        )
      ), x -> x IS NOT NULL
    ) AS affiliations_array,
    FILTER(COLLECT_SET(i.iso3166_code), x -> x IS NOT NULL) AS countries_set,
    FILTER(
      COLLECT_LIST(
        STRUCT(
          i.iso3166_code AS country_code, i.display_name AS display_name,
          CASE WHEN i.affiliation_id IS NOT NULL THEN CONCAT('https://openalex.org/I', CAST(i.affiliation_id AS STRING)) END AS id,
          ARRAY_SORT(TRANSFORM(il.lineage_ids, id -> CONCAT('https://openalex.org/I', id))) AS lineage,
          CASE WHEN i.ror_id IS NOT NULL THEN CONCAT('https://ror.org/', CAST(i.ror_id AS STRING)) END AS ror,
          CASE
            WHEN r.types IS NULL OR size(r.types) = 0 THEN NULL
            WHEN size(r.types) = 1 THEN r.types[0]
            ELSE element_at(FILTER(r.types, x -> lower(x) <> 'funder'), 1)
          END AS type,
          COALESCE(r.types, ARRAY()) AS type_list
        )
      ), x -> x IS NOT NULL
    ) AS institutions_list,
    ARRAY_SORT(FILTER(COLLECT_LIST(a.original_affiliation), x -> x IS NOT NULL)) AS raw_affiliation_strings_list,
    COALESCE(MAX(CASE WHEN a.is_corresponding_author THEN TRUE ELSE FALSE END), FALSE) AS is_corresponding_author_flag
  FROM openalex.mid.affiliation a
  LEFT JOIN openalex.mid.institution i ON a.affiliation_id = i.affiliation_id
  LEFT JOIN openalex.mid.author au ON a.author_id = au.author_id
  LEFT JOIN openalex.mid.author au_canonical ON COALESCE(au.merge_into_id, au.author_id) = au_canonical.author_id
  LEFT JOIN openalex.mid.author_orcid ao ON au.author_id = ao.author_id
  LEFT JOIN openalex.institutions.ror r ON CONCAT('https://ror.org/', i.ror_id) = r.id
  LEFT JOIN institution_lineage il ON i.affiliation_id = il.institution_id
  GROUP BY a.paper_id, a.author_sequence_number
),
paper_author_counts AS (
  SELECT
    paper_id,
    COUNT(*) as total_authors
  FROM author_affiliation_agg
  GROUP BY paper_id
),
final_authorships AS (
  SELECT
    paa.paper_id,
    ARRAY_SORT(
      COLLECT_LIST(
        STRUCT(
          paa.affiliations_array AS affiliations,
          STRUCT(
            paa.author_display_name AS display_name,
            CASE WHEN paa.author_openalex_id IS NOT NULL THEN CONCAT('https://openalex.org/A', paa.author_openalex_id) END AS id,
            CASE WHEN paa.author_orcid IS NOT NULL THEN CONCAT('https://orcid.org/', paa.author_orcid) END AS orcid
          ) AS author,
          CASE
            WHEN paa.author_sequence_number = 1 THEN 'first'
            WHEN paa.author_sequence_number = pac.total_authors THEN 'last'
            ELSE 'middle'
          END AS author_position,
          paa.author_sequence_number AS author_order_number,
          COALESCE(ARRAY_SORT(paa.countries_set), ARRAY()) AS countries,
          COALESCE(ARRAY_SORT(paa.institutions_list), ARRAY()) AS institutions,
          -- override - single author is always corresponding
          (paa.is_corresponding_author_flag OR pac.total_authors = 1) AS is_corresponding,
          COALESCE(paa.raw_affiliation_strings_list, ARRAY()) AS raw_affiliation_strings,
          paa.raw_author_name AS raw_author_name
        )
      ),
      (left, right) -> CASE
        WHEN left.author_order_number < right.author_order_number THEN -1
        WHEN left.author_order_number > right.author_order_number THEN 1
        ELSE 0
      END
    ) as authorships
  FROM
    author_affiliation_agg paa
  JOIN
    paper_author_counts pac ON paa.paper_id = pac.paper_id
  GROUP BY
    paa.paper_id
)
SELECT
  paper_id,
  authorships,
  FILTER(authorships, auth -> 
    auth is not null 
    and auth.author IS NOT NULL 
    and auth.is_corresponding
  ).author.id AS corresponding_author_ids,
  COALESCE(
    ARRAY_SORT(ARRAY_DISTINCT(
      FILTER(
        FLATTEN(
          TRANSFORM(
            FILTER(authorships, auth -> auth IS NOT NULL 
              AND auth.author IS NOT NULL
              AND auth.is_corresponding),
            auth -> COALESCE(auth.institutions, ARRAY())
          )
        ).id,
        id -> id IS NOT NULL
      )
    )),
    ARRAY()
  ) AS corresponding_institution_ids
FROM final_authorships;