### Install utils libraries, import and register UDFs

In [0]:
%python
%pip install /Volumes/openalex/default/libraries/openalex_dlt_utils-0.2.3-py3-none-any.whl
from nameparser import HumanName # Will be installed via pipeline libraries
from openalex.dlt.normalize import normalize_title_udf, udf_last_name_only
spark.udf.register("normalize_title_udf", normalize_title_udf)
spark.udf.register("udf_last_name_only", udf_last_name_only)

### Sync Tables from PROD

#### `mid.work`

In [0]:
%sql
DECLARE OR REPLACE VARIABLE walden_paper_id BIGINT DEFAULT 9999999999;
SET VARIABLE walden_paper_id = (SELECT max(paper_id) from openalex.mid.work);

DECLARE OR REPLACE VARIABLE walden_full_updated_date TIMESTAMP DEFAULT current_timestamp();
SET VARIABLE walden_full_updated_date = (SELECT max(full_updated_date) from openalex.mid.work);

MERGE INTO openalex.mid.work AS target
USING (
      SELECT *
      FROM openalex_postgres.mid.work -- federated foreign table
      WHERE paper_id > walden_paper_id
         OR full_updated_date > walden_full_updated_date
) AS source
ON target.paper_id = source.paper_id
WHEN MATCHED THEN UPDATE SET *
WHEN NOT MATCHED THEN INSERT *;

#### `mid.affiliation`

In [0]:
%sql
DECLARE OR REPLACE VARIABLE walden_affiliation_updated_date TIMESTAMP DEFAULT current_timestamp();
SET VARIABLE walden_affiliation_updated_date = (SELECT max(updated_date) from openalex.mid.affiliation);

MERGE INTO openalex.mid.affiliation AS target
USING (
      SELECT *
      FROM openalex_postgres.mid.affiliation -- federated foreign table
      WHERE updated_date > walden_affiliation_updated_date
        AND paper_id IS NOT NULL AND author_id IS NOT NULL
      QUALIFY row_number() OVER (PARTITION BY paper_id, author_id, affiliation_id ORDER BY updated_date DESC) = 1
) AS source
ON target.paper_id = source.paper_id
  AND target.author_id = source.author_id
  AND target.affiliation_id = source.affiliation_id
  -- AND target.updated_date = source.updated_date
WHEN MATCHED THEN UPDATE SET *
WHEN NOT MATCHED THEN INSERT *;

#### `mid.author`

In [0]:
%sql
DECLARE OR REPLACE VARIABLE walden_author_updated_date TIMESTAMP DEFAULT current_timestamp();
SET VARIABLE walden_author_updated_date 
  = (SELECT max(updated_date) from openalex.mid.author);
SELECT walden_author_updated_date;

MERGE INTO openalex.mid.author AS target
USING (
      SELECT *
      FROM openalex_postgres.mid.author -- federated foreign table
      WHERE updated_date > walden_author_updated_date and author_id != 5317838346 --deleted author
) AS source
ON target.author_id = source.author_id
WHEN MATCHED THEN UPDATE SET *
WHEN NOT MATCHED THEN INSERT *;

UPDATE
openalex.mid.author 
SET normalized_name = openalex.works.normalize_affiliation_string(display_name)
where normalized_name is null;

#### `mid.author_orcid`

In [0]:
SELECT count(*) FROM openalex_postgres.mid.author --444347806 in Walden -- 444,381,546 in PROD - check later (or do full refresh)

In [0]:
%sql
DECLARE OR REPLACE VARIABLE walden_author_updated_date TIMESTAMP DEFAULT current_timestamp();
SET VARIABLE walden_author_updated_date = (SELECT max(updated) from openalex.mid.author_orcid);

MERGE INTO openalex.mid.author_orcid AS target
USING (
      SELECT *
      FROM openalex_postgres.mid.author_orcid -- federated foreign table
      WHERE updated > walden_author_updated_date
        AND author_id IS NOT NULL -- check with Casey which range of authors is to be ignored (too many in the table - 400M+)
) AS source
ON target.author_id = source.author_id
WHEN MATCHED THEN UPDATE SET *
WHEN NOT MATCHED THEN INSERT *;

#### `mid.work_extra_ids`

In [0]:
%sql
DECLARE OR REPLACE VARIABLE walden_work_extra_id BIGINT DEFAULT 9999999999;
SET VARIABLE walden_work_extra_id = (SELECT max(paper_id) from openalex.mid.work_extra_ids);

MERGE INTO openalex.mid.work_extra_ids AS target
USING (
      SELECT DISTINCT *
      FROM openalex_postgres.mid.work_extra_ids -- federated foreign table
      WHERE paper_id > walden_work_extra_id
        AND paper_id IS NOT NULL
) AS source
ON target.paper_id = source.paper_id
  AND target.attribute_type = source.attribute_type
WHEN MATCHED THEN UPDATE SET target.attribute_value = source.attribute_value
WHEN NOT MATCHED THEN INSERT *;

#### `mid.journal`

In [0]:
%sql
DECLARE OR REPLACE VARIABLE walden_journal_updated_date TIMESTAMP DEFAULT current_timestamp();
SET VARIABLE walden_journal_updated_date = (SELECT max(full_updated_date) from openalex.mid.journal);

MERGE INTO openalex.mid.journal AS target
USING (
      SELECT *
      FROM openalex_postgres.mid.journal -- federated foreign table
      WHERE full_updated_date > walden_journal_updated_date
) AS source
ON target.journal_id = source.journal_id
WHEN MATCHED THEN UPDATE SET *
WHEN NOT MATCHED THEN INSERT *;

### Update missing `normalized_title` and `match_author`

In [0]:
%sql
UPDATE openalex.mid.affiliation
SET match_author = udf_last_name_only(array(named_struct('name', original_author)))[0].author_key
WHERE match_author IS NULL;

In [0]:
%sql
UPDATE openalex.mid.work 
SET normalized_title = normalize_title_udf(original_title)
WHERE normalized_title IS NULL;

### Update or insert fresh prod data into `work_id_map` by `paper_id`.
Without changes the merge produces 0 inserts and updates.

In [0]:
%sql
WITH prod_id_data AS (
SELECT
  w.paper_id,
  -- IMPORTANT - we store clean DOI in the map
  MAX(w.doi_lower) AS doi,
  MAX(e.attribute_value) as pmid,
  MAX(w.arxiv_id) as arxiv,
  MAX(CASE 
    WHEN a.match_author IS NULL THEN w.normalized_title
    ELSE CONCAT(w.normalized_title, '_', a.match_author)
  END) as title_author,
  MIN(to_date(publication_date)) as publication_date,
  MAX(to_date(w.created_date)) as created_dt,
  MAX(try_to_timestamp(w.updated_date)) as updated_dt
FROM openalex.mid.work w
LEFT JOIN openalex.mid.affiliation a ON w.paper_id = a.paper_id AND a.author_sequence_number = 1
LEFT JOIN openalex.mid.work_extra_ids e ON w.paper_id = e.paper_id AND e.attribute_type = 2
GROUP BY w.paper_id
)
MERGE INTO identifier('openalex' || :env_suffix || '.works.work_id_map') AS target
USING prod_id_data AS source
  ON target.paper_id = source.paper_id
WHEN MATCHED AND (
    (target.doi IS DISTINCT FROM source.doi AND source.doi IS NOT NULL) OR
    (target.pmid IS DISTINCT FROM source.pmid AND source.pmid IS NOT NULL) OR
    (target.arxiv IS DISTINCT FROM source.arxiv AND source.arxiv IS NOT NULL) OR
    (target.title_author IS DISTINCT FROM source.title_author AND source.title_author IS NOT NULL)
)
THEN UPDATE SET
  target.paper_id = source.paper_id,
  target.doi = COALESCE(source.doi, target.doi),
  target.pmid = COALESCE(source.pmid, target.pmid),
  target.arxiv = COALESCE(source.arxiv, target.arxiv),
  target.title_author = COALESCE(source.title_author, target.title_author),
  target.published_date = LEAST(target.published_date, source.publication_date),
  target.openalex_created_dt = source.created_dt,
  target.openalex_updated_dt = source.updated_dt
WHEN NOT MATCHED THEN INSERT
(paper_id, doi, pmid, arxiv, title_author, published_date, openalex_created_dt, openalex_updated_dt) 
VALUES (source.paper_id, source.doi, source.pmid, source.arxiv, 
  source.title_author, source.publication_date, source.created_dt, source.updated_dt);

### Update `authorships_backfill` once `affiliation` and `author` are refreshed

In [0]:
%run ../utils/variables

In [0]:
%sql
CREATE OR REPLACE TABLE openalex.authors.work_authorships_backfill CLUSTER BY AUTO
AS
-- Get lineage for each institution (keep raw IDs)
WITH institution_lineage AS (
  SELECT
    institution_id,
    COLLECT_LIST(ancestor_id) AS lineage_ids
  FROM openalex.mid.institution_ancestors
  WHERE NOT ARRAY_CONTAINS(SUPER_SYSTEM_INSTITUTIONS, ancestor_id)
  GROUP BY institution_id
),
-- Pre-aggregate institutions per paper, author, raw_affiliation_string
author_affiliations AS (
  SELECT
    a.paper_id,
    a.author_sequence_number,
    a.original_affiliation AS raw_affiliation_string,
    ARRAY_SORT(
      ARRAY_DISTINCT(
        FILTER(
          COLLECT_LIST(
            CASE
              WHEN i.affiliation_id IS NOT NULL
                THEN CONCAT('https://openalex.org/I', CAST(i.affiliation_id AS STRING))
            END
          ),
          x -> x IS NOT NULL
        )
      )
    ) AS institution_ids
  FROM openalex.mid.affiliation a
  LEFT JOIN openalex.mid.institution i ON a.affiliation_id = i.affiliation_id
  GROUP BY
    a.paper_id,
    a.author_sequence_number,
    a.original_affiliation
),
author_affiliation_agg AS (
  SELECT
    a.paper_id,
    a.author_sequence_number,
    -- Author details
    MAX(au_canonical.display_name) AS author_display_name,
    MAX(au_canonical.author_id) AS author_openalex_id,
    MAX(ao.orcid) AS author_orcid,
    MAX(a.original_author) AS raw_author_name,
    -- Affiliations data: one struct per raw_affiliation_string, with merged institution_ids
    ARRAY_SORT(
      ARRAY_DISTINCT(
        FILTER(
          COLLECT_LIST(
            NAMED_STRUCT(
              'raw_affiliation_string', aa.raw_affiliation_string,
              'institution_ids', aa.institution_ids
            )
          ),
          x -> x.raw_affiliation_string IS NOT NULL
        )
      )
    ) AS affiliations_array,
    FILTER(COLLECT_SET(i.iso3166_code), x -> x IS NOT NULL) AS countries_set,
    ARRAY_SORT(
      ARRAY_DISTINCT(
        FILTER(
          COLLECT_LIST(
            NAMED_STRUCT(
              'country_code', i.iso3166_code,
              'display_name', i.display_name,
              'id',
                CASE
                  WHEN i.affiliation_id IS NOT NULL
                    THEN CONCAT('https://openalex.org/I', CAST(i.affiliation_id AS STRING))
                END,
              'lineage',
                ARRAY_SORT(
                  TRANSFORM(
                    ARRAY_COMPACT(CONCAT(ARRAY(i.affiliation_id), COALESCE(il.lineage_ids, ARRAY()))),
                    id -> CONCAT('https://openalex.org/I', id)
                  )
                ),
              'ror',
                CASE
                  WHEN i.ror_id IS NULL THEN NULL
                  WHEN i.ror_id LIKE 'https://ror.org/%' THEN i.ror_id
                  ELSE CONCAT('https://ror.org/', CAST(i.ror_id AS STRING))
                END,
              'type',
                CASE
                  WHEN r.types IS NULL OR size(r.types) = 0 THEN NULL
                  WHEN size(r.types) = 1 THEN r.types[0]
                  ELSE element_at(FILTER(r.types, x -> lower(x) <> 'funder'), 1)
                END
            )
          ),
          x -> x.id IS NOT NULL
        )
      )
    ) AS institutions_list,
    ARRAY_SORT(FILTER(COLLECT_LIST(a.original_affiliation), x -> x IS NOT NULL)) AS raw_affiliation_strings_list,
    COALESCE(MAX(CASE WHEN a.is_corresponding_author THEN TRUE ELSE FALSE END), FALSE) AS is_corresponding_author_flag
  FROM openalex.mid.affiliation a
  LEFT JOIN openalex.mid.institution i ON a.affiliation_id = i.affiliation_id
  LEFT JOIN openalex.mid.author au ON a.author_id = au.author_id
  LEFT JOIN openalex.mid.author au_canonical ON COALESCE(au.merge_into_id, au.author_id) = au_canonical.author_id
  LEFT JOIN openalex.mid.author_orcid ao ON au.author_id = ao.author_id
  LEFT JOIN openalex.institutions.ror r ON CONCAT('https://ror.org/', i.ror_id) = r.id
  LEFT JOIN institution_lineage il ON i.affiliation_id = il.institution_id
  LEFT JOIN author_affiliations aa 
    ON a.paper_id = aa.paper_id
    AND a.author_sequence_number = aa.author_sequence_number
    AND a.original_affiliation = aa.raw_affiliation_string
  GROUP BY a.paper_id, a.author_sequence_number
),
paper_author_counts AS (
  SELECT
    paper_id,
    COUNT(*) as total_authors
  FROM author_affiliation_agg
  GROUP BY paper_id
),
final_authorships AS (
  SELECT
    paa.paper_id,
    ARRAY_SORT(
      COLLECT_LIST(
        STRUCT(
          paa.affiliations_array AS affiliations,
          STRUCT(
            paa.author_display_name AS display_name,
            CASE WHEN paa.author_openalex_id IS NOT NULL THEN CONCAT('https://openalex.org/A', paa.author_openalex_id) END AS id,
            CASE 
              WHEN paa.author_orcid IS NULL THEN NULL
              WHEN paa.author_orcid LIKE 'https://orcid.org/%' THEN paa.author_orcid
              ELSE CONCAT('https://orcid.org/', paa.author_orcid) 
            END AS orcid
          ) AS author,
          CASE
            WHEN paa.author_sequence_number = 1 THEN 'first'
            WHEN paa.author_sequence_number = pac.total_authors THEN 'last'
            ELSE 'middle'
          END AS author_position,
          paa.author_sequence_number AS author_order_number,
          COALESCE(ARRAY_SORT(paa.countries_set), ARRAY()) AS countries,
          COALESCE(ARRAY_SORT(paa.institutions_list), ARRAY()) AS institutions,
          -- override - single author is always corresponding
          (paa.is_corresponding_author_flag OR pac.total_authors = 1) AS is_corresponding,
          ARRAY_DISTINCT(COALESCE(paa.raw_affiliation_strings_list, ARRAY())) AS raw_affiliation_strings,
          paa.raw_author_name AS raw_author_name
        )
      ),
      (left, right) -> CASE
        WHEN left.author_order_number < right.author_order_number THEN -1
        WHEN left.author_order_number > right.author_order_number THEN 1
        ELSE 0
      END
    ) as authorships
  FROM
    author_affiliation_agg paa
  JOIN
    paper_author_counts pac ON paa.paper_id = pac.paper_id
  GROUP BY
    paa.paper_id
)
SELECT
  paper_id,
  authorships,
  FILTER(authorships, auth -> 
    auth is not null 
    and auth.author IS NOT NULL 
    and auth.is_corresponding
  ).author.id AS corresponding_author_ids,
  COALESCE(
    ARRAY_SORT(ARRAY_DISTINCT(
      FILTER(
        FLATTEN(
          TRANSFORM(
            FILTER(authorships, auth -> auth IS NOT NULL 
              AND auth.author IS NOT NULL
              AND auth.is_corresponding),
            auth -> COALESCE(auth.institutions, ARRAY())
          )
        ).id,
        id -> id IS NOT NULL
      )
    )),
    ARRAY()
  ) AS corresponding_institution_ids
FROM final_authorships;

### Apply Works Magnet moderation

In [0]:
CREATE OR REPLACE TABLE openalex.authors.work_authorships_backfill_moderated 
CLUSTER BY (paper_id)
AS
WITH 
-- Get lineage for institutions
institution_lineage AS (
  SELECT
    institution_id,
    COLLECT_LIST(ancestor_id) AS lineage_ids
  FROM openalex.mid.institution_ancestors
  WHERE NOT ARRAY_CONTAINS(SUPER_SYSTEM_INSTITUTIONS, ancestor_id)
  GROUP BY institution_id
  
),

-- Step 1a: Explode authorships
authorships_exploded AS (
  SELECT
    paper_id,
    EXPLODE(authorships) AS auth
  FROM openalex.authors.work_authorships_backfill
),

-- Step 1b: Explode affiliations
backfill_exploded AS (
  SELECT
    paper_id,
    auth.author,
    auth.author_position,
    auth.author_order_number,
    auth.is_corresponding,
    auth.raw_author_name,
    auth.institutions AS original_institutions,
    auth.countries AS original_countries,
    EXPLODE_OUTER(auth.affiliations) AS aff
  FROM authorships_exploded
),

-- Step 1c: Extract affiliation fields and normalize
backfill_flattened AS (
  SELECT
    paper_id,
    author,
    author_position,
    author_order_number,
    is_corresponding,
    raw_author_name,
    original_institutions,
    original_countries,
    aff.raw_affiliation_string,
    openalex.works.normalize_affiliation_string(aff.raw_affiliation_string) AS normalized_affiliation,
    aff.institution_ids AS original_institution_ids
  FROM backfill_exploded
),

-- Step 2: Explode moderation new_rors (these are ROR IDs, not institution IDs!)
moderation_exploded AS (
  SELECT
    openalex.works.normalize_affiliation_string(raw_affiliation_string) AS normalized_affiliation,
    EXPLODE(new_rors) AS new_ror_id  -- This is a ROR ID string, not institution ID
  FROM openalex.works.magnet_moderation
  WHERE new_rors IS NOT NULL AND SIZE(new_rors) > 0
),

-- Step 3: Build institutions object for each normalized string in moderation
moderation_with_institutions AS (
  SELECT
    me.normalized_affiliation,
    COLLECT_LIST(CONCAT('https://openalex.org/I', i.affiliation_id)) AS new_institution_ids,
    COLLECT_LIST(
      STRUCT(
        i.iso3166_code AS country_code,
        i.display_name AS display_name,
        CONCAT('https://openalex.org/I', i.affiliation_id) AS id,
        ARRAY_SORT(
          TRANSFORM(
            ARRAY_COMPACT(CONCAT(ARRAY(i.affiliation_id), COALESCE(il.lineage_ids, ARRAY()))),
            lid -> CONCAT('https://openalex.org/I', lid)
          )
        ) AS lineage,
        CASE 
          WHEN i.ror_id IS NULL THEN NULL
          WHEN i.ror_id LIKE 'https://ror.org/%' THEN i.ror_id
          ELSE CONCAT('https://ror.org/', i.ror_id) 
        END AS ror,
        CASE
          WHEN r.types IS NULL OR SIZE(r.types) = 0 THEN NULL
          WHEN SIZE(r.types) = 1 THEN r.types[0]
          ELSE ELEMENT_AT(FILTER(r.types, x -> LOWER(x) <> 'funder'), 1)
        END AS type
        --COALESCE(r.types, ARRAY()) AS type_list
      )
    ) AS new_institutions
  FROM moderation_exploded me
  -- Join on ROR ID (no URL prefix in mid.institution)
  INNER JOIN openalex.mid.institution i 
    ON me.new_ror_id = i.ror_id
  LEFT JOIN institution_lineage il ON i.affiliation_id = il.institution_id
  LEFT JOIN openalex.institutions.ror r ON CONCAT('https://ror.org/', i.ror_id) = r.id
  GROUP BY me.normalized_affiliation
),

-- Step 4: Join moderation to exploded backfill on normalized affiliation
backfill_with_moderation AS (
  SELECT
    bf.paper_id,
    bf.author,
    bf.author_position,
    bf.author_order_number,
    bf.is_corresponding,
    bf.raw_author_name,
    bf.raw_affiliation_string,
    bf.normalized_affiliation,
    -- Use moderated institutions if available, otherwise keep original
    COALESCE(mw.new_institution_ids, bf.original_institution_ids) AS institution_ids,
    COALESCE(mw.new_institutions, bf.original_institutions) AS institutions
  FROM backfill_flattened bf
  LEFT JOIN moderation_with_institutions mw 
    ON bf.normalized_affiliation = mw.normalized_affiliation
),

-- Step 5: Reconstruct authorships by grouping back at author level
authorships_reconstructed AS (
  SELECT
    paper_id,
    author,
    author_position,
    author_order_number,
    is_corresponding,
    raw_author_name,
    -- Collect affiliations
    FILTER(
      COLLECT_LIST(
        STRUCT(
          institution_ids,
          raw_affiliation_string
        )
      ),
      aff -> aff.raw_affiliation_string IS NOT NULL
    ) AS affiliations,
    -- Get unique institutions across all affiliations for this author
    ARRAY_SORT(ARRAY_DISTINCT(FILTER(
      FLATTEN(COLLECT_LIST(institutions)),
      inst -> inst IS NOT NULL and inst.id IS NOT NULL
    ))) AS institutions,
    -- Get unique countries
    ARRAY_SORT(ARRAY_DISTINCT(FILTER(
      FLATTEN(COLLECT_LIST(institutions)).country_code,
      cc -> cc IS NOT NULL
    ))) AS countries,
    -- Collect raw affiliation strings
    ARRAY_DISTINCT(ARRAY_SORT(FILTER(COLLECT_LIST(raw_affiliation_string), x -> x IS NOT NULL))) AS raw_affiliation_strings
  FROM backfill_with_moderation
  GROUP BY paper_id, author, author_position, author_order_number, is_corresponding, raw_author_name
)

-- Step 6: Final - Reconstruct paper-level authorships array
SELECT
  paper_id,
  ARRAY_SORT(
    COLLECT_LIST(
      STRUCT(
        affiliations,
        author,
        author_position,
        author_order_number,
        countries,
        institutions,
        is_corresponding,
        CAST(NULL AS STRUCT<title:STRING,first:STRING,middle:STRING,last:STRING,suffix:STRING,nickname:STRING>) AS parsed_name,
        raw_affiliation_strings,
        raw_author_name
      )
    ),
    (left, right) -> CASE
      WHEN left.author_order_number < right.author_order_number THEN -1
      WHEN left.author_order_number > right.author_order_number THEN 1
      ELSE 0
    END
  ) AS authorships,
  -- Corresponding author IDs
  FILTER(
    COLLECT_LIST(CASE WHEN is_corresponding THEN author.id END),
    id -> id IS NOT NULL
  ) AS corresponding_author_ids,
  -- Corresponding institution IDs
  COALESCE(
    ARRAY_SORT(ARRAY_DISTINCT(
      FILTER(
        FLATTEN(
          COLLECT_LIST(
            CASE WHEN is_corresponding THEN TRANSFORM(institutions, inst -> inst.id) END
          )
        ),
        id -> id IS NOT NULL
      )
    )),
    ARRAY()
  ) AS corresponding_institution_ids
FROM authorships_reconstructed
GROUP BY paper_id;

In [0]:
OPTIMIZE openalex.authors.work_authorships_backfill_moderated FULL;