### Creates `openalex.works.authors_and_affiliations` in Walden End to End workflow

In [0]:
DECLARE OR REPLACE VARIABLE max_updated_date TIMESTAMP DEFAULT to_timestamp('1900-01-01');
SET VARIABLE max_updated_date = COALESCE((SELECT MAX(updated_datetime) FROM identifier('openalex' || :env_suffix || '.works.authors_and_affiliations')), to_timestamp('1900-01-01'));
SELECT max_updated_date;

### Step 1: Populate `openalex.authors.author_institutions` mapping

In [0]:
MERGE INTO identifier('openalex' || :env_suffix || '.authors.author_institutions') AS target
USING (
    WITH exploded_authors AS (
        SELECT 
            id AS work_id,
            POSEXPLODE(authorships) AS (author_sequence, authorship)
        FROM identifier('openalex' || :env_suffix || '.works.openalex_works_base')
        WHERE updated_date > max_updated_date
          AND authorships IS NOT NULL
          AND SIZE(authorships) > 0
    ),
    exploded_affiliations AS (
        SELECT 
            work_id,
            author_sequence,
            authorship.raw_author_name,
            EXPLODE_OUTER(authorship.raw_affiliation_strings) AS raw_affiliation_string
        FROM exploded_authors
    ),
    with_institutions AS (
        SELECT 
            ea.work_id,
            ea.author_sequence,
            ea.raw_author_name,
            ea.raw_affiliation_string,
            asl.countries AS raw_countries,
            CASE 
                WHEN ea.raw_affiliation_string IS NULL THEN NULL
                WHEN asl.institution_ids_override IS NOT NULL AND SIZE(asl.institution_ids_override) > 0 
                    THEN asl.institution_ids_override
                WHEN asl.institution_ids IS NOT NULL AND SIZE(asl.institution_ids) > 0 
                    AND NOT (SIZE(asl.institution_ids) = 1 AND asl.institution_ids[0] = -1) 
                    THEN asl.institution_ids
                ELSE NULL
            END AS institution_ids
        FROM exploded_affiliations ea
        LEFT JOIN openalex.institutions.affiliation_strings_lookup asl
            ON ea.raw_affiliation_string = asl.raw_affiliation_string
            AND ea.raw_affiliation_string IS NOT NULL
    )
    SELECT 
        work_id,
        author_sequence,
        EXPLODE_OUTER(institution_ids) AS institution_id,
        raw_author_name,
        raw_affiliation_string,
        raw_countries
    FROM with_institutions
    WHERE institution_ids IS NOT NULL AND SIZE(institution_ids) > 0
) AS source
ON target.work_id = source.work_id 
   AND target.author_sequence = source.author_sequence 
   AND NVL(target.institution_id, -1) = NVL(source.institution_id, -1)
   AND NVL(target.raw_affiliation_string, '') = NVL(source.raw_affiliation_string, '')
WHEN NOT MATCHED THEN
    INSERT (work_id, author_sequence, institution_id, raw_author_name, raw_affiliation_string, raw_countries)
    VALUES (source.work_id, source.author_sequence, source.institution_id, 
            source.raw_author_name, source.raw_affiliation_string, source.raw_countries);

### Step 2: Create enriched authorships with parsed names and institution details

In [0]:
%run ../utils/variables

In [0]:
CREATE OR REPLACE TABLE identifier('openalex' || :env_suffix || '.works.authors_and_affiliations_updates')
CLUSTER BY (work_id) AS (
WITH base_works AS (
    -- Read raw authorships from openalex_works_base (incremental)
    SELECT
        id AS work_id,
        authorships,
        updated_date AS updated_datetime
    FROM identifier('openalex' || :env_suffix || '.works.openalex_works_base')
    WHERE updated_date > max_updated_date
      AND authorships IS NOT NULL
      AND SIZE(authorships) > 0
),
institution_lineage AS (
    SELECT
        institution_id,
        COLLECT_LIST(ancestor_id) AS lineage_ids
    FROM openalex.mid.institution_ancestors
    WHERE NOT ARRAY_CONTAINS(SUPER_SYSTEM_INSTITUTIONS, ancestor_id)
    GROUP BY institution_id
),
author_institutions_with_details AS (
    SELECT
        ai.work_id,
        ai.author_sequence,
        ARRAY_DISTINCT(FLATTEN(COLLECT_SET(ai.raw_countries))) AS raw_parsed_countries,
        COLLECT_SET(
            STRUCT(
                inst.iso3166_code AS country_code,
                inst.display_name,
                CONCAT('https://openalex.org/I', ai.institution_id) AS id,
                ARRAY_SORT(
                    TRANSFORM(
                        ARRAY_COMPACT(CONCAT(ARRAY(ai.institution_id), COALESCE(il.lineage_ids, ARRAY()))), 
                        id -> CONCAT('https://openalex.org/I', id)
                    )
                ) AS lineage,
                CASE 
                    WHEN inst.ror_id IS NULL THEN NULL
                    WHEN inst.ror_id LIKE 'https://ror.org/%' THEN inst.ror_id
                    ELSE CONCAT('https://ror.org/', inst.ror_id) 
                END AS ror,
                inst.type
            )
        ) AS institutions
    FROM identifier('openalex' || :env_suffix || '.authors.author_institutions') ai
    LEFT JOIN openalex.institutions.institutions inst ON inst.id = ai.institution_id
    LEFT JOIN institution_lineage il USING (institution_id)
    WHERE ai.institution_id IS NOT NULL
    GROUP BY ai.work_id, ai.author_sequence
),
-- Map raw_affiliation_string -> institution_ids per (work_id, author_sequence)
affiliations_map_ids AS (
    SELECT
        work_id,
        author_sequence,
        raw_affiliation_string,
        ARRAY_DISTINCT(
            ARRAY_COMPACT(
                COLLECT_LIST(CONCAT('https://openalex.org/I', institution_id))
            )
        ) AS institution_ids
    FROM identifier('openalex' || :env_suffix || '.authors.author_institutions')
    WHERE institution_id IS NOT NULL
      AND raw_affiliation_string IS NOT NULL
    GROUP BY work_id, author_sequence, raw_affiliation_string
),
affiliations_map AS (
    SELECT
        work_id,
        author_sequence,
        MAP_FROM_ENTRIES(
            COLLECT_LIST(NAMED_STRUCT('key', raw_affiliation_string, 'value', institution_ids))
        ) AS aff_map
    FROM affiliations_map_ids
    GROUP BY work_id, author_sequence
),
-- Pre-aggregate institution details per work_id as a map[author_sequence -> details]
author_institution_lookup AS (
    SELECT
        work_id,
        MAP_FROM_ENTRIES(
            COLLECT_LIST(
                STRUCT(
                    author_sequence,
                    STRUCT(
                        institutions,
                        raw_parsed_countries,
                        aff_map
                    )
                )
            )
        ) AS author_lookup
    FROM (
        SELECT
            aid.work_id,
            aid.author_sequence,
            aid.institutions,
            aid.raw_parsed_countries,
            am.aff_map
        FROM author_institutions_with_details aid
        LEFT JOIN affiliations_map am 
            ON aid.work_id = am.work_id 
            AND aid.author_sequence = am.author_sequence
    )
    GROUP BY work_id
),
-- Explode authorships to join with parsed names lookup
exploded_for_parsed_names AS (
    SELECT
        work_id,
        updated_datetime,
        POSEXPLODE(authorships) AS (author_idx, authorship)
    FROM base_works
),
-- Join with parsed names lookup
with_parsed_names AS (
    SELECT
        e.work_id,
        e.updated_datetime,
        e.author_idx,
        e.authorship,
        pn.parsed_name
    FROM exploded_for_parsed_names e
    LEFT JOIN identifier('openalex' || :env_suffix || '.authors.parsed_names_lookup') pn
        ON TRIM(e.authorship.raw_author_name) = pn.raw_author_name
),
-- Reassemble authorships with parsed names
authorships_with_parsed_names AS (
    SELECT
        work_id,
        updated_datetime,
        TRANSFORM(
            ARRAY_SORT(
                COLLECT_LIST(
                    STRUCT(
                        author_idx,
                        STRUCT(
                            authorship.affiliations AS affiliations,
                            authorship.author AS author,
                            authorship.author_position AS author_position,
                            authorship.author_order_number AS author_order_number,
                            authorship.countries AS countries,
                            authorship.institutions AS institutions,
                            authorship.is_corresponding AS is_corresponding,
                            authorship.raw_affiliation_strings AS raw_affiliation_strings,
                            authorship.raw_author_name AS raw_author_name,
                            parsed_name AS parsed_name
                        ) AS authorship
                    )
                ),
                (left, right) -> CASE
                    WHEN left.author_idx < right.author_idx THEN -1
                    WHEN left.author_idx > right.author_idx THEN 1
                    ELSE 0
                END
            ),
            x -> x.authorship
        ) AS authorships
    FROM with_parsed_names
    GROUP BY work_id, updated_datetime
)
-- Final enrichment: add institution details, countries, affiliations mapping
SELECT
    ba.work_id,
    ba.updated_datetime,
    TRANSFORM(
        ba.authorships,
        (auth, idx) -> STRUCT(
            -- affiliations: map raw_affiliation_strings to institution_ids
            TRANSFORM(
                COALESCE(auth.raw_affiliation_strings, ARRAY()),
                s -> STRUCT(
                    COALESCE(ELEMENT_AT(ELEMENT_AT(ail.author_lookup, idx).aff_map, s), ARRAY()) AS institution_ids,
                    s AS raw_affiliation_string
                )
            ) AS affiliations,
            -- Preserve author field (id still NULL, will be assigned later)
            auth.author,
            -- All other fields
            auth.author_position,
            auth.author_order_number,
            -- countries from institutions (fallback to raw parsed countries)
            CASE
                WHEN ELEMENT_AT(ail.author_lookup, idx).institutions IS NOT NULL 
                     AND SIZE(FILTER(ELEMENT_AT(ail.author_lookup, idx).institutions.country_code, c -> c IS NOT NULL AND c <> '')) > 0
                    THEN ARRAY_SORT(ARRAY_DISTINCT(FILTER(ELEMENT_AT(ail.author_lookup, idx).institutions.country_code, c -> c IS NOT NULL AND c <> '')))
                WHEN ELEMENT_AT(ail.author_lookup, idx).raw_parsed_countries IS NOT NULL
                    THEN ELEMENT_AT(ail.author_lookup, idx).raw_parsed_countries
                ELSE ARRAY()
            END AS countries,
            -- institutions with full details
            COALESCE(ELEMENT_AT(ail.author_lookup, idx).institutions, ARRAY()) AS institutions,
            auth.is_corresponding,
            auth.raw_affiliation_strings,
            auth.raw_author_name,
            auth.parsed_name
        )
    ) AS authorships
FROM authorships_with_parsed_names ba
LEFT JOIN author_institution_lookup ail ON ba.work_id = ail.work_id);

### Step 3: Match authors - testing

In [0]:
-------------------------------------------------------------------------------
-- AUTHOR MATCHING ALGORITHM - DIAGNOSTIC TABLE
-------------------------------------------------------------------------------

CREATE OR REPLACE TABLE openalex.authors.author_matching_diagnostics AS

WITH with_work_data AS (
  SELECT
    aa.work_id,
    aa.authorships,
    COALESCE(wtf.topics, ARRAY()) AS topics,
    ARRAY_DISTINCT(
      TRANSFORM(
        FILTER(w.locations, x -> x.source.id IS NOT NULL),
        x -> x.source.id
      )
    ) AS work_source_ids
  FROM openalex.works.authors_and_affiliations_updates aa
  LEFT JOIN openalex.works.work_topics_frontfill wtf
    ON aa.work_id = wtf.work_id
  LEFT JOIN openalex.works.openalex_works w
    ON aa.work_id = w.id
),

authors_exploded AS (
  SELECT
    work_id,
    authorship.author.display_name,
    authorship.parsed_name,
    LOWER(CONCAT(SUBSTRING(authorship.parsed_name.first, 1, 1), ' ', authorship.parsed_name.last)) AS block_key,
    ARRAY_DISTINCT(
      CONCAT(
        TRANSFORM(authorship.institutions, i -> i.id),
        FLATTEN(TRANSFORM(authorship.institutions, i -> i.lineage))
      )
    ) AS institution_ids,
    TRANSFORM(topics, t -> t.id) AS topic_ids,
    work_source_ids
  FROM with_work_data
  LATERAL VIEW EXPLODE(authorships) AS authorship
  WHERE work_id > 7000000000
),

blocked_candidates AS (
  SELECT 
    e.work_id,
    e.display_name,
    e.parsed_name,
    e.block_key,
    e.institution_ids,
    e.topic_ids,
    e.work_source_ids,
    alm.author_id,
    alm.parsed_longest_name,
    alm.institution_ids as candidate_institution_ids,
    alm.topic_ids as candidate_topic_ids,
    alm.source_ids AS candidate_source_ids,
    alm.works_count
  FROM authors_exploded e
  JOIN openalex.authors.author_lookup_mapping alm
    ON alm.block_key = e.block_key
),

with_match_signals AS (
  SELECT
    *,
    NAMED_STRUCT(
      'id', author_id,
      'display_name', CONCAT(parsed_longest_name.first, ' ', parsed_longest_name.last),
      'parsed_name', parsed_longest_name
    ) AS candidate_obj,
    length(parsed_name.first) as pn_first_len,
    length(parsed_longest_name.first) as cand_first_len,
    coalesce(parsed_name.middle, '') as pn_middle,
    coalesce(parsed_longest_name.middle, '') as cand_middle,
    
    (size(institution_ids) > 0 AND size(candidate_institution_ids) > 0 
     AND arrays_overlap(candidate_institution_ids, institution_ids)) as has_inst,
    
    (size(topic_ids) > 0 AND size(candidate_topic_ids) > 0 
     AND arrays_overlap(candidate_topic_ids, topic_ids)) as has_topic,

     (SIZE(work_source_ids) > 0 AND SIZE(candidate_source_ids) > 0
     AND ARRAYS_OVERLAP(candidate_source_ids, work_source_ids)) AS has_source
  FROM blocked_candidates
),

with_name_matches AS (
  SELECT
    *,
    -- 1: Exact Full Name
    (pn_first_len > 1 AND length(pn_middle) > 1 AND cand_first_len > 1 AND length(cand_middle) > 1
     AND lower(parsed_name.first) = lower(parsed_longest_name.first)
     AND lower(pn_middle) = lower(cand_middle)
     AND lower(parsed_name.last) = lower(parsed_longest_name.last)
    ) as pattern_1_exact_full,

    -- 2: Exact First, Middle Initial match
    (pn_first_len > 1 AND length(pn_middle) = 1 AND cand_first_len > 1
     AND lower(parsed_name.first) = lower(parsed_longest_name.first)
     AND lower(parsed_name.last) = lower(parsed_longest_name.last)
     AND (cand_middle = '' OR lower(pn_middle) = lower(substring(cand_middle, 1, 1)))
    ) as pattern_2_exact_first_mid_init,

    -- 3: Initials match to Full
    (pn_first_len = 1 AND pn_middle != '' AND cand_first_len > 1 AND cand_middle != ''
     AND lower(parsed_name.first) = lower(substring(parsed_longest_name.first, 1, 1))
     AND lower(substring(pn_middle, 1, 1)) = lower(substring(cand_middle, 1, 1))
     AND lower(parsed_name.last) = lower(parsed_longest_name.last)
    ) as pattern_3_init_mid_init,

    -- 4: First Initial, Middle Initial match
    (pn_first_len = 1 AND cand_first_len = 1 AND pn_middle != '' AND cand_middle != ''
     AND lower(parsed_name.first) = lower(parsed_longest_name.first)
     AND lower(substring(pn_middle, 1, 1)) = lower(substring(cand_middle, 1, 1))
     AND lower(parsed_name.last) = lower(parsed_longest_name.last)
    ) as pattern_4_first_init_mid_init,

    -- 5: Exact First, Exact Last
    (pn_first_len > 1 AND cand_first_len > 1
     AND lower(parsed_name.first) = lower(parsed_longest_name.first)
     AND lower(parsed_name.last) = lower(parsed_longest_name.last)
     AND pn_middle = ''
    ) as pattern_5_exact_first_last,

    -- 6: First Initial Only to Full
    (pn_first_len = 1 AND pn_middle = '' AND cand_first_len > 1
     AND lower(parsed_name.first) = lower(substring(parsed_longest_name.first, 1, 1))
     AND lower(parsed_name.last) = lower(parsed_longest_name.last)
    ) as pattern_6_first_init_to_full,

    -- 7: First Initial Only
    (pn_first_len = 1 AND cand_first_len = 1
     AND lower(parsed_name.first) = lower(parsed_longest_name.first)
     AND lower(parsed_name.last) = lower(parsed_longest_name.last)
     AND pn_middle = '' AND cand_middle = ''
    ) as pattern_7_first_init_last,

    -- 8: Full Name to Initial
    (pn_first_len > 1 AND cand_first_len = 1
     AND lower(substring(parsed_name.first, 1, 1)) = lower(parsed_longest_name.first)
     AND lower(parsed_name.last) = lower(parsed_longest_name.last)
    ) as pattern_8_full_to_init

  FROM with_match_signals
),

with_any_name_match AS (
  SELECT
    *,
    (pattern_1_exact_full OR pattern_2_exact_first_mid_init OR pattern_3_init_mid_init OR 
     pattern_4_first_init_mid_init OR pattern_5_exact_first_last OR pattern_6_first_init_to_full OR 
     pattern_7_first_init_last OR pattern_8_full_to_init) as any_name_match
  FROM with_name_matches
),

aggregated_counts AS (
  SELECT
    work_id,
    display_name,
    block_key,
    
    -- STRATEGY 1: Name Only (Unique)
    count_if(pattern_1_exact_full) AS s1_n1, count_if(pattern_2_exact_first_mid_init) AS s1_n2,
    count_if(pattern_3_init_mid_init) AS s1_n3, count_if(pattern_4_first_init_mid_init) AS s1_n4,
    count_if(pattern_5_exact_first_last) AS s1_n5, count_if(pattern_6_first_init_to_full) AS s1_n6,
    count_if(pattern_7_first_init_last) AS s1_n7, count_if(pattern_8_full_to_init) AS s1_n8,
    
    -- STRATEGY 2: Name + Institution (NOW INCLUDES P6, P8)
    count_if(pattern_1_exact_full AND has_inst) AS s2_n1, count_if(pattern_2_exact_first_mid_init AND has_inst) AS s2_n2,
    count_if(pattern_3_init_mid_init AND has_inst) AS s2_n3, count_if(pattern_4_first_init_mid_init AND has_inst) AS s2_n4,
    count_if(pattern_5_exact_first_last AND has_inst) AS s2_n5, count_if(pattern_6_first_init_to_full AND has_inst) AS s2_n6,
    count_if(pattern_7_first_init_last AND has_inst) AS s2_n7, count_if(pattern_8_full_to_init AND has_inst) AS s2_n8,

    -- STRATEGY 6: Name + Inst + Source (NOW INCLUDES P6, P8)
    count_if(pattern_1_exact_full AND has_inst AND has_source) AS s6_n1,
    count_if(pattern_2_exact_first_mid_init AND has_inst AND has_source) AS s6_n2,
    count_if(pattern_5_exact_first_last AND has_inst AND has_source) AS s6_n5,
    count_if(pattern_6_first_init_to_full AND has_inst AND has_source) AS s6_n6,
    count_if(pattern_7_first_init_last AND has_inst AND has_source) AS s6_n7,
    count_if(pattern_8_full_to_init AND has_inst AND has_source) AS s6_n8,

    -- STRATEGY 4: Name + Inst + Topic (NOW INCLUDES P6, P8)
    count_if(pattern_1_exact_full AND has_inst AND has_topic) AS s4_n1,
    count_if(pattern_2_exact_first_mid_init AND has_inst AND has_topic) AS s4_n2,
    count_if(pattern_5_exact_first_last AND has_inst AND has_topic) AS s4_n5,
    count_if(pattern_6_first_init_to_full AND has_inst AND has_topic) AS s4_n6,
    count_if(pattern_7_first_init_last AND has_inst AND has_topic) AS s4_n7,
    count_if(pattern_8_full_to_init AND has_inst AND has_topic) AS s4_n8,

    -- STRATEGY 5: Name + Source (NOW INCLUDES P6, P8)
    count_if(pattern_1_exact_full AND has_source) AS s5_n1,
    count_if(pattern_2_exact_first_mid_init AND has_source) AS s5_n2,
    count_if(pattern_5_exact_first_last AND has_source) AS s5_n5,
    count_if(pattern_6_first_init_to_full AND has_source) AS s5_n6,
    count_if(pattern_7_first_init_last AND has_source) AS s5_n7,
    count_if(pattern_8_full_to_init AND has_source) AS s5_n8,

    -- STRATEGY 3: Name + Topic (STRICT - NO P6 or P8)
    count_if(pattern_1_exact_full AND has_topic) AS s3_n1,
    count_if(pattern_2_exact_first_mid_init AND has_topic) AS s3_n2,
    count_if(pattern_5_exact_first_last AND has_topic) AS s3_n5,
    
    -- CAPTURE OBJECTS --------------------------------------------------------
    MAX(CASE WHEN pattern_1_exact_full THEN candidate_obj END) AS match_s1_n1,
    MAX(CASE WHEN pattern_2_exact_first_mid_init THEN candidate_obj END) AS match_s1_n2,
    MAX(CASE WHEN pattern_5_exact_first_last THEN candidate_obj END) AS match_s1_n5,

    -- S2 Capture (Inst)
    MAX(CASE WHEN pattern_1_exact_full AND has_inst THEN candidate_obj END) AS match_s2_n1,
    MAX(CASE WHEN pattern_2_exact_first_mid_init AND has_inst THEN candidate_obj END) AS match_s2_n2,
    MAX(CASE WHEN pattern_5_exact_first_last AND has_inst THEN candidate_obj END) AS match_s2_n5,
    MAX(CASE WHEN pattern_6_first_init_to_full AND has_inst THEN candidate_obj END) AS match_s2_n6,
    MAX(CASE WHEN pattern_8_full_to_init AND has_inst THEN candidate_obj END) AS match_s2_n8,

    -- S6 Capture (Inst + Source)
    MAX(CASE WHEN pattern_1_exact_full AND has_inst AND has_source THEN candidate_obj END) AS match_s6_n1,
    MAX(CASE WHEN pattern_2_exact_first_mid_init AND has_inst AND has_source THEN candidate_obj END) AS match_s6_n2,
    MAX(CASE WHEN pattern_5_exact_first_last AND has_inst AND has_source THEN candidate_obj END) AS match_s6_n5,
    MAX(CASE WHEN pattern_6_first_init_to_full AND has_inst AND has_source THEN candidate_obj END) AS match_s6_n6,
    MAX(CASE WHEN pattern_8_full_to_init AND has_inst AND has_source THEN candidate_obj END) AS match_s6_n8,

    -- S5 Capture (Source)
    MAX(CASE WHEN pattern_1_exact_full AND has_source THEN candidate_obj END) AS match_s5_n1,
    MAX(CASE WHEN pattern_2_exact_first_mid_init AND has_source THEN candidate_obj END) AS match_s5_n2,
    MAX(CASE WHEN pattern_5_exact_first_last AND has_source THEN candidate_obj END) AS match_s5_n5,
    MAX(CASE WHEN pattern_6_first_init_to_full AND has_source THEN candidate_obj END) AS match_s5_n6,
    MAX(CASE WHEN pattern_8_full_to_init AND has_source THEN candidate_obj END) AS match_s5_n8,
    
    -- S4 Capture (Inst + Topic)
    MAX(CASE WHEN pattern_1_exact_full AND has_inst AND has_topic THEN candidate_obj END) AS match_s4_n1,
    MAX(CASE WHEN pattern_2_exact_first_mid_init AND has_inst AND has_topic THEN candidate_obj END) AS match_s4_n2,
    MAX(CASE WHEN pattern_5_exact_first_last AND has_inst AND has_topic THEN candidate_obj END) AS match_s4_n5,
    MAX(CASE WHEN pattern_6_first_init_to_full AND has_inst AND has_topic THEN candidate_obj END) AS match_s4_n6,
    MAX(CASE WHEN pattern_8_full_to_init AND has_inst AND has_topic THEN candidate_obj END) AS match_s4_n8,

    -- S3 Capture (Topic - STRICT)
    MAX(CASE WHEN pattern_1_exact_full AND has_topic THEN candidate_obj END) AS match_s3_n1,
    MAX(CASE WHEN pattern_2_exact_first_mid_init AND has_topic THEN candidate_obj END) AS match_s3_n2,
    MAX(CASE WHEN pattern_5_exact_first_last AND has_topic THEN candidate_obj END) AS match_s3_n5,
    
    -- Diagnostics
    COUNT(*) AS total_candidates_in_block,
    COUNT_IF(any_name_match) AS total_name_matches,
    
    MAX(SIZE(institution_ids)) > 0 AS work_has_inst,
    MAX(SIZE(work_source_ids)) > 0 AS work_has_source,
    MAX(SIZE(topic_ids)) > 0 AS work_has_topic,
    
    SLICE(COLLECT_LIST(
      CASE WHEN any_name_match THEN
        NAMED_STRUCT(
          'author_id', author_id,
          'name', parsed_longest_name.first || ' ' || parsed_longest_name.last,
          'has_inst', has_inst,
          'has_topic', has_topic,
          'has_source', has_source
        )
      END
    ), 1, 10) AS candidates_passing_name_check

  FROM with_any_name_match
  GROUP BY work_id, display_name, block_key
)

SELECT
  work_id,
  display_name AS work_author_name,
  block_key,
  
  -- Match Method
  CASE 
    WHEN s1_n1 = 1 THEN 'unique_exact_full_name'
    WHEN s1_n2 = 1 THEN 'unique_exact_first_mid_init'
    WHEN s1_n5 = 1 THEN 'unique_exact_first_last'
    
    WHEN s6_n1 = 1 THEN 'exact_full_resolved_by_inst_and_source'
    WHEN s6_n2 = 1 THEN 'exact_first_mid_init_resolved_by_inst_and_source'
    WHEN s6_n5 = 1 THEN 'exact_first_last_resolved_by_inst_and_source'
    WHEN s6_n6 = 1 THEN 'init_to_full_resolved_by_inst_and_source'
    WHEN s6_n8 = 1 THEN 'full_to_init_resolved_by_inst_and_source'

    WHEN s2_n1 = 1 THEN 'exact_full_resolved_by_inst'
    WHEN s2_n2 = 1 THEN 'exact_first_mid_init_resolved_by_inst'
    WHEN s2_n5 = 1 THEN 'exact_first_last_resolved_by_inst'
    WHEN s2_n6 = 1 THEN 'init_to_full_resolved_by_inst'
    WHEN s2_n8 = 1 THEN 'full_to_init_resolved_by_inst'

    WHEN s4_n1 = 1 THEN 'exact_full_resolved_by_inst_and_topic'
    WHEN s4_n2 = 1 THEN 'exact_first_mid_init_resolved_by_inst_and_topic'
    WHEN s4_n5 = 1 THEN 'exact_first_last_resolved_by_inst_and_topic'
    WHEN s4_n6 = 1 THEN 'init_to_full_resolved_by_inst_and_topic'
    WHEN s4_n8 = 1 THEN 'full_to_init_resolved_by_inst_and_topic'
    
    WHEN s5_n1 = 1 THEN 'exact_full_resolved_by_source'
    WHEN s5_n2 = 1 THEN 'exact_first_mid_init_resolved_by_source'
    WHEN s5_n5 = 1 THEN 'exact_first_last_resolved_by_source'
    WHEN s5_n6 = 1 THEN 'init_to_full_resolved_by_source'
    WHEN s5_n8 = 1 THEN 'full_to_init_resolved_by_source'
    
    WHEN s3_n1 = 1 THEN 'exact_full_resolved_by_topic'
    WHEN s3_n2 = 1 THEN 'exact_first_mid_init_resolved_by_topic'
    WHEN s3_n5 = 1 THEN 'exact_first_last_resolved_by_topic'
    
    ELSE NULL
  END AS match_method,

  -- Matched Author Object
  CASE 
    WHEN s1_n1 = 1 THEN match_s1_n1
    WHEN s1_n2 = 1 THEN match_s1_n2
    WHEN s1_n5 = 1 THEN match_s1_n5
    
    WHEN s6_n1 = 1 THEN match_s6_n1
    WHEN s6_n2 = 1 THEN match_s6_n2
    WHEN s6_n5 = 1 THEN match_s6_n5
    WHEN s6_n6 = 1 THEN match_s6_n6
    WHEN s6_n8 = 1 THEN match_s6_n8

    WHEN s2_n1 = 1 THEN match_s2_n1
    WHEN s2_n2 = 1 THEN match_s2_n2
    WHEN s2_n5 = 1 THEN match_s2_n5
    WHEN s2_n6 = 1 THEN match_s2_n6
    WHEN s2_n8 = 1 THEN match_s2_n8

    WHEN s4_n1 = 1 THEN match_s4_n1
    WHEN s4_n2 = 1 THEN match_s4_n2
    WHEN s4_n5 = 1 THEN match_s4_n5
    WHEN s4_n6 = 1 THEN match_s4_n6
    WHEN s4_n8 = 1 THEN match_s4_n8

    WHEN s5_n1 = 1 THEN match_s5_n1
    WHEN s5_n2 = 1 THEN match_s5_n2
    WHEN s5_n5 = 1 THEN match_s5_n5
    WHEN s5_n6 = 1 THEN match_s5_n6
    WHEN s5_n8 = 1 THEN match_s5_n8

    WHEN s3_n1 = 1 THEN match_s3_n1
    WHEN s3_n2 = 1 THEN match_s3_n2
    WHEN s3_n5 = 1 THEN match_s3_n5

    ELSE NULL
  END AS matched_author,

  -- Match Outcome
  CASE 
    WHEN (
      s1_n1=1 OR s1_n2=1 OR s1_n5=1 OR 
      s6_n1=1 OR s6_n2=1 OR s6_n5=1 OR s6_n6=1 OR s6_n8=1 OR
      s2_n1=1 OR s2_n2=1 OR s2_n5=1 OR s2_n6=1 OR s2_n8=1 OR
      s4_n1=1 OR s4_n2=1 OR s4_n5=1 OR s4_n6=1 OR s4_n8=1 OR
      s5_n1=1 OR s5_n2=1 OR s5_n5=1 OR s5_n6=1 OR s5_n8=1 OR
      s3_n1=1 OR s3_n2=1 OR s3_n5=1
    ) THEN 'MATCHED'
    WHEN total_candidates_in_block = 0 THEN 'NO_CANDIDATES'
    ELSE 'AMBIGUOUS'
  END AS match_outcome,

  -- Detailed Failure Reason
  CASE
    WHEN (
      s1_n1=1 OR s1_n2=1 OR s1_n5=1 OR 
      s6_n1=1 OR s6_n2=1 OR s6_n5=1 OR s6_n6=1 OR s6_n8=1 OR
      s2_n1=1 OR s2_n2=1 OR s2_n5=1 OR s2_n6=1 OR s2_n8=1 OR
      s4_n1=1 OR s4_n2=1 OR s4_n5=1 OR s4_n6=1 OR s4_n8=1 OR
      s5_n1=1 OR s5_n2=1 OR s5_n5=1 OR s5_n6=1 OR s5_n8=1 OR
      s3_n1=1 OR s3_n2=1 OR s3_n5=1
    ) THEN NULL
    
    WHEN total_candidates_in_block = 0 THEN 'NO_BLOCK_CANDIDATES'
    
    WHEN total_name_matches = 0 THEN 'no_name_pattern_matched'

    WHEN (
      (s2_n1 > 1 OR s2_n2 > 1 OR s2_n5 > 1 OR s2_n6 > 1 OR s2_n8 > 1) OR -- Inst
      (s6_n1 > 1 OR s6_n2 > 1 OR s6_n5 > 1 OR s6_n6 > 1 OR s6_n8 > 1) OR -- Inst + Source
      (s4_n1 > 1 OR s4_n2 > 1 OR s4_n5 > 1 OR s4_n6 > 1 OR s4_n8 > 1)    -- Inst + Topic
    ) THEN 'ambiguous_name_multiple_candidates_have_inst'

    WHEN (s5_n1 > 1 OR s5_n2 > 1 OR s5_n5 > 1 OR s5_n6 > 1 OR s5_n8 > 1) 
      THEN 'ambiguous_name_multiple_candidates_have_source'

    WHEN (s3_n1 > 1 OR s3_n2 > 1 OR s3_n5 > 1) 
      THEN 'ambiguous_name_multiple_candidates_have_topic'

    WHEN (NOT work_has_inst AND NOT work_has_source AND NOT work_has_topic) 
      THEN 'ambiguous_name_work_has_no_signals'

    WHEN (work_has_inst OR work_has_source OR work_has_topic) 
      THEN 'ambiguous_name_no_signal_overlap'
    
    ELSE 'ambiguous_other'

  END AS failure_reason,
  
  total_candidates_in_block,
  total_name_matches,
  candidates_passing_name_check

FROM aggregated_counts;

In [0]:
-- use author assignments to either 1) update openalex.works.work_authorships_map with assignment or 2) create new author then update openalex.works.work_authorships_map with new author id
-- code to go here

### Step 4: Merge enriched updates into final `authors_and_affiliations` table

In [0]:
MERGE INTO identifier('openalex' || :env_suffix || '.works.authors_and_affiliations') AS target
USING identifier('openalex' || :env_suffix || '.works.authors_and_affiliations_updates') AS source
ON target.work_id = source.work_id
WHEN MATCHED THEN UPDATE SET
  target.authorships = source.authorships,
  target.updated_datetime = source.updated_datetime
WHEN NOT MATCHED THEN INSERT (work_id, authorships, updated_datetime)
VALUES (source.work_id, source.authorships, source.updated_datetime);