### Update `openalex.works.work_authors` table using AND and raw affiliation string lookup

In [0]:
DECLARE OR REPLACE VARIABLE max_updated_date TIMESTAMP DEFAULT to_timestamp('1900-01-01');
SET VARIABLE max_updated_date = COALESCE((SELECT MAX(updated_at) - INTERVAL 1 SECOND FROM identifier('openalex' || :env_suffix || '.works.work_authors')), to_timestamp('1900-01-01'));
-- SET VARIABLE max_updated_date = to_timestamp('2025-12-20');
SELECT max_updated_date;

### Step 1: Get updated works for matching

In [0]:
%run ../utils/variables

In [0]:
-- STEP 1: Create Staging Table for Batch Works
CREATE OR REPLACE TABLE openalex.authors.author_matching_batch AS
WITH raw_exploded AS (
    SELECT 
        id AS work_id,
        updated_date,
        POSEXPLODE(authorships) AS (author_sequence, authorship)
    FROM identifier('openalex' || :env_suffix || '.works.openalex_works_base')
    WHERE updated_date > max_updated_date
      AND authorships IS NOT NULL 
      AND SIZE(authorships) > 0
),
exploded_affiliations AS (
    SELECT 
        work_id,
        author_sequence,
        authorship,
        authorship.raw_author_name,
        EXPLODE_OUTER(authorship.raw_affiliation_strings) AS raw_affiliation_string
    FROM raw_exploded
),
-- 1. Resolve Direct IDs
resolved_direct_ids AS (
    SELECT 
        ea.work_id,
        ea.author_sequence,
        ea.authorship, 
        ea.raw_author_name,
        ea.raw_affiliation_string,
        CASE 
            WHEN ea.raw_affiliation_string IS NULL THEN NULL
            WHEN asl.institution_ids IS NOT NULL AND SIZE(asl.institution_ids) > 0 
                AND NOT (SIZE(asl.institution_ids) = 1 AND asl.institution_ids[0] = -1) 
                THEN asl.institution_ids
            ELSE NULL
        END AS direct_ids,
        asl.countries AS raw_countries
    FROM exploded_affiliations ea
    LEFT JOIN openalex.institutions.raw_affiliation_strings_institutions_mv asl
        ON ea.raw_affiliation_string = asl.raw_affiliation_string
),

-- 2. Expand Lineage (FOR MATCHING ONLY)
expanded_for_matching AS (
    SELECT 
        r.work_id,
        r.author_sequence,
        -- Combine Direct + Ancestors into one big list for the matching algorithm
        ARRAY_DISTINCT(FLATTEN(COLLECT_LIST(
            FLATTEN(ARRAY(
                FILTER(ARRAY(r.inst_id_scalar), x -> x IS NOT NULL),
                COALESCE(anc.ancestors, ARRAY())
            ))
        ))) as matching_institution_ids
    FROM (
        SELECT *, EXPLODE_OUTER(direct_ids) as inst_id_scalar
        FROM resolved_direct_ids 
    ) r
    LEFT JOIN (
        SELECT institution_id, lineage_ids as ancestors
        FROM openalex.institutions.institution_ancestors
    ) anc ON CAST(r.inst_id_scalar AS BIGINT) = anc.institution_id
    GROUP BY r.work_id, r.author_sequence
),

-- 3. Group for Storage (DIRECT IDs ONLY)
grouped_for_storage AS (
    SELECT 
        work_id,
        author_sequence,
        COLLECT_LIST(
            NAMED_STRUCT(
                'raw_string', raw_affiliation_string, 
                'ids', COALESCE(direct_ids, ARRAY()) 
            )
        ) as affiliation_structs
    FROM (
        -- Deduplicate to ensure no duplicate raw affilition strings
        SELECT 
            work_id, 
            author_sequence, 
            raw_affiliation_string,
            FIRST(direct_ids) as direct_ids
        FROM resolved_direct_ids
        GROUP BY work_id, author_sequence, raw_affiliation_string
    ) deduped
    GROUP BY work_id, author_sequence
)

-- 4. Final Selection
SELECT 
    r.work_id,
    r.author_sequence,
    FIRST(gs.affiliation_structs) as affiliation_structs,
    COALESCE(FIRST(efm.matching_institution_ids), ARRAY()) as all_institution_ids,
    FIRST(r.authorship) as authorship_struct,
    FIRST(r.raw_author_name) as raw_author_name,
    FIRST(r.raw_countries) as raw_countries
FROM resolved_direct_ids r
LEFT JOIN grouped_for_storage gs 
    ON r.work_id = gs.work_id AND r.author_sequence = gs.author_sequence
LEFT JOIN expanded_for_matching efm
    ON r.work_id = efm.work_id AND r.author_sequence = efm.author_sequence
GROUP BY r.work_id, r.author_sequence;

### Step 2: Run Matching Algorithm Over Updated Works with ID over 7000000000

In [0]:
CREATE OR REPLACE TABLE openalex.authors.pending_author_assignments AS
WITH 
-- 1. ENRICH BATCH DATA
-- We need to add Signals (Topics, Sources) and parsed names to the staged batch data
enriched_batch AS (
  SELECT
    b.work_id,
    b.author_sequence,
    b.raw_author_name,
    b.all_institution_ids,
    TRANSFORM(b.all_institution_ids, x -> CONCAT('https://openalex.org/I', CAST(x AS STRING))) AS institution_ids,
    
    COALESCE(pn.parsed_name, 
             named_struct('title', '', 'first', '', 'middle', '', 'last', '', 'suffix', '', 'nickname', '')
    ) AS parsed_name,
    
    COALESCE(wtf.topics, ARRAY()) AS topics,
    
    ARRAY_DISTINCT(
      TRANSFORM(
        FILTER(w.locations, x -> x.source.id IS NOT NULL),
        x -> x.source.id
      )
    ) AS work_source_ids
    
  FROM openalex.authors.author_matching_batch b
  LEFT JOIN openalex.authors.parsed_names_lookup pn 
    ON TRIM(b.raw_author_name) = pn.raw_author_name
  LEFT JOIN openalex.works.work_topics_frontfill wtf 
    ON b.work_id = wtf.work_id
  LEFT JOIN openalex.works.openalex_works w 
    ON b.work_id = w.id
),

-- 2. PREPARE MATCHING INPUTS
-- Calculate Block Keys and ID arrays
authors_prepared AS (
  SELECT
    work_id,
    author_sequence,
    raw_author_name,
    parsed_name,
    -- Block Key Generation
    LOWER(CONCAT(SUBSTRING(parsed_name.first, 1, 1), ' ', parsed_name.last)) AS block_key,
    institution_ids,
    -- Extract Topic IDs
    TRANSFORM(topics, t -> t.id) AS topic_ids,
    work_source_ids
  FROM enriched_batch
  -- Filter invalid rows if necessary, though batch_staged_data should be clean
  WHERE raw_author_name IS NOT NULL
),

-- 3. CANDIDATE BLOCKING
blocked_candidates AS (
  SELECT 
    e.work_id,
    e.author_sequence,
    e.raw_author_name,
    e.parsed_name,
    e.block_key,
    e.institution_ids,
    e.topic_ids,
    e.work_source_ids,
    alm.author_id,
    alm.parsed_longest_name,
    alm.institution_ids as candidate_institution_ids,
    alm.topic_ids as candidate_topic_ids,
    alm.source_ids AS candidate_source_ids,
    alm.works_count
  FROM authors_prepared e
  LEFT JOIN openalex.authors.author_lookup_mapping alm
    ON alm.block_key = e.block_key
    AND e.work_id > 7000000000  -- only attempt matching on new records, less than this will have no candidates right away
),

with_match_signals AS (
  SELECT
    *,
    NAMED_STRUCT(
      'id', author_id,
      'display_name', CONCAT(parsed_longest_name.first, ' ', parsed_longest_name.last),
      'parsed_name', parsed_longest_name
    ) AS candidate_obj,
    length(parsed_name.first) as pn_first_len,
    length(parsed_longest_name.first) as cand_first_len,
    coalesce(parsed_name.middle, '') as pn_middle,
    coalesce(parsed_longest_name.middle, '') as cand_middle,
    
    (size(institution_ids) > 0 AND size(candidate_institution_ids) > 0 
     AND arrays_overlap(candidate_institution_ids, institution_ids)) as has_inst,
    
    (size(topic_ids) > 0 AND size(candidate_topic_ids) > 0 
     AND arrays_overlap(candidate_topic_ids, topic_ids)) as has_topic,

     (SIZE(work_source_ids) > 0 AND SIZE(candidate_source_ids) > 0
     AND ARRAYS_OVERLAP(candidate_source_ids, work_source_ids)) AS has_source
  FROM blocked_candidates
),

with_name_matches AS (
  SELECT
    *,
    -- 1: Exact Full Name
    (pn_first_len > 1 AND length(pn_middle) > 1 AND cand_first_len > 1 AND length(cand_middle) > 1
     AND lower(parsed_name.first) = lower(parsed_longest_name.first)
     AND lower(pn_middle) = lower(cand_middle)
     AND lower(parsed_name.last) = lower(parsed_longest_name.last)
    ) as pattern_1_exact_full,

    -- 2: Exact First, Middle Initial match
    (pn_first_len > 1 AND length(pn_middle) = 1 AND cand_first_len > 1
     AND lower(parsed_name.first) = lower(parsed_longest_name.first)
     AND lower(parsed_name.last) = lower(parsed_longest_name.last)
     AND (cand_middle = '' OR lower(pn_middle) = lower(substring(cand_middle, 1, 1)))
    ) as pattern_2_exact_first_mid_init,

    -- 3: Initials match to Full
    (pn_first_len = 1 AND pn_middle != '' AND cand_first_len > 1 AND cand_middle != ''
     AND lower(parsed_name.first) = lower(substring(parsed_longest_name.first, 1, 1))
     AND lower(substring(pn_middle, 1, 1)) = lower(substring(cand_middle, 1, 1))
     AND lower(parsed_name.last) = lower(parsed_longest_name.last)
    ) as pattern_3_init_mid_init,

    -- 4: First Initial, Middle Initial match
    (pn_first_len = 1 AND cand_first_len = 1 AND pn_middle != '' AND cand_middle != ''
     AND lower(parsed_name.first) = lower(parsed_longest_name.first)
     AND lower(substring(pn_middle, 1, 1)) = lower(substring(cand_middle, 1, 1))
     AND lower(parsed_name.last) = lower(parsed_longest_name.last)
    ) as pattern_4_first_init_mid_init,

    -- 5: Exact First, Exact Last
    (pn_first_len > 1 AND cand_first_len > 1
     AND lower(parsed_name.first) = lower(parsed_longest_name.first)
     AND lower(parsed_name.last) = lower(parsed_longest_name.last)
     AND pn_middle = ''
    ) as pattern_5_exact_first_last,

    -- 6: First Initial Only to Full
    (pn_first_len = 1 AND pn_middle = '' AND cand_first_len > 1
     AND lower(parsed_name.first) = lower(substring(parsed_longest_name.first, 1, 1))
     AND lower(parsed_name.last) = lower(parsed_longest_name.last)
    ) as pattern_6_first_init_to_full,

    -- 7: First Initial Only
    (pn_first_len = 1 AND cand_first_len = 1
     AND lower(parsed_name.first) = lower(parsed_longest_name.first)
     AND lower(parsed_name.last) = lower(parsed_longest_name.last)
     AND pn_middle = '' AND cand_middle = ''
    ) as pattern_7_first_init_last,

    -- 8: Full Name to Initial
    (pn_first_len > 1 AND cand_first_len = 1
     AND lower(substring(parsed_name.first, 1, 1)) = lower(parsed_longest_name.first)
     AND lower(parsed_name.last) = lower(parsed_longest_name.last)
    ) as pattern_8_full_to_init

  FROM with_match_signals
),

with_any_name_match AS (
  SELECT
    *,
    (pattern_1_exact_full OR pattern_2_exact_first_mid_init OR pattern_3_init_mid_init OR 
     pattern_4_first_init_mid_init OR pattern_5_exact_first_last OR pattern_6_first_init_to_full OR 
     pattern_7_first_init_last OR pattern_8_full_to_init) as any_name_match
  FROM with_name_matches
),

aggregated_counts AS (
  SELECT
    work_id,
    author_sequence,
    raw_author_name,
    block_key,
    institution_ids,
    parsed_name,
    work_source_ids,
    
    -- STRATEGY 1: Name Only (Unique)
    count_if(pattern_1_exact_full) AS s1_n1, count_if(pattern_2_exact_first_mid_init) AS s1_n2,
    count_if(pattern_3_init_mid_init) AS s1_n3, count_if(pattern_4_first_init_mid_init) AS s1_n4,
    count_if(pattern_5_exact_first_last) AS s1_n5, count_if(pattern_6_first_init_to_full) AS s1_n6,
    count_if(pattern_7_first_init_last) AS s1_n7, count_if(pattern_8_full_to_init) AS s1_n8,
    
    -- STRATEGY 2: Name + Institution
    count_if(pattern_1_exact_full AND has_inst) AS s2_n1, count_if(pattern_2_exact_first_mid_init AND has_inst) AS s2_n2,
    count_if(pattern_3_init_mid_init AND has_inst) AS s2_n3, count_if(pattern_4_first_init_mid_init AND has_inst) AS s2_n4,
    count_if(pattern_5_exact_first_last AND has_inst) AS s2_n5, count_if(pattern_6_first_init_to_full AND has_inst) AS s2_n6,
    count_if(pattern_7_first_init_last AND has_inst) AS s2_n7, count_if(pattern_8_full_to_init AND has_inst) AS s2_n8,

    -- STRATEGY 6: Name + Inst + Source
    count_if(pattern_1_exact_full AND has_inst AND has_source) AS s6_n1,
    count_if(pattern_2_exact_first_mid_init AND has_inst AND has_source) AS s6_n2,
    count_if(pattern_5_exact_first_last AND has_inst AND has_source) AS s6_n5,
    count_if(pattern_6_first_init_to_full AND has_inst AND has_source) AS s6_n6,
    count_if(pattern_7_first_init_last AND has_inst AND has_source) AS s6_n7,
    count_if(pattern_8_full_to_init AND has_inst AND has_source) AS s6_n8,

    -- STRATEGY 4: Name + Inst + Topic
    count_if(pattern_1_exact_full AND has_inst AND has_topic) AS s4_n1,
    count_if(pattern_2_exact_first_mid_init AND has_inst AND has_topic) AS s4_n2,
    count_if(pattern_5_exact_first_last AND has_inst AND has_topic) AS s4_n5,
    count_if(pattern_6_first_init_to_full AND has_inst AND has_topic) AS s4_n6,
    count_if(pattern_7_first_init_last AND has_inst AND has_topic) AS s4_n7,
    count_if(pattern_8_full_to_init AND has_inst AND has_topic) AS s4_n8,

    -- STRATEGY 5: Name + Source
    count_if(pattern_1_exact_full AND has_source) AS s5_n1,
    count_if(pattern_2_exact_first_mid_init AND has_source) AS s5_n2,
    count_if(pattern_5_exact_first_last AND has_source) AS s5_n5,
    count_if(pattern_6_first_init_to_full AND has_source) AS s5_n6,
    count_if(pattern_7_first_init_last AND has_source) AS s5_n7,
    count_if(pattern_8_full_to_init AND has_source) AS s5_n8,

    -- STRATEGY 3: Name + Topic
    count_if(pattern_1_exact_full AND has_topic) AS s3_n1,
    count_if(pattern_2_exact_first_mid_init AND has_topic) AS s3_n2,
    count_if(pattern_5_exact_first_last AND has_topic) AS s3_n5,
    
    -- CAPTURE MATCHED OBJECTS (Same as before)
    MAX(CASE WHEN pattern_1_exact_full THEN candidate_obj END) AS match_s1_n1,
    MAX(CASE WHEN pattern_2_exact_first_mid_init THEN candidate_obj END) AS match_s1_n2,
    MAX(CASE WHEN pattern_5_exact_first_last THEN candidate_obj END) AS match_s1_n5,

    MAX(CASE WHEN pattern_1_exact_full AND has_inst THEN candidate_obj END) AS match_s2_n1,
    MAX(CASE WHEN pattern_2_exact_first_mid_init AND has_inst THEN candidate_obj END) AS match_s2_n2,
    MAX(CASE WHEN pattern_5_exact_first_last AND has_inst THEN candidate_obj END) AS match_s2_n5,
    MAX(CASE WHEN pattern_6_first_init_to_full AND has_inst THEN candidate_obj END) AS match_s2_n6,
    MAX(CASE WHEN pattern_8_full_to_init AND has_inst THEN candidate_obj END) AS match_s2_n8,

    MAX(CASE WHEN pattern_1_exact_full AND has_inst AND has_source THEN candidate_obj END) AS match_s6_n1,
    MAX(CASE WHEN pattern_2_exact_first_mid_init AND has_inst AND has_source THEN candidate_obj END) AS match_s6_n2,
    MAX(CASE WHEN pattern_5_exact_first_last AND has_inst AND has_source THEN candidate_obj END) AS match_s6_n5,
    MAX(CASE WHEN pattern_6_first_init_to_full AND has_inst AND has_source THEN candidate_obj END) AS match_s6_n6,
    MAX(CASE WHEN pattern_8_full_to_init AND has_inst AND has_source THEN candidate_obj END) AS match_s6_n8,

    MAX(CASE WHEN pattern_1_exact_full AND has_source THEN candidate_obj END) AS match_s5_n1,
    MAX(CASE WHEN pattern_2_exact_first_mid_init AND has_source THEN candidate_obj END) AS match_s5_n2,
    MAX(CASE WHEN pattern_5_exact_first_last AND has_source THEN candidate_obj END) AS match_s5_n5,
    MAX(CASE WHEN pattern_6_first_init_to_full AND has_source THEN candidate_obj END) AS match_s5_n6,
    MAX(CASE WHEN pattern_8_full_to_init AND has_source THEN candidate_obj END) AS match_s5_n8,
    
    MAX(CASE WHEN pattern_1_exact_full AND has_inst AND has_topic THEN candidate_obj END) AS match_s4_n1,
    MAX(CASE WHEN pattern_2_exact_first_mid_init AND has_inst AND has_topic THEN candidate_obj END) AS match_s4_n2,
    MAX(CASE WHEN pattern_5_exact_first_last AND has_inst AND has_topic THEN candidate_obj END) AS match_s4_n5,
    MAX(CASE WHEN pattern_6_first_init_to_full AND has_inst AND has_topic THEN candidate_obj END) AS match_s4_n6,
    MAX(CASE WHEN pattern_8_full_to_init AND has_inst AND has_topic THEN candidate_obj END) AS match_s4_n8,

    MAX(CASE WHEN pattern_1_exact_full AND has_topic THEN candidate_obj END) AS match_s3_n1,
    MAX(CASE WHEN pattern_2_exact_first_mid_init AND has_topic THEN candidate_obj END) AS match_s3_n2,
    MAX(CASE WHEN pattern_5_exact_first_last AND has_topic THEN candidate_obj END) AS match_s3_n5,
    
    COUNT(author_id) AS total_candidates_in_block,
    COUNT_IF(any_name_match) AS total_name_matches

  FROM with_any_name_match
  GROUP BY work_id, author_sequence, raw_author_name, block_key, institution_ids, parsed_name, work_source_ids
),

final_decision AS (
SELECT
  work_id,
  author_sequence,
  block_key,
  raw_author_name,
  institution_ids,
  parsed_name,
  work_source_ids,
  
  -- MATCH OUTCOME
  CASE 
    WHEN (
      s1_n1=1 OR s1_n2=1 OR s1_n5=1 OR 
      s6_n1=1 OR s6_n2=1 OR s6_n5=1 OR s6_n6=1 OR s6_n8=1 OR
      s2_n1=1 OR s2_n2=1 OR s2_n5=1 OR s2_n6=1 OR s2_n8=1 OR
      s4_n1=1 OR s4_n2=1 OR s4_n5=1 OR s4_n6=1 OR s4_n8=1 OR
      s5_n1=1 OR s5_n2=1 OR s5_n5=1 OR s5_n6=1 OR s5_n8=1 OR
      s3_n1=1 OR s3_n2=1 OR s3_n5=1
    ) THEN 'MATCHED'
    WHEN total_candidates_in_block = 0 THEN 'NO_CANDIDATES'
    ELSE 'AMBIGUOUS'
  END AS match_outcome,

  -- MATCHED AUTHOR ID
  CASE 
    WHEN s1_n1 = 1 THEN match_s1_n1.id
    WHEN s1_n2 = 1 THEN match_s1_n2.id
    WHEN s1_n5 = 1 THEN match_s1_n5.id
    
    WHEN s6_n1 = 1 THEN match_s6_n1.id
    WHEN s6_n2 = 1 THEN match_s6_n2.id
    WHEN s6_n5 = 1 THEN match_s6_n5.id
    WHEN s6_n6 = 1 THEN match_s6_n6.id
    WHEN s6_n8 = 1 THEN match_s6_n8.id

    WHEN s2_n1 = 1 THEN match_s2_n1.id
    WHEN s2_n2 = 1 THEN match_s2_n2.id
    WHEN s2_n5 = 1 THEN match_s2_n5.id
    WHEN s2_n6 = 1 THEN match_s2_n6.id
    WHEN s2_n8 = 1 THEN match_s2_n8.id

    WHEN s4_n1 = 1 THEN match_s4_n1.id
    WHEN s4_n2 = 1 THEN match_s4_n2.id
    WHEN s4_n5 = 1 THEN match_s4_n5.id
    WHEN s4_n6 = 1 THEN match_s4_n6.id
    WHEN s4_n8 = 1 THEN match_s4_n8.id

    WHEN s5_n1 = 1 THEN match_s5_n1.id
    WHEN s5_n2 = 1 THEN match_s5_n2.id
    WHEN s5_n5 = 1 THEN match_s5_n5.id
    WHEN s5_n6 = 1 THEN match_s5_n6.id
    WHEN s5_n8 = 1 THEN match_s5_n8.id

    WHEN s3_n1 = 1 THEN match_s3_n1.id
    WHEN s3_n2 = 1 THEN match_s3_n2.id
    WHEN s3_n5 = 1 THEN match_s3_n5.id

    ELSE NULL
  END AS existing_author_id

FROM aggregated_counts
)
SELECT * FROM final_decision;

### Step 3: Cluster Unmatched & Mint New IDs

In [0]:
-- A. Get the current High Water Mark
DECLARE OR REPLACE VARIABLE max_id BIGINT;
SET VARIABLE max_id = (SELECT MAX(id) FROM openalex.authors.author_registry);

-- B. Cluster and Mint
CREATE OR REPLACE TABLE openalex.authors.author_matching_new_author_queue AS
WITH unmatched_with_hash AS (
    SELECT 
        pa.work_id,
        pa.author_sequence,
        pa.raw_author_name,
        
        xxhash64(
            -- 1. NAME PART: Parsed if available, else Raw
            CASE 
                WHEN pa.parsed_name.first IS NOT NULL AND pa.parsed_name.first <> '' 
                     AND pa.parsed_name.last IS NOT NULL AND pa.parsed_name.last <> ''
                THEN LOWER(CONCAT(pa.parsed_name.first, ' ', pa.parsed_name.last))
                ELSE LOWER(TRIM(pa.raw_author_name))
            END,
            -- 2. SIGNAL PART: Institutions -> Sources
            CASE 
                WHEN SIZE(b.all_institution_ids) > 0
                THEN concat_ws('|', sort_array(b.all_institution_ids))
                ELSE concat_ws('|', sort_array(pa.work_source_ids))
            END
        ) AS cluster_hash

    FROM openalex.authors.pending_author_assignments pa
    --  Join Batch to get 'all_institution_ids'
    LEFT JOIN openalex.authors.author_matching_batch b
        ON pa.work_id = b.work_id AND pa.author_sequence = b.author_sequence
    LEFT JOIN openalex.works.openalex_works_base w
        ON pa.work_id = w.id
    LEFT JOIN openalex.works.work_authors existing
        ON pa.work_id = existing.work_id 
        AND pa.author_sequence = existing.author_sequence
    WHERE 
        -- Only unmatched records
        pa.match_outcome <> 'MATCHED'
        -- ensure we haven't already assigned an ID in a previous run
        AND existing.author_id IS NULL
        -- Only mint IDs for new works, after the author start assignment date
        AND pa.work_id > 7000000000
        AND w.created_date >= to_timestamp('2025-12-20') 
        -- Safety: Ensure we actually have a name string to hash
        AND pa.raw_author_name IS NOT NULL 
        AND TRIM(pa.raw_author_name) <> ''
),
unique_clusters AS (
    SELECT 
        cluster_hash,
        MAX_BY(raw_author_name, length(raw_author_name)) as raw_display_name,
        monotonically_increasing_id() as batch_row_id
    FROM unmatched_with_hash
    GROUP BY cluster_hash
)
SELECT 
    uc.cluster_hash,
    CASE 
        WHEN SIZE(SPLIT(uc.raw_display_name, ',')) = 2 THEN 
            TRIM(SPLIT(uc.raw_display_name, ',')[1]) || ' ' || TRIM(SPLIT(uc.raw_display_name, ',')[0])
        ELSE 
            uc.raw_display_name 
    END AS display_name,
    max_id + ROW_NUMBER() OVER (ORDER BY uc.batch_row_id) AS new_author_id
FROM unique_clusters uc;

In [0]:
-- logging: review match rates
SELECT 
    pa.match_outcome, 
    COUNT(*) as count,
    ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER(), 2) as percentage
FROM openalex.authors.pending_author_assignments pa
JOIN openalex.works.openalex_works_base w 
    ON pa.work_id = w.id
WHERE pa.work_id > 7000000000 
  AND w.created_date >= to_timestamp('2025-12-20')
GROUP BY pa.match_outcome

UNION ALL

-- 2. MINTING STATS
SELECT 
    'NEW_AUTHORS_TO_CREATE' as match_outcome,
    COUNT(*) as count,
    NULL as percentage
FROM openalex.authors.author_matching_new_author_queue;

### Step 4: Write New Authors to Registry

In [0]:
INSERT INTO openalex.authors.author_registry 
    (id, display_name, merge_into_id, merge_into_date, created_date, updated_date)
SELECT 
    new_author_id AS id,
    display_name,
    NULL AS merge_into_id,
    NULL AS merge_into_date,
    current_timestamp() AS created_date,
    current_timestamp() AS updated_date
FROM openalex.authors.author_matching_new_author_queue;

### Step 5: Consolidate Decisions

In [None]:
CREATE OR REPLACE TEMPORARY VIEW batch_author_decisions AS
SELECT 
    b.work_id,
    b.author_sequence,
    b.raw_author_name,
    b.affiliation_structs,
    
    CASE 
        WHEN w.created_date >= to_timestamp('2025-12-20') 
            AND b.work_id > 7000000000
        THEN COALESCE(pa.existing_author_id, q.new_author_id)
        ELSE lwa.author_id  -- Use legacy author_id for older works
    END AS final_author_id

FROM openalex.authors.author_matching_batch b

LEFT JOIN openalex.authors.pending_author_assignments pa
    ON b.work_id = pa.work_id 
    AND b.author_sequence = pa.author_sequence

LEFT JOIN openalex.works.openalex_works_base w
    ON b.work_id = w.id

LEFT JOIN openalex.works_legacy.work_authors lwa
    ON b.work_id = lwa.work_id
    AND b.author_sequence = lwa.author_sequence

LEFT JOIN openalex.authors.author_matching_new_author_queue q
    ON (pa.match_outcome IS NULL OR pa.match_outcome <> 'MATCHED')
    AND xxhash64(
            CASE 
               WHEN pa.parsed_name.first IS NOT NULL AND pa.parsed_name.first <> '' 
                AND pa.parsed_name.last IS NOT NULL AND pa.parsed_name.last <> ''
               THEN LOWER(CONCAT(pa.parsed_name.first, ' ', pa.parsed_name.last))
               ELSE LOWER(TRIM(b.raw_author_name))
            END,
            CASE 
                WHEN SIZE(b.all_institution_ids) > 0 
                THEN concat_ws('|', sort_array(b.all_institution_ids))
                ELSE concat_ws('|', sort_array(pa.work_source_ids))
            END
        ) = q.cluster_hash;

### Step 6: Update work_authors Table and refresh related view

In [None]:
MERGE INTO openalex.works.work_authors AS target
USING (
    SELECT
        source.work_id,
        source.author_sequence,
        source.final_author_id,
        source.raw_author_name,
        ARRAY_COMPACT(ARRAY_DISTINCT(COLLECT_LIST(aff.raw_string))) AS raw_affiliation_strings,
        MAX(w_auth.is_corresponding) AS is_corresponding,
        MAX(wb.updated_date) AS source_updated_date

    FROM (
        SELECT *
        FROM batch_author_decisions
        LATERAL VIEW OUTER EXPLODE(affiliation_structs) t AS aff
    ) source
    LEFT JOIN (
        SELECT
            id AS work_id,
            author_sequence,
            authorship.is_corresponding
        FROM openalex.works.openalex_works_base
        LATERAL VIEW POSEXPLODE(authorships) t AS author_sequence, authorship
    ) w_auth
        ON source.work_id = w_auth.work_id
        AND source.author_sequence = w_auth.author_sequence
    LEFT JOIN openalex.works.openalex_works_base wb
        ON source.work_id = wb.id
    GROUP BY source.work_id, source.author_sequence, source.final_author_id, source.raw_author_name
) AS source
ON target.work_id = source.work_id
   AND target.author_sequence = source.author_sequence

WHEN MATCHED THEN
    UPDATE SET
        target.author_id = COALESCE(target.author_id, source.final_author_id),
        target.raw_author_name = source.raw_author_name,
        target.raw_affiliation_strings = source.raw_affiliation_strings,
        target.is_corresponding = source.is_corresponding,
        target.updated_at = source.source_updated_date

WHEN NOT MATCHED THEN
    INSERT (
        work_id, author_sequence, author_id,
        raw_author_name, raw_affiliation_strings, is_corresponding,
        created_at, updated_at
    )
    VALUES (
        source.work_id, source.author_sequence, source.final_author_id,
        source.raw_author_name, source.raw_affiliation_strings, source.is_corresponding,
        current_timestamp(), source.source_updated_date
    );