In [0]:
%sql
-------------------------------------------------------------------------------
-- AUTHOR MATCHING ALGORITHM - CREATE DIAGNOSTIC TABLE attempt 1
-------------------------------------------------------------------------------

CREATE OR REPLACE TABLE openalex.authors.author_matching_diagnostics AS

WITH with_topics AS (
  SELECT
    aa.work_id,
    aa.authorships,
    coalesce(wtf.topics, array()) as topics
  FROM openalex.works.authors_and_affiliations_updates aa
  LEFT JOIN openalex.works.work_topics_frontfill wtf
    ON aa.work_id = wtf.work_id
),

authors_exploded AS (
  SELECT
    work_id,
    authorship.author.display_name,
    authorship.parsed_name,
    lower(concat(substring(authorship.parsed_name.first, 1, 1), ' ', authorship.parsed_name.last)) as block_key,
    array_distinct(
      concat(
        transform(authorship.institutions, i -> i.id),
        flatten(transform(authorship.institutions, i -> i.lineage))
      )
    ) as institution_ids,
    transform(topics, t -> t.id) as topic_ids,
    transform(topics, t -> t.subfield.id) as subfield_ids
  FROM with_topics
  LATERAL VIEW explode(authorships) AS authorship
  WHERE work_id > 7000000000
),

blocked_candidates AS (
  SELECT 
    e.work_id,
    e.display_name,
    e.parsed_name,
    e.block_key,
    e.institution_ids,
    e.topic_ids,
    e.subfield_ids,
    alm.author_id,
    alm.parsed_longest_name,
    alm.institution_ids as candidate_institution_ids,
    alm.topic_ids as candidate_topic_ids,
    alm.subfield_ids as candidate_subfield_ids,
    alm.works_count
  FROM authors_exploded e
  JOIN openalex.authors.author_lookup_mapping alm
    ON alm.block_key = e.block_key
),

with_match_signals AS (
  SELECT
    *,
    
    length(parsed_name.first) as pn_first_len,
    length(parsed_longest_name.first) as cand_first_len,
    coalesce(parsed_name.middle, '') as pn_middle,
    coalesce(parsed_longest_name.middle, '') as cand_middle,
    
    (size(institution_ids) > 0 
     AND size(candidate_institution_ids) > 0 
     AND arrays_overlap(candidate_institution_ids, institution_ids)
    ) as has_inst,
    
    (size(topic_ids) > 0 
     AND size(candidate_topic_ids) > 0 
     AND arrays_overlap(candidate_topic_ids, topic_ids)
    ) as has_topic,

    (size(subfield_ids) > 0 
     AND size(candidate_subfield_ids) > 0 
     AND arrays_overlap(candidate_subfield_ids, subfield_ids)
    ) as has_subfield
     
  FROM blocked_candidates
),

with_name_matches AS (
  SELECT
    *,
    
    (pn_first_len = 1 AND cand_first_len = 1
     AND lower(parsed_name.first) = lower(parsed_longest_name.first)
     AND lower(parsed_name.last) = lower(parsed_longest_name.last)
     AND pn_middle = '' AND cand_middle = ''
    ) as matches_1,
    
    (pn_first_len = 1 AND cand_first_len = 1
     AND pn_middle != '' AND cand_middle != ''
     AND lower(parsed_name.first) = lower(parsed_longest_name.first)
     AND lower(substring(pn_middle, 1, 1)) = lower(substring(cand_middle, 1, 1))
     AND lower(parsed_name.last) = lower(parsed_longest_name.last)
    ) as matches_2,
    
    (pn_first_len > 1 AND cand_first_len > 1
     AND lower(parsed_name.first) = lower(parsed_longest_name.first)
     AND lower(parsed_name.last) = lower(parsed_longest_name.last)
     AND pn_middle = ''
    ) as matches_3,
    
    (pn_first_len > 1 AND length(pn_middle) = 1 AND cand_first_len > 1
     AND lower(parsed_name.first) = lower(parsed_longest_name.first)
     AND lower(parsed_name.last) = lower(parsed_longest_name.last)
     AND (cand_middle = '' OR lower(pn_middle) = lower(substring(cand_middle, 1, 1)))
    ) as matches_4,
    
    (pn_first_len > 1 AND length(pn_middle) > 1
     AND cand_first_len > 1 AND length(cand_middle) > 1
     AND lower(parsed_name.first) = lower(parsed_longest_name.first)
     AND lower(pn_middle) = lower(cand_middle)
     AND lower(parsed_name.last) = lower(parsed_longest_name.last)
    ) as matches_5,
    
    (pn_first_len > 1 AND cand_first_len = 1
     AND lower(substring(parsed_name.first, 1, 1)) = lower(parsed_longest_name.first)
     AND lower(parsed_name.last) = lower(parsed_longest_name.last)
    ) as matches_6,
    
    (pn_first_len = 1 AND pn_middle = '' AND cand_first_len > 1
     AND lower(parsed_name.first) = lower(substring(parsed_longest_name.first, 1, 1))
     AND lower(parsed_name.last) = lower(parsed_longest_name.last)
    ) as matches_7,
    
    (pn_first_len = 1 AND pn_middle != ''
     AND cand_first_len > 1 AND cand_middle != ''
     AND lower(parsed_name.first) = lower(substring(parsed_longest_name.first, 1, 1))
     AND lower(substring(pn_middle, 1, 1)) = lower(substring(cand_middle, 1, 1))
     AND lower(parsed_name.last) = lower(parsed_longest_name.last)
    ) as matches_8

  FROM with_match_signals
),

with_any_name_match AS (
  SELECT
    *,
    (matches_1 OR matches_2 OR matches_3 OR matches_4 OR 
     matches_5 OR matches_6 OR matches_7 OR matches_8) as any_name_match
  FROM with_name_matches
),

match_counts AS (
  SELECT
    work_id,
    display_name,
    parsed_name,
    block_key,
    
    count_if(matches_1) as n1_1, count_if(matches_2) as n1_2,
    count_if(matches_3) as n1_3, count_if(matches_4) as n1_4,
    count_if(matches_5) as n1_5, count_if(matches_6) as n1_6,
    count_if(matches_7) as n1_7, count_if(matches_8) as n1_8,
    
    count_if(matches_1 AND has_inst) as n2_1, count_if(matches_2 AND has_inst) as n2_2,
    count_if(matches_3 AND has_inst) as n2_3, count_if(matches_4 AND has_inst) as n2_4,
    count_if(matches_5 AND has_inst) as n2_5, count_if(matches_6 AND has_inst) as n2_6,
    count_if(matches_7 AND has_inst) as n2_7, count_if(matches_8 AND has_inst) as n2_8,
    
    count_if(matches_1 AND has_topic) as n3_1, count_if(matches_2 AND has_topic) as n3_2,
    count_if(matches_3 AND has_topic) as n3_3, count_if(matches_4 AND has_topic) as n3_4,
    count_if(matches_5 AND has_topic) as n3_5, count_if(matches_6 AND has_topic) as n3_6,
    count_if(matches_7 AND has_topic) as n3_7, count_if(matches_8 AND has_topic) as n3_8,
    
    count_if(matches_1 AND has_inst AND has_topic) as n4_1, count_if(matches_2 AND has_inst AND has_topic) as n4_2,
    count_if(matches_3 AND has_inst AND has_topic) as n4_3, count_if(matches_4 AND has_inst AND has_topic) as n4_4,
    count_if(matches_5 AND has_inst AND has_topic) as n4_5, count_if(matches_6 AND has_inst AND has_topic) as n4_6,
    count_if(matches_7 AND has_inst AND has_topic) as n4_7, count_if(matches_8 AND has_inst AND has_topic) as n4_8,

    count_if(matches_1 AND has_subfield) as n5_1, count_if(matches_2 AND has_subfield) as n5_2,
    count_if(matches_3 AND has_subfield) as n5_3, count_if(matches_4 AND has_subfield) as n5_4,
    count_if(matches_5 AND has_subfield) as n5_5, count_if(matches_6 AND has_subfield) as n5_6,
    count_if(matches_7 AND has_subfield) as n5_7, count_if(matches_8 AND has_subfield) as n5_8,
    
    -- Capture the matched author_id for each strategy
    max(CASE WHEN matches_1 THEN author_id END) as a1_1,
    max(CASE WHEN matches_2 THEN author_id END) as a1_2,
    max(CASE WHEN matches_3 THEN author_id END) as a1_3,
    max(CASE WHEN matches_4 THEN author_id END) as a1_4,
    max(CASE WHEN matches_5 THEN author_id END) as a1_5,
    max(CASE WHEN matches_6 THEN author_id END) as a1_6,
    max(CASE WHEN matches_7 THEN author_id END) as a1_7,
    max(CASE WHEN matches_8 THEN author_id END) as a1_8,
    
    max(CASE WHEN matches_1 AND has_inst THEN author_id END) as a2_1,
    max(CASE WHEN matches_2 AND has_inst THEN author_id END) as a2_2,
    max(CASE WHEN matches_3 AND has_inst THEN author_id END) as a2_3,
    max(CASE WHEN matches_4 AND has_inst THEN author_id END) as a2_4,
    max(CASE WHEN matches_5 AND has_inst THEN author_id END) as a2_5,
    max(CASE WHEN matches_6 AND has_inst THEN author_id END) as a2_6,
    max(CASE WHEN matches_7 AND has_inst THEN author_id END) as a2_7,
    max(CASE WHEN matches_8 AND has_inst THEN author_id END) as a2_8,
    
    max(CASE WHEN matches_1 AND has_topic THEN author_id END) as a3_1,
    max(CASE WHEN matches_2 AND has_topic THEN author_id END) as a3_2,
    max(CASE WHEN matches_3 AND has_topic THEN author_id END) as a3_3,
    max(CASE WHEN matches_4 AND has_topic THEN author_id END) as a3_4,
    max(CASE WHEN matches_5 AND has_topic THEN author_id END) as a3_5,
    max(CASE WHEN matches_6 AND has_topic THEN author_id END) as a3_6,
    max(CASE WHEN matches_7 AND has_topic THEN author_id END) as a3_7,
    max(CASE WHEN matches_8 AND has_topic THEN author_id END) as a3_8,
    
    max(CASE WHEN matches_1 AND has_inst AND has_topic THEN author_id END) as a4_1,
    max(CASE WHEN matches_2 AND has_inst AND has_topic THEN author_id END) as a4_2,
    max(CASE WHEN matches_3 AND has_inst AND has_topic THEN author_id END) as a4_3,
    max(CASE WHEN matches_4 AND has_inst AND has_topic THEN author_id END) as a4_4,
    max(CASE WHEN matches_5 AND has_inst AND has_topic THEN author_id END) as a4_5,
    max(CASE WHEN matches_6 AND has_inst AND has_topic THEN author_id END) as a4_6,
    max(CASE WHEN matches_7 AND has_inst AND has_topic THEN author_id END) as a4_7,
    max(CASE WHEN matches_8 AND has_inst AND has_topic THEN author_id END) as a4_8,
    
    max(CASE WHEN matches_1 AND has_subfield THEN author_id END) as a5_1,
    max(CASE WHEN matches_2 AND has_subfield THEN author_id END) as a5_2,
    max(CASE WHEN matches_3 AND has_subfield THEN author_id END) as a5_3,
    max(CASE WHEN matches_4 AND has_subfield THEN author_id END) as a5_4,
    max(CASE WHEN matches_5 AND has_subfield THEN author_id END) as a5_5,
    max(CASE WHEN matches_6 AND has_subfield THEN author_id END) as a5_6,
    max(CASE WHEN matches_7 AND has_subfield THEN author_id END) as a5_7,
    max(CASE WHEN matches_8 AND has_subfield THEN author_id END) as a5_8,
    
    count(*) as total_candidates_in_block,
    count_if(any_name_match) as total_name_matches,
    count_if(has_inst) as candidates_with_inst,
    count_if(has_topic) as candidates_with_topic,
    count_if(has_subfield) as candidates_with_subfield,
    
    max(size(institution_ids)) as work_has_inst,
    max(size(topic_ids)) as work_has_topic,
    max(size(subfield_ids)) as work_has_subfield,
    
    slice(collect_list(
      CASE WHEN any_name_match THEN
        named_struct(
          'author_id', author_id,
          'cand_first', parsed_longest_name.first,
          'cand_middle', parsed_longest_name.middle,
          'cand_last', parsed_longest_name.last,
          'works_count', works_count,
          'has_inst', has_inst,
          'has_topic', has_topic,
          'has_subfield', has_subfield
        )
      END
    ), 1, 10) as matching_candidates,
    
    slice(collect_list(
      CASE WHEN NOT any_name_match THEN
        named_struct(
          'author_id', author_id,
          'cand_first', parsed_longest_name.first,
          'cand_middle', parsed_longest_name.middle,
          'cand_last', parsed_longest_name.last,
          'works_count', works_count
        )
      END
    ), 1, 5) as non_matching_candidates

  FROM with_any_name_match
  GROUP BY work_id, display_name, parsed_name, block_key
)

SELECT
  work_id,
  display_name,
  parsed_name.first as work_first,
  parsed_name.middle as work_middle,
  parsed_name.last as work_last,
  block_key,
  
  CASE 
    WHEN n1_5 = 1 THEN 'r1_first_mid_last'
    WHEN n1_4 = 1 THEN 'r1_first_midinit_last'
    WHEN n1_8 = 1 THEN 'r1_init_midinit_to_full'
    WHEN n1_2 = 1 THEN 'r1_firstinit_midinit_last'
    WHEN n1_3 = 1 THEN 'r1_first_last'
    WHEN n1_7 = 1 THEN 'r1_init_to_full'
    WHEN n1_1 = 1 THEN 'r1_firstinit_last'
    WHEN n1_6 = 1 THEN 'r1_full_to_init'
    WHEN n2_5 = 1 THEN 'r2_first_mid_last'
    WHEN n2_4 = 1 THEN 'r2_first_midinit_last'
    WHEN n2_8 = 1 THEN 'r2_init_midinit_to_full'
    WHEN n2_2 = 1 THEN 'r2_firstinit_midinit_last'
    WHEN n2_3 = 1 THEN 'r2_first_last'
    WHEN n2_7 = 1 THEN 'r2_init_to_full'
    WHEN n2_1 = 1 THEN 'r2_firstinit_last'
    WHEN n2_6 = 1 THEN 'r2_full_to_init'
    WHEN n3_5 = 1 THEN 'r3_first_mid_last'
    WHEN n3_4 = 1 THEN 'r3_first_midinit_last'
    WHEN n3_8 = 1 THEN 'r3_init_midinit_to_full'
    WHEN n3_2 = 1 THEN 'r3_firstinit_midinit_last'
    WHEN n3_3 = 1 THEN 'r3_first_last'
    WHEN n3_7 = 1 THEN 'r3_init_to_full'
    WHEN n3_1 = 1 THEN 'r3_firstinit_last'
    WHEN n3_6 = 1 THEN 'r3_full_to_init'
    WHEN n4_5 = 1 THEN 'r4_first_mid_last'
    WHEN n4_4 = 1 THEN 'r4_first_midinit_last'
    WHEN n4_8 = 1 THEN 'r4_init_midinit_to_full'
    WHEN n4_2 = 1 THEN 'r4_firstinit_midinit_last'
    WHEN n4_3 = 1 THEN 'r4_first_last'
    WHEN n4_7 = 1 THEN 'r4_init_to_full'
    WHEN n4_1 = 1 THEN 'r4_firstinit_last'
    WHEN n4_6 = 1 THEN 'r4_full_to_init'
    WHEN n5_5 = 1 THEN 'r5_first_mid_last'
    WHEN n5_4 = 1 THEN 'r5_first_midinit_last'
    WHEN n5_8 = 1 THEN 'r5_init_midinit_to_full'
    WHEN n5_2 = 1 THEN 'r5_firstinit_midinit_last'
    WHEN n5_3 = 1 THEN 'r5_first_last'
    WHEN n5_7 = 1 THEN 'r5_init_to_full'
    WHEN n5_1 = 1 THEN 'r5_firstinit_last'
    WHEN n5_6 = 1 THEN 'r5_full_to_init'
    ELSE null
  END as match_strategy,
  
  -- Matched author_id
  CASE 
    WHEN n1_5 = 1 THEN a1_5
    WHEN n1_4 = 1 THEN a1_4
    WHEN n1_8 = 1 THEN a1_8
    WHEN n1_2 = 1 THEN a1_2
    WHEN n1_3 = 1 THEN a1_3
    WHEN n1_7 = 1 THEN a1_7
    WHEN n1_1 = 1 THEN a1_1
    WHEN n1_6 = 1 THEN a1_6
    WHEN n2_5 = 1 THEN a2_5
    WHEN n2_4 = 1 THEN a2_4
    WHEN n2_8 = 1 THEN a2_8
    WHEN n2_2 = 1 THEN a2_2
    WHEN n2_3 = 1 THEN a2_3
    WHEN n2_7 = 1 THEN a2_7
    WHEN n2_1 = 1 THEN a2_1
    WHEN n2_6 = 1 THEN a2_6
    WHEN n3_5 = 1 THEN a3_5
    WHEN n3_4 = 1 THEN a3_4
    WHEN n3_8 = 1 THEN a3_8
    WHEN n3_2 = 1 THEN a3_2
    WHEN n3_3 = 1 THEN a3_3
    WHEN n3_7 = 1 THEN a3_7
    WHEN n3_1 = 1 THEN a3_1
    WHEN n3_6 = 1 THEN a3_6
    WHEN n4_5 = 1 THEN a4_5
    WHEN n4_4 = 1 THEN a4_4
    WHEN n4_8 = 1 THEN a4_8
    WHEN n4_2 = 1 THEN a4_2
    WHEN n4_3 = 1 THEN a4_3
    WHEN n4_7 = 1 THEN a4_7
    WHEN n4_1 = 1 THEN a4_1
    WHEN n4_6 = 1 THEN a4_6
    WHEN n5_5 = 1 THEN a5_5
    WHEN n5_4 = 1 THEN a5_4
    WHEN n5_8 = 1 THEN a5_8
    WHEN n5_2 = 1 THEN a5_2
    WHEN n5_3 = 1 THEN a5_3
    WHEN n5_7 = 1 THEN a5_7
    WHEN n5_1 = 1 THEN a5_1
    WHEN n5_6 = 1 THEN a5_6
    ELSE null
  END as matched_author_id,
  
  CASE 
    WHEN n1_1 = 1 OR n1_2 = 1 OR n1_3 = 1 OR n1_4 = 1 OR n1_5 = 1 OR n1_6 = 1 OR n1_7 = 1 OR n1_8 = 1 OR
         n2_1 = 1 OR n2_2 = 1 OR n2_3 = 1 OR n2_4 = 1 OR n2_5 = 1 OR n2_6 = 1 OR n2_7 = 1 OR n2_8 = 1 OR
         n3_1 = 1 OR n3_2 = 1 OR n3_3 = 1 OR n3_4 = 1 OR n3_5 = 1 OR n3_6 = 1 OR n3_7 = 1 OR n3_8 = 1 OR
         n4_1 = 1 OR n4_2 = 1 OR n4_3 = 1 OR n4_4 = 1 OR n4_5 = 1 OR n4_6 = 1 OR n4_7 = 1 OR n4_8 = 1 OR
         n5_1 = 1 OR n5_2 = 1 OR n5_3 = 1 OR n5_4 = 1 OR n5_5 = 1 OR n5_6 = 1 OR n5_7 = 1 OR n5_8 = 1
    THEN 'matched'
    WHEN total_candidates_in_block = 0 THEN 'no_candidates'
    ELSE 'ambiguous'
  END as match_outcome,
  
  CASE 
    WHEN total_candidates_in_block = 0 THEN 'no_candidates_in_block'
    WHEN total_name_matches = 0 THEN 'no_name_match'
    WHEN total_name_matches = 1 THEN 'single_name_match'
    ELSE 'multiple_name_matches'
  END as ambiguity_reason,
  
  CASE
    WHEN total_candidates_in_block = 0 THEN 'no_candidates_in_block'
    WHEN total_name_matches = 0 THEN 'no_name_pattern_matched'
    WHEN total_name_matches = 1 THEN 'matched'
    WHEN work_has_inst = 0 AND work_has_topic = 0 AND work_has_subfield = 0 
      THEN 'multiple_names_no_signals'
    WHEN work_has_inst > 0 AND candidates_with_inst = 0 
      THEN 'multiple_names_no_inst_overlap'
    WHEN work_has_topic > 0 AND candidates_with_topic = 0 AND candidates_with_subfield = 0
      THEN 'multiple_names_no_topic_overlap'
    WHEN candidates_with_inst > 1 OR candidates_with_topic > 1 OR candidates_with_subfield > 1
      THEN 'multiple_names_multiple_signal_matches'
    ELSE 'multiple_names_other'
  END as no_match_reason,
  
  total_candidates_in_block,
  total_name_matches,
  work_has_inst > 0 as work_has_inst,
  work_has_topic > 0 as work_has_topic,
  work_has_subfield > 0 as work_has_subfield,
  candidates_with_inst,
  candidates_with_topic,
  candidates_with_subfield,
  matching_candidates,
  non_matching_candidates

FROM match_counts;