In [0]:
-------------------------------------------------------------------------------
-- AUTHOR MATCHING ALGORITHM - DIAGNOSTIC TABLE
-------------------------------------------------------------------------------

CREATE OR REPLACE TABLE openalex.authors.author_matching_diagnostics AS

WITH with_work_data AS (
  SELECT
    aa.work_id,
    aa.authorships,
    COALESCE(wtf.topics, ARRAY()) AS topics,
    ARRAY_DISTINCT(
      TRANSFORM(
        FILTER(w.locations, x -> x.source.id IS NOT NULL),
        x -> x.source.id
      )
    ) AS work_source_ids
  FROM openalex.works.authors_and_affiliations_updates aa
  LEFT JOIN openalex.works.work_topics_frontfill wtf
    ON aa.work_id = wtf.work_id
  LEFT JOIN openalex.works.openalex_works w
    ON aa.work_id = w.id
),

authors_exploded AS (
  SELECT
    work_id,
    authorship.author.display_name,
    authorship.parsed_name,
    LOWER(CONCAT(SUBSTRING(authorship.parsed_name.first, 1, 1), ' ', authorship.parsed_name.last)) AS block_key,
    ARRAY_DISTINCT(
      CONCAT(
        TRANSFORM(authorship.institutions, i -> i.id),
        FLATTEN(TRANSFORM(authorship.institutions, i -> i.lineage))
      )
    ) AS institution_ids,
    TRANSFORM(topics, t -> t.id) AS topic_ids,
    work_source_ids
  FROM with_work_data
  LATERAL VIEW EXPLODE(authorships) AS authorship
  WHERE work_id > 7000000000
),

blocked_candidates AS (
  SELECT 
    e.work_id,
    e.display_name,
    e.parsed_name,
    e.block_key,
    e.institution_ids,
    e.topic_ids,
    e.work_source_ids,
    alm.author_id,
    alm.parsed_longest_name,
    alm.institution_ids as candidate_institution_ids,
    alm.topic_ids as candidate_topic_ids,
    alm.source_ids AS candidate_source_ids,
    alm.works_count
  FROM authors_exploded e
  JOIN openalex.authors.author_lookup_mapping alm
    ON alm.block_key = e.block_key
),

with_match_signals AS (
  SELECT
    *,
    NAMED_STRUCT(
      'id', author_id,
      'display_name', CONCAT(parsed_longest_name.first, ' ', parsed_longest_name.last),
      'parsed_name', parsed_longest_name
    ) AS candidate_obj,
    length(parsed_name.first) as pn_first_len,
    length(parsed_longest_name.first) as cand_first_len,
    coalesce(parsed_name.middle, '') as pn_middle,
    coalesce(parsed_longest_name.middle, '') as cand_middle,
    
    (size(institution_ids) > 0 AND size(candidate_institution_ids) > 0 
     AND arrays_overlap(candidate_institution_ids, institution_ids)) as has_inst,
    
    (size(topic_ids) > 0 AND size(candidate_topic_ids) > 0 
     AND arrays_overlap(candidate_topic_ids, topic_ids)) as has_topic,

     (SIZE(work_source_ids) > 0 AND SIZE(candidate_source_ids) > 0
     AND ARRAYS_OVERLAP(candidate_source_ids, work_source_ids)) AS has_source
  FROM blocked_candidates
),

with_name_matches AS (
  SELECT
    *,
    -- 1: Exact Full Name
    (pn_first_len > 1 AND length(pn_middle) > 1 AND cand_first_len > 1 AND length(cand_middle) > 1
     AND lower(parsed_name.first) = lower(parsed_longest_name.first)
     AND lower(pn_middle) = lower(cand_middle)
     AND lower(parsed_name.last) = lower(parsed_longest_name.last)
    ) as pattern_1_exact_full,

    -- 2: Exact First, Middle Initial match
    (pn_first_len > 1 AND length(pn_middle) = 1 AND cand_first_len > 1
     AND lower(parsed_name.first) = lower(parsed_longest_name.first)
     AND lower(parsed_name.last) = lower(parsed_longest_name.last)
     AND (cand_middle = '' OR lower(pn_middle) = lower(substring(cand_middle, 1, 1)))
    ) as pattern_2_exact_first_mid_init,

    -- 3: Initials match to Full
    (pn_first_len = 1 AND pn_middle != '' AND cand_first_len > 1 AND cand_middle != ''
     AND lower(parsed_name.first) = lower(substring(parsed_longest_name.first, 1, 1))
     AND lower(substring(pn_middle, 1, 1)) = lower(substring(cand_middle, 1, 1))
     AND lower(parsed_name.last) = lower(parsed_longest_name.last)
    ) as pattern_3_init_mid_init,

    -- 4: First Initial, Middle Initial match
    (pn_first_len = 1 AND cand_first_len = 1 AND pn_middle != '' AND cand_middle != ''
     AND lower(parsed_name.first) = lower(parsed_longest_name.first)
     AND lower(substring(pn_middle, 1, 1)) = lower(substring(cand_middle, 1, 1))
     AND lower(parsed_name.last) = lower(parsed_longest_name.last)
    ) as pattern_4_first_init_mid_init,

    -- 5: Exact First, Exact Last
    (pn_first_len > 1 AND cand_first_len > 1
     AND lower(parsed_name.first) = lower(parsed_longest_name.first)
     AND lower(parsed_name.last) = lower(parsed_longest_name.last)
     AND pn_middle = ''
    ) as pattern_5_exact_first_last,

    -- 6: First Initial Only to Full
    (pn_first_len = 1 AND pn_middle = '' AND cand_first_len > 1
     AND lower(parsed_name.first) = lower(substring(parsed_longest_name.first, 1, 1))
     AND lower(parsed_name.last) = lower(parsed_longest_name.last)
    ) as pattern_6_first_init_to_full,

    -- 7: First Initial Only
    (pn_first_len = 1 AND cand_first_len = 1
     AND lower(parsed_name.first) = lower(parsed_longest_name.first)
     AND lower(parsed_name.last) = lower(parsed_longest_name.last)
     AND pn_middle = '' AND cand_middle = ''
    ) as pattern_7_first_init_last,

    -- 8: Full Name to Initial
    (pn_first_len > 1 AND cand_first_len = 1
     AND lower(substring(parsed_name.first, 1, 1)) = lower(parsed_longest_name.first)
     AND lower(parsed_name.last) = lower(parsed_longest_name.last)
    ) as pattern_8_full_to_init

  FROM with_match_signals
),

with_any_name_match AS (
  SELECT
    *,
    (pattern_1_exact_full OR pattern_2_exact_first_mid_init OR pattern_3_init_mid_init OR 
     pattern_4_first_init_mid_init OR pattern_5_exact_first_last OR pattern_6_first_init_to_full OR 
     pattern_7_first_init_last OR pattern_8_full_to_init) as any_name_match
  FROM with_name_matches
),

aggregated_counts AS (
  SELECT
    work_id,
    display_name,
    block_key,
    
    -- STRATEGY 1: Name Only (Unique)
    count_if(pattern_1_exact_full) AS s1_n1, count_if(pattern_2_exact_first_mid_init) AS s1_n2,
    count_if(pattern_3_init_mid_init) AS s1_n3, count_if(pattern_4_first_init_mid_init) AS s1_n4,
    count_if(pattern_5_exact_first_last) AS s1_n5, count_if(pattern_6_first_init_to_full) AS s1_n6,
    count_if(pattern_7_first_init_last) AS s1_n7, count_if(pattern_8_full_to_init) AS s1_n8,
    
    -- STRATEGY 2: Name + Institution (NOW INCLUDES P6, P8)
    count_if(pattern_1_exact_full AND has_inst) AS s2_n1, count_if(pattern_2_exact_first_mid_init AND has_inst) AS s2_n2,
    count_if(pattern_3_init_mid_init AND has_inst) AS s2_n3, count_if(pattern_4_first_init_mid_init AND has_inst) AS s2_n4,
    count_if(pattern_5_exact_first_last AND has_inst) AS s2_n5, count_if(pattern_6_first_init_to_full AND has_inst) AS s2_n6,
    count_if(pattern_7_first_init_last AND has_inst) AS s2_n7, count_if(pattern_8_full_to_init AND has_inst) AS s2_n8,

    -- STRATEGY 6: Name + Inst + Source (NOW INCLUDES P6, P8)
    count_if(pattern_1_exact_full AND has_inst AND has_source) AS s6_n1,
    count_if(pattern_2_exact_first_mid_init AND has_inst AND has_source) AS s6_n2,
    count_if(pattern_5_exact_first_last AND has_inst AND has_source) AS s6_n5,
    count_if(pattern_6_first_init_to_full AND has_inst AND has_source) AS s6_n6,
    count_if(pattern_7_first_init_last AND has_inst AND has_source) AS s6_n7,
    count_if(pattern_8_full_to_init AND has_inst AND has_source) AS s6_n8,

    -- STRATEGY 4: Name + Inst + Topic (NOW INCLUDES P6, P8)
    count_if(pattern_1_exact_full AND has_inst AND has_topic) AS s4_n1,
    count_if(pattern_2_exact_first_mid_init AND has_inst AND has_topic) AS s4_n2,
    count_if(pattern_5_exact_first_last AND has_inst AND has_topic) AS s4_n5,
    count_if(pattern_6_first_init_to_full AND has_inst AND has_topic) AS s4_n6,
    count_if(pattern_7_first_init_last AND has_inst AND has_topic) AS s4_n7,
    count_if(pattern_8_full_to_init AND has_inst AND has_topic) AS s4_n8,

    -- STRATEGY 5: Name + Source (NOW INCLUDES P6, P8)
    count_if(pattern_1_exact_full AND has_source) AS s5_n1,
    count_if(pattern_2_exact_first_mid_init AND has_source) AS s5_n2,
    count_if(pattern_5_exact_first_last AND has_source) AS s5_n5,
    count_if(pattern_6_first_init_to_full AND has_source) AS s5_n6,
    count_if(pattern_7_first_init_last AND has_source) AS s5_n7,
    count_if(pattern_8_full_to_init AND has_source) AS s5_n8,

    -- STRATEGY 3: Name + Topic (STRICT - NO P6 or P8)
    count_if(pattern_1_exact_full AND has_topic) AS s3_n1,
    count_if(pattern_2_exact_first_mid_init AND has_topic) AS s3_n2,
    count_if(pattern_5_exact_first_last AND has_topic) AS s3_n5,
    
    -- CAPTURE OBJECTS --------------------------------------------------------
    MAX(CASE WHEN pattern_1_exact_full THEN candidate_obj END) AS match_s1_n1,
    MAX(CASE WHEN pattern_2_exact_first_mid_init THEN candidate_obj END) AS match_s1_n2,
    MAX(CASE WHEN pattern_5_exact_first_last THEN candidate_obj END) AS match_s1_n5,

    -- S2 Capture (Inst)
    MAX(CASE WHEN pattern_1_exact_full AND has_inst THEN candidate_obj END) AS match_s2_n1,
    MAX(CASE WHEN pattern_2_exact_first_mid_init AND has_inst THEN candidate_obj END) AS match_s2_n2,
    MAX(CASE WHEN pattern_5_exact_first_last AND has_inst THEN candidate_obj END) AS match_s2_n5,
    MAX(CASE WHEN pattern_6_first_init_to_full AND has_inst THEN candidate_obj END) AS match_s2_n6,
    MAX(CASE WHEN pattern_8_full_to_init AND has_inst THEN candidate_obj END) AS match_s2_n8,

    -- S6 Capture (Inst + Source)
    MAX(CASE WHEN pattern_1_exact_full AND has_inst AND has_source THEN candidate_obj END) AS match_s6_n1,
    MAX(CASE WHEN pattern_2_exact_first_mid_init AND has_inst AND has_source THEN candidate_obj END) AS match_s6_n2,
    MAX(CASE WHEN pattern_5_exact_first_last AND has_inst AND has_source THEN candidate_obj END) AS match_s6_n5,
    MAX(CASE WHEN pattern_6_first_init_to_full AND has_inst AND has_source THEN candidate_obj END) AS match_s6_n6,
    MAX(CASE WHEN pattern_8_full_to_init AND has_inst AND has_source THEN candidate_obj END) AS match_s6_n8,

    -- S5 Capture (Source)
    MAX(CASE WHEN pattern_1_exact_full AND has_source THEN candidate_obj END) AS match_s5_n1,
    MAX(CASE WHEN pattern_2_exact_first_mid_init AND has_source THEN candidate_obj END) AS match_s5_n2,
    MAX(CASE WHEN pattern_5_exact_first_last AND has_source THEN candidate_obj END) AS match_s5_n5,
    MAX(CASE WHEN pattern_6_first_init_to_full AND has_source THEN candidate_obj END) AS match_s5_n6,
    MAX(CASE WHEN pattern_8_full_to_init AND has_source THEN candidate_obj END) AS match_s5_n8,
    
    -- S4 Capture (Inst + Topic)
    MAX(CASE WHEN pattern_1_exact_full AND has_inst AND has_topic THEN candidate_obj END) AS match_s4_n1,
    MAX(CASE WHEN pattern_2_exact_first_mid_init AND has_inst AND has_topic THEN candidate_obj END) AS match_s4_n2,
    MAX(CASE WHEN pattern_5_exact_first_last AND has_inst AND has_topic THEN candidate_obj END) AS match_s4_n5,
    MAX(CASE WHEN pattern_6_first_init_to_full AND has_inst AND has_topic THEN candidate_obj END) AS match_s4_n6,
    MAX(CASE WHEN pattern_8_full_to_init AND has_inst AND has_topic THEN candidate_obj END) AS match_s4_n8,

    -- S3 Capture (Topic - STRICT)
    MAX(CASE WHEN pattern_1_exact_full AND has_topic THEN candidate_obj END) AS match_s3_n1,
    MAX(CASE WHEN pattern_2_exact_first_mid_init AND has_topic THEN candidate_obj END) AS match_s3_n2,
    MAX(CASE WHEN pattern_5_exact_first_last AND has_topic THEN candidate_obj END) AS match_s3_n5,
    
    -- Diagnostics
    COUNT(*) AS total_candidates_in_block,
    COUNT_IF(any_name_match) AS total_name_matches,
    
    MAX(SIZE(institution_ids)) > 0 AS work_has_inst,
    MAX(SIZE(work_source_ids)) > 0 AS work_has_source,
    MAX(SIZE(topic_ids)) > 0 AS work_has_topic,
    
    SLICE(COLLECT_LIST(
      CASE WHEN any_name_match THEN
        NAMED_STRUCT(
          'author_id', author_id,
          'name', parsed_longest_name.first || ' ' || parsed_longest_name.last,
          'has_inst', has_inst,
          'has_topic', has_topic,
          'has_source', has_source
        )
      END
    ), 1, 10) AS candidates_passing_name_check

  FROM with_any_name_match
  GROUP BY work_id, display_name, block_key
)

SELECT
  work_id,
  display_name AS work_author_name,
  block_key,
  
  -- Match Method
  CASE 
    WHEN s1_n1 = 1 THEN 'unique_exact_full_name'
    WHEN s1_n2 = 1 THEN 'unique_exact_first_mid_init'
    WHEN s1_n5 = 1 THEN 'unique_exact_first_last'
    
    WHEN s6_n1 = 1 THEN 'exact_full_resolved_by_inst_and_source'
    WHEN s6_n2 = 1 THEN 'exact_first_mid_init_resolved_by_inst_and_source'
    WHEN s6_n5 = 1 THEN 'exact_first_last_resolved_by_inst_and_source'
    WHEN s6_n6 = 1 THEN 'init_to_full_resolved_by_inst_and_source'
    WHEN s6_n8 = 1 THEN 'full_to_init_resolved_by_inst_and_source'

    WHEN s2_n1 = 1 THEN 'exact_full_resolved_by_inst'
    WHEN s2_n2 = 1 THEN 'exact_first_mid_init_resolved_by_inst'
    WHEN s2_n5 = 1 THEN 'exact_first_last_resolved_by_inst'
    WHEN s2_n6 = 1 THEN 'init_to_full_resolved_by_inst'
    WHEN s2_n8 = 1 THEN 'full_to_init_resolved_by_inst'

    WHEN s4_n1 = 1 THEN 'exact_full_resolved_by_inst_and_topic'
    WHEN s4_n2 = 1 THEN 'exact_first_mid_init_resolved_by_inst_and_topic'
    WHEN s4_n5 = 1 THEN 'exact_first_last_resolved_by_inst_and_topic'
    WHEN s4_n6 = 1 THEN 'init_to_full_resolved_by_inst_and_topic'
    WHEN s4_n8 = 1 THEN 'full_to_init_resolved_by_inst_and_topic'
    
    WHEN s5_n1 = 1 THEN 'exact_full_resolved_by_source'
    WHEN s5_n2 = 1 THEN 'exact_first_mid_init_resolved_by_source'
    WHEN s5_n5 = 1 THEN 'exact_first_last_resolved_by_source'
    WHEN s5_n6 = 1 THEN 'init_to_full_resolved_by_source'
    WHEN s5_n8 = 1 THEN 'full_to_init_resolved_by_source'
    
    WHEN s3_n1 = 1 THEN 'exact_full_resolved_by_topic'
    WHEN s3_n2 = 1 THEN 'exact_first_mid_init_resolved_by_topic'
    WHEN s3_n5 = 1 THEN 'exact_first_last_resolved_by_topic'
    
    ELSE NULL
  END AS match_method,

  -- Matched Author Object
  CASE 
    WHEN s1_n1 = 1 THEN match_s1_n1
    WHEN s1_n2 = 1 THEN match_s1_n2
    WHEN s1_n5 = 1 THEN match_s1_n5
    
    WHEN s6_n1 = 1 THEN match_s6_n1
    WHEN s6_n2 = 1 THEN match_s6_n2
    WHEN s6_n5 = 1 THEN match_s6_n5
    WHEN s6_n6 = 1 THEN match_s6_n6
    WHEN s6_n8 = 1 THEN match_s6_n8

    WHEN s2_n1 = 1 THEN match_s2_n1
    WHEN s2_n2 = 1 THEN match_s2_n2
    WHEN s2_n5 = 1 THEN match_s2_n5
    WHEN s2_n6 = 1 THEN match_s2_n6
    WHEN s2_n8 = 1 THEN match_s2_n8

    WHEN s4_n1 = 1 THEN match_s4_n1
    WHEN s4_n2 = 1 THEN match_s4_n2
    WHEN s4_n5 = 1 THEN match_s4_n5
    WHEN s4_n6 = 1 THEN match_s4_n6
    WHEN s4_n8 = 1 THEN match_s4_n8

    WHEN s5_n1 = 1 THEN match_s5_n1
    WHEN s5_n2 = 1 THEN match_s5_n2
    WHEN s5_n5 = 1 THEN match_s5_n5
    WHEN s5_n6 = 1 THEN match_s5_n6
    WHEN s5_n8 = 1 THEN match_s5_n8

    WHEN s3_n1 = 1 THEN match_s3_n1
    WHEN s3_n2 = 1 THEN match_s3_n2
    WHEN s3_n5 = 1 THEN match_s3_n5

    ELSE NULL
  END AS matched_author,

  -- Match Outcome
  CASE 
    WHEN (
      s1_n1=1 OR s1_n2=1 OR s1_n5=1 OR 
      s6_n1=1 OR s6_n2=1 OR s6_n5=1 OR s6_n6=1 OR s6_n8=1 OR
      s2_n1=1 OR s2_n2=1 OR s2_n5=1 OR s2_n6=1 OR s2_n8=1 OR
      s4_n1=1 OR s4_n2=1 OR s4_n5=1 OR s4_n6=1 OR s4_n8=1 OR
      s5_n1=1 OR s5_n2=1 OR s5_n5=1 OR s5_n6=1 OR s5_n8=1 OR
      s3_n1=1 OR s3_n2=1 OR s3_n5=1
    ) THEN 'MATCHED'
    WHEN total_candidates_in_block = 0 THEN 'NO_CANDIDATES'
    ELSE 'AMBIGUOUS'
  END AS match_outcome,

  -- Detailed Failure Reason
  CASE
    WHEN (
      s1_n1=1 OR s1_n2=1 OR s1_n5=1 OR 
      s6_n1=1 OR s6_n2=1 OR s6_n5=1 OR s6_n6=1 OR s6_n8=1 OR
      s2_n1=1 OR s2_n2=1 OR s2_n5=1 OR s2_n6=1 OR s2_n8=1 OR
      s4_n1=1 OR s4_n2=1 OR s4_n5=1 OR s4_n6=1 OR s4_n8=1 OR
      s5_n1=1 OR s5_n2=1 OR s5_n5=1 OR s5_n6=1 OR s5_n8=1 OR
      s3_n1=1 OR s3_n2=1 OR s3_n5=1
    ) THEN NULL
    
    WHEN total_candidates_in_block = 0 THEN 'NO_BLOCK_CANDIDATES'
    
    WHEN total_name_matches = 0 THEN 'no_name_pattern_matched'

    WHEN (
      (s2_n1 > 1 OR s2_n2 > 1 OR s2_n5 > 1 OR s2_n6 > 1 OR s2_n8 > 1) OR -- Inst
      (s6_n1 > 1 OR s6_n2 > 1 OR s6_n5 > 1 OR s6_n6 > 1 OR s6_n8 > 1) OR -- Inst + Source
      (s4_n1 > 1 OR s4_n2 > 1 OR s4_n5 > 1 OR s4_n6 > 1 OR s4_n8 > 1)    -- Inst + Topic
    ) THEN 'ambiguous_name_multiple_candidates_have_inst'

    WHEN (s5_n1 > 1 OR s5_n2 > 1 OR s5_n5 > 1 OR s5_n6 > 1 OR s5_n8 > 1) 
      THEN 'ambiguous_name_multiple_candidates_have_source'

    WHEN (s3_n1 > 1 OR s3_n2 > 1 OR s3_n5 > 1) 
      THEN 'ambiguous_name_multiple_candidates_have_topic'

    WHEN (NOT work_has_inst AND NOT work_has_source AND NOT work_has_topic) 
      THEN 'ambiguous_name_work_has_no_signals'

    WHEN (work_has_inst OR work_has_source OR work_has_topic) 
      THEN 'ambiguous_name_no_signal_overlap'
    
    ELSE 'ambiguous_other'

  END AS failure_reason,
  
  total_candidates_in_block,
  total_name_matches,
  candidates_passing_name_check

FROM aggregated_counts;