## Build Authors from `openalex_works`

#### Create base table exploded from `authorships`, limit to valid authors from `openalex.authors.author_registry`

In [0]:
%sql
CREATE OR REPLACE TABLE openalex.works.work_authorships
AS
WITH exploded_works AS (
  SELECT
    w.id,
    w.publication_year,
    w.publication_date,
    w.open_access,
    w.cited_by_count,
    w.topics,
    w.concepts,
    FILTER(
      TRANSFORM(w.locations, loc -> loc.source),
      s -> s.id IS NOT NULL
    ) AS work_sources,
    w.created_date,
    w.updated_date,
    a.author,
    a.raw_author_name,
    a.institutions,
    a.author_order_number,
    a.is_corresponding
  FROM openalex.works.openalex_works w
  LATERAL VIEW OUTER EXPLODE(w.authorships) AS a
  WHERE a.author.id IS NOT NULL
    AND a.author.id != 'https://openalex.org/A9999999999'
    AND a.author.id != 'https://openalex.org/A5317838346'
    AND a.author.id != 'https://openalex.org/A5098778687'
    AND a.author.id != 'https://openalex.org/A5040317105'
),
valid_authors AS (
  SELECT id
  FROM openalex.authors.author_registry
  WHERE merge_into_id IS NULL
)
SELECT
  ew.id                                                    AS work_id,
  COALESCE(ew.publication_year, YEAR(ew.publication_date)) AS pub_year,
  CAST(ew.open_access.is_oa AS BOOLEAN)                    AS is_oa,
  CAST(ew.cited_by_count AS INT)                           AS work_cited_by_count,
  ew.topics                                                AS work_topics,
  ew.concepts                                              AS work_concepts,
  ew.work_sources                                          AS work_sources,
  ew.created_date                                          AS created_date,
  ew.updated_date                                          AS updated_date,
  ew.author.id                                             AS author_id,
  ew.author.display_name                                   AS author_display_name,
  ew.author.orcid                                          AS author_orcid,
  ew.raw_author_name                                       AS raw_author_name,
  COALESCE(ew.institutions, ARRAY())                       AS author_institutions,
  ew.author_order_number                                   AS author_order_number,
  ew.is_corresponding                                      AS is_corresponding
FROM exploded_works ew
INNER JOIN valid_authors va
  ON CAST(REPLACE(ew.author.id, 'https://openalex.org/A', '') AS BIGINT) = va.id;

-- Improve locality & skipping for the GROUP BY
OPTIMIZE openalex.works.work_authorships
ZORDER BY (author_id, pub_year);

-- Optional but helpful for planning & data skipping
ANALYZE TABLE openalex.works.work_authorships COMPUTE STATISTICS;

#### Create `author_topics`

In [0]:
%sql
CREATE OR REPLACE TABLE openalex.authors.author_topics CLUSTER BY (author_id) AS
WITH topics_and_share AS (
  WITH awt AS (
    -- explode once; distinct by (author, work, topic)
    SELECT DISTINCT
      author_id,
      work_id,
      CAST(t.id AS STRING) AS topic_id,
      t.display_name, t.subfield, t.field, t.domain, t.score
    FROM (
      SELECT author_id, work_id, EXPLODE_OUTER(work_topics) AS t
      FROM openalex.works.work_authorships
      WHERE work_id IS NOT NULL
    )
    WHERE t.id IS NOT NULL
  ),
  counts AS (
    -- author-topic counts AND topic totals in one pass
    SELECT
      author_id,                 -- NULL on total rows
      topic_id,
      MAX_BY(display_name, score) AS display_name,
      MAX_BY(subfield,     score) AS subfield,
      MAX_BY(field,        score) AS field,
      MAX_BY(domain,       score) AS domain,
      MAX(score)                 AS score,
      COUNT(DISTINCT work_id)    AS cnt
    FROM awt
    GROUP BY GROUPING SETS (
      (author_id, topic_id),
      (topic_id)
    )
  ),
  with_totals AS (
    -- compute topic_total via window BEFORE filtering author rows
    SELECT
      author_id,
      topic_id,
      display_name, subfield, field, domain, score,
      cnt AS topic_count,
      MAX(CASE WHEN author_id IS NULL THEN cnt END)
        OVER (PARTITION BY topic_id) AS topic_total
    FROM counts
  )
  -- final: include ALL topics; round share to 7 decimals; sort arrays
  SELECT
    author_id,
    ARRAY_SORT(
      COLLECT_LIST(
        STRUCT(
          topic_id AS id,
          display_name,
          CAST(topic_count AS INT) AS count,
          score,
          subfield,
          field,
          domain
        )
      ),
      (l, r) -> CASE WHEN l.count > r.count THEN -1 WHEN l.count < r.count THEN 1 ELSE 0 END
    ) AS topics,
    ARRAY_SORT(
      COLLECT_LIST(
        STRUCT(
          topic_id AS id,
          display_name,
          ROUND(CAST(topic_count AS DOUBLE) / NULLIF(topic_total, 0), 7) AS value,
          subfield,
          field,
          domain
        )
      ),
      (l, r) -> CASE WHEN l.value > r.value THEN -1 WHEN l.value < r.value THEN 1 ELSE 0 END
    ) AS topic_share
  FROM with_totals
  WHERE author_id IS NOT NULL
  GROUP BY author_id
)
SELECT * FROM topics_and_share;

OPTIMIZE openalex.authors.author_topics FULL;

### Create Final table `openalex.authors.openalex_authors`

In [0]:
%sql
CREATE OR REPLACE TABLE openalex.authors.openalex_authors
CLUSTER BY AUTO AS
WITH 
-- concepts
author_concepts_exploded AS (
  SELECT
    author_id,
    EXPLODE_OUTER(work_concepts) AS concept,
    COUNT(*) AS concept_count
  FROM openalex.works.work_authorships
  GROUP BY 1,2
  QUALIFY ROW_NUMBER() OVER (
    PARTITION BY author_id
    ORDER BY concept_count DESC, CAST(concept.id AS STRING) ASC
  ) <= 5
),
concepts_packed AS (
  SELECT
    author_id,
    COLLECT_LIST(
      STRUCT(
        concept.id,
        concept.wikidata,
        concept.display_name,
        CAST(concept.level AS INT),
        concept.score,
        CAST(concept_count AS INT) AS count
      )
    ) AS x_concepts
  FROM author_concepts_exploded
  WHERE concept.id IS NOT NULL
  GROUP BY author_id
),

-- affiliations
affiliations_exploded AS (
  SELECT
    author_id,
    i.id                             AS inst_id,
    ANY_VALUE(i.ror)                 AS ror,
    ANY_VALUE(i.display_name)        AS display_name,
    ANY_VALUE(i.country_code)        AS country_code,
    ANY_VALUE(i.type)                AS type,
    ANY_VALUE(i.lineage)             AS lineage,
    SORT_ARRAY(COLLECT_SET(pub_year), false) AS years
  FROM (
    SELECT author_id, pub_year, EXPLODE_OUTER(author_institutions) AS i
    FROM openalex.works.work_authorships
    WHERE pub_year IS NOT NULL
  )
  WHERE i.id IS NOT NULL
  GROUP BY author_id, i.id
),
affiliations_packed AS (
  SELECT
    author_id,
    COLLECT_LIST(
      STRUCT(
        STRUCT(inst_id as id, ror, display_name, country_code, type, lineage) as institution,
        years
      )
    ) AS affiliations
  FROM affiliations_exploded
  GROUP BY author_id
),
sources_agg AS (
  SELECT
    author_id,
    -- COLLECT_SET deduplicates the struct objects
    COLLECT_SET(src) AS sources
  FROM (
    SELECT author_id, EXPLODE(work_sources) AS src
    FROM openalex.works.work_authorships
    WHERE work_sources IS NOT NULL
  ) t
  WHERE src.id IS NOT NULL
  GROUP BY author_id
),

-- counts
counts_per_year AS (
  SELECT
    author_id,
    CAST(pub_year AS INT) AS year,
    CAST(count(*) AS INT) AS works_count,
    CAST(sum(if(is_oa, 1, 0)) AS INT) AS oa_works_count,
    CAST(sum(work_cited_by_count) AS INT) AS cited_by_count
  FROM openalex.works.work_authorships
  WHERE pub_year IS NOT NULL
  GROUP BY author_id, pub_year
),
counts_agg AS (
  SELECT
    author_id,
    SORT_ARRAY(
      COLLECT_LIST(
        STRUCT(
          year,
          works_count,
          oa_works_count,
          cited_by_count
        )
      ),
      true
    ) as counts_by_year
  FROM counts_per_year
  GROUP BY author_id
),

-- last known institutions
last_inst_agg AS (
  SELECT
    wa.author_id,
    COLLECT_SET(STRUCT(i.id, i.ror, i.display_name, i.country_code, i.type, i.lineage)) as last_known_institutions
  FROM openalex.works.work_authorships wa
  INNER JOIN (
      SELECT author_id, MAX(pub_year) as max_year 
      FROM openalex.works.work_authorships 
      GROUP BY author_id
  ) my ON wa.author_id = my.author_id AND wa.pub_year = my.max_year
  LATERAL VIEW OUTER EXPLODE(author_institutions) t AS i
  WHERE i.id IS NOT NULL
  GROUP BY wa.author_id
),

-- main aggregation
main_agg_pre AS (
  SELECT
    CAST(REPLACE(b.author_id, 'https://openalex.org/A', '') AS BIGINT) AS id,
    b.author_id AS author_id_str,
    named_struct(
      'openalex', b.author_id,
      'orcid', MAX(b.author_orcid)
    ) AS ids,
    MAX_BY(b.author_orcid,
           named_struct('y', b.pub_year, 'ord', -COALESCE(b.author_order_number, 999999999))) AS orcid,
    MAX_BY(b.author_display_name,
           named_struct('y', b.pub_year, 'ord', -COALESCE(b.author_order_number, 999999999))) AS display_name,
    SLICE(
      ARRAY_DISTINCT(
        ARRAY_COMPACT(
          CONCAT(COLLECT_LIST(b.author_display_name), COLLECT_LIST(b.raw_author_name))
        )
      ),
      1, 10
    ) AS display_name_alternatives,
    CAST(COUNT(*) AS INT) AS works_count,
    CAST(SUM(work_cited_by_count) AS INT) AS cited_by_count,
    COUNT_IF(is_oa) AS oa_works_count,
    CAST(AVG(CASE WHEN b.pub_year >= YEAR(current_date()) - 2 THEN work_cited_by_count END) AS DOUBLE) AS two_year_mean,
    CAST(COUNT_IF(work_cited_by_count >= 10) AS INT) AS i10_index,
    SORT_ARRAY(
      TRANSFORM(
        FILTER(COLLECT_LIST(work_cited_by_count), x -> x IS NOT NULL),
        x -> CAST(x AS INT)
      ),
      false
    ) AS sorted_citations,
    MAX(b.pub_year) AS max_pub_year,
    CONCAT('https://api.openalex.org/works?filter=author.id:',
           REGEXP_EXTRACT(b.author_id, '/(A[0-9]+)$', 1)) AS works_api_url,
    MAX(b.updated_date) AS updated_date,
    CAST(MIN(b.created_date) AS DATE) AS created_date
  FROM openalex.works.work_authorships b
  GROUP BY b.author_id
),
enriched_agg AS (
  SELECT
    m.*,
    ARRAY_MAX(
      TRANSFORM(
        ARRAY_UNION(ARRAY(m.display_name), COALESCE(m.display_name_alternatives, ARRAY())),
        x -> STRUCT(LENGTH(x) AS len, x AS name)
      )
    ).name AS longest_name,
    named_struct(
      '2yr_mean_citedness', COALESCE(two_year_mean, 0),
      'h_index',
        CAST(
          ARRAY_MAX(
            ZIP_WITH(
              sorted_citations,
              SEQUENCE(1, SIZE(sorted_citations)),
              (citation, rank) -> IF(citation >= rank, rank, 0)
            )
          ) AS INT
        ),
      'i10_index', COALESCE(i10_index,0)
    ) AS summary_stats
  FROM main_agg_pre m
)

-- final select
SELECT
  m.id,
  m.display_name,
  m.display_name_alternatives,
  m.orcid,
  m.longest_name,
  pn.parsed_name AS parsed_longest_name,
  CASE 
    WHEN pn.parsed_name.last IS NULL OR pn.parsed_name.last = '' THEN NULL
    WHEN pn.parsed_name.first IS NULL OR pn.parsed_name.first = '' THEN pn.parsed_name.last
    ELSE CONCAT(SUBSTRING(pn.parsed_name.first, 1, 1), ' ', pn.parsed_name.last)
  END AS block_key,
  m.works_count,
  m.cited_by_count,
  m.summary_stats,
  m.ids,
  COALESCE(ap.affiliations, ARRAY()) AS affiliations,
  la.last_known_institutions,
  COALESCE(at.topics, ARRAY()) AS topics,
  COALESCE(at.topic_share, ARRAY()) AS topic_share,
  COALESCE(cp.x_concepts, ARRAY()) AS x_concepts,
  COALESCE(s.sources, ARRAY()) AS sources,
  cby.counts_by_year,
  m.works_api_url,
  m.updated_date,
  m.created_date
FROM enriched_agg m
LEFT JOIN openalex.authors.author_topics at ON m.author_id_str = at.author_id
LEFT JOIN concepts_packed cp ON m.author_id_str = cp.author_id
LEFT JOIN affiliations_packed ap ON m.author_id_str = ap.author_id
LEFT JOIN sources_agg s ON m.author_id_str = s.author_id
LEFT JOIN counts_agg cby ON m.author_id_str = cby.author_id
LEFT JOIN last_inst_agg la ON m.author_id_str = la.author_id
LEFT JOIN openalex.authors.parsed_names_lookup pn ON TRIM(m.longest_name) = pn.raw_author_name;

### Create mapping table for AND

In [0]:
%sql
CREATE OR REPLACE TABLE openalex.authors.author_lookup_mapping
CLUSTER BY (block_key)
AS
SELECT
  -- 1. Blocking Key
  block_key,

  -- 2. Candidate Identity
  id AS author_id,
  display_name,
  longest_name,
  parsed_longest_name,
  ids.orcid as orcid,

  ARRAY_DISTINCT(
     TRANSFORM(
       FILTER(affiliations, x -> x.institution.id IS NOT NULL),
       x -> x.institution.id
     )
  ) AS institution_ids,
  
  ARRAY_DISTINCT(
     TRANSFORM(topics, x -> x.id)
  ) AS topic_ids,

  array_distinct(
    TRANSFORM(topics, x -> x.subfield.id)
  ) AS subfield_ids,

  TRANSFORM(sources, x -> x.id) AS source_ids,

  ARRAY_DISTINCT(
    TRANSFORM(
        ARRAY_UNION(ARRAY(display_name), COALESCE(display_name_alternatives, ARRAY())),
        n -> LOWER(TRIM(n))
    )
  ) AS name_variants,

  works_count,
  cited_by_count,
  COALESCE(summary_stats.h_index, 0) AS h_index,
  
  CAST(ARRAY_MIN(TRANSFORM(counts_by_year, x -> x.year)) AS INT) AS first_active_year,
  CAST(ARRAY_MAX(TRANSFORM(counts_by_year, x -> x.year)) AS INT) AS last_active_year

FROM openalex.authors.openalex_authors
WHERE block_key IS NOT NULL;

OPTIMIZE openalex.authors.author_lookup_mapping;