## Build Authors API objects from `openalex_works`

#### Create base table exploded from `authorships`

In [0]:
-- One-time (or refresh) base table with early slicing to top-5
CREATE OR REPLACE TABLE openalex.works.work_authorships
AS
SELECT
  /* per-work */
  w.id                                                     AS work_id,
  COALESCE(w.publication_year, YEAR(w.publication_date))   AS pub_year,
  CAST(w.open_access.is_oa AS BOOLEAN)                     AS is_oa,
  CAST(w.cited_by_count AS INT)                            AS work_cited_by_count,
  w.topics                                                 AS work_topics,
  w.concepts                                               AS work_concepts,

  w.created_date                                           AS created_date,
  w.updated_date                                           AS updated_date,

  /* authorship (explode once) */
  a.author.id                                              AS author_id,
  a.author.display_name                                    AS author_display_name,
  a.author.orcid                                           AS author_orcid,
  a.raw_author_name                                        AS raw_author_name,
  COALESCE(a.institutions,  ARRAY())                       AS author_institutions,
  a.author_order_number                                    AS author_order_number,
  a.is_corresponding                                       AS is_corresponding
FROM openalex.works.openalex_works w
LATERAL VIEW OUTER EXPLODE(w.authorships) AS a
WHERE a.author.id IS NOT NULL;

-- Improve locality & skipping for the GROUP BY
OPTIMIZE openalex.authors.authorships_base
ZORDER BY (author_id, pub_year);

-- Optional but helpful for planning & data skipping
ANALYZE TABLE openalex.authors.authorships_base COMPUTE STATISTICS;


#### Create `author_topics`

In [0]:
CREATE OR REPLACE TABLE openalex.authors.author_topics CLUSTER BY (author_id) AS
WITH topics_and_share AS (
  WITH awt AS (
    -- explode once; distinct by (author, work, topic)
    SELECT DISTINCT
      author_id,
      work_id,
      CAST(t.id AS STRING) AS topic_id,
      t.display_name, t.subfield, t.field, t.domain, t.score
    FROM (
      SELECT author_id, work_id, EXPLODE_OUTER(work_topics) AS t
      FROM openalex.works.work_authorships
      WHERE work_id IS NOT NULL
    )
    WHERE t.id IS NOT NULL
  ),
  counts AS (
    -- author-topic counts AND topic totals in one pass
    SELECT
      author_id,                 -- NULL on total rows
      topic_id,
      MAX_BY(display_name, score) AS display_name,
      MAX_BY(subfield,     score) AS subfield,
      MAX_BY(field,        score) AS field,
      MAX_BY(domain,       score) AS domain,
      MAX(score)                 AS score,
      COUNT(DISTINCT work_id)    AS cnt
    FROM awt
    GROUP BY GROUPING SETS (
      (author_id, topic_id),
      (topic_id)
    )
  ),
  with_totals AS (
    -- compute topic_total via window BEFORE filtering author rows
    SELECT
      author_id,
      topic_id,
      display_name, subfield, field, domain, score,
      cnt AS topic_count,
      MAX(CASE WHEN author_id IS NULL THEN cnt END)
        OVER (PARTITION BY topic_id) AS topic_total
    FROM counts
  )
  -- final: include ALL topics; round share to 7 decimals; sort arrays
  SELECT
    author_id,
    ARRAY_SORT(
      COLLECT_LIST(
        STRUCT(
          topic_id AS id,
          display_name,
          CAST(topic_count AS INT) AS count,
          score,
          subfield,
          field,
          domain
        )
      ),
      (l, r) -> CASE WHEN l.count > r.count THEN -1 WHEN l.count < r.count THEN 1 ELSE 0 END
    ) AS topics,
    ARRAY_SORT(
      COLLECT_LIST(
        STRUCT(
          topic_id AS id,
          display_name,
          ROUND(CAST(topic_count AS DOUBLE) / NULLIF(topic_total, 0), 7) AS value,
          subfield,
          field,
          domain
        )
      ),
      (l, r) -> CASE WHEN l.value > r.value THEN -1 WHEN l.value < r.value THEN 1 ELSE 0 END
    ) AS topic_share
  FROM with_totals
  WHERE author_id IS NOT NULL
  GROUP BY author_id
)
SELECT * FROM topics_and_share;

OPTIMIZE openalex.authors.author_topics FULL;

### Create Final table `openalex.authors.authors_api` to sync to Elastic Search

In [0]:
CREATE OR REPLACE TABLE openalex.authors.authors_api
CLUSTER BY AUTO AS
WITH author_concepts_exploded AS (
  SELECT
    author_id,
    EXPLODE_OUTER(work_concepts) AS concept,
    COUNT(*) AS concept_count
  FROM openalex.works.work_authorships
  GROUP BY 1,2
  QUALIFY ROW_NUMBER() OVER (
    PARTITION BY author_id
    ORDER BY concept_count DESC, CAST(concept.id AS STRING) ASC
  ) <= 5
),
concepts_packed AS (
  SELECT
    author_id,
    COLLECT_LIST(
      STRUCT(
        concept.id,
        concept.wikidata,
        concept.display_name,
        CAST(concept.level AS INT),
        concept.score,
        CAST(concept_count AS INT) AS count
      )
    ) AS x_concepts
  FROM author_concepts_exploded
  WHERE concept.id IS NOT NULL
  GROUP BY author_id
),
-- Compact, spill-friendly affiliations pre-agg
affiliations_exploded AS (
  SELECT
    author_id,
    i.id                             AS inst_id,
    ANY_VALUE(i.ror)                 AS ror,
    ANY_VALUE(i.display_name)        AS display_name,
    ANY_VALUE(i.country_code)        AS country_code,
    ANY_VALUE(i.type)                AS type,
    ANY_VALUE(i.lineage)             AS lineage,
    SORT_ARRAY(COLLECT_SET(pub_year), false) AS years -- DESCENDING
  FROM (
    SELECT
      author_id,
      pub_year,
      EXPLODE_OUTER(author_institutions) AS i
    FROM openalex.works.work_authorships
    WHERE pub_year IS NOT NULL
  )
  WHERE i.id IS NOT NULL
  GROUP BY author_id, i.id
),
affiliations_packed AS (
  SELECT
    author_id,
    COLLECT_LIST(
      STRUCT(
        STRUCT(inst_id as id, ror, display_name, country_code, type, lineage) as institution,  -- institution
        years
      )
    ) AS affiliations
  FROM affiliations_exploded
  GROUP BY author_id
),
h_citations AS (
  SELECT
    author_id,
    -- non-null citations → INT → sort DESC
    SORT_ARRAY(
      TRANSFORM(
        FILTER(COLLECT_LIST(work_cited_by_count), x -> x IS NOT NULL),
        x -> CAST(x AS INT)
      ),
      false
    ) AS sorted_citations
  FROM openalex.works.work_authorships
  GROUP BY author_id
),
main_agg_pre AS (
  SELECT
    b.author_id AS id,
    named_struct('openalex', b.author_id) AS ids,

    -- recent values
    MAX_BY(b.author_orcid,
           named_struct('y', b.pub_year, 'ord', -COALESCE(b.author_order_number, 999999999))) AS orcid,
    MAX_BY(b.author_display_name,
           named_struct('y', b.pub_year, 'ord', -COALESCE(b.author_order_number, 999999999))) AS display_name,

    -- alternatives
    ARRAY_DISTINCT(
      ARRAY_COMPACT(
        CONCAT(COLLECT_LIST(b.author_display_name), COLLECT_LIST(b.raw_author_name))
      )
    ) AS display_name_alternatives,

    -- totals
    CAST(COUNT(*) AS INT) AS works_count,
    CAST(SUM(work_cited_by_count) AS INT) AS cited_by_count,
    COUNT_IF(is_oa) AS oa_works_count,
    -- helpers for summary_stats (compute ONCE here)
    -- 2yr mean + i10:
    CAST(AVG(CASE WHEN b.pub_year >= YEAR(current_date()) - 2 THEN work_cited_by_count END) AS DOUBLE)
      AS two_year_mean,
    CAST(COUNT_IF(work_cited_by_count >= 10) AS INT) AS i10_index,

    -- sorted citations array (desc) for h-index:
    SORT_ARRAY(
      TRANSFORM(
        FILTER(COLLECT_LIST(work_cited_by_count), x -> x IS NOT NULL),
        x -> CAST(x AS INT)
      ),
      false
    ) AS sorted_citations,

    MAX(b.pub_year) AS max_pub_year,
    -- works API + dates
    CONCAT('https://api.openalex.org/works?filter=author.id:',
           REGEXP_EXTRACT(b.author_id, '/(A[0-9]+)$', 1)) AS works_api_url,
    MAX(b.updated_date) AS updated_date,
    CAST(MIN(b.created_date) AS DATE) AS created_date
  FROM openalex.works.work_authorships b
  GROUP BY b.author_id
),

-- ---------- build summary_stats from the precomputed helpers; no join to base ----------
main_agg AS (
  SELECT
    id, ids,
    orcid, display_name, display_name_alternatives,
    works_count, cited_by_count, max_pub_year,
    -- your zip_with h-index
    named_struct(
      '2yr_mean_citedness', COALESCE(two_year_mean, 0),
      'h_index',
        CAST(
          ARRAY_MAX(
            ZIP_WITH(
              sorted_citations,
              SEQUENCE(1, SIZE(sorted_citations)),
              (citation, rank) -> IF(citation >= rank, rank, 0)
            )
          ) AS INT
        ),
      'i10_index', COALESCE(i10_index,0)
    ) AS summary_stats,
    works_api_url, updated_date, created_date
  FROM main_agg_pre
)

-- ---------- final stitch (unchanged): just join the tiny per-author topic/concept arrays ----------
SELECT
  m.id, m.orcid,
  m.display_name, m.display_name_alternatives,
  m.works_count, m.cited_by_count, m.summary_stats,
  m.ids,
  COALESCE(ap.affiliations, ARRAY()) AS affiliations,
  /* last_known_institutions: max(pub_year) for this author, then explode institutions */
  (
    SELECT
      COLLECT_SET(STRUCT(i.id, i.ror, i.display_name, i.country_code, i.type, i.lineage))
    FROM (
      SELECT
        EXPLODE_OUTER(b4.author_institutions) AS i
      FROM openalex.authors.authorships_base b4
      WHERE b4.author_id = m.id
        AND b4.pub_year = m.max_pub_year
    )
    WHERE i.id IS NOT NULL
  ) AS last_known_institutions,  
  COALESCE(at.topics, ARRAY()) AS topics,
  -- @TODO figure out the calculation - just sorts by score renamed to value for compatibility
  COALESCE(at.topic_share, ARRAY()) AS topic_share,
  COALESCE(cp.x_concepts, ARRAY()) AS x_concepts,
  /* inline counts_by_year: correlated per-author subquery */
  (
    SELECT
      SORT_ARRAY(
        COLLECT_LIST(
          STRUCT(
            year,
            works_count,
            oa_works_count,
            cited_by_count
          )
        ),
        true
      )
    FROM (
      SELECT
        CAST(b2.pub_year AS INT) AS year,
        CAST(COUNT(*) AS INT) AS works_count,
        CAST(SUM(CASE WHEN b2.is_oa THEN 1 ELSE 0 END) AS INT) AS oa_works_count,
        CAST(SUM(b2.work_cited_by_count) AS INT) AS cited_by_count
      FROM openalex.authors.authorships_base b2
      WHERE b2.author_id = m.id
        AND b2.pub_year IS NOT NULL
      GROUP BY CAST(b2.pub_year AS INT)
    )
  ) AS counts_by_year,
  m.works_api_url, m.updated_date, m.created_date
FROM main_agg m
LEFT JOIN openalex.authors.author_topics at ON m.id = at.author_id
LEFT JOIN concepts_packed cp ON m.id = cp.author_id
LEFT JOIN affiliations_packed ap ON m.id = ap.author_id;