In [None]:
-- Snapshot keywords_api hashes before rebuild
CREATE OR REPLACE TABLE openalex.common.keywords_api_hash AS
SELECT id, updated_date,
  xxhash64(CONCAT_WS('|',
    CAST(id AS STRING),
    COALESCE(display_name, ''),
    COALESCE(CAST(works_count AS STRING), '0'),
    COALESCE(CAST(cited_by_count AS STRING), '0')
  )) AS content_hash
FROM openalex.common.keywords_api

In [None]:
CREATE OR REPLACE TABLE openalex.common.keywords_api AS
WITH exploded AS (
  SELECT id AS work_id, cited_by_count, EXPLODE(keywords) AS keyword
  FROM openalex.works.openalex_works
),
dedup AS (
  SELECT work_id, cited_by_count, keyword
  FROM exploded
  QUALIFY ROW_NUMBER() OVER (PARTITION BY work_id, keyword.id ORDER BY work_id) = 1
),
aggregated_counts AS (
  SELECT
    keyword.id AS id,
    ANY_VALUE(keyword.display_name) AS display_name,
    CAST(COUNT(DISTINCT work_id) AS INT) AS works_count,
    CAST(SUM(cited_by_count) AS INT) AS cited_by_count
  FROM dedup
  GROUP BY 1
)
SELECT
  ac.id,
  ac.display_name,
  ac.works_count,
  ac.cited_by_count,
  CONCAT('https://api.openalex.org/works?filter=keywords.id:keywords/', kw.keyword_id) AS works_api_url,
  DATE_TRUNC('SECOND', CAST(kw.created_datetime AS TIMESTAMP)) AS created_date,
  CAST(NULL AS TIMESTAMP) AS updated_date
FROM aggregated_counts ac
JOIN (
  SELECT keyword_id, MIN(created_datetime) AS created_datetime
  FROM openalex.common.keywords
  GROUP BY keyword_id
) kw
  ON kw.keyword_id = REPLACE(ac.id, 'https://openalex.org/keywords/', '')

In [None]:
-- Compare hashes and set updated_date only when content changed
WITH new_hashes AS (
  SELECT id,
    xxhash64(CONCAT_WS('|',
      CAST(id AS STRING),
      COALESCE(display_name, ''),
      COALESCE(CAST(works_count AS STRING), '0'),
      COALESCE(CAST(cited_by_count AS STRING), '0')
    )) AS content_hash
  FROM openalex.common.keywords_api
)
MERGE INTO openalex.common.keywords_api AS target
USING (
  SELECT n.id,
    CASE
      WHEN p.id IS NULL THEN DATE_TRUNC('SECOND', CURRENT_TIMESTAMP())
      WHEN n.content_hash <> p.content_hash THEN DATE_TRUNC('SECOND', CURRENT_TIMESTAMP())
      ELSE p.updated_date
    END AS new_updated_date
  FROM new_hashes n
  LEFT JOIN openalex.common.keywords_api_hash p ON n.id = p.id
) AS source
ON target.id = source.id
WHEN MATCHED THEN UPDATE SET target.updated_date = source.new_updated_date