In [None]:
-- Snapshot concepts_api hashes before rebuild
CREATE OR REPLACE TABLE openalex.common.concepts_api_hash AS
SELECT id, updated_date,
  xxhash64(CONCAT_WS('|',
    CAST(id AS STRING),
    COALESCE(display_name, ''),
    COALESCE(CAST(level AS STRING), '0'),
    COALESCE(description, ''),
    COALESCE(wikidata, ''),
    COALESCE(image_url, ''),
    COALESCE(image_thumbnail_url, ''),
    COALESCE(CAST(works_count AS STRING), '0'),
    COALESCE(CAST(cited_by_count AS STRING), '0'),
    COALESCE(TO_JSON(ids), '{}')
  )) AS content_hash
FROM openalex.common.concepts_api

In [None]:
CREATE OR REPLACE TABLE openalex.common.concepts_api AS
WITH exploded AS (
  SELECT id AS work_id, cited_by_count, EXPLODE(concepts) AS concept
  FROM openalex.works.openalex_works
),
dedup AS (
  SELECT work_id, cited_by_count, concept
  FROM exploded
  QUALIFY ROW_NUMBER() OVER (PARTITION BY work_id, concept.id ORDER BY work_id) = 1
),
aggregated_counts AS (
  SELECT
    concept.id AS concept_id,
    ANY_VALUE(concept.display_name) AS display_name,
    CAST(COUNT(DISTINCT work_id) AS INT) AS works_count,
    CAST(SUM(cited_by_count) AS INT) AS cited_by_count
  FROM dedup
  GROUP BY 1
),
concepts_json AS (
  SELECT
    get_json_object(wikipedia_json, '$.query.pages[0]') AS wikipedia_data,
    *
  FROM openalex.common.concepts
  WHERE wikidata_id IS NOT NULL AND merge_into_id IS NULL
),
concepts_parsed AS (
  SELECT
    get_json_object(wikipedia_data, '$.original.source') AS parsed_image_url,
    get_json_object(wikipedia_data, '$.thumbnail.source') AS parsed_image_thumbnail_url,
    get_json_object(wikipedia_data, '$.terms.description[0]') AS parsed_description,
    *
  FROM concepts_json
)
SELECT
  con.concept_id AS id,
  ac.display_name,
  con.level,
  con.parsed_description AS description,
  con.wikidata_id AS wikidata,
  con.parsed_image_url AS image_url,
  con.parsed_image_thumbnail_url AS image_thumbnail_url,
  ac.works_count,
  ac.cited_by_count,
  NAMED_STRUCT(
    'openalex', CONCAT('https://openalex.org/C', con.concept_id),
    'wikidata', con.wikidata_id,
    'wikipedia', CONCAT('https://en.wikipedia.org/wiki/', REPLACE(LOWER(ac.display_name), ' ', '_')),
    'umls_aui', CAST(NULL AS ARRAY<STRING>),
    'umls_cui', CAST(NULL AS ARRAY<STRING>),
    'mag', CAST(NULL AS STRING)
  ) AS ids,
  CONCAT('https://api.openalex.org/works?filter=concepts.id:', con.concept_id) AS works_api_url,
  CAST(NULL AS STRUCT<`2yr_mean_citedness`: DOUBLE, h_index: INT, i10_index: INT>) AS summary_stats,
  CAST(NULL AS MAP<STRING, STRING>) AS international,
  CAST(NULL AS ARRAY<STRING>) AS ancestors,
  CAST(NULL AS ARRAY<STRING>) AS related_concepts,
  CAST(NULL AS ARRAY<STRING>) AS counts_by_year,
  con.created_date,
  CAST(NULL AS TIMESTAMP) AS updated_date
FROM aggregated_counts ac
JOIN concepts_parsed con ON con.concept_id = ac.concept_id

In [None]:
-- Compare hashes and set updated_date only when content changed
WITH new_hashes AS (
  SELECT id,
    xxhash64(CONCAT_WS('|',
      CAST(id AS STRING),
      COALESCE(display_name, ''),
      COALESCE(CAST(level AS STRING), '0'),
      COALESCE(description, ''),
      COALESCE(wikidata, ''),
      COALESCE(image_url, ''),
      COALESCE(image_thumbnail_url, ''),
      COALESCE(CAST(works_count AS STRING), '0'),
      COALESCE(CAST(cited_by_count AS STRING), '0'),
      COALESCE(TO_JSON(ids), '{}')
    )) AS content_hash
  FROM openalex.common.concepts_api
)
MERGE INTO openalex.common.concepts_api AS target
USING (
  SELECT n.id,
    CASE
      WHEN p.id IS NULL THEN DATE_TRUNC('SECOND', CURRENT_TIMESTAMP())
      WHEN n.content_hash <> p.content_hash THEN DATE_TRUNC('SECOND', CURRENT_TIMESTAMP())
      ELSE p.updated_date
    END AS new_updated_date
  FROM new_hashes n
  LEFT JOIN openalex.common.concepts_api_hash p ON n.id = p.id
) AS source
ON target.id = source.id
WHEN MATCHED THEN UPDATE SET target.updated_date = source.new_updated_date