# Create Topic Hierarchy API Tables

Creates enriched API tables for topics, subfields, fields, and domains with:
- Hash-based `updated_date` that only changes when content changes
- Hierarchy relationships (parent, children, siblings)
- Aggregated `works_count` and `cited_by_count`

Uses:
- `openalex.common.topics` - Topic master table (has subfield_id, field_id, domain_id)
- `openalex.common.subfields` - Subfield master table (no parent FK)
- `openalex.common.fields` - Field master table (no parent FK)
- `openalex.common.domains` - Domain master table
- `openalex.works.openalex_works` - For works counts

Creates:
- `openalex.common.topics_api` - Enriched topics table
- `openalex.common.subfields_api` - Enriched subfields table
- `openalex.common.fields_api` - Enriched fields table
- `openalex.common.domains_api` - Enriched domains table
- `openalex.common.*_api_hash` - Hash snapshots for change detection

Note: subfields and fields tables lack parent FK columns. Hierarchy is derived from
the topics table which has subfield_id, field_id, and domain_id.

In [ ]:
-- Snapshot topics_api hashes before rebuild
CREATE OR REPLACE TABLE openalex.common.topics_api_hash AS
SELECT id, updated_date,
  xxhash64(CONCAT_WS('|',
    CAST(id AS STRING),
    COALESCE(display_name, ''),
    COALESCE(description, ''),
    COALESCE(TO_JSON(keywords), '[]'),
    COALESCE(ids.openalex, ''),
    COALESCE(ids.wikipedia, ''),
    COALESCE(TO_JSON(subfield), '{}'),
    COALESCE(TO_JSON(field), '{}'),
    COALESCE(TO_JSON(domain), '{}'),
    COALESCE(TO_JSON(siblings), '[]'),
    COALESCE(CAST(works_count AS STRING), '0'),
    COALESCE(CAST(cited_by_count AS STRING), '0')
  )) AS content_hash
FROM openalex.common.topics_api

In [ ]:
-- Snapshot subfields_api hashes before rebuild
CREATE OR REPLACE TABLE openalex.common.subfields_api_hash AS
SELECT id, updated_date,
  xxhash64(CONCAT_WS('|',
    CAST(id AS STRING),
    COALESCE(display_name, ''),
    COALESCE(description, ''),
    COALESCE(ids.openalex, ''),
    COALESCE(ids.wikidata, ''),
    COALESCE(ids.wikipedia, ''),
    COALESCE(TO_JSON(display_name_alternatives), '[]'),
    COALESCE(TO_JSON(field), '{}'),
    COALESCE(TO_JSON(domain), '{}'),
    COALESCE(TO_JSON(topics), '[]'),
    COALESCE(TO_JSON(siblings), '[]'),
    COALESCE(CAST(works_count AS STRING), '0'),
    COALESCE(CAST(cited_by_count AS STRING), '0')
  )) AS content_hash
FROM openalex.common.subfields_api

In [ ]:
-- Snapshot fields_api hashes before rebuild
CREATE OR REPLACE TABLE openalex.common.fields_api_hash AS
SELECT id, updated_date,
  xxhash64(CONCAT_WS('|',
    CAST(id AS STRING),
    COALESCE(display_name, ''),
    COALESCE(description, ''),
    COALESCE(ids.openalex, ''),
    COALESCE(ids.wikidata, ''),
    COALESCE(ids.wikipedia, ''),
    COALESCE(TO_JSON(display_name_alternatives), '[]'),
    COALESCE(TO_JSON(domain), '{}'),
    COALESCE(TO_JSON(subfields), '[]'),
    COALESCE(TO_JSON(siblings), '[]'),
    COALESCE(CAST(works_count AS STRING), '0'),
    COALESCE(CAST(cited_by_count AS STRING), '0')
  )) AS content_hash
FROM openalex.common.fields_api

In [ ]:
-- Snapshot domains_api hashes before rebuild
CREATE OR REPLACE TABLE openalex.common.domains_api_hash AS
SELECT id, updated_date,
  xxhash64(CONCAT_WS('|',
    CAST(id AS STRING),
    COALESCE(display_name, ''),
    COALESCE(description, ''),
    COALESCE(ids.openalex, ''),
    COALESCE(ids.wikidata, ''),
    COALESCE(ids.wikipedia, ''),
    COALESCE(TO_JSON(display_name_alternatives), '[]'),
    COALESCE(TO_JSON(fields), '[]'),
    COALESCE(TO_JSON(siblings), '[]'),
    COALESCE(CAST(works_count AS STRING), '0'),
    COALESCE(CAST(cited_by_count AS STRING), '0')
  )) AS content_hash
FROM openalex.common.domains_api

In [None]:
-- Build hierarchy lookup from topics table (subfield->field->domain mappings)
-- and compute per-entity works_count/cited_by_count using primary_topic
CREATE OR REPLACE TABLE openalex.common.topic_hierarchy_staging AS
WITH hierarchy AS (
  SELECT DISTINCT subfield_id, field_id, domain_id
  FROM openalex.common.topics
),
primary_topics AS (
  SELECT
    w.id AS work_id,
    CAST(w.cited_by_count AS BIGINT) AS cited_by_count,
    CAST(REPLACE(w.primary_topic.id, 'https://openalex.org/T', '') AS INT) AS topic_id,
    CAST(REPLACE(w.primary_topic.subfield.id, 'https://openalex.org/subfields/', '') AS INT) AS subfield_id,
    CAST(REPLACE(w.primary_topic.field.id, 'https://openalex.org/fields/', '') AS INT) AS field_id,
    CAST(REPLACE(w.primary_topic.domain.id, 'https://openalex.org/domains/', '') AS INT) AS domain_id
  FROM openalex.works.openalex_works w
  WHERE w.primary_topic IS NOT NULL
),
topic_agg AS (
  SELECT topic_id AS entity_id,
    CAST(COUNT(DISTINCT work_id) AS INT) AS works_count,
    CAST(SUM(cited_by_count) AS BIGINT) AS cited_by_count
  FROM primary_topics GROUP BY topic_id
),
subfield_agg AS (
  SELECT subfield_id AS entity_id,
    CAST(COUNT(DISTINCT work_id) AS INT) AS works_count,
    CAST(SUM(cited_by_count) AS BIGINT) AS cited_by_count
  FROM primary_topics GROUP BY subfield_id
),
field_agg AS (
  SELECT field_id AS entity_id,
    CAST(COUNT(DISTINCT work_id) AS INT) AS works_count,
    CAST(SUM(cited_by_count) AS BIGINT) AS cited_by_count
  FROM primary_topics GROUP BY field_id
),
domain_agg AS (
  SELECT domain_id AS entity_id,
    CAST(COUNT(DISTINCT work_id) AS INT) AS works_count,
    CAST(SUM(cited_by_count) AS BIGINT) AS cited_by_count
  FROM primary_topics GROUP BY domain_id
)
-- Combine all into a single staging table
SELECT 'hierarchy' AS record_type, subfield_id AS id1, field_id AS id2, domain_id AS id3, 0 AS works_count, CAST(0 AS BIGINT) AS cited_by_count FROM hierarchy
UNION ALL
SELECT 'topic', entity_id, 0, 0, works_count, cited_by_count FROM topic_agg
UNION ALL
SELECT 'subfield', entity_id, 0, 0, works_count, cited_by_count FROM subfield_agg
UNION ALL
SELECT 'field', entity_id, 0, 0, works_count, cited_by_count FROM field_agg
UNION ALL
SELECT 'domain', entity_id, 0, 0, works_count, cited_by_count FROM domain_agg

In [ ]:
-- Build domains_api (4 rows)
CREATE OR REPLACE TABLE openalex.common.domains_api AS
WITH hierarchy AS (
  SELECT DISTINCT id1 AS subfield_id, id2 AS field_id, id3 AS domain_id
  FROM openalex.common.topic_hierarchy_staging
  WHERE record_type = 'hierarchy'
),
domain_fields AS (
  SELECT
    h.domain_id,
    ARRAY_SORT(
      COLLECT_LIST(
        NAMED_STRUCT(
          'id', CONCAT('https://openalex.org/fields/', f.field_id),
          'display_name', f.display_name
        )
      ),
      (l, r) -> CASE WHEN l.id < r.id THEN -1 WHEN l.id > r.id THEN 1 ELSE 0 END
    ) AS fields
  FROM (
    SELECT DISTINCT domain_id, field_id FROM hierarchy
  ) h
  JOIN openalex.common.fields f ON h.field_id = f.field_id
  GROUP BY h.domain_id
),
domain_siblings AS (
  SELECT
    d1.domain_id,
    ARRAY_SORT(
      COLLECT_LIST(
        NAMED_STRUCT(
          'id', CONCAT('https://openalex.org/domains/', d2.domain_id),
          'display_name', d2.display_name
        )
      ),
      (l, r) -> CASE WHEN l.id < r.id THEN -1 WHEN l.id > r.id THEN 1 ELSE 0 END
    ) AS siblings
  FROM openalex.common.domains d1
  JOIN openalex.common.domains d2 ON d1.domain_id <> d2.domain_id
  GROUP BY d1.domain_id
),
domain_counts AS (
  SELECT id1 AS domain_id, works_count, cited_by_count
  FROM openalex.common.topic_hierarchy_staging
  WHERE record_type = 'domain'
)
SELECT
  d.domain_id AS id,
  d.display_name,
  d.description,
  NAMED_STRUCT(
    'openalex', CONCAT('https://openalex.org/domains/', d.domain_id),
    'wikidata', d.wikidata_url,
    'wikipedia', d.wikipedia_url
  ) AS ids,
  COALESCE(FROM_JSON(d.display_name_alternatives, 'ARRAY<STRING>'), ARRAY()) AS display_name_alternatives,
  COALESCE(df.fields, ARRAY()) AS fields,
  COALESCE(ds.siblings, ARRAY()) AS siblings,
  COALESCE(dc.works_count, 0) AS works_count,
  COALESCE(CAST(dc.cited_by_count AS INT), 0) AS cited_by_count,
  CONCAT('https://api.openalex.org/works?filter=topics.domain.id:', d.domain_id) AS works_api_url,
  CAST(NULL AS TIMESTAMP) AS updated_date,
  DATE_TRUNC('SECOND', d.created_date) AS created_date
FROM openalex.common.domains d
LEFT JOIN domain_fields df ON d.domain_id = df.domain_id
LEFT JOIN domain_siblings ds ON d.domain_id = ds.domain_id
LEFT JOIN domain_counts dc ON d.domain_id = dc.domain_id

In [None]:
-- Build fields_api (~27 rows)
CREATE OR REPLACE TABLE openalex.common.fields_api AS
WITH hierarchy AS (
  SELECT DISTINCT id1 AS subfield_id, id2 AS field_id, id3 AS domain_id
  FROM openalex.common.topic_hierarchy_staging
  WHERE record_type = 'hierarchy'
),
field_domain AS (
  SELECT DISTINCT field_id, domain_id FROM hierarchy
),
field_subfields AS (
  SELECT
    h.field_id,
    ARRAY_SORT(
      COLLECT_LIST(
        NAMED_STRUCT(
          'id', CONCAT('https://openalex.org/subfields/', s.subfield_id),
          'display_name', s.display_name
        )
      ),
      (l, r) -> CASE WHEN l.id < r.id THEN -1 WHEN l.id > r.id THEN 1 ELSE 0 END
    ) AS subfields
  FROM (
    SELECT DISTINCT field_id, subfield_id FROM hierarchy
  ) h
  JOIN openalex.common.subfields s ON h.subfield_id = s.subfield_id
  GROUP BY h.field_id
),
field_siblings AS (
  SELECT
    f1.field_id,
    ARRAY_SORT(
      COLLECT_LIST(
        NAMED_STRUCT(
          'id', CONCAT('https://openalex.org/fields/', f2.field_id),
          'display_name', f2.display_name
        )
      ),
      (l, r) -> CASE WHEN l.id < r.id THEN -1 WHEN l.id > r.id THEN 1 ELSE 0 END
    ) AS siblings
  FROM openalex.common.fields f1
  JOIN openalex.common.fields f2 ON f1.field_id <> f2.field_id
  GROUP BY f1.field_id
),
field_counts AS (
  SELECT id1 AS field_id, works_count, cited_by_count
  FROM openalex.common.topic_hierarchy_staging
  WHERE record_type = 'field'
)
SELECT
  f.field_id AS id,
  f.display_name,
  f.description,
  NAMED_STRUCT(
    'openalex', CONCAT('https://openalex.org/fields/', f.field_id),
    'wikidata', f.wikidata_url,
    'wikipedia', f.wikipedia_url
  ) AS ids,
  COALESCE(FROM_JSON(f.display_name_alternatives, 'ARRAY<STRING>'), ARRAY()) AS display_name_alternatives,
  NAMED_STRUCT(
    'id', CONCAT('https://openalex.org/domains/', d.domain_id),
    'display_name', d.display_name
  ) AS domain,
  COALESCE(fs.subfields, ARRAY()) AS subfields,
  COALESCE(fsib.siblings, ARRAY()) AS siblings,
  COALESCE(fc.works_count, 0) AS works_count,
  COALESCE(CAST(fc.cited_by_count AS INT), 0) AS cited_by_count,
  CONCAT('https://api.openalex.org/works?filter=topics.field.id:', f.field_id) AS works_api_url,
  CAST(NULL AS TIMESTAMP) AS updated_date,
  DATE_TRUNC('SECOND', f.created_date) AS created_date
FROM openalex.common.fields f
JOIN field_domain fd ON f.field_id = fd.field_id
JOIN openalex.common.domains d ON fd.domain_id = d.domain_id
LEFT JOIN field_subfields fs ON f.field_id = fs.field_id
LEFT JOIN field_siblings fsib ON f.field_id = fsib.field_id
LEFT JOIN field_counts fc ON f.field_id = fc.field_id

In [None]:
-- Build subfields_api (~250 rows)
CREATE OR REPLACE TABLE openalex.common.subfields_api AS
WITH hierarchy AS (
  SELECT DISTINCT id1 AS subfield_id, id2 AS field_id, id3 AS domain_id
  FROM openalex.common.topic_hierarchy_staging
  WHERE record_type = 'hierarchy'
),
subfield_topics AS (
  SELECT
    t.subfield_id,
    ARRAY_SORT(
      COLLECT_LIST(
        NAMED_STRUCT(
          'id', CONCAT('https://openalex.org/T', t.topic_id),
          'display_name', t.display_name
        )
      ),
      (l, r) -> CASE WHEN l.id < r.id THEN -1 WHEN l.id > r.id THEN 1 ELSE 0 END
    ) AS topics
  FROM openalex.common.topics t
  GROUP BY t.subfield_id
),
subfield_siblings AS (
  SELECT
    s1.subfield_id,
    ARRAY_SORT(
      COLLECT_LIST(
        NAMED_STRUCT(
          'id', CONCAT('https://openalex.org/subfields/', s2.subfield_id),
          'display_name', s2.display_name
        )
      ),
      (l, r) -> CASE WHEN l.id < r.id THEN -1 WHEN l.id > r.id THEN 1 ELSE 0 END
    ) AS siblings
  FROM openalex.common.subfields s1
  JOIN openalex.common.subfields s2 ON s1.subfield_id <> s2.subfield_id
  GROUP BY s1.subfield_id
),
subfield_counts AS (
  SELECT id1 AS subfield_id, works_count, cited_by_count
  FROM openalex.common.topic_hierarchy_staging
  WHERE record_type = 'subfield'
)
SELECT
  s.subfield_id AS id,
  s.display_name,
  s.description,
  NAMED_STRUCT(
    'openalex', CONCAT('https://openalex.org/subfields/', s.subfield_id),
    'wikidata', s.wikidata_url,
    'wikipedia', s.wikipedia_url
  ) AS ids,
  COALESCE(FROM_JSON(s.display_name_alternatives, 'ARRAY<STRING>'), ARRAY()) AS display_name_alternatives,
  NAMED_STRUCT(
    'id', CONCAT('https://openalex.org/fields/', f.field_id),
    'display_name', f.display_name
  ) AS field,
  NAMED_STRUCT(
    'id', CONCAT('https://openalex.org/domains/', d.domain_id),
    'display_name', d.display_name
  ) AS domain,
  COALESCE(st.topics, ARRAY()) AS topics,
  COALESCE(ssib.siblings, ARRAY()) AS siblings,
  COALESCE(sc.works_count, 0) AS works_count,
  COALESCE(CAST(sc.cited_by_count AS INT), 0) AS cited_by_count,
  CONCAT('https://api.openalex.org/works?filter=topics.subfield.id:', s.subfield_id) AS works_api_url,
  CAST(NULL AS TIMESTAMP) AS updated_date,
  DATE_TRUNC('SECOND', s.created_date) AS created_date
FROM openalex.common.subfields s
JOIN hierarchy h ON s.subfield_id = h.subfield_id
JOIN openalex.common.fields f ON h.field_id = f.field_id
JOIN openalex.common.domains d ON h.domain_id = d.domain_id
LEFT JOIN subfield_topics st ON s.subfield_id = st.subfield_id
LEFT JOIN subfield_siblings ssib ON s.subfield_id = ssib.subfield_id
LEFT JOIN subfield_counts sc ON s.subfield_id = sc.subfield_id

In [ ]:
-- Build topics_api (~4,500 rows)
CREATE OR REPLACE TABLE openalex.common.topics_api AS
WITH topic_siblings AS (
  SELECT
    t1.topic_id,
    ARRAY_SORT(
      COLLECT_LIST(
        NAMED_STRUCT(
          'id', CONCAT('https://openalex.org/T', t2.topic_id),
          'display_name', t2.display_name
        )
      ),
      (l, r) -> CASE WHEN l.id < r.id THEN -1 WHEN l.id > r.id THEN 1 ELSE 0 END
    ) AS siblings
  FROM openalex.common.topics t1
  JOIN openalex.common.topics t2 ON t1.subfield_id = t2.subfield_id AND t1.topic_id <> t2.topic_id
  GROUP BY t1.topic_id
),
topic_counts AS (
  SELECT id1 AS topic_id, works_count, cited_by_count
  FROM openalex.common.topic_hierarchy_staging
  WHERE record_type = 'topic'
)
SELECT
  t.topic_id AS id,
  t.display_name,
  t.summary AS description,
  TRANSFORM(SPLIT(t.keywords, '; '), x -> TRIM(x)) AS keywords,
  NAMED_STRUCT(
    'openalex', CONCAT('https://openalex.org/T', t.topic_id),
    'wikipedia', t.wikipedia_url
  ) AS ids,
  NAMED_STRUCT(
    'id', CONCAT('https://openalex.org/subfields/', s.subfield_id),
    'display_name', s.display_name
  ) AS subfield,
  NAMED_STRUCT(
    'id', CONCAT('https://openalex.org/fields/', f.field_id),
    'display_name', f.display_name
  ) AS field,
  NAMED_STRUCT(
    'id', CONCAT('https://openalex.org/domains/', d.domain_id),
    'display_name', d.display_name
  ) AS domain,
  COALESCE(tsib.siblings, ARRAY()) AS siblings,
  COALESCE(tc.works_count, 0) AS works_count,
  COALESCE(CAST(tc.cited_by_count AS INT), 0) AS cited_by_count,
  CONCAT('https://api.openalex.org/works?filter=topics.id:T', t.topic_id) AS works_api_url,
  CAST(NULL AS TIMESTAMP) AS updated_date,
  DATE_TRUNC('SECOND', t.created_date) AS created_date
FROM openalex.common.topics t
JOIN openalex.common.subfields s ON t.subfield_id = s.subfield_id
JOIN openalex.common.fields f ON t.field_id = f.field_id
JOIN openalex.common.domains d ON t.domain_id = d.domain_id
LEFT JOIN topic_siblings tsib ON t.topic_id = tsib.topic_id
LEFT JOIN topic_counts tc ON t.topic_id = tc.topic_id

In [ ]:
-- Topics: compare hashes and set updated_date only when content changed
WITH new_hashes AS (
  SELECT id,
    xxhash64(CONCAT_WS('|',
      CAST(id AS STRING),
      COALESCE(display_name, ''),
      COALESCE(description, ''),
      COALESCE(TO_JSON(keywords), '[]'),
      COALESCE(ids.openalex, ''),
      COALESCE(ids.wikipedia, ''),
      COALESCE(TO_JSON(subfield), '{}'),
      COALESCE(TO_JSON(field), '{}'),
      COALESCE(TO_JSON(domain), '{}'),
      COALESCE(TO_JSON(siblings), '[]'),
      COALESCE(CAST(works_count AS STRING), '0'),
      COALESCE(CAST(cited_by_count AS STRING), '0')
    )) AS content_hash
  FROM openalex.common.topics_api
)
MERGE INTO openalex.common.topics_api AS target
USING (
  SELECT n.id,
    CASE
      WHEN p.id IS NULL THEN DATE_TRUNC('SECOND', CURRENT_TIMESTAMP())
      WHEN n.content_hash <> p.content_hash THEN DATE_TRUNC('SECOND', CURRENT_TIMESTAMP())
      ELSE p.updated_date
    END AS new_updated_date
  FROM new_hashes n
  LEFT JOIN openalex.common.topics_api_hash p ON n.id = p.id
) AS source
ON target.id = source.id
WHEN MATCHED THEN UPDATE SET target.updated_date = source.new_updated_date

In [ ]:
-- Subfields: compare hashes and set updated_date only when content changed
WITH new_hashes AS (
  SELECT id,
    xxhash64(CONCAT_WS('|',
      CAST(id AS STRING),
      COALESCE(display_name, ''),
      COALESCE(description, ''),
      COALESCE(ids.openalex, ''),
      COALESCE(ids.wikidata, ''),
      COALESCE(ids.wikipedia, ''),
      COALESCE(TO_JSON(display_name_alternatives), '[]'),
      COALESCE(TO_JSON(field), '{}'),
      COALESCE(TO_JSON(domain), '{}'),
      COALESCE(TO_JSON(topics), '[]'),
      COALESCE(TO_JSON(siblings), '[]'),
      COALESCE(CAST(works_count AS STRING), '0'),
      COALESCE(CAST(cited_by_count AS STRING), '0')
    )) AS content_hash
  FROM openalex.common.subfields_api
)
MERGE INTO openalex.common.subfields_api AS target
USING (
  SELECT n.id,
    CASE
      WHEN p.id IS NULL THEN DATE_TRUNC('SECOND', CURRENT_TIMESTAMP())
      WHEN n.content_hash <> p.content_hash THEN DATE_TRUNC('SECOND', CURRENT_TIMESTAMP())
      ELSE p.updated_date
    END AS new_updated_date
  FROM new_hashes n
  LEFT JOIN openalex.common.subfields_api_hash p ON n.id = p.id
) AS source
ON target.id = source.id
WHEN MATCHED THEN UPDATE SET target.updated_date = source.new_updated_date

In [ ]:
-- Fields: compare hashes and set updated_date only when content changed
WITH new_hashes AS (
  SELECT id,
    xxhash64(CONCAT_WS('|',
      CAST(id AS STRING),
      COALESCE(display_name, ''),
      COALESCE(description, ''),
      COALESCE(ids.openalex, ''),
      COALESCE(ids.wikidata, ''),
      COALESCE(ids.wikipedia, ''),
      COALESCE(TO_JSON(display_name_alternatives), '[]'),
      COALESCE(TO_JSON(domain), '{}'),
      COALESCE(TO_JSON(subfields), '[]'),
      COALESCE(TO_JSON(siblings), '[]'),
      COALESCE(CAST(works_count AS STRING), '0'),
      COALESCE(CAST(cited_by_count AS STRING), '0')
    )) AS content_hash
  FROM openalex.common.fields_api
)
MERGE INTO openalex.common.fields_api AS target
USING (
  SELECT n.id,
    CASE
      WHEN p.id IS NULL THEN DATE_TRUNC('SECOND', CURRENT_TIMESTAMP())
      WHEN n.content_hash <> p.content_hash THEN DATE_TRUNC('SECOND', CURRENT_TIMESTAMP())
      ELSE p.updated_date
    END AS new_updated_date
  FROM new_hashes n
  LEFT JOIN openalex.common.fields_api_hash p ON n.id = p.id
) AS source
ON target.id = source.id
WHEN MATCHED THEN UPDATE SET target.updated_date = source.new_updated_date

In [ ]:
-- Domains: compare hashes and set updated_date only when content changed
WITH new_hashes AS (
  SELECT id,
    xxhash64(CONCAT_WS('|',
      CAST(id AS STRING),
      COALESCE(display_name, ''),
      COALESCE(description, ''),
      COALESCE(ids.openalex, ''),
      COALESCE(ids.wikidata, ''),
      COALESCE(ids.wikipedia, ''),
      COALESCE(TO_JSON(display_name_alternatives), '[]'),
      COALESCE(TO_JSON(fields), '[]'),
      COALESCE(TO_JSON(siblings), '[]'),
      COALESCE(CAST(works_count AS STRING), '0'),
      COALESCE(CAST(cited_by_count AS STRING), '0')
    )) AS content_hash
  FROM openalex.common.domains_api
)
MERGE INTO openalex.common.domains_api AS target
USING (
  SELECT n.id,
    CASE
      WHEN p.id IS NULL THEN DATE_TRUNC('SECOND', CURRENT_TIMESTAMP())
      WHEN n.content_hash <> p.content_hash THEN DATE_TRUNC('SECOND', CURRENT_TIMESTAMP())
      ELSE p.updated_date
    END AS new_updated_date
  FROM new_hashes n
  LEFT JOIN openalex.common.domains_api_hash p ON n.id = p.id
) AS source
ON target.id = source.id
WHEN MATCHED THEN UPDATE SET target.updated_date = source.new_updated_date

In [ ]:
-- Verification counts
SELECT 'topics' AS entity, COUNT(*) AS count FROM openalex.common.topics_api
UNION ALL
SELECT 'subfields', COUNT(*) FROM openalex.common.subfields_api
UNION ALL
SELECT 'fields', COUNT(*) FROM openalex.common.fields_api
UNION ALL
SELECT 'domains', COUNT(*) FROM openalex.common.domains_api

In [ ]:
-- Clean up staging table
DROP TABLE IF EXISTS openalex.common.topic_hierarchy_staging