In [0]:
-- 216,437,323 work_id matches in openalex_works (Sept)
SELECT format_number(COUNT(DISTINCT work_id),0) as works_with_topics_count
FROM openalex.works.work_topic
WHERE work_id IN (SELECT id FROM openalex.works.openalex_works)

In [0]:
SELECT count(*) from openalex.works.work_topic -- 550,011,744 before merge_into_id, after 587,365,294

In [0]:
CREATE OR REPLACE TABLE openalex.works.work_topic
CLUSTER BY (work_id, topic_id) AS
-- Direct topics: works where merge_into_id IS NULL
WITH direct_topics AS (
  SELECT
    wt.paper_id as work_id,
    wt.topic_id,
    wt.score,
    wt.topic_rank,
    wt.algorithm_version,
    "mid.work_topic" as source,
    wt.updated_date as updated_datetime,
    current_timestamp() as created_datetime
  FROM openalex.mid.work_topic wt
  JOIN openalex.mid.work w ON wt.paper_id = w.paper_id
  WHERE w.merge_into_id IS NULL
),
-- Indirect topics: works where merge_into_id IS NOT NULL
-- Get topics from merge_into_id, keep assigned to original paper_id
indirect_topics AS (
  SELECT
    w.paper_id as work_id,
    wt.topic_id,
    wt.score,
    wt.topic_rank,
    wt.algorithm_version,
    "mid.work_topic" as source,
    wt.updated_date as updated_datetime,
    current_timestamp() as created_datetime
  FROM openalex.mid.work w
  JOIN openalex.mid.work_topic wt ON wt.paper_id = w.merge_into_id
  WHERE w.merge_into_id IS NOT NULL
)
SELECT * FROM direct_topics
UNION ALL
SELECT * FROM indirect_topics

### Create `backfill` table

In [0]:
CREATE OR REPLACE TABLE openalex.common.fields AS
SELECT * FROM openalex_postgres.mid.field;

CREATE OR REPLACE TABLE openalex.common.subfields AS
SELECT * FROM openalex_postgres.mid.subfield;

CREATE OR REPLACE TABLE openalex.common.domains AS
SELECT * FROM openalex_postgres.mid.domain;

--CREATE OR REPLACE TABLE openalex.common.topics AS
--SELECT * FROM openalex_postgres.mid.topic;


In [0]:
SELECT count(*) FROM openalex.works.work_topic

In [0]:
SELECT * FROM openalex.works.work_topics_backfill

In [0]:
select count(*) from openalex.works.work_topics_backfill -- 228,083,708 (before); 240,613,815 (after) + 12.5M

In [0]:
    -- {
    --   "id": "https://openalex.org/T10602",
    --   "display_name": "Glycosylation and Glycoproteins Research",
    --   "score": 0.9678,
    --   "subfield": {
    --     "id": "https://openalex.org/subfields/1312",
    --     "display_name": "Molecular Biology"
    --   },
    --   "field": {
    --     "id": "https://openalex.org/fields/13",
    --     "display_name": "Biochemistry, Genetics and Molecular Biology"
    --   },
    --   "domain": {
    --     "id": "https://openalex.org/domains/1",
    --     "display_name": "Life Sciences"
    --   }
    -- }
CREATE OR REPLACE TABLE openalex.works.work_topics_backfill 
CLUSTER BY (work_id)
AS
WITH topics_metadata AS (
  SELECT
    topic_id,
    t.display_name,
    NAMED_STRUCT(
      'id', concat('https://openalex.org/subfields/', s.subfield_id),
      'display_name', s.display_name
    ) as subfield,
    NAMED_STRUCT(
      'id', concat('https://openalex.org/fields/', f.field_id),
      'display_name', f.display_name
    ) as field,
    NAMED_STRUCT(
      'id', concat('https://openalex.org/domains/', d.domain_id),
      'display_name', d.display_name
    ) as domain
  FROM openalex.common.topics t
  JOIN openalex.common.subfields s USING (subfield_id)
  JOIN openalex.common.fields f USING (field_id)
  JOIN openalex.common.domains d USING (domain_id)
)
SELECT
  work_id,
  array_sort(
    array_agg(
      NAMED_STRUCT(
        'id', concat('https://openalex.org/T',topic_id),
        'display_name', tm.display_name,
        'score', score,
        'subfield', tm.subfield,
        'field', tm.field,
        'domain', tm.domain
      )
    ),
    (left, right) -> CASE
      WHEN left.score > right.score THEN -1
      WHEN left.score < right.score THEN 1
      ELSE 0
    END
  ) as topics,
  first(wt.source) as source,
  max(wt.created_datetime) as created_datetime,
  max(wt.updated_datetime) as updated_datetime
FROM openalex.works.work_topic wt
JOIN topics_metadata tm USING (topic_id)
GROUP BY work_id
-- enriched_work_topics AS (
--   SELECT
--     work_id,
--     array_sort(
--       array_agg(
--         struct(
--           concat('https://openalex.org/T',topic_id) AS id,
--           wikidata,
--           display_name,
--           level,
--           score
--         )
--       ),
--       (left, right) -> CASE
--         WHEN left.score > right.score THEN -1
--         WHEN left.score < right.score THEN 1
--         ELSE 0
--       END
--     ) AS topics,
--     first(source) as source,
--     max(updated_datetime) as updated_datetime,
--     current_timestamp() as created_datetime
--   FROM openalex.works.work_topic
--   JOIN topic_metadata USING (concept_id)
--   GROUP BY work_id
-- )
-- SELECT * FROM enriched_work_topics

In [0]:
CREATE OR REPLACE TABLE openalex.common.topics_structured
AS
WITH topics_metadata AS (
  SELECT
    topic_id,
    NAMED_STRUCT(
      'id', concat('https://openalex.org/T', t.topic_id),
      'display_name', t.display_name,
      'score', CAST(NULL AS DOUBLE),  -- placeholder
      'subfield', NAMED_STRUCT(
        'id', concat('https://openalex.org/subfields/', s.subfield_id),
        'display_name', s.display_name
      ),
      'field', NAMED_STRUCT(
        'id', concat('https://openalex.org/fields/', f.field_id),
        'display_name', f.display_name
      ),
      'domain', NAMED_STRUCT(
        'id', concat('https://openalex.org/domains/', d.domain_id),
        'display_name', d.display_name
      )
    ) AS topic_struct
  FROM openalex.common.topics t
  JOIN openalex.common.subfields s USING (subfield_id)
  JOIN openalex.common.fields f USING (field_id)
  JOIN openalex.common.domains d USING (domain_id)
)
SELECT * FROM topics_metadata;


In [0]:
--216,437,323
MERGE INTO openalex.works.openalex_works AS target
USING (
  SELECT
    work_id,
    topics
  FROM openalex.works.work_topics_backfill
) AS source
-- don't force update if fields are populated already
ON target.topics is NULL and target.id = source.work_id
WHEN MATCHED THEN
  UPDATE SET
    target.topics = source.topics,
    target.primary_topic = source.topics[0]

### API example

In [0]:
-- %python
-- # work_id 1775749144
-- API_JSON = """is_paratext": false,
--   "primary_topic": {
--     "id": "https://openalex.org/T10602",
--     "display_name": "Glycosylation and Glycoproteins Research",
--     "score": 0.9678,
--     "subfield": {
--       "id": "https://openalex.org/subfields/1312",
--       "display_name": "Molecular Biology"
--     },
--     "field": {
--       "id": "https://openalex.org/fields/13",
--       "display_name": "Biochemistry, Genetics and Molecular Biology"
--     },
--     "domain": {
--       "id": "https://openalex.org/domains/1",
--       "display_name": "Life Sciences"
--     }
--   },
--   "topics": [
--     {
--       "id": "https://openalex.org/T10602",
--       "display_name": "Glycosylation and Glycoproteins Research",
--       "score": 0.9678,
--       "subfield": {
--         "id": "https://openalex.org/subfields/1312",
--         "display_name": "Molecular Biology"
--       },
--       "field": {
--         "id": "https://openalex.org/fields/13",
--         "display_name": "Biochemistry, Genetics and Molecular Biology"
--       },
--       "domain": {
--         "id": "https://openalex.org/domains/1",
--         "display_name": "Life Sciences"
--       }
--     },
--     {
--       "id": "https://openalex.org/T11399",
--       "display_name": "Muscle metabolism and nutrition",
--       "score": 0.951,
--       "subfield": {
--         "id": "https://openalex.org/subfields/1307",
--         "display_name": "Cell Biology"
--       },
--       "field": {
--         "id": "https://openalex.org/fields/13",
--         "display_name": "Biochemistry, Genetics and Molecular Biology"
--       },
--       "domain": {
--         "id": "https://openalex.org/domains/1",
--         "display_name": "Life Sciences"
--       }
--     },
--     {
--       "id": "https://openalex.org/T14135",
--       "display_name": "Cancer and biochemical research",
--       "score": 0.9502,
--       "subfield": {
--         "id": "https://openalex.org/subfields/1312",
--         "display_name": "Molecular Biology"
--       },
--       "field": {
--         "id": "https://openalex.org/fields/13",
--         "display_name": "Biochemistry, Genetics and Molecular Biology"
--       },
--       "domain": {
--         "id": "https://openalex.org/domains/1",
--         "display_name": "Life Sciences"
--       }
--     }
--   ],
--   "keywords": [],
--   "concepts": ["""

In [0]:
-- ALTER TABLE openalex.works.openalex_works ADD COLUMN primary_topic
--     STRUCT<id STRING, display_name STRING, score FLOAT,
--         subfield STRUCT<id STRING, display_name STRING>,
--         field STRUCT<id STRING, display_name STRING>,
--         domain STRUCT<id STRING, display_name STRING>> AFTER apc_list;
-- ALTER TABLE openalex.works.openalex_works ADD COLUMN topics
--     ARRAY<STRUCT<id STRING, display_name STRING, score FLOAT,
--         subfield STRUCT<id STRING, display_name STRING>,
--         field STRUCT<id STRING, display_name STRING>,
--         domain STRUCT<id STRING, display_name STRING>>> AFTER primary_topic;