In [0]:
WITH concepts_json AS (
  SELECT 
    get_json_object(wikipedia_json, '$.query.pages[0]') as wikipedia_data, 
    get_json_object(wikidata_json, 
      CONCAT('$.entities.', json_object_keys(get_json_object(wikidata_json, '$.entities'))[0])
    ) as wikidata_data,
    *
  FROM openalex.common.concepts
  WHERE wikidata_id IS NOT NULL AND merge_into_id IS NULL
),
concepts_parsed AS (
  SELECT
    get_json_object(wikipedia_data, '$.terms.description[0]') as description,
    get_json_object(wikipedia_data, '$.title') as wikipedia_title,
    get_json_object(wikipedia_data, '$.original.source') as image_url,
    get_json_object(wikipedia_data, '$.thumbnail.source') as image_thumbnail_url,
    get_json_object(wikipedia_data, '$.pageprops.wikibase_item') as raw_wikidata_id,
    wikipedia_data,
    wikidata_data, -- extract international data (probably from_json followed by transform_keys)
    *
  FROM concepts_json
)
SELECT * FROM concepts_parsed


In [0]:
-- need legacy.mag_advanced_field_of_study_children,
-- legacy.mag_advanced_field_of_study_extended_attributes

WITH exploded AS (
    SELECT id as work_id, cited_by_count, explode(concepts) as concept
    FROM openalex.works.openalex_works
),
-- one row per (work_id, keyword_id)
dedup AS (
  SELECT work_id, cited_by_count, concept
  FROM exploded
  QUALIFY row_number() OVER (PARTITION BY work_id, concept.id ORDER BY work_id, concept.id) = 1
),
-- Aggregate on unique keywords
aggregated_counts AS (
  SELECT
    concept.id as concept_id,
    concept.display_name as display_name,
    count(DISTINCT work_id) as works_count,
    sum(cited_by_count) as cited_by_count
  FROM dedup
  GROUP BY 1, 2
),
concepts_json AS (
  SELECT 
    get_json_object(wikipedia_json, '$.query.pages[0]') as wikipedia_data, 
    get_json_object(wikidata_json, 
      CONCAT('$.entities.', json_object_keys(get_json_object(wikidata_json, '$.entities'))[0])
    ) as wikidata_data,
    *
  FROM openalex.common.concepts
  WHERE wikidata_id IS NOT NULL AND merge_into_id IS NULL
),
concepts_parsed AS (
  SELECT
    get_json_object(wikipedia_data, '$.title') as wikipedia_title,
    get_json_object(wikipedia_data, '$.original.source') as image_url,
    get_json_object(wikipedia_data, '$.thumbnail.source') as image_thumbnail_url,
    get_json_object(wikipedia_data, '$.terms.description[0]') as description,
    get_json_object(wikipedia_data, '$.pageprops.wikibase_item') as raw_wikidata_id,
    wikidata_data, -- extract international data (probably from_json followed by transform_keys)
    *
  FROM concepts_json
)
-- Join with the common keywords table to get metadata
SELECT
  concept_id as _id,
  STRUCT(
    ac.concept_id as id,
    con.wikidata_id as wikidata,
    ac.display_name,
    con.level,
    con.description,
    ac.works_count,
    ac.cited_by_count,
    NULL AS summary_stats,
    named_struct(
      "openalex", con.concept_id,
      "wikidata", con.wikidata_id,
      "wikipedia", CONCAT("https://en.wikipedia.org/wiki/", 
        replace(lower(ac.display_name), " ", "_")),
      "umls_aui", NULL,
      "umls_cui", NULL,
      "mag", NULL
    ) as ids,
    con.image_url,
    con.image_thumbnail_url,
    NULL as international,
    NULL as ancestors,
    NULL as related_concepts,
    NULL as counts_by_year,
    CONCAT("https://api.openalex.org/works?filter=concepts.id:", con.concept_id) AS works_api_url,
    updated_date,
    created_date
  ) as _source
FROM aggregated_counts ac
JOIN concepts_parsed con USING (concept_id)