### Create `openalex.works.work_references` table (if not exists)

In [0]:
%sql
-- looks like a few fields have been added after - citing_work_id, cited_work_id, etc.
CREATE TABLE IF NOT EXISTS openalex.works.work_references 
CLUSTER BY AUTO
AS
  WITH references_exploded AS (
    SELECT 
      native_id, 
      native_id_namespace,
      work_id as citing_work_id,
      provenance,
      posexplode(references) as (ref_ind, ref)
    FROM openalex.works.locations_mapped
  )
  SELECT
    native_id, 
    native_id_namespace,
    citing_work_id,
    ref_ind,
    ref.doi as doi,
    ref.pmid as pmid,
    ref.arxiv as arxiv,
    ref.title as title,
    ref.authors as authors,
    ref.year as year,
    ref.raw as raw,
    provenance,
    current_timestamp() as created_timestamp,
    current_timestamp() as updated_timestamp
  FROM references_exploded


### Insert fresh records for Parsing into `work_references`

In [0]:
%sql
-- Insert new references into work_references table
-- Strategy: INSERT ONLY for new (citing_work_id, ref_ind) combinations
-- Rationale: References don't change often; avoids data shift issues
-- For full re-processing of a work, delete its references first, then insert
-- 10/23 
    -- 84,848,991 records inserted
    -- 3,758,687,070 total records
    -- 1,526,343,813 cited_work_id is NULL

INSERT INTO openalex.works.work_references (
  native_id,
  native_id_namespace,
  citing_work_id,
  cited_work_id,
  ref_ind,
  doi,
  pmid,
  arxiv,
  title,
  normalized_title,
  authors,
  year,
  raw,
  parsed_doi,
  parsed_first_author,
  parsed_title,
  title_author,
  provenance,
  created_timestamp,
  updated_timestamp
)
WITH works_to_process AS (
  SELECT 
    lm.native_id,
    lm.native_id_namespace,
    lm.work_id as citing_work_id,
    lm.provenance,
    posexplode(lm.references) as (ref_ind, ref)
  FROM openalex.works.locations_mapped lm
  LEFT ANTI JOIN openalex.works.work_references wr
    ON lm.work_id = wr.citing_work_id
)
SELECT 
  native_id,
  native_id_namespace,
  citing_work_id,
  CAST(null as BIGINT), -- cited_work_id (calculated later)
  ref_ind,
  ref.doi as doi,
  ref.pmid as pmid,
  ref.arxiv as arxiv,
  ref.title as title,
  CAST(null as STRING), -- normalized_title (calculated later)
  ref.authors as authors,
  ref.year as year,
  ref.raw as raw,
  CAST(null as STRING), -- parsed_doi (calculated later)
  CAST(null as STRING), -- parsed_first_author (calculated later)
  CAST(null as STRING), -- parsed_title (calculated later)
  CAST(null as STRING), -- title_author (calculated later)
  provenance,
  current_timestamp() as created_timestamp,
  current_timestamp() as updated_timestamp
FROM works_to_process;
-- For full re-processing of specific works, run this first:
-- DELETE FROM openalex.works.work_references
-- WHERE citing_work_id IN (SELECT work_id FROM openalex.works.locations_mapped WHERE ...);

### Merge with `work_id_map.doi`

In [0]:
%sql
-- ============================================================
-- STEP 2: Link references to cited works via DOI matching
-- ============================================================
-- Update cited_work_id and title_author for references where DOI matches work_id_map
-- Prefer paper_id, fall back to id within work_id_map
-- Only updates records where cited_work_id is still NULL
-- 10/23 added 29,913,749 IDs for 84,848,991 new records, recovered 39,496 pmid

MERGE INTO openalex.works.work_references AS target
USING (
  SELECT 
    LOWER(doi) as doi,
    MIN(paper_id) as paper_id,
    MIN(id) as work_id,
    MIN(pmid) as pmid,
    MAX(title_author) as title_author 
  FROM openalex.works.work_id_map
  WHERE doi IS NOT NULL
  GROUP BY lower(doi)
) AS source
ON lower(target.doi) = source.doi
WHEN MATCHED AND target.cited_work_id IS NULL
THEN UPDATE SET
  target.cited_work_id = COALESCE(source.paper_id, source.work_id),
  target.pmid = COALESCE(source.pmid, target.pmid), -- bring it in if exists
  target.title_author = COALESCE(source.title_author, target.title_author), -- bring it in if exists
  target.updated_timestamp = current_timestamp();

### Merge with `work_id_map.pmid`

In [0]:
%sql
-- 10/23 added 11,311,131 IDs (where doi was null) for 84,848,991 new records 
MERGE INTO openalex.works.work_references AS target
USING (
  SELECT DISTINCT
    lower(pmid) as pmid,
    MIN(paper_id) AS paper_id,
    MIN(id) as work_id,
    MAX(title_author) as title_author
  FROM openalex.works.work_id_map
  WHERE pmid IS NOT NULL and doi is NULL -- keep doi is null because it otherwise adds a lot of erroneous refs
  GROUP BY pmid
) as source
ON lower(target.pmid) = source.pmid
WHEN MATCHED AND target.cited_work_id IS NULL
THEN UPDATE SET
  target.cited_work_id = COALESCE(source.paper_id, source.work_id),
  target.title_author = COALESCE(source.title_author, target.title_author), -- bring it in if exists
  target.updated_timestamp = current_timestamp();

### Merge with `work_id_map.title_author`

In [0]:
%sql
-- 10/23 added 11,311,131 IDs for 84,848,991 new records
MERGE INTO openalex.works.work_references AS target
USING (
  SELECT DISTINCT
    lower(pmid) as pmid,
    MIN(paper_id) AS paper_id,
    MIN(id) as work_id,
    MAX(title_author) as title_author
  FROM openalex.works.work_id_map
  WHERE pmid IS NOT NULL AND doi is NULL
  GROUP BY pmid
) as source
ON lower(target.pmid) = source.pmid
WHEN MATCHED AND target.cited_work_id IS NULL
THEN UPDATE SET
  target.cited_work_id = COALESCE(source.paper_id, source.work_id),
  target.title_author = COALESCE(source.title_author, target.title_author), -- bring it in if exists
  target.updated_timestamp = current_timestamp();

In [0]:
%sql
  SELECT provenance, native_id, native_id_namespace, references
  FROM openalex.works.locations_parsed
  WHERE references is not null and size(references) > 0
  AND NOT(size(references) = 1 and references[0].doi is null 
    and references[0].pmid is null and references[0].title is null and references[0].arxiv is null and references[0].raw is null)

In [0]:
%sql
WITH filtered_refs AS (
  SELECT provenance, native_id, native_id_namespace, 
    references,
    size(array_compact(references.doi)) as num_dois,
    size(array_compact(references.pmid)) as num_pmids,
    size(array_compact(references.title)) as num_titles,
    size(array_compact(references.raw)) as num_raw,
    size(array_compact(references.authors)) as num_authors
  FROM openalex.works.locations_parsed
  WHERE references is not null and size(references) > 0
  AND NOT(size(references) = 1 and references[0].doi is null 
    and references[0].pmid is null and references[0].title is null 
    and references[0].arxiv is null and references[0].raw is null)
)
SELECT provenance,
  format_number(count(*),0) as total_record_count,
  format_number(sum(size(references)),0) as total_reference_count,
  format_number(sum(num_dois),0) as total_dois, 
  format_number(sum(num_pmids),0) as total_pmids, 
  format_number(sum(num_titles),0) as total_titles,   
  format_number(sum(num_authors),0) as total_authors,
  format_number(sum(num_raw),0) as total_raw_strings,  
  format_number(count_if(num_dois > 0),0) as has_any_dois,
  format_number(count_if(num_pmids > 0),0) as has_any_pmids, 
  format_number(count_if(num_titles > 0),0) as has_any_titles,  
  format_number(count_if(num_authors > 0),0) as has_any_authors,
  format_number(count_if(num_raw > 0),0) as has_any_raw_strings
FROM filtered_refs group by provenance order by 2 desc

### Aggregate before `openalex_works`

In [0]:
%sql
CREATE OR REPLACE TABLE openalex.works.referenced_works
SELECT
  citing_work_id,
  TRANSFORM(
    SORT_ARRAY(COLLECT_LIST(STRUCT(ref_ind, cited_work_id))),
    x -> x.cited_work_id
  ) AS referenced_works
FROM openalex.works.work_references
WHERE cited_work_id IS NOT NULL
GROUP BY citing_work_id;
OPTIMIZE openalex.works.referenced_works ZORDER BY (citing_work_id);