In [None]:
%sql
-- Populate SDG frontfill input table with works that need SDG inference
-- Excludes works already processed (via LEFT ANTI JOIN to output table)
CREATE OR REPLACE TABLE openalex.works.works_sdg_frontfill_input
SELECT
  id as work_id, title, abstract
FROM openalex.works.openalex_works_base w
LEFT ANTI JOIN openalex.works.works_sdg_frontfill lm
  ON w.id = lm.work_id
WHERE type IN ('article', 'book', 'review', 'book-chapter', 'preprint', 'dissertation')
    -- Only process recent works (id > 6600000000)
    AND w.id > 6600000000
    AND (
      -- Require at least 25 characters of combined text (title + abstract)
      (length(title) + coalesce(length(abstract), 0)) >= 25
      -- Require at least 5 words to ensure meaningful content for SDG classification
      AND (array_size(split(concat(coalesce(title,''), ' ', coalesce(abstract,'')), ' ')) >= 5)
    );

In [None]:
%sql
OPTIMIZE openalex.works.works_sdg_frontfill_input ZORDER BY (work_id);

In [None]:
%sql
SELECT FORMAT_NUMBER(COUNT(*),0) as num_works_to_process
FROM openalex.works.works_sdg_frontfill_input