### Create `openalex_works` table, include curation data

In [0]:
-- Register helper function
CREATE FUNCTION IF NOT EXISTS get_highest_priority_value(
    all_structs ARRAY<STRUCT<field_value: STRING, priority: INT>>, field_name STRING
  )
  RETURNS STRING
  RETURN
    (
      SELECT
        AGGREGATE(
          FILTER(all_structs, y -> y.field_value IS NOT NULL),
          STRUCT(CAST(NULL AS STRING) AS field_value, 999 AS priority),
          (acc, x) -> CASE
            WHEN x.priority < acc.priority THEN x
            ELSE acc
          END
        ).field_value
    );

-- Final pipeline to create openalex_works
CREATE
OR REPLACE TABLE identifier(
  'openalex' || :env_suffix || '.works.openalex_works'
) CLUSTER BY AUTO
TBLPROPERTIES (
  'delta.dataSkippingNumIndexedCols' = 36,
  'delta.deletedFileRetentionDuration' = '60 days',
  -- default is 7
  'delta.logRetentionDuration' = '60 days' -- default is 30
) AS (
  WITH mat_sources AS (
    SELECT
      s.id AS source_id,
      s.display_name,
      s.issn AS issn_l,
      s.issns,
      s.is_in_doaj,
      s.is_core,
      s.publisher AS source_publisher,
      s.publisher_id,
      s.institution_id,
      s.repository_id,
      s.apc_prices AS apc_prices,
      s.apc_usd,
      s.type AS source_type,
      i.display_name AS institution_name,
      p.display_name AS publisher_name,
      s.is_in_doaj_start_year,
      s.high_oa_rate_start_year,
      s.is_oa_high_oa_rate,
      s.is_fully_open_in_jstage,
      s.doaj_license
    FROM
      openalex.sources.sources s
      LEFT JOIN openalex.institutions.institutions i ON s.institution_id = i.id
      LEFT JOIN openalex.publishers.publishers p ON s.publisher_id = p.id
  ),
  priority_table AS (
    -- some comment
    SELECT
      *
    FROM
      openalex.system.priority_table
  ),
  openapc_paid AS (
    SELECT
      paper_id as work_id,
      /* if multiple rows per work, pick the latest year’s amounts */
      MAX_BY(CAST(apc_in_euro AS DOUBLE), year) AS apc_in_euro,
      MAX_BY(CAST(apc_in_usd  AS DOUBLE), year) AS apc_in_usd
    FROM openalex.mid.work_openapc
    GROUP BY paper_id
  ),
  base AS (
    SELECT
      a.work_id,
      a.provenance,
      a.native_id,
      a.native_id_namespace,
      a.best_doi,
      a.title,
      a.type,
      a.abstract,
      a.referenced_works_count,
      a.referenced_works,
      a.abstract_inverted_index,
      b.priority,
      a.openalex_created_dt,
      a.openalex_updated_dt,
      s.source_id,
      s.display_name,
      s.issn_l,
      s.issns,
      s.is_in_doaj,
      CASE
        WHEN GET(s.apc_prices, 0) IS NULL THEN NULL
        ELSE s.apc_prices
      END AS apc_prices,
      s.apc_usd,
      s.is_core,
      a.is_oa,
      COALESCE(a.is_oa, FALSE) AS is_oa_raw,
      COALESCE(s.is_in_doaj, FALSE) AS is_in_doaj_raw,
      COALESCE(
        is_in_doaj_raw
        AND (
          ISNULL(s.is_in_doaj_start_year)
          OR YEAR(a.published_date) >= s.is_in_doaj_start_year
        ),
        FALSE
      ) AS is_in_doaj_stg,
      COALESCE(
        s.is_oa_high_oa_rate
        AND (
          ISNULL(s.high_oa_rate_start_year)
          OR YEAR(a.published_date) >= s.high_oa_rate_start_year
        )
        OR s.is_fully_open_in_jstage,
        FALSE
      ) AS is_oa_high_rate,
      (
        is_oa_high_rate
        or is_in_doaj_stg
      ) AS source_is_oa,
      (
        is_oa_raw
        OR source_is_oa
      ) AS composite_is_oa,
      s.is_in_doaj_start_year,
      s.source_type,
      a.source_name,
      a.publisher,
      a.published_date,
      a.volume,
      a.issue,
      a.first_page,
      a.last_page,
      COALESCE(a.language_classification.language, a.language) as language, --prefer fasttext classified language
      a.authors,
      TRANSFORM(
        a.urls,
        x -> STRUCT(
          REGEXP_REPLACE(x.url, 'dx.doi.org', 'doi.org') AS url,
          x.content_type
        )
      ) AS urls,
      CASE
        WHEN is_in_doaj_stg
        AND s.doaj_license IS NOT NULL THEN s.doaj_license
        ELSE a.license
      END AS license,
      s.institution_name,
      s.publisher_name,
      s.institution_id,
      s.publisher_id,
      a.version,
      CASE
        WHEN LOWER(a.native_id) LIKE '%arxiv.org%' THEN COALESCE(
          GET(
            FILTER(a.urls, x -> x.content_type = 'html').url,
            0
          ),
          a.landing_page_url
        )
        ELSE a.landing_page_url
      END AS landing_page_url,
      CASE
        WHEN LOWER(a.native_id) LIKE '%arxiv.org%' THEN COALESCE(
          CONCAT(
            'https://arxiv.org/pdf/',
            SPLIT_PART(a.native_id, ':', 3)
          ),
          a.pdf_url
        )
        ELSE a.pdf_url
      END AS pdf_url,
      a.is_retracted,
      s.repository_id,
      a.pdf_s3_id,
      a.grobid_s3_id,
      ROW_NUMBER() OVER (
        PARTITION BY a.work_id,
        a.provenance
        ORDER BY
          a.created_date DESC
      ) AS row_num,
      CASE
        WHEN s.source_id IS NULL THEN NULL
        WHEN (
          a.provenance = 'crossref'
          AND s.source_type != 'repository'
        ) THEN 'publisher'
        ELSE 'repository'
      END AS host_type,
      CASE
        WHEN composite_is_oa
        AND host_type = 'publisher' THEN CASE
          WHEN ZEROIFNULL(s.apc_usd) = 0
          AND source_is_oa THEN 1
          WHEN source_is_oa THEN 2
          WHEN a.license IS NOT NULL
          AND a.license != 'publisher-specific-oa' THEN 3
          ELSE 4
        END
        WHEN host_type IS NULL
        AND (
          a.is_oa
          OR composite_is_oa
        ) THEN 2
        WHEN (
          a.is_oa
          OR composite_is_oa
        )
        AND host_type = 'repository' THEN 5
        ELSE 6
      END AS oa_status
    FROM
      identifier(
        'openalex' || :env_suffix || '.works.locations_mapped'
      ) a
      LEFT JOIN priority_table b USING (provenance)
      LEFT JOIN mat_sources s ON a.source_id = s.source_id QUALIFY row_num <= 10
  ),
  -- CURATION BLOCK
  curation_requests_clean AS (
    WITH ranked AS (
      SELECT
        LOWER(
          TRIM(
            REGEXP_REPLACE(doi, '^https?://(dx\\.)?doi\\.org/', '')
          )
        ) AS doi,
        TRIM(previous_url) AS prev_url,
        TRIM(new_url) AS new_url,
        ROW_NUMBER() OVER (
          PARTITION BY LOWER(
            TRIM(
              REGEXP_REPLACE(doi, '^https?://(dx\\.)?doi\\.org/', '')
            )
          ),
          TRIM(previous_url)
          ORDER BY
            ingestion_timestamp DESC
        ) AS rn
      FROM
        openalex.unpaywall.curation_requests
    )
    SELECT
      doi,
      prev_url,
      new_url
    FROM
      ranked
    WHERE
      rn = 1
  ),
  cr_matches AS (
    SELECT
      b.*,
      c.new_url
    FROM
      base b
      JOIN curation_requests_clean c ON LOWER(b.best_doi) = c.doi
    WHERE
      c.prev_url IS NOT NULL
      AND (
        REGEXP_REPLACE(LOWER(COALESCE(b.pdf_url, '')), '^https?://', '') = REGEXP_REPLACE(LOWER(c.prev_url), '^https?://', '')
        OR REGEXP_REPLACE(
          LOWER(COALESCE(b.landing_page_url, '')),
          '^https?://',
          ''
        ) = REGEXP_REPLACE(LOWER(c.prev_url), '^https?://', '')
        OR ARRAY_CONTAINS(
          TRANSFORM(
            b.urls,
            u -> REGEXP_REPLACE(LOWER(u.url), '^https?://', '') = REGEXP_REPLACE(LOWER(c.prev_url), '^https?://', '')
          ),
          TRUE
        )
      )
  ),
  cr_upserts AS (
    -- curation upserts
    SELECT
      b.work_id,
      b.provenance,
      b.native_id,
      b.native_id_namespace,
      b.best_doi,
      b.title,
      b.type,
      b.abstract,
      b.referenced_works_count,
      b.referenced_works,
      b.abstract_inverted_index,
      b.priority,
      b.openalex_created_dt,
      b.openalex_updated_dt,
      b.source_id,
      b.display_name,
      b.issn_l,
      b.issns,
      b.is_in_doaj,
      b.apc_prices,
      b.apc_usd,
      b.is_core,
      b.is_oa,
      b.is_oa_raw,
      b.is_in_doaj_raw,
      b.is_in_doaj_stg,
      b.is_oa_high_rate,
      b.source_is_oa,
      b.composite_is_oa,
      b.is_in_doaj_start_year,
      b.source_type,
      b.source_name,
      b.publisher,
      b.published_date,
      b.volume,
      b.issue,
      b.first_page,
      b.last_page,
      b.language,
      b.authors,
      /* Update the urls array using normalized comparison */
      transform(
        b.urls,
        u -> struct(
          CASE
            WHEN regexp_replace(lower(u.url), '^https?://', '') = regexp_replace(lower(c.prev_url), '^https?://', '') THEN c.new_url
            ELSE u.url
          END as url,
          u.content_type
        )
      ) AS urls,
      b.license,
      b.institution_name,
      b.publisher_name,
      b.institution_id,
      b.publisher_id,
      b.version,
      /* 44   landing_page_url  (may be replaced) --------------------- */
      CASE
        WHEN regexp_replace(
          lower(coalesce(b.landing_page_url, '')),
          '^https?://',
          ''
        ) = regexp_replace(lower(c.prev_url), '^https?://', '') THEN c.new_url
        ELSE b.landing_page_url
      END AS landing_page_url,
      /* 45   pdf_url  (may be replaced) ------------------------------ */
      CASE
        WHEN regexp_replace(lower(coalesce(b.pdf_url, '')), '^https?://', '') = regexp_replace(lower(c.prev_url), '^https?://', '') THEN c.new_url
        ELSE b.pdf_url
      END AS pdf_url,
      /* 46–52 : trailing columns in correct order -------------------- */
      b.is_retracted,
      b.repository_id,
      b.pdf_s3_id,
      b.grobid_s3_id,
      b.row_num,
      b.host_type,
      b.oa_status
    FROM
      base AS b
      JOIN curation_requests_clean c ON lower(b.best_doi) = c.doi
      AND (
        regexp_replace(lower(coalesce(b.pdf_url, '')), '^https?://', '') = regexp_replace(lower(c.prev_url), '^https?://', '')
        OR regexp_replace(
          lower(coalesce(b.landing_page_url, '')),
          '^https?://',
          ''
        ) = regexp_replace(lower(c.prev_url), '^https?://', '')
        OR array_contains(
          transform(
            b.urls,
            u -> regexp_replace(lower(u.url), '^https?://', '') = regexp_replace(lower(c.prev_url), '^https?://', '')
          ),
          true
        )
      )
    WHERE
      c.prev_url IS NOT NULL
      AND c.new_url IS NOT NULL -- replacement, not nullification
  ),
  cr_nullify AS (
    SELECT
      b.work_id,
      b.provenance,
      b.native_id,
      b.native_id_namespace,
      b.best_doi,
      b.title,
      b.type,
      b.abstract,
      b.referenced_works_count,
      b.referenced_works,
      b.abstract_inverted_index,
      b.priority,
      b.openalex_created_dt,
      b.openalex_updated_dt,
      b.source_id,
      b.display_name,
      b.issn_l,
      b.issns,
      b.is_in_doaj,
      b.apc_prices,
      b.apc_usd,
      b.is_core,
      /* Update is_oa to false if pdf_url is being nullified */
      CASE
        WHEN regexp_replace(lower(coalesce(b.pdf_url, '')), '^https?://', '') = regexp_replace(lower(c.prev_url), '^https?://', '') THEN false
        ELSE b.is_oa
      END AS is_oa,
      /* Update is_oa_raw to false if pdf_url is being nullified */
      CASE
        WHEN regexp_replace(lower(coalesce(b.pdf_url, '')), '^https?://', '') = regexp_replace(lower(c.prev_url), '^https?://', '') THEN false
        ELSE b.is_oa_raw
      END AS is_oa_raw,
      b.is_in_doaj_raw,
      b.is_in_doaj_stg,
      b.is_oa_high_rate,
      b.source_is_oa,
      /* Update composite_is_oa to account for nullified pdf_url */
      CASE
        WHEN regexp_replace(lower(coalesce(b.pdf_url, '')), '^https?://', '') = regexp_replace(lower(c.prev_url), '^https?://', '') THEN b.source_is_oa
        /* When pdf_url is nullified, composite_is_oa depends only on source_is_oa */
        ELSE b.composite_is_oa
      END AS composite_is_oa,
      b.is_in_doaj_start_year,
      b.source_type,
      b.source_name,
      b.publisher,
      b.published_date,
      b.volume,
      b.issue,
      b.first_page,
      b.last_page,
      b.language,
      b.authors,
      /* Update the urls array to nullify matched URLs using normalized comparison */
      transform(
        b.urls,
        u -> struct(
          CASE
            WHEN regexp_replace(lower(u.url), '^https?://', '') = regexp_replace(lower(c.prev_url), '^https?://', '') THEN null
            ELSE u.url
          END as url,
          u.content_type
        )
      ) AS urls,
      b.license,
      b.institution_name,
      b.publisher_name,
      b.institution_id,
      b.publisher_id,
      b.version,
      /* 44   landing_page_url  (set to null if matched) ------------- */
      CASE
        WHEN regexp_replace(
          lower(coalesce(b.landing_page_url, '')),
          '^https?://',
          ''
        ) = regexp_replace(lower(c.prev_url), '^https?://', '') THEN null
        ELSE b.landing_page_url
      END AS landing_page_url,
      /* 45   pdf_url  (set to null if matched) ---------------------- */
      CASE
        WHEN regexp_replace(lower(coalesce(b.pdf_url, '')), '^https?://', '') = regexp_replace(lower(c.prev_url), '^https?://', '') THEN null
        ELSE b.pdf_url
      END AS pdf_url,
      /* 46–52 : trailing columns in correct order -------------------- */
      b.is_retracted,
      b.repository_id,
      b.pdf_s3_id,
      b.grobid_s3_id,
      b.row_num,
      b.host_type,
      /* Update oa_status to closed (6) if pdf_url is nullified */
      CASE
        WHEN regexp_replace(lower(coalesce(b.pdf_url, '')), '^https?://', '') = regexp_replace(lower(c.prev_url), '^https?://', '') THEN 6
        /* closed */
        ELSE b.oa_status
      END AS oa_status
    FROM
      base AS b
      JOIN curation_requests_clean c ON lower(b.best_doi) = c.doi
      AND (
        regexp_replace(lower(coalesce(b.pdf_url, '')), '^https?://', '') = regexp_replace(lower(c.prev_url), '^https?://', '')
        OR regexp_replace(
          lower(coalesce(b.landing_page_url, '')),
          '^https?://',
          ''
        ) = regexp_replace(lower(c.prev_url), '^https?://', '')
        OR array_contains(
          transform(
            b.urls,
            u -> regexp_replace(lower(u.url), '^https?://', '') = regexp_replace(lower(c.prev_url), '^https?://', '')
          ),
          true
        )
      )
    WHERE
      c.prev_url IS NOT NULL
      AND c.new_url IS NULL -- nullification, not update
  ),
  cr_mark_oa AS (
    SELECT
      b.work_id,
      b.provenance,
      b.native_id,
      b.native_id_namespace,
      b.best_doi,
      b.title,
      b.type,
      b.abstract,
      b.referenced_works_count,
      b.referenced_works,
      b.abstract_inverted_index,
      b.priority,
      b.openalex_created_dt,
      b.openalex_updated_dt,
      b.source_id,
      b.display_name,
      b.issn_l,
      b.issns,
      b.is_in_doaj,
      b.apc_prices,
      b.apc_usd,
      b.is_core,
      TRUE AS is_oa,
      TRUE AS is_oa_raw,
      b.is_in_doaj_raw,
      b.is_in_doaj_stg,
      b.is_oa_high_rate,
      b.source_is_oa,
      /* composite_is_oa only matters for publisher rows */
      CASE
        WHEN b.host_type = 'publisher' THEN TRUE
        ELSE b.composite_is_oa
      END AS composite_is_oa,
      b.is_in_doaj_start_year,
      b.source_type,
      b.source_name,
      b.publisher,
      b.published_date,
      b.volume,
      b.issue,
      b.first_page,
      b.last_page,
      b.language,
      b.authors,
      b.urls,
      b.license,
      b.institution_name,
      b.publisher_name,
      b.institution_id,
      b.publisher_id,
      b.version,
      b.landing_page_url,
      b.pdf_url,
      b.is_retracted,
      b.repository_id,
      b.pdf_s3_id,
      b.grobid_s3_id,
      b.row_num,
      b.host_type,
      /* recompute oa_status the same way base does */
      CASE
        WHEN b.host_type = 'publisher' THEN CASE
          WHEN ZEROIFNULL(b.apc_usd) = 0
          AND b.source_is_oa THEN 1 -- diamond
          WHEN b.source_is_oa THEN 2 -- gold
          WHEN b.license IS NOT NULL
          AND b.license <> 'publisher-specific-oa' THEN 3 -- hybrid
          ELSE 4 -- bronze
        END
        ELSE 5 -- repository => green
      END AS oa_status
    FROM
      base b
      JOIN curation_requests_clean c ON LOWER(b.best_doi) = c.doi
    WHERE
      c.prev_url IS NULL
      AND c.new_url IS NOT NULL
      AND (
        REGEXP_REPLACE(
          LOWER(COALESCE(b.pdf_url, '')),
          '^https?://',
          ''
        ) = REGEXP_REPLACE(LOWER(c.new_url), '^https?://', '')
        OR REGEXP_REPLACE(
          LOWER(COALESCE(b.landing_page_url, '')),
          '^https?://',
          ''
        ) = REGEXP_REPLACE(LOWER(c.new_url), '^https?://', '')
        OR ARRAY_CONTAINS(
          TRANSFORM(
            b.urls,
            u -> REGEXP_REPLACE(LOWER(u.url), '^https?://', '') = REGEXP_REPLACE(LOWER(c.new_url), '^https?://', '')
          ),
          TRUE
        )
      )
  ),
  base_filtered AS (
    SELECT
      *
    FROM
      base b
    WHERE
      NOT EXISTS (
        SELECT
          1
        FROM
          cr_matches m
        WHERE
          m.work_id = b.work_id
          AND m.provenance = b.provenance
          AND (
            REGEXP_REPLACE(LOWER(COALESCE(m.pdf_url, '')), '^https?://', '') = REGEXP_REPLACE(LOWER(COALESCE(b.pdf_url, '')), '^https?://', '')
            OR REGEXP_REPLACE(
              LOWER(COALESCE(m.landing_page_url, '')),
              '^https?://',
              ''
            ) = REGEXP_REPLACE(
              LOWER(COALESCE(b.landing_page_url, '')),
              '^https?://',
              ''
            )
          )
      )
      AND NOT EXISTS (
        SELECT
          1
        FROM
          cr_mark_oa o
        WHERE
          o.work_id = b.work_id
          AND o.provenance = b.provenance
      )
  ),
  cr_new_locations AS (
    -- new locations for curation requests that don't match existing URLs
    SELECT
      b.work_id,
      'curation' AS provenance,
      b.native_id,
      b.native_id_namespace,
      b.best_doi,
      b.title,
      b.type,
      b.abstract,
      b.referenced_works_count,
      b.referenced_works,
      b.abstract_inverted_index,
      999 AS priority,
      b.openalex_created_dt,
      CURRENT_TIMESTAMP() AS openalex_updated_dt,
      b.source_id,
      -- Use existing source
      b.display_name,
      b.issn_l,
      b.issns,
      b.is_in_doaj,
      b.apc_prices,
      b.apc_usd,
      b.is_core,
      TRUE AS is_oa,
      TRUE AS is_oa_raw,
      b.is_in_doaj_raw,
      b.is_in_doaj_stg,
      b.is_oa_high_rate,
      b.source_is_oa,
      TRUE AS composite_is_oa,
      b.is_in_doaj_start_year,
      b.source_type,
      b.source_name,
      b.publisher,
      b.published_date,
      b.volume,
      b.issue,
      b.first_page,
      b.last_page,
      b.language,
      b.authors,
      ARRAY(
        STRUCT(
          c.new_url AS url,
          CASE
            WHEN LOWER(c.new_url) LIKE '%.pdf%'
            OR LOWER(c.new_url) LIKE '%/pdf/%' THEN 'pdf'
            ELSE 'html'
          END AS content_type
        )
      ) AS urls,
      NULL AS license,
      b.institution_name,
      b.publisher_name,
      b.institution_id,
      b.publisher_id,
      'publishedVersion' AS version,
      -- Set landing_page_url or pdf_url based on URL type
      CASE
        WHEN LOWER(c.new_url) LIKE '%.pdf%'
        OR LOWER(c.new_url) LIKE '%/pdf/%' THEN NULL
        ELSE c.new_url
      END AS landing_page_url,
      CASE
        WHEN LOWER(c.new_url) LIKE '%.pdf%'
        OR LOWER(c.new_url) LIKE '%/pdf/%' THEN c.new_url
        ELSE NULL
      END AS pdf_url,
      b.is_retracted,
      b.repository_id,
      CAST(NULL AS STRING) AS pdf_s3_id,
      CAST(NULL AS STRING) AS grobid_s3_id,
      1 AS row_num,
      b.host_type,
      CASE
        WHEN b.composite_is_oa
        AND b.host_type = 'publisher' THEN CASE
          WHEN ZEROIFNULL(b.apc_usd) = 0
          AND b.source_is_oa THEN 1
          WHEN b.source_is_oa THEN 2 -- gold
          ELSE 4
        END -- bronze
        WHEN b.host_type = 'repository' THEN 5
        ELSE 4
      END AS oa_status
    FROM
      curation_requests_clean c
      INNER JOIN base b ON LOWER(b.best_doi) = c.doi
    WHERE
      c.new_url IS NOT NULL
      AND c.prev_url IS NULL -- this indicates a new URL addition, not a replacement
      AND NOT EXISTS (
        -- Ensure this URL doesn't already exist for this work
        SELECT
          1
        FROM
          base b2
        WHERE
          b2.work_id = b.work_id
          AND (
            REGEXP_REPLACE(
              LOWER(COALESCE(b2.pdf_url, '')),
              '^https?://',
              ''
            ) = REGEXP_REPLACE(LOWER(c.new_url), '^https?://', '')
            OR REGEXP_REPLACE(
              LOWER(COALESCE(b2.landing_page_url, '')),
              '^https?://',
              ''
            ) = REGEXP_REPLACE(LOWER(c.new_url), '^https?://', '')
            OR ARRAY_CONTAINS(
              TRANSFORM(
                b2.urls,
                u -> REGEXP_REPLACE(LOWER(u.url), '^https?://', '') = REGEXP_REPLACE(LOWER(c.new_url), '^https?://', '')
              ),
              TRUE
            )
          )
      )
  ),
  base_with_cr AS (
    SELECT
      *
    FROM
      base_filtered
    UNION ALL
    SELECT
      *
    FROM
      cr_upserts
    UNION ALL
    SELECT
      *
    FROM
      cr_nullify
    UNION ALL
    SELECT
      *
    FROM
      cr_new_locations
    UNION ALL
    SELECT
      *
    FROM
      cr_mark_oa
  ),
  -- deduplicate and rank landing page urls
  base_with_landing_page_rank AS (
    SELECT
      *,
      row_number() OVER (
        PARTITION BY work_id,
        coalesce(
          landing_page_url,
          get(
            filter(urls, x -> x.content_type = "html").url,
            0
          ),
          ''
        )
        ORDER BY
          case
            when provenance = 'crossref'
            and best_doi is not null then 1
            when provenance = 'crossref' then 2
            when version = 'publishedVersion'
            and pdf_url is not null then 3
            when version = 'publishedVersion' then 4
            when version = 'acceptedVersion'
            and pdf_url is not null then 5
            when version = 'acceptedVersion' then 6
            when version = 'submittedVersion'
            and pdf_url is not null then 7
            when version = 'submittedVersion' then 8
            else 9
          end,
          -- Then by url_sort_score
          case
            when contains(
              coalesce(pdf_url, landing_page_url),
              "europepmc.org"
            ) then 1
            when contains(coalesce(pdf_url, landing_page_url), "/pmc/") then 2
            when contains(coalesce(pdf_url, landing_page_url), "arxiv") then 3
            when contains(coalesce(pdf_url, landing_page_url), ".edu") then 4
            else 5
          end,
          -- Finally by priority from priority_table
          priority,
          case
            when provenance in ('repo', 'repo_backfill') then native_id
          end
      ) AS landing_page_rank
    FROM
      base_with_cr
  ),
  collect_all_values AS (
    select
      work_id,
      collect_list(struct(best_doi, priority)) as best_dois,
      collect_list(struct(title, priority)) as titles,
      collect_list(struct(publisher, priority)) as publishers,
      collect_list(struct(abstract, priority)) as abstracts,
      array_distinct(flatten(collect_list(referenced_works))) as referenced_works,
      -- preserve natural order (whatever it may be), sort in JSON
      collect_list(struct(abstract_inverted_index, priority)) as abstract_inverted_indexes,
      collect_list(struct(volume, priority)) as volumes,
      collect_list(struct(issue, priority)) as issues,
      collect_list(struct(first_page, priority)) as first_pages,
      collect_list(struct(last_page, priority)) as last_pages,
      collect_list(struct(language, priority)) as languages,
      collect_list(struct(type, priority)) as types,
      filter(
        collect_list(struct(published_date, priority)),
        x -> x.published_date is not null
      ) as published_dates,
      filter(
        collect_list(struct(openalex_created_dt, priority)),
        x -> x.openalex_created_dt is not null
      ) as openalex_created_dts,
      filter(
        collect_list(struct(openalex_updated_dt, priority)),
        x -> x.openalex_updated_dt is not null
      ) as openalex_updated_dts,
      filter(
        collect_set(struct(native_id_namespace, native_id)),
        x -> lower(x.native_id_namespace) != 'pmh'
      ) as ids,
      -- locations
      collect_set(
        struct(
          case
            when provenance = 'repo_backfill' then 'repo'
            else provenance
          end as provenance,
          native_id,
          case
            when provenance = 'crossref'
            and best_doi is not null then 1 -- publisher with a doi
            when provenance = 'crossref' then 2 -- publisher without a doi
            when version = 'publishedVersion'
            and pdf_url is not null then 3 -- published version with a pdf url
            when version = 'publishedVersion' then 4 -- published version without a pdf url
            when version = 'acceptedVersion'
            and pdf_url is not null then 5 -- accepted version with a pdf url
            when version = 'acceptedVersion' then 6 -- accepted version without a pdf url
            when version = 'submittedVersion'
            and pdf_url is not null then 7 -- submitted version with a pdf url
            when version = 'submittedVersion' then 8 -- submitted version without a pdf url
            else 9
          end as sort_score,
          case
            when host_type = 'repository' then (
              is_oa_raw
              OR composite_is_oa
            )
            else composite_is_oa
          end as is_oa,
          coalesce(
            landing_page_url,
            get(
              filter(urls, x -> x.content_type = "html").url,
              0
            )
          ) as landing_page_url,
          pdf_url,
          case
            when contains(
              coalesce(pdf_url, landing_page_url),
              "europepmc.org"
            ) then 1
            when contains(coalesce(pdf_url, landing_page_url), "/pmc/") then 2
            when contains(coalesce(pdf_url, landing_page_url), "arxiv") then 3
            when contains(coalesce(pdf_url, landing_page_url), ".edu") then 4
            else 5
          end as url_sort_score,
          oa_status,
          version = 'publishedVersion' as is_published,
          version in ('acceptedVersion','publishedVersion') as is_accepted,
          CASE 
            WHEN source_id IS NULL THEN NULL
            ELSE struct(
              concat("https://openalex.org/S", source_id) as id,
              display_name AS display_name,
              issn_l,
              issns as issn,
              source_is_oa as is_oa,
              is_in_doaj,
              is_core,
              case
                when source_type = 'repository' then concat('https://openalex.org/I', institution_id)
                else concat('https://openalex.org/P', publisher_id)
              end as host_organization,
              case
                when source_type = 'repository' then institution_name
                else publisher_name
              end as host_organization_name,
              IF ((source_type = 'repository' or source_type = 'metadata') and institution_id IS NOT NULL,
                ARRAY(CONCAT('https://openalex.org/I', institution_id)), 
                ARRAY(CONCAT('https://openalex.org/P', publisher_id))
              ) AS host_organization_lineage,
              CAST(ARRAY() AS ARRAY<STRING>) AS host_organization_lineage_names,
              source_type as type
            )
          END as source,
          apc_prices,
          apc_usd,
          license,
          IF(
            license IS NOT NULL,
            CONCAT('https://openalex.org/licenses/', license),
            NULL
          ) as license_id,
          version,
          host_type,
          case
            when provenance in ('repo', 'repo_backfill') then repository_id
          end as endpoint_id,
          case
            when provenance in ('repo', 'repo_backfill') then native_id
          end as pmh_id,
          provenance = 'crossref' as is_unpaywall_record,
          pdf_s3_id,
          grobid_s3_id,
          type as location_type,
          cast(openalex_updated_dt as timestamp) as updated
        )
      ) as locations,
      exists(collect_set(is_retracted), x -> x = True) as is_retracted,
      array_contains(collect_set(provenance), 'crossref') as indexed_in_crossref
    from
      base_with_landing_page_rank
    where
      landing_page_rank = 1
    group by
      work_id
  ),
  abstracts_backfill AS (
    SELECT
      work_id,
      abstract,
      abstract_inverted_index
    FROM
      openalex.abstracts.abstracts_backfill
  ),
  approved_curations AS (
    SELECT
      CAST(SUBSTRING(entity_id, 2) AS BIGINT) AS work_id,
      MAP_FROM_ENTRIES(COLLECT_LIST(STRUCT(property, property_value))) AS curations
    FROM
      openalex.curations.approved_curations
    WHERE
      entity = 'works'
      AND property IN ('type', 'language')
      AND status = 'approved'
    GROUP BY work_id
  ),
  set_fields AS (
    SELECT
      work_id as id,
      get_highest_priority_value(titles, titles.title) AS title,
      CONCAT(
        'https://doi.org/',
        get_highest_priority_value(best_dois, best_dois.best_doi)
      ) AS best_doi,
      get_highest_priority_value(publishers, publishers.publisher) AS publisher,
      get_highest_priority_value(
        concat(
          abstracts,
          CASE
            WHEN bf.abstract IS NOT NULL THEN array(
              named_struct('abstract', bf.abstract, 'priority', 998)
            )
            ELSE array()
          END
        ),
        abstracts.abstract
      ) AS abstract,
      SIZE(referenced_works) AS referenced_works_count,
      referenced_works,
      CASE
        WHEN (
          -- Springer/Elsevier by publisher string or host org id
          (
            publisher IS NOT NULL
            AND lower(publisher) RLIKE '(springer|elsevier)'
          )
          OR exists(
            locations,
            x -> x.source.host_organization IS NOT NULL
            AND get(split(x.source.host_organization, '/'), 4) IN ('P4310320990', 'P4310319965')
          )
        ) -- Allow only if best OA is diamond/gold/hybrid (1/2/3)
        AND coalesce(
          try_element_at(
            transform(
              filter(locations, x -> x.is_oa),
              y -> y.oa_status
            ),
            1
          ),
          0
        ) NOT IN (1, 2, 3) THEN NULL
        ELSE get_highest_priority_value(
          concat(
            abstract_inverted_indexes,
            CASE
              WHEN bf.abstract_inverted_index IS NOT NULL THEN array(
                named_struct(
                  'abstract_inverted_index',
                  bf.abstract_inverted_index,
                  'priority',
                  998
                )
              )
              ELSE array()
            END
          ),
          abstract_inverted_indexes.abstract_inverted_index
        )
      END AS abstract_inverted_index,
      CASE
        WHEN id > 6600000000 THEN TRUE
        ELSE FALSE
      END AS is_xpac,
      get_highest_priority_value(volumes, volumes.volume) AS volume,
      get_highest_priority_value(issues, issues.issue) AS issue,
      get_highest_priority_value(first_pages, first_pages.first_page) AS first_page,
      get_highest_priority_value(last_pages, last_pages.last_page) AS last_page,
      COALESCE(
        ELEMENT_AT(ac.curations, 'language'),
        get_highest_priority_value(languages, languages.language)
      ) AS language,
      COALESCE(
        ELEMENT_AT(ac.curations, 'type'),
        get_highest_priority_value(types, types.type)
      ) AS type,
      TRY_CAST(
        get_highest_priority_value(
          openalex_created_dts,
          openalex_created_dts.openalex_created_dt
        ) AS DATE
      ) AS created_date,
      CAST(
        get_highest_priority_value(
          openalex_updated_dts,
          openalex_updated_dts.openalex_updated_dt
        ) AS TIMESTAMP
      ) AS updated_date,
      TRY_CAST(
        get_highest_priority_value(published_dates, published_dates.published_date) AS DATE
      ) AS publication_date,
      YEAR(publication_date) AS publication_year,
      MAP_FROM_ENTRIES(
        AGGREGATE(
          ids,
          ARRAY(
            NAMED_STRUCT(
              'native_id_namespace',
              'openalex',
              'native_id',
              CONCAT(
                'https://openalex.org/W',
                CAST(work_id AS STRING)
              )
            )
          ),
          (acc, x) -> CASE
            WHEN SIZE(
              FILTER(
                acc,
                y -> y.native_id_namespace = x.native_id_namespace
              )
            ) = 0 THEN acc || ARRAY(x)
            ELSE acc
          END
        )
      ) AS ids,
      ARRAY_SORT(
        locations,
        (x, y) -> IF(
          x.sort_score < y.sort_score,
          -1,
          IF(
            x.sort_score > y.sort_score,
            1,
            IF(
              x.url_sort_score < y.url_sort_score,
              -1,
              IF(x.url_sort_score > y.url_sort_score, 1, 0)
            )
          )
        )
      ) AS locations_sorted,
      authorships,
      (
        is_retracted
        OR lower(title) like 'retracted article%'
        OR lower(title) like 'retracted: %'
      ) AS is_retracted,
      indexed_in_crossref
    FROM
      collect_all_values
      LEFT JOIN identifier(
        'openalex' || :env_suffix || '.works.authors_and_affiliations'
      ) USING (work_id)
      LEFT JOIN abstracts_backfill bf USING (work_id)
      LEFT JOIN approved_curations ac USING (work_id)
  )
  SELECT
    s.id,
    s.best_doi AS doi,
    s.title,
    s.authorships,
    -- @TODO make changes later to calculate them upstream - they will have to rely on Walden logic
    CAST(ARRAY() AS ARRAY<STRING>) AS corresponding_author_ids,
    CAST(ARRAY() AS ARRAY<STRING>) AS corresponding_institution_ids,
    COALESCE(SIZE(ARRAY_DISTINCT(FLATTEN(s.authorships.countries))),0) as countries_distinct_count,
    s.publication_date,
    s.publication_year,
    s.abstract,
    s.abstract IS NOT NULL AS has_abstract,
    s.referenced_works_count,
    s.referenced_works,
    CAST(ARRAY() AS ARRAY < STRING >) AS related_works,
    s.abstract_inverted_index,
    CONCAT(
      'https://api.openalex.org/works?filter=cites:W',
      CAST(s.id AS STRING)
    ) AS cited_by_api_url,
    CAST(0 AS INT) as cited_by_count,
    CAST(
      ARRAY() AS ARRAY < STRUCT < year: INT,
      cited_by_count: INT > >
    ) AS counts_by_year,
    CAST(0.0 AS DOUBLE) as fwci,
    CAST(
      NULL AS STRUCT < value DOUBLE,
      is_in_top_1_percent BOOLEAN,
      is_in_top_10_percent BOOLEAN >
    ) as citation_normalized_percentile,
    CAST(NULL AS STRUCT < min INT, max INT >) as cited_by_percentile_year,
    MAP_FILTER(
      TRANSFORM_VALUES(
        MAP_CONCAT(
          MAP_FILTER(s.ids, (k, v) -> k != 'mag'),
          map('mag', COALESCE(
            IF(id < 4200000000, CAST(id AS STRING), NULL),
            s.ids['mag']
          ))
        ),
        (k, v) -> IF(k = 'pmid', CONCAT('https://pubmed.ncbi.nlm.nih.gov/', v), v)
      ),
      (k, v) -> v IS NOT NULL
    ) as ids,
    -- transform_values(ids, (k, v) -> case when k = 'pmid' then CONCAT('https://pubmed.ncbi.nlm.nih.gov/', v) else v end)
    ARRAY_SORT(
      ARRAY_DISTINCT(
        ARRAY_COMPACT(
          FLATTEN(
            TRANSFORM(
              s.locations_sorted,
              loc -> CASE
                WHEN loc.provenance IN ('crossref', 'pubmed', 'datacite') THEN array(
                  loc.provenance,
                  IF(loc.source.is_in_doaj, 'doaj', NULL)
                )
                WHEN loc.provenance = 'repo'
                AND lower(loc.native_id) like 'oai:arxiv.org%' THEN array('arxiv')
                WHEN loc.provenance = 'repo'
                AND lower(loc.native_id) like 'oai:doaj.org/%' THEN array('doaj')
                WHEN loc.provenance = 'mag'
                AND lower(loc.source.display_name) = 'pubmed' THEN array('pubmed')
                ELSE array()
              END
            )
          )
        )
      )
    ) as indexed_in,
    s.language,
    s.publisher,
    STRUCT(s.volume, s.issue, s.first_page, s.last_page) AS biblio,
    CASE
      WHEN GET(GET(locations_sorted, 0).apc_prices, 0).price IS NOT NULL THEN STRUCT(
        GET(GET(locations_sorted, 0).apc_prices, 0).price AS value,
        GET(GET(locations_sorted, 0).apc_prices, 0).currency,
        GET(locations_sorted, 0).apc_usd AS value_usd
      )
    END AS apc_list,
    CASE
      /* OpenAPC takes precedence whenever present */
      WHEN o.apc_in_euro IS NOT NULL OR o.apc_in_usd IS NOT NULL THEN STRUCT(
        o.apc_in_euro AS value,
        'EUR'         AS currency,
        o.apc_in_usd  AS value_usd
      )
      /* Otherwise: only relevant for gold/hybrid AND if a list price exists */
      WHEN GET(FILTER(s.locations_sorted, x -> x.is_oa), 0).oa_status IN (2, 3)
        AND GET(GET(s.locations_sorted, 0).apc_prices, 0).price IS NOT NULL 
        THEN apc_list
        ELSE NULL
    END AS apc_paid,
    CAST(
      NULL AS STRUCT < id STRING,
      display_name STRING,
      score FLOAT,
      subfield STRUCT < id STRING,
      display_name STRING >,
      field STRUCT < id STRING,
      display_name STRING >,
      domain STRUCT < id STRING,
      display_name STRING > >
    ) AS primary_topic,
    CAST(
      ARRAY() AS ARRAY < STRUCT < id STRING,
      display_name STRING,
      score FLOAT,
      subfield STRUCT < id STRING,
      display_name STRING >,
      field STRUCT < id STRING,
      display_name STRING >,
      domain STRUCT < id STRING,
      display_name STRING > > >
    ) AS topics,
    xxhash64(concat_ws('|', doi, title, abstract, GET(s.locations_sorted, 0).source.display_name)) as topics_key,
    CAST(
      ARRAY() AS ARRAY < STRUCT < id STRING,
      display_name STRING,
      score FLOAT > >
    ) AS keywords,
    CAST(
      ARRAY() AS ARRAY < STRUCT < id BIGINT,
      wikidata STRING,
      display_name STRING,
      level INT,
      score FLOAT > >
    ) AS concepts,
    SIZE(s.locations_sorted) AS locations_count,
    TRANSFORM(
      s.locations_sorted,
      x -> STRUCT(
        COALESCE(x.is_oa, FALSE) AS is_oa,
        x.landing_page_url,
        x.pdf_url,
        x.is_published,
        x.is_accepted,
        x.source,
        x.license,
        x.license_id,
        x.version,
        x.oa_status,
        x.host_type,
        x.endpoint_id,
        x.pmh_id,
        x.is_unpaywall_record,
        x.location_type,
        x.pdf_s3_id,
        x.grobid_s3_id,
        CAST(x.updated AS TIMESTAMP) AS updated,
        x.provenance,
        x.native_id
      )
    ) AS locations,
    GET(s.locations_sorted, 0) AS primary_location,
    GET(FILTER(s.locations_sorted, x -> x.is_oa), 0) AS best_oa_location,
    STRUCT(
      CASE
        WHEN GET(FILTER(s.locations_sorted, x -> x.is_oa), 0) IS NOT NULL
            AND GET(FILTER(s.locations_sorted, x -> x.is_oa), 0).pdf_s3_id IS NOT NULL
        THEN TRUE ELSE FALSE
      END AS pdf,
      CASE
        WHEN GET(FILTER(s.locations_sorted, x -> x.is_oa), 0) IS NOT NULL
            AND GET(FILTER(s.locations_sorted, x -> x.is_oa), 0).grobid_s3_id IS NOT NULL
        THEN TRUE ELSE FALSE
      END AS `grobid_xml`
    ) AS has_content,
    CAST(
      ARRAY() AS ARRAY < STRUCT < id STRING,
      display_name STRING,
      score FLOAT > >
    ) AS sustainable_development_goals,
    CAST(
      ARRAY() AS ARRAY < STRUCT < funder STRING,
      funder_display_name STRING,
      award_id STRING > >
    ) AS grants,
    CAST(
      ARRAY() AS ARRAY < STRUCT < id STRING,
      funder_award_id STRING,
      funder_id STRING,
      funder_display_name STRING,
      doi STRING > >
    ) AS awards,
    CAST(
      ARRAY() AS ARRAY < STRUCT < id STRING,
      display_name STRING,
      ror STRING > >
    ) AS funders,
    STRUCT(
      best_oa_location IS NOT NULL AS is_oa,
      CASE
        WHEN best_oa_location.oa_status = 1 THEN 'diamond'
        WHEN best_oa_location.oa_status = 2 THEN 'gold'
        WHEN best_oa_location.oa_status = 3 THEN 'hybrid'
        WHEN best_oa_location.oa_status = 4 THEN 'bronze'
        WHEN best_oa_location.oa_status = 5 THEN 'green'
        ELSE 'closed'
      END AS oa_status,
      COALESCE(
        best_oa_location.pdf_url,
        best_oa_location.landing_page_url
      ) AS oa_url,
      CAST(NULL AS BOOLEAN) AS any_repository_has_fulltext
    ) AS open_access,
    s.type,
    CAST(NULL as STRING) AS type_crossref,
    s.type = 'paratext' AS is_paratext,    
    s.is_retracted,
    s.indexed_in_crossref,
    s.is_xpac,
    m.mesh_formatted AS mesh,
    CAST(NULL AS STRING) AS fulltext,
    s.created_date,
    s.updated_date
  FROM
    set_fields s
    LEFT JOIN openapc_paid o ON s.id = o.work_id
    LEFT JOIN (
      SELECT
        pmid,
        COLLECT_LIST(
          STRUCT(
            descriptor_ui,
            descriptor_name,
            qualifiers._UI AS qualifier_ui,
            qualifiers._VALUE AS qualifier_name,
            CASE
              WHEN is_major_topic = 'Y' THEN TRUE
              ELSE FALSE
            END AS is_major_topic
          )
        ) AS mesh_formatted
      FROM
        (
          SELECT
            pmid,
            EXPLODE(mesh.MeshHeading) AS mesh_exploded,
            mesh_exploded.DescriptorName._UI AS descriptor_ui,
            mesh_exploded.DescriptorName._VALUE AS descriptor_name,
            EXPLODE_OUTER(
              ARRAYS_ZIP(
                mesh_exploded.QualifierName._UI,
                mesh_exploded.QualifierName._VALUE
              )
            ) AS qualifiers,
            mesh_exploded.DescriptorName._MajorTopicYN AS is_major_topic
          FROM
            (
              SELECT
                FILTER(
                  PubmedData.ArticleIdList.ArticleId,
                  x -> x._IdType = 'pubmed'
                )._VALUE [0] AS pmid,
                MedlineCitation.MeshHeadingList AS mesh
              FROM
                openalex.pubmed.pubmed_items
            )
        )
      GROUP BY
        pmid
    ) m ON s.ids.pmid = m.pmid
);

### End CREATE Table

In [0]:
-- UPDATE identifier('openalex' || :env_suffix || '.works.openalex_works') 
-- SET topics_key = xxhash64(concat_ws('|', doi, title, abstract, primary_location.source.display_name));

In [0]:
-- OPTIMIZE identifier('openalex' || :env_suffix || '.works.openalex_works') FULL;

### `MERGE` other fields

#### Merge citations

In [0]:
-- MERGE Backfill referenced_works (mid.citation)
WITH prod_ref_works AS (
  SELECT 
    paper_id as id,
    collect_set(paper_reference_id) as referenced_works
  FROM openalex.mid.citation
  GROUP BY paper_id
)
MERGE INTO identifier('openalex' || :env_suffix || '.works.openalex_works') as target
USING prod_ref_works as source
ON target.id = source.id
WHEN MATCHED THEN UPDATE SET
  referenced_works = array_union(target.referenced_works, source.referenced_works),
  referenced_works_count = size(array_union(target.referenced_works, source.referenced_works));

-- Calculate and MERGE the citations
-- Far fewer changes than propagating through locations_mapped and 17 CTEs, no need to select distinct work_id data
-- runtime about 1 min, updates 67M rows
WITH exploded_references AS (
  SELECT
    id,
    publication_year,
    EXPLODE(referenced_works) AS cited_work_id
  FROM identifier('openalex' || :env_suffix || '.works.openalex_works')
  WHERE referenced_works_count > 0
    AND publication_year <= YEAR(CURRENT_DATE())
),
citation_counts AS (
  SELECT
    cited_work_id,
    publication_year,
    COUNT(*) AS cited_by_count
  FROM exploded_references
  GROUP BY cited_work_id, publication_year
),
citation_counts_by_work AS (
  SELECT 
    cited_work_id,
    FILTER(
      SORT_ARRAY(
        COLLECT_LIST(
          NAMED_STRUCT(
            'year', publication_year,
            'cited_by_count', cited_by_count
          )
        ),
        false
      ),
      x -> x.year >= 2012
    ) AS counts_by_year,
    SUM(cited_by_count) AS cited_by_count -- total across all years
  FROM citation_counts
  GROUP BY cited_work_id
)
MERGE INTO identifier('openalex' || :env_suffix || '.works.openalex_works') AS target
USING citation_counts_by_work AS source
ON target.id = source.cited_work_id
WHEN MATCHED THEN
UPDATE SET
  target.cited_by_count = source.cited_by_count,
  target.counts_by_year = source.counts_by_year;



#### Merge full-text

In [0]:
-------- Merge fulltext from PDFs --------
WITH pdf_fulltext_for_merge AS (
    -- DOI-based matching
    SELECT 
        CONCAT('https://doi.org/', LOWER(FILTER(ids, x -> x.namespace = 'doi')[0].id)) AS doi_normalized,
        NULL AS pmh_id,
        fulltext,
        'doi' AS match_type,
        ROW_NUMBER() OVER (PARTITION BY CONCAT('https://doi.org/', LOWER(FILTER(ids, x -> x.namespace = 'doi')[0].id)) ORDER BY LENGTH(fulltext) DESC) AS rn
    FROM openalex.pdf.pdf_combined
    WHERE SIZE(FILTER(ids, x -> x.namespace = 'doi')) > 0
      AND fulltext IS NOT NULL
      AND TRIM(fulltext) != ''
    
    UNION ALL
    
    -- PMH ID-based matching
    SELECT 
        NULL AS doi_normalized,
        FILTER(ids, x -> x.namespace = 'pmh')[0].id AS pmh_id,
        fulltext,
        'pmh' AS match_type,
        ROW_NUMBER() OVER (PARTITION BY FILTER(ids, x -> x.namespace = 'pmh')[0].id ORDER BY LENGTH(fulltext) DESC) AS rn
    FROM openalex.pdf.pdf_combined
    WHERE SIZE(FILTER(ids, x -> x.namespace = 'pmh')) > 0
      AND fulltext IS NOT NULL
      AND TRIM(fulltext) != ''
      -- Only include PMH records that don't have DOIs (to avoid duplicates)
      AND SIZE(FILTER(ids, x -> x.namespace = 'doi')) = 0
),
pdf_fulltext_deduped AS (
    SELECT doi_normalized, pmh_id, fulltext, match_type
    FROM pdf_fulltext_for_merge
    WHERE rn = 1
),
works_with_locations AS (
    SELECT 
        w.id,
        w.doi,
        EXPLODE_OUTER(w.locations) AS location
    FROM identifier('openalex' || :env_suffix || '.works.openalex_works') w
),
matched_fulltext AS (
    -- DOI matches
    SELECT 
        w.id AS work_id,
        p.fulltext,
        p.match_type
    FROM (SELECT DISTINCT id, doi FROM works_with_locations) w
    INNER JOIN pdf_fulltext_deduped p 
        ON LOWER(w.doi) = p.doi_normalized
    WHERE p.doi_normalized IS NOT NULL
    
    UNION ALL
    
    -- PMH ID matches
    SELECT 
        w.id AS work_id,
        p.fulltext,
        p.match_type
    FROM works_with_locations w
    INNER JOIN pdf_fulltext_deduped p 
        ON w.location.pmh_id = p.pmh_id
    WHERE p.pmh_id IS NOT NULL
      AND w.location.pmh_id IS NOT NULL
),
final_fulltext AS (
    -- Deduplicate in case a work matches on both DOI and PMH
    -- Prefer DOI matches over PMH matches
    SELECT 
        work_id,
        fulltext,
        ROW_NUMBER() OVER (PARTITION BY work_id ORDER BY CASE WHEN match_type = 'doi' THEN 1 ELSE 2 END) AS priority_rn
    FROM matched_fulltext
)
MERGE INTO identifier('openalex' || :env_suffix || '.works.openalex_works') AS target
USING (SELECT work_id, fulltext FROM final_fulltext WHERE priority_rn = 1) AS source
ON target.id = source.work_id
WHEN MATCHED THEN
UPDATE SET
  target.fulltext = source.fulltext;

#### Merge Concepts

In [0]:
---------- MERGE aggregated and sorted by score Concepts from backfill --------
MERGE INTO identifier('openalex' || :env_suffix || '.works.openalex_works') AS target
USING openalex.works.work_concepts_backfill AS source
  ON target.id = source.work_id
WHEN MATCHED THEN
  UPDATE SET
  target.concepts = source.concepts,
  target.keywords = filter(source.keywords, k -> k.score > 0);

---------- MERGE from predicted Concepts using concept_key --------
-- ============= Tunable parameters =============
DECLARE OR REPLACE VARIABLE filter_threshold FLOAT DEFAULT 0.20;  -- score cutoff for filtering
DECLARE OR REPLACE VARIABLE base_mid         FLOAT DEFAULT 5.0;   -- target median size (bell center)
DECLARE OR REPLACE VARIABLE half_range       FLOAT DEFAULT 6.0;   -- maximum deviation from median (-+ range)
DECLARE OR REPLACE VARIABLE center_size      INT   DEFAULT 7;     -- where the tanh crosses 0 (inflection point)
DECLARE OR REPLACE VARIABLE slope            FLOAT DEFAULT 0.05;  -- steepness of the tanh curve

MERGE INTO identifier('openalex' || :env_suffix || '.works.openalex_works') AS target
USING (
  SELECT concept_key,
         FIRST(concepts_enriched) AS concepts,
         FIRST(keywords) as keywords
  FROM openalex.works.openalex_works_concepts_predicted
  WHERE size(concepts_enriched) > 0 OR size(keywords) > 0
  GROUP BY concept_key
) as source
ON (target.concepts IS NULL OR size(target.concepts) = 0)
   AND xxhash64(
     -- sanitize later
     concat_ws('|',
       target.title,
       target.abstract,
       target.primary_location.source.display_name,
       target.primary_location.source.type
     )
   ) = source.concept_key
WHEN MATCHED THEN
  UPDATE SET
    target.concepts = slice(source.concepts, 1, 40), -- too many concepts from the model - up to 130
    target.keywords = slice(
      filter(source.keywords, k -> k.score > 0), 1,
      greatest(2, least(12, round(base_mid + 
          half_range * tanh((
            size(filter(source.keywords, 
              k -> k.score > filter_threshold)) - center_size) * slope)))
      )
    );


#### Merge Topics

In [0]:

-- MERGE from Topics backfill
MERGE INTO identifier('openalex' || :env_suffix || '.works.openalex_works')  AS target
USING (
  SELECT
    work_id,
    topics
  FROM openalex.works.work_topics_backfill
) AS source
-- don't force update if topics are populated already
ON id < 6600000000
  AND target.id = source.work_id
WHEN MATCHED THEN
  UPDATE SET
    target.topics = source.topics,
    target.primary_topic = source.topics[0];

-- MERGE from Topics frontfill
MERGE INTO identifier('openalex' || :env_suffix || '.works.openalex_works')  AS target
USING (
  SELECT
    topics_key,
    FIRST(topics) as topics
  FROM openalex.works.work_topics_frontfill
  GROUP BY topics_key
) AS source
-- don't force update if topics are populated already
ON id > 6600000000 -- speed this up
  AND (target.topics IS NULL or size(target.topics) = 0)
  AND target.topics_key = source.topics_key 
WHEN MATCHED THEN
  UPDATE SET
    target.topics = source.topics,
    target.primary_topic = source.topics[0];

-- MERGE Topics BY DOI
MERGE INTO identifier('openalex' || :env_suffix || '.works.openalex_works')  AS target
USING (
  SELECT
    lower(doi) as doi,
    FIRST(topics) as topics
  FROM openalex.works.work_topics_frontfill
  WHERE doi is not null
  GROUP BY doi
) AS source
-- don't force update if topics are populated already (@TODO - should we revisit this)
ON (target.topics IS NULL or size(target.topics) = 0)
  AND target.doi = source.doi
WHEN MATCHED THEN
  UPDATE SET
    target.topics = source.topics,
    target.primary_topic = source.topics[0];

### MERGE fwci and citation percentiles

In [0]:
-- FWCI + cohort percentile (pub+3 within pub_year/subfield_id/work_type)
-- + cited_by_percentile_year (global by eval_year)
-- Computes everything from citation edges (referenced_works); no counts_by_year usage.

WITH base AS (  -- candidate works + work_type mapping
  SELECT
    id AS work_id,
    CASE
      WHEN type = 'article'
           AND primary_location.source.type = 'conference' THEN 'conference_article'
      WHEN type IN ('article', 'book', 'review', 'book-chapter') THEN type
      ELSE NULL
    END AS work_type,
    COALESCE(publication_year, YEAR(publication_date)) AS pub_year,
    primary_topic.subfield.id AS subfield_id
  FROM identifier('openalex' || :env_suffix || '.works.openalex_works')
  WHERE primary_topic.subfield.id IS NOT NULL
    AND COALESCE(publication_year, YEAR(publication_date)) IS NOT NULL
),

-- All citation edges: (citing_year -> cited_work_id)
edges AS (
  SELECT
    w.publication_year AS citing_year,
    EXPLODE(COALESCE(w.referenced_works, ARRAY())) AS cited_work_id
  FROM identifier('openalex' || :env_suffix || '.works.openalex_works') AS w
  WHERE w.referenced_works_count > 0
    AND w.publication_year IS NOT NULL
    AND w.publication_year <= YEAR(CURRENT_DATE())
),

-- Per-work pub+3 citations via edges (join + conditional sum)
three_years AS (
  SELECT
    b.work_id,
    b.subfield_id,
    b.pub_year,
    b.work_type,
    SUM(
      CASE
        WHEN e.citing_year BETWEEN b.pub_year AND LEAST(b.pub_year + 3, YEAR(CURRENT_DATE()))
        THEN 1 ELSE 0
      END
    ) AS pub_plus_3_citations
  FROM base b
  LEFT JOIN edges e
    ON e.cited_work_id = b.work_id
  WHERE b.work_type IS NOT NULL
  GROUP BY b.work_id, b.subfield_id, b.pub_year, b.work_type
),

-- Join monthly cohort means to compute FWCI and carry p90/p99 thresholds
with_fwci AS (
  SELECT
    t.work_id,
    t.subfield_id,
    t.pub_year,
    t.work_type,
    t.pub_plus_3_citations,
    CASE
      WHEN d.mean_citations IS NULL OR d.mean_citations <= 0 THEN NULL
      ELSE t.pub_plus_3_citations / d.mean_citations
    END AS fwci,
    d.p90_threshold,
    d.p99_threshold
  FROM three_years t
  LEFT JOIN openalex.common.citations_mean_pub_year_type d
    ON d.publication_year = t.pub_year
   AND d.subfield_id      = t.subfield_id
   AND d.work_type        = t.work_type
),

-- Cohort percentile for pub+3 within (pub_year, subfield_id, work_type) + top-1/10 flags
with_percentile AS (
  SELECT
    work_id,
    subfield_id,
    pub_year,
    work_type,
    pub_plus_3_citations,
    ROUND(fwci, 8) AS fwci,
    ROUND(
      PERCENT_RANK() OVER (
        PARTITION BY pub_year, subfield_id, work_type
        ORDER BY pub_plus_3_citations, work_id
      ), 8
    ) AS citation_pct_cohort,
    (p99_threshold IS NOT NULL AND pub_plus_3_citations >= p99_threshold) AS is_in_top_1_percent,
    (p90_threshold IS NOT NULL AND pub_plus_3_citations >= p90_threshold) AS is_in_top_10_percent
  FROM with_fwci
),

/* ===== cited_by_percentile_year (global by eval_year), computed from edges ===== */

by_year AS (
  SELECT
    e.cited_work_id,
    e.citing_year,
    COUNT(*) AS citation_count
  FROM edges e
  GROUP BY e.cited_work_id, e.citing_year
),

latest AS (
  SELECT
    cited_work_id AS work_id,
    GREATEST(MAX(citing_year), 1920) AS eval_year
  FROM by_year
  GROUP BY cited_work_id
),

work_counts AS (
  -- citation_count for that work in its eval_year (0 if none)
  SELECT
    l.work_id,
    l.eval_year,
    COALESCE(MAX(CASE WHEN b.citing_year = l.eval_year THEN b.citation_count END), 0) AS citation_count
  FROM latest l
  LEFT JOIN by_year b
    ON b.cited_work_id = l.work_id
  GROUP BY l.work_id, l.eval_year
),

per_year_dist AS (
  SELECT
    citing_year,
    citation_count,
    PERCENT_RANK() OVER (PARTITION BY citing_year ORDER BY citation_count) AS pct
  FROM (SELECT DISTINCT citing_year, citation_count FROM by_year)
),

bounds AS (
  SELECT
    w.work_id,
    MAX(CASE WHEN d.citation_count <= w.citation_count THEN d.pct END) AS lower_pct,
    MIN(CASE WHEN d.citation_count >= w.citation_count THEN d.pct END) AS upper_pct
  FROM work_counts w
  JOIN per_year_dist d
    ON d.citing_year = w.eval_year
  GROUP BY w.work_id
),

formatted_year_pct AS (
  SELECT
    work_id,
    NAMED_STRUCT(
      'min',
        CASE
          WHEN ROUND(COALESCE(lower_pct, 0) * 100) = 100 THEN 99
          WHEN ROUND(COALESCE(lower_pct, 0) * 100) = ROUND(COALESCE(upper_pct, 0) * 100)
            THEN GREATEST(CAST(ROUND(COALESCE(lower_pct, 0) * 100) AS INT) - 1, 0)
          ELSE CAST(ROUND(COALESCE(lower_pct, 0) * 100) AS INT)
        END,
      'max',
        CASE
          WHEN ROUND(COALESCE(upper_pct, 0) * 100) = 100 THEN 100
          ELSE CAST(ROUND(COALESCE(upper_pct, 0) * 100) AS INT)
        END
    ) AS cited_by_percentile_year
  FROM bounds
),

updates AS (
  SELECT
    p.work_id,
    p.fwci,
    NAMED_STRUCT(
      'value', p.citation_pct_cohort,
      'is_in_top_1_percent', p.is_in_top_1_percent,
      'is_in_top_10_percent', p.is_in_top_10_percent
    ) AS citation_normalized_percentile,
    y.cited_by_percentile_year
  FROM with_percentile p
  LEFT JOIN formatted_year_pct y
    ON y.work_id = p.work_id
)
-- Preview:
-- SELECT * FROM updates;

MERGE INTO identifier('openalex' || :env_suffix || '.works.openalex_works') AS target
USING updates AS source
  ON target.id = source.work_id
WHEN MATCHED
THEN UPDATE SET
  target.fwci = COALESCE(source.fwci, target.fwci),
  target.citation_normalized_percentile =
    COALESCE(source.citation_normalized_percentile, target.citation_normalized_percentile),
  target.cited_by_percentile_year =
    COALESCE(source.cited_by_percentile_year, target.cited_by_percentile_year);

### Merge `sustainable_development_goals`

In [0]:
-- MERGE from SDG backfill
MERGE INTO identifier('openalex' || :env_suffix || '.works.openalex_works')  AS target
USING (
  SELECT
    paper_id,
    sustainable_development_goals
  FROM openalex.works.work_sdg_backfill
) AS source
-- don't force update if topics are populated already
ON target.id = source.paper_id
WHEN MATCHED THEN UPDATE
  SET target.sustainable_development_goals = source.sustainable_development_goals;


### Merge `grants`

In [0]:
MERGE INTO identifier('openalex' || :env_suffix || '.works.openalex_works')  AS target
USING (
  WITH funders_exploded AS (
    SELECT 
      paper_id, funder_id, display_name, explode_outer(award_ids) as award_id
    FROM openalex.mid.work_funder
    JOIN openalex.common.funder USING (funder_id)
  )
  SELECT paper_id,
    array_sort(
      collect_list(
        struct(
          CONCAT("https://openalex.org/F", funder_id) as funder,
          display_name as funder_display_name,
          award_id
        )
      )
    ) as grants
  FROM funders_exploded
  GROUP BY paper_id
) as source
ON target.id = source.paper_id
WHEN MATCHED THEN UPDATE
  SET target.grants = source.grants;

### Merge `awards`

In [0]:
MERGE INTO identifier('openalex' || :env_suffix || '.works.openalex_works')  AS target
USING (
  SELECT work_id,
    collect_set(
      struct(
        CONCAT('https://openalex.org/G', id) as id,
        award_id as funder_award_id,
        CONCAT('https://openalex.org/F', funder_id) as funder_id,
        COALESCE(f.display_name, a.funder_name) as funder_display_name,
        doi_url as doi
      )
    ) as awards
  FROM openalex.works.work_awards a
  JOIN openalex.common.funder f 
    ON a.funder_ids.doi = f.doi OR a.funder_ids.ror_id = f.ror_id
  WHERE work_id IS NOT NULL
  GROUP BY work_id
) as source
ON target.id = source.work_id
WHEN MATCHED THEN UPDATE
  SET target.awards = source.awards;

### Merge `funders`

In [0]:
%sql
-- 1) Build rolled funders from AWARDS + FULLTEXT (no lateral view; explode in SELECT)
WITH from_awards AS (
  SELECT
    id AS work_id,
    explode(
      array_distinct(
        array_compact(
          awards.funder_id
        )
      )
    ) AS funder_id
  FROM openalex.works.openalex_works
  WHERE size(awards) > 0
),
from_awards_enriched AS (
  SELECT
    a.work_id,
    a.funder_id,
    mf.ror_id AS ror,
    mf.display_name AS display_name
  FROM from_awards a
  LEFT JOIN openalex.mid.funder mf
    ON CONCAT("https://openalex.org/F", mf.funder_id) = a.funder_id
),
from_grants AS (
  SELECT
    id AS work_id,
    explode(
      array_distinct(
        array_compact(
          grants.funder
        )
      )
    ) AS funder_id
  FROM openalex.works.openalex_works
  WHERE size(grants) > 0
),
from_grants_enriched AS (
  SELECT
    a.work_id,
    a.funder_id,
    mf.ror_id AS ror,
    mf.display_name AS display_name
  FROM from_grants a
  LEFT JOIN openalex.mid.funder mf
    ON CONCAT("https://openalex.org/F", mf.funder_id) = a.funder_id
),
from_fulltext_enriched AS (
  SELECT
    ft.work_id,
    ft.funder_id,
    /* fulltext already has both name & ror */
    ft.ror_id AS ror,
    ft.funder_display_name AS display_name
  FROM openalex.works.fulltext_work_funders ft
  JOIN openalex.common.funder_names_keep keep ON keep.name = ft.funder_name
),
unioned AS (
  SELECT work_id, funder_id, ror, display_name FROM from_awards_enriched
  UNION ALL
  SELECT work_id, funder_id, ror, display_name FROM from_grants_enriched
  UNION ALL  
  SELECT work_id, funder_id, ror, display_name FROM from_fulltext_enriched
),
dedup AS (
  -- one row per (work_id, funder_id), pick deterministic values
  SELECT
    work_id,
    funder_id,
    MAX(display_name) AS display_name,
    MAX(ror) AS ror
  FROM unioned
  GROUP BY work_id, funder_id
),
rolled_up AS (
  SELECT
    work_id,
    -- order by funder_id via lexicographic struct ordering (id is first field)
    sort_array(
      collect_list(
        struct(
          funder_id as id,
          display_name,
          ror
        )
      )
    ) AS funders
  FROM dedup
  GROUP BY work_id
)
-- SELECT * from rolled;
-- 2) Merge into openalex_works.funders (array<struct<id:string, ror:string, display_name:string>>)
MERGE INTO identifier('openalex' || :env_suffix || '.works.openalex_works') AS target
USING rolled_up AS source
  ON target.id = source.work_id
WHEN MATCHED THEN
  UPDATE SET target.funders = source.funders;

### Merge `authorships'

In [0]:
MERGE INTO identifier('openalex' || :env_suffix || '.works.openalex_works')  AS target
USING (
  SELECT paper_id as work_id,
    authorships,
    corresponding_author_ids,
    corresponding_institution_ids
  FROM openalex.authors.work_authorships_backfill_moderated
) as source
ON target.id = source.work_id
WHEN MATCHED AND source.authorships IS NOT NULL and size(source.authorships) > 0 THEN UPDATE
  SET
    target.authorships = source.authorships,
    target.corresponding_author_ids = source.corresponding_author_ids,
    target.corresponding_institution_ids = source.corresponding_institution_ids;

### Merge `work.type`

In [0]:
MERGE INTO identifier('openalex' || :env_suffix || '.works.openalex_works') as target
USING (
  WITH approved_curations AS (
    SELECT
      CAST(SUBSTRING(entity_id, 2) AS BIGINT) AS work_id,
      MAP_FROM_ENTRIES(COLLECT_LIST(STRUCT(property, property_value))) AS curations
    FROM
      openalex.curations.approved_curations
    WHERE
      entity = 'works'
      AND property IN ('type', 'language')
      AND status = 'approved'
    GROUP BY CAST(SUBSTRING(entity_id, 2) AS BIGINT)
  )
  SELECT 
    w.paper_id as work_id,
    w.type,
    w.type_crossref,
    ac.work_id IS NOT NULL as has_curation
  FROM openalex.mid.work w
  LEFT JOIN approved_curations ac ON w.paper_id = ac.work_id
) as source
ON target.id = source.work_id
WHEN MATCHED 
  AND target.type <> source.type
  AND source.type IS NOT NULL
  AND source.has_curation = FALSE
THEN UPDATE SET
  target.type = COALESCE(source.type, target.type),
  target.type_crossref = source.type_crossref

### MERGE `related_works`

In [0]:
MERGE INTO identifier('openalex' || :env_suffix || '.works.openalex_works') as target
USING (
  SELECT 
    work_id, related_works
  FROM openalex.works.related_works_backfill
) as source
ON target.id = source.work_id
WHEN MATCHED AND source.related_works IS NOT NULL AND SIZE(source.related_works) > 0
THEN UPDATE SET
  target.related_works = source.related_works