# Create Works Base

## Overview

This notebook creates `openalex_works_base`, the foundational table for the works pipeline. It collects and deduplicates data from multiple provenances (crossref, pubmed, datacite, repo, etc.) into a single row per work.

## Data Flow

```
locations_mapped (multiple rows per work, one per provenance)
    |
    v
openalex_works_base (one row per work, deduplicated)
    |
    v
CreateWorksEnriched (adds citations, topics, authorships, etc.)
    |
    v
openalex_works (final output)
```

## Key Transformations

### Field Deduplication
Fields are deduplicated using priority-based selection via `get_highest_priority_value()`:
- Priority order defined in `openalex.system.priority_table`
- For each field, the value from the highest-priority provenance is selected

### Locations Collection
All locations (URLs, sources) are collected into an array, deduplicated by landing_page_url, and sorted by:
1. Version (publishedVersion > acceptedVersion > submittedVersion)
2. URL quality score (europepmc > pmc > arxiv > .edu)
3. Provenance priority

### Authorships
Authors are deduplicated by selecting from the highest-priority provenance, then enriched with:
- Legacy affiliations from `works_legacy.raw_affiliation_strings`
- Legacy `is_corresponding` from `works_legacy.work_authors`

### Curation Requests
URL corrections from `openalex.unpaywall.curation_requests` are applied to update/nullify/add locations.

## References Handling

**Important:** This notebook only collects pre-resolved `referenced_works` (work IDs from legacy/MAG data).
It does NOT collect raw `references` data - that is handled separately by `parse_work_references.ipynb`.

See `qa/issues/open/references-pipeline-refactor-2026-01/` for planned improvements to this flow.

### Create `openalex_works_base` table, include curation data

In [None]:
-- Register helper function
CREATE FUNCTION IF NOT EXISTS get_highest_priority_value(
    all_structs ARRAY<STRUCT<field_value: STRING, priority: INT>>, field_name STRING
  )
  RETURNS STRING
  RETURN
    (
      SELECT
        AGGREGATE(
          FILTER(all_structs, y -> y.field_value IS NOT NULL),
          STRUCT(CAST(NULL AS STRING) AS field_value, 999 AS priority),
          (acc, x) -> CASE
            WHEN x.priority < acc.priority THEN x
            ELSE acc
          END
        ).field_value
    );

-- Final pipeline to create openalex_works
CREATE
OR REPLACE TABLE identifier(
  'openalex' || :env_suffix || '.works.openalex_works_base'
) CLUSTER BY (id, updated_date)
TBLPROPERTIES (
  'delta.dataSkippingNumIndexedCols' = 36,
  'delta.deletedFileRetentionDuration' = '60 days',
  -- default is 7
  'delta.logRetentionDuration' = '60 days' -- default is 30
) AS (
  WITH mat_sources AS (
    SELECT
      s.id AS source_id,
      s.display_name,
      s.issn AS issn_l,
      s.issns,
      s.is_in_doaj,
      s.is_core,
      s.publisher AS source_publisher,
      s.publisher_id,
      s.institution_id,
      s.repository_id,
      s.apc_prices AS apc_prices,
      s.apc_usd,
      s.type AS source_type,
      i.display_name AS institution_name,
      p.display_name AS publisher_name,
      p.parent_publisher,
      s.is_in_doaj_start_year,
      s.high_oa_rate_start_year,
      s.is_oa_high_oa_rate,
      s.is_fully_open_in_jstage,
      s.doaj_license
    FROM
      openalex.sources.sources s
      LEFT JOIN openalex.institutions.institutions i ON s.institution_id = i.id
      LEFT JOIN openalex.publishers.publishers p ON s.publisher_id = p.id
  ),
  priority_table AS (
    -- some comment
    SELECT
      *
    FROM
      openalex.system.priority_table
  ),
  openapc_paid AS (
    SELECT
      paper_id as work_id,
      /* if multiple rows per work, pick the latest year's amounts */
      MAX_BY(CAST(apc_in_euro AS DOUBLE), year) AS apc_in_euro,
      MAX_BY(CAST(apc_in_usd  AS DOUBLE), year) AS apc_in_usd
    FROM openalex.mid.work_openapc
    GROUP BY paper_id
  ),
  base AS (
    SELECT
      a.work_id,
      a.provenance,
      a.native_id,
      a.native_id_namespace,
      a.best_doi,
      a.title,
      a.type,
      a.abstract,
      a.referenced_works_count,
      a.referenced_works,
      a.abstract_inverted_index,
      b.priority,
      a.openalex_created_dt,
      a.openalex_updated_dt,
      s.source_id,
      s.display_name,
      s.issn_l,
      s.issns,
      s.is_in_doaj,
      CASE
        WHEN GET(s.apc_prices, 0) IS NULL THEN NULL
        ELSE s.apc_prices
      END AS apc_prices,
      s.apc_usd,
      COALESCE(s.is_core, FALSE) AS is_core,
      a.is_oa,
      COALESCE(a.is_oa, FALSE) AS is_oa_raw,
      COALESCE(s.is_in_doaj, FALSE) AS is_in_doaj_raw,
      COALESCE(
        is_in_doaj_raw
        AND (
          ISNULL(s.is_in_doaj_start_year)
          OR YEAR(a.published_date) >= s.is_in_doaj_start_year
        ),
        FALSE
      ) AS is_in_doaj_stg,
      COALESCE(
        s.is_oa_high_oa_rate
        AND (
          ISNULL(s.high_oa_rate_start_year)
          OR YEAR(a.published_date) >= s.high_oa_rate_start_year
        )
        OR s.is_fully_open_in_jstage,
        FALSE
      ) AS is_oa_high_rate,
      (
        is_oa_high_rate
        or is_in_doaj_stg
      ) AS source_is_oa,
      (
        is_oa_raw
        OR source_is_oa
      ) AS composite_is_oa,
      COALESCE(s.is_in_doaj_start_year, 0) AS is_in_doaj_start_year,
      s.source_type,
      a.source_name,
      a.publisher,
      a.published_date,
      a.volume,
      a.issue,
      a.first_page,
      a.last_page,
      COALESCE(a.language_classification.language, a.language) as language, --prefer fasttext classified language
      a.authors,
      TRANSFORM(
        a.urls,
        x -> STRUCT(
          REGEXP_REPLACE(x.url, 'dx.doi.org', 'doi.org') AS url,
          x.content_type
        )
      ) AS urls,
      CASE
        WHEN is_in_doaj_stg
        AND s.doaj_license IS NOT NULL THEN s.doaj_license
        ELSE a.license
      END AS license,
      s.institution_name,
      s.publisher_name,
      s.parent_publisher,
      s.institution_id,
      s.publisher_id,
      a.version,
      CASE
        WHEN LOWER(a.native_id) LIKE '%arxiv.org%' THEN COALESCE(
          GET(
            FILTER(a.urls, x -> x.content_type = 'html').url,
            0
          ),
          a.landing_page_url
        )
        ELSE a.landing_page_url
      END AS landing_page_url,
      CASE
        WHEN LOWER(a.native_id) LIKE '%arxiv.org%' THEN COALESCE(
          CONCAT(
            'https://arxiv.org/pdf/',
            SPLIT_PART(a.native_id, ':', 3)
          ),
          a.pdf_url
        )
        ELSE a.pdf_url
      END AS pdf_url,
      a.is_retracted,
      a.raw_type,
      s.repository_id,
      a.pdf_s3_id,
      a.grobid_s3_id,
      ROW_NUMBER() OVER (
        PARTITION BY a.work_id,
        a.provenance
        ORDER BY
          a.created_date DESC
      ) AS row_num,
      CASE
        WHEN s.source_id IS NULL THEN NULL
        WHEN (
          a.provenance = 'crossref'
          AND s.source_type != 'repository'
        ) THEN 'publisher'
        ELSE 'repository'
      END AS host_type,
      CASE
        WHEN composite_is_oa
        AND host_type = 'publisher' THEN CASE
          WHEN ZEROIFNULL(s.apc_usd) = 0
          AND source_is_oa THEN 1
          WHEN source_is_oa THEN 2
          WHEN a.license IS NOT NULL
          AND a.license != 'publisher-specific-oa' THEN 3
          ELSE 4
        END
        WHEN host_type IS NULL
        AND (
          a.is_oa
          OR composite_is_oa
        ) THEN 2
        WHEN (
          a.is_oa
          OR composite_is_oa
        )
        AND host_type = 'repository' THEN 5
        ELSE 6
      END AS oa_status
    FROM
      identifier(
        'openalex' || :env_suffix || '.works.locations_mapped'
      ) a
      LEFT JOIN priority_table b USING (provenance)
      LEFT JOIN mat_sources s ON a.source_id = s.source_id
    QUALIFY row_num <= 10
  ),
  -- authors start
  deduplicated_authors_source AS (
    SELECT
        work_id,
        authors,
        EXISTS(authors.is_corresponding, x -> x = TRUE) AS is_corresponding_exists,
        ARRAY_SIZE(authors) AS author_count
    FROM identifier('openalex' || :env_suffix || '.works.locations_mapped')
    LEFT JOIN openalex.system.priority_table USING (provenance)
    WHERE authors IS NOT NULL
        AND SIZE(authors) > 0
    QUALIFY ROW_NUMBER() OVER (PARTITION BY work_id ORDER BY priority ASC) = 1
  ),
  exploded_authors AS (
      SELECT
          work_id,
          author_count,
          is_corresponding_exists,
          POSEXPLODE(authors) AS (original_author_order, author_data),
          TRIM(author_data.name) AS raw_author_name,
          author_data.orcid AS orcid,
          author_data.affiliations.name AS raw_affiliation_strings,
          author_data.is_corresponding AS is_corresponding_from_source
      FROM deduplicated_authors_source
  ),
  -- Legacy affiliations from works_legacy.raw_affiliation_strings table
  legacy_affiliations AS (
      SELECT
          work_id,
          author_sequence,
          COLLECT_LIST(raw_affiliation_string) AS raw_affiliation_strings
      FROM identifier('openalex' || :env_suffix || '.works_legacy.raw_affiliation_strings')
      WHERE raw_affiliation_string IS NOT NULL
      GROUP BY work_id, author_sequence
  ),
  -- Legacy work_authors for is_corresponding values
  legacy_work_authors AS (
      SELECT
          work_id,
          author_sequence,
          is_corresponding AS legacy_is_corresponding
      FROM identifier('openalex' || :env_suffix || '.works_legacy.work_authors')
  ),
  authors_with_corresponding AS (
      SELECT
          e.work_id,
          e.original_author_order,
          e.author_count,
          e.raw_author_name,
          e.orcid,
          e.is_corresponding_from_source,
          e.is_corresponding_exists,
          -- Combine source and legacy affiliations, clean up, and deduplicate
          ARRAY_DISTINCT(
              TRANSFORM(
                  ARRAY_COMPACT(
                      CONCAT(
                          COALESCE(e.raw_affiliation_strings, ARRAY()),
                          COALESCE(l.raw_affiliation_strings, ARRAY())
                      )
                  ),
                  s -> TRIM(TRAILING '.' FROM TRIM(REPLACE(s, '\\n', '')))
              )
          ) AS raw_affiliation_strings,
          -- Legacy is_corresponding value (NULL if no legacy record exists)
          lw.legacy_is_corresponding,
          -- Check if legacy has any corresponding author for this work
          MAX(CASE WHEN lw.legacy_is_corresponding = TRUE THEN 1 ELSE 0 END)
              OVER (PARTITION BY e.work_id) AS legacy_has_corresponding,
          CASE
              WHEN e.is_corresponding_exists THEN
                  MAX(CASE WHEN e.is_corresponding_from_source = TRUE THEN 1 ELSE 0 END)
                      OVER (PARTITION BY e.work_id)
              ELSE 0
          END AS work_has_corresponding
      FROM exploded_authors e
      LEFT JOIN legacy_affiliations l
          ON e.work_id = l.work_id
          AND e.original_author_order = l.author_sequence
      LEFT JOIN legacy_work_authors lw
          ON e.work_id = lw.work_id
          AND e.original_author_order = lw.author_sequence
    WHERE e.raw_author_name IS NOT NULL AND TRIM(e.raw_author_name) != ''
  ),
  raw_authorships AS (
      SELECT
          work_id,
          TRANSFORM(
              ARRAY_SORT(
                  COLLECT_LIST(
                      STRUCT(
                          original_author_order,
                          STRUCT(
                              CAST(ARRAY() AS ARRAY<STRUCT<institution_ids: ARRAY<STRING>, raw_affiliation_string: STRING>>) AS affiliations,
                              NAMED_STRUCT(
                                  'display_name', TRIM(REPLACE(raw_author_name, '\\n', '')),
                                  'id', CAST(NULL AS STRING),
                                  'orcid', CASE
                                      WHEN orcid IS NULL THEN NULL
                                      WHEN orcid LIKE 'https://orcid.org/%' THEN orcid
                                      ELSE CONCAT('https://orcid.org/', orcid)
                                  END
                              ) AS author,
                              CASE
                                  WHEN original_author_order = 0 THEN 'first'
                                  WHEN original_author_order + 1 = author_count THEN 'last'
                                  ELSE 'middle'
                              END AS author_position,
                              original_author_order AS author_order_number,
                              CAST(ARRAY() AS ARRAY<STRING>) AS countries,
                              CAST(ARRAY() AS ARRAY<STRUCT<country_code: STRING, display_name: STRING, id: STRING, lineage: ARRAY<STRING>, ror: STRING, type: STRING>>) AS institutions,
                              CASE
                                  -- Source says this author is corresponding
                                  WHEN is_corresponding_from_source = TRUE THEN TRUE
                                  -- Source has corresponding info and someone else is corresponding
                                  WHEN work_has_corresponding = 1 THEN FALSE
                                  -- No source info, use legacy is_corresponding if available
                                  WHEN legacy_is_corresponding = TRUE THEN TRUE
                                  -- Legacy has corresponding info and someone else is corresponding
                                  WHEN legacy_has_corresponding = 1 THEN FALSE
                                  -- No info from either source, default first author to corresponding
                                  WHEN original_author_order = 0 THEN TRUE
                                  ELSE FALSE
                              END AS is_corresponding,
                              raw_affiliation_strings,
                              TRIM(REPLACE(raw_author_name, '\\n', '')) AS raw_author_name,
                              CAST(NULL AS STRUCT<title: STRING, first: STRING, middle: STRING, last: STRING, suffix: STRING, nickname: STRING>) AS parsed_name
                          ) AS authorship
                      )
                  ),
                  (left, right) -> CASE
                      WHEN left.original_author_order < right.original_author_order THEN -1
                      WHEN left.original_author_order > right.original_author_order THEN 1
                      ELSE 0
                  END
              ),
              x -> x.authorship
          ) AS authorships
      FROM authors_with_corresponding
      GROUP BY work_id
  ),
  -- end authors
  -- CURATION BLOCK
  curation_requests_clean AS (
    WITH ranked AS (
      SELECT
        LOWER(
          TRIM(
            REGEXP_REPLACE(doi, '^https?://(dx\\.)?doi\\.org/', '')
          )
        ) AS doi,
        TRIM(previous_url) AS prev_url,
        TRIM(new_url) AS new_url,
        ROW_NUMBER() OVER (
          PARTITION BY LOWER(
            TRIM(
              REGEXP_REPLACE(doi, '^https?://(dx\\.)?doi\\.org/', '')
            )
          ),
          TRIM(previous_url)
          ORDER BY
            ingestion_timestamp DESC
        ) AS rn
      FROM
        openalex.unpaywall.curation_requests
    )
    SELECT
      doi,
      prev_url,
      new_url
    FROM
      ranked
    WHERE
      rn = 1
  ),
  cr_matches AS (
    SELECT
      b.*,
      c.new_url
    FROM
      base b
      JOIN curation_requests_clean c ON LOWER(b.best_doi) = c.doi
    WHERE
      c.prev_url IS NOT NULL
      AND (
        REGEXP_REPLACE(LOWER(COALESCE(b.pdf_url, '')), '^https?://', '') = REGEXP_REPLACE(LOWER(c.prev_url), '^https?://', '')
        OR REGEXP_REPLACE(
          LOWER(COALESCE(b.landing_page_url, '')),
          '^https?://',
          ''
        ) = REGEXP_REPLACE(LOWER(c.prev_url), '^https?://', '')
        OR ARRAY_CONTAINS(
          TRANSFORM(
            b.urls,
            u -> REGEXP_REPLACE(LOWER(u.url), '^https?://', '') = REGEXP_REPLACE(LOWER(c.prev_url), '^https?://', '')
          ),
          TRUE
        )
      )
  ),
  cr_upserts AS (
    -- curation upserts
    SELECT
      b.work_id,
      b.provenance,
      b.native_id,
      b.native_id_namespace,
      b.best_doi,
      b.title,
      b.type,
      b.abstract,
      b.referenced_works_count,
      b.referenced_works,
      b.abstract_inverted_index,
      b.priority,
      b.openalex_created_dt,
      b.openalex_updated_dt,
      b.source_id,
      b.display_name,
      b.issn_l,
      b.issns,
      b.is_in_doaj,
      b.apc_prices,
      b.apc_usd,
      b.is_core,
      b.is_oa,
      b.is_oa_raw,
      b.is_in_doaj_raw,
      b.is_in_doaj_stg,
      b.is_oa_high_rate,
      b.source_is_oa,
      b.composite_is_oa,
      b.is_in_doaj_start_year,
      b.source_type,
      b.source_name,
      b.publisher,
      b.published_date,
      b.volume,
      b.issue,
      b.first_page,
      b.last_page,
      b.language,
      b.authors,
      /* Update the urls array using normalized comparison */
      transform(
        b.urls,
        u -> struct(
          CASE
            WHEN regexp_replace(lower(u.url), '^https?://', '') = regexp_replace(lower(c.prev_url), '^https?://', '') THEN c.new_url
            ELSE u.url
          END as url,
          u.content_type
        )
      ) AS urls,
      b.license,
      b.institution_name,
      b.publisher_name,
      b.parent_publisher,
      b.institution_id,
      b.publisher_id,
      b.version,
      /* 44   landing_page_url  (may be replaced) --------------------- */
      CASE
        WHEN regexp_replace(
          lower(coalesce(b.landing_page_url, '')),
          '^https?://',
          ''
        ) = regexp_replace(lower(c.prev_url), '^https?://', '') THEN c.new_url
        ELSE b.landing_page_url
      END AS landing_page_url,
      /* 45   pdf_url  (may be replaced) ------------------------------ */
      CASE
        WHEN regexp_replace(lower(coalesce(b.pdf_url, '')), '^https?://', '') = regexp_replace(lower(c.prev_url), '^https?://', '') THEN c.new_url
        ELSE b.pdf_url
      END AS pdf_url,
      /* 46–52 : trailing columns in correct order -------------------- */
      b.is_retracted,
      b.raw_type,
      b.repository_id,
      b.pdf_s3_id,
      b.grobid_s3_id,
      b.row_num,
      b.host_type,
      b.oa_status
    FROM
      base AS b
      JOIN curation_requests_clean c ON lower(b.best_doi) = c.doi
      AND (
        regexp_replace(lower(coalesce(b.pdf_url, '')), '^https?://', '') = regexp_replace(lower(c.prev_url), '^https?://', '')
        OR regexp_replace(
          lower(coalesce(b.landing_page_url, '')),
          '^https?://',
          ''
        ) = regexp_replace(lower(c.prev_url), '^https?://', '')
        OR array_contains(
          transform(
            b.urls,
            u -> regexp_replace(lower(u.url), '^https?://', '') = regexp_replace(lower(c.prev_url), '^https?://', '')
          ),
          true
        )
      )
    WHERE
      c.prev_url IS NOT NULL
      AND c.new_url IS NOT NULL -- replacement, not nullification
  ),
  cr_nullify AS (
    SELECT
      b.work_id,
      b.provenance,
      b.native_id,
      b.native_id_namespace,
      b.best_doi,
      b.title,
      b.type,
      b.abstract,
      b.referenced_works_count,
      b.referenced_works,
      b.abstract_inverted_index,
      b.priority,
      b.openalex_created_dt,
      b.openalex_updated_dt,
      b.source_id,
      b.display_name,
      b.issn_l,
      b.issns,
      b.is_in_doaj,
      b.apc_prices,
      b.apc_usd,
      b.is_core,
      /* Update is_oa to false if pdf_url is being nullified */
      CASE
        WHEN regexp_replace(lower(coalesce(b.pdf_url, '')), '^https?://', '') = regexp_replace(lower(c.prev_url), '^https?://', '') THEN false
        ELSE b.is_oa
      END AS is_oa,
      /* Update is_oa_raw to false if pdf_url is being nullified */
      CASE
        WHEN regexp_replace(lower(coalesce(b.pdf_url, '')), '^https?://', '') = regexp_replace(lower(c.prev_url), '^https?://', '') THEN false
        ELSE b.is_oa_raw
      END AS is_oa_raw,
      b.is_in_doaj_raw,
      b.is_in_doaj_stg,
      b.is_oa_high_rate,
      b.source_is_oa,
      /* Update composite_is_oa to account for nullified pdf_url */
      CASE
        WHEN regexp_replace(lower(coalesce(b.pdf_url, '')), '^https?://', '') = regexp_replace(lower(c.prev_url), '^https?://', '') THEN b.source_is_oa
        /* When pdf_url is nullified, composite_is_oa depends only on source_is_oa */
        ELSE b.composite_is_oa
      END AS composite_is_oa,
      b.is_in_doaj_start_year,
      b.source_type,
      b.source_name,
      b.publisher,
      b.published_date,
      b.volume,
      b.issue,
      b.first_page,
      b.last_page,
      b.language,
      b.authors,
      /* Update the urls array to nullify matched URLs using normalized comparison */
      transform(
        b.urls,
        u -> struct(
          CASE
            WHEN regexp_replace(lower(u.url), '^https?://', '') = regexp_replace(lower(c.prev_url), '^https?://', '') THEN null
            ELSE u.url
          END as url,
          u.content_type
        )
      ) AS urls,
      b.license,
      b.institution_name,
      b.publisher_name,
      b.parent_publisher,
      b.institution_id,
      b.publisher_id,
      b.version,
      /* 44   landing_page_url  (set to null if matched) ------------- */
      CASE
        WHEN regexp_replace(
          lower(coalesce(b.landing_page_url, '')),
          '^https?://',
          ''
        ) = regexp_replace(lower(c.prev_url), '^https?://', '') THEN null
        ELSE b.landing_page_url
      END AS landing_page_url,
      /* 45   pdf_url  (set to null if matched) ---------------------- */
      CASE
        WHEN regexp_replace(lower(coalesce(b.pdf_url, '')), '^https?://', '') = regexp_replace(lower(c.prev_url), '^https?://', '') THEN null
        ELSE b.pdf_url
      END AS pdf_url,
      /* 46–52 : trailing columns in correct order -------------------- */
      b.is_retracted,
      b.raw_type,
      b.repository_id,
      b.pdf_s3_id,
      b.grobid_s3_id,
      b.row_num,
      b.host_type,
      /* Update oa_status to closed (6) if pdf_url is nullified */
      CASE
        WHEN regexp_replace(lower(coalesce(b.pdf_url, '')), '^https?://', '') = regexp_replace(lower(c.prev_url), '^https?://', '') THEN 6
        /* closed */
        ELSE b.oa_status
      END AS oa_status
    FROM
      base AS b
      JOIN curation_requests_clean c ON lower(b.best_doi) = c.doi
      AND (
        regexp_replace(lower(coalesce(b.pdf_url, '')), '^https?://', '') = regexp_replace(lower(c.prev_url), '^https?://', '')
        OR regexp_replace(
          lower(coalesce(b.landing_page_url, '')),
          '^https?://',
          ''
        ) = regexp_replace(lower(c.prev_url), '^https?://', '')
        OR array_contains(
          transform(
            b.urls,
            u -> regexp_replace(lower(u.url), '^https?://', '') = regexp_replace(lower(c.prev_url), '^https?://', '')
          ),
          true
        )
      )
    WHERE
      c.prev_url IS NOT NULL
      AND c.new_url IS NULL -- nullification, not update
  ),
  cr_mark_oa AS (
    SELECT
      b.work_id,
      b.provenance,
      b.native_id,
      b.native_id_namespace,
      b.best_doi,
      b.title,
      b.type,
      b.abstract,
      b.referenced_works_count,
      b.referenced_works,
      b.abstract_inverted_index,
      b.priority,
      b.openalex_created_dt,
      b.openalex_updated_dt,
      b.source_id,
      b.display_name,
      b.issn_l,
      b.issns,
      b.is_in_doaj,
      b.apc_prices,
      b.apc_usd,
      b.is_core,
      TRUE AS is_oa,
      TRUE AS is_oa_raw,
      b.is_in_doaj_raw,
      b.is_in_doaj_stg,
      b.is_oa_high_rate,
      b.source_is_oa,
      /* composite_is_oa only matters for publisher rows */
      CASE
        WHEN b.host_type = 'publisher' THEN TRUE
        ELSE b.composite_is_oa
      END AS composite_is_oa,
      b.is_in_doaj_start_year,
      b.source_type,
      b.source_name,
      b.publisher,
      b.published_date,
      b.volume,
      b.issue,
      b.first_page,
      b.last_page,
      b.language,
      b.authors,
      b.urls,
      b.license,
      b.institution_name,
      b.publisher_name,
      b.parent_publisher,
      b.institution_id,
      b.publisher_id,
      b.version,
      b.landing_page_url,
      b.pdf_url,
      b.is_retracted,
      b.raw_type,
      b.repository_id,
      b.pdf_s3_id,
      b.grobid_s3_id,
      b.row_num,
      b.host_type,
      /* recompute oa_status the same way base does */
      CASE
        WHEN b.host_type = 'publisher' THEN CASE
          WHEN ZEROIFNULL(b.apc_usd) = 0
          AND b.source_is_oa THEN 1 -- diamond
          WHEN b.source_is_oa THEN 2 -- gold
          WHEN b.license IS NOT NULL
          AND b.license <> 'publisher-specific-oa' THEN 3 -- hybrid
          ELSE 4 -- bronze
        END
        ELSE 5 -- repository => green
      END AS oa_status
    FROM
      base b
      JOIN curation_requests_clean c ON LOWER(b.best_doi) = c.doi
    WHERE
      c.prev_url IS NULL
      AND c.new_url IS NOT NULL
      AND (
        REGEXP_REPLACE(
          LOWER(COALESCE(b.pdf_url, '')),
          '^https?://',
          ''
        ) = REGEXP_REPLACE(LOWER(c.new_url), '^https?://', '')
        OR REGEXP_REPLACE(
          LOWER(COALESCE(b.landing_page_url, '')),
          '^https?://',
          ''
        ) = REGEXP_REPLACE(LOWER(c.new_url), '^https?://', '')
        OR ARRAY_CONTAINS(
          TRANSFORM(
            b.urls,
            u -> REGEXP_REPLACE(LOWER(u.url), '^https?://', '') = REGEXP_REPLACE(LOWER(c.new_url), '^https?://', '')
          ),
          TRUE
        )
      )
  ),
  base_filtered AS (
    SELECT
      *
    FROM
      base b
    WHERE
      NOT EXISTS (
        SELECT
          1
        FROM
          cr_matches m
        WHERE
          m.work_id = b.work_id
          AND m.provenance = b.provenance
          AND (
            REGEXP_REPLACE(LOWER(COALESCE(m.pdf_url, '')), '^https?://', '') = REGEXP_REPLACE(LOWER(COALESCE(b.pdf_url, '')), '^https?://', '')
            OR REGEXP_REPLACE(
              LOWER(COALESCE(m.landing_page_url, '')),
              '^https?://',
              ''
            ) = REGEXP_REPLACE(
              LOWER(COALESCE(b.landing_page_url, '')),
              '^https?://',
              ''
            )
          )
      )
      AND NOT EXISTS (
        SELECT
          1
        FROM
          cr_mark_oa o
        WHERE
          o.work_id = b.work_id
          AND o.provenance = b.provenance
      )
  ),
  cr_new_locations AS (
    -- new locations for curation requests that don't match existing URLs
    SELECT
      b.work_id,
      'curation' AS provenance,
      b.native_id,
      b.native_id_namespace,
      b.best_doi,
      b.title,
      b.type,
      b.abstract,
      b.referenced_works_count,
      b.referenced_works,
      b.abstract_inverted_index,
      999 AS priority,
      b.openalex_created_dt,
      CURRENT_TIMESTAMP() AS openalex_updated_dt,
      b.source_id,
      -- Use existing source
      b.display_name,
      b.issn_l,
      b.issns,
      b.is_in_doaj,
      b.apc_prices,
      b.apc_usd,
      b.is_core,
      TRUE AS is_oa,
      TRUE AS is_oa_raw,
      b.is_in_doaj_raw,
      b.is_in_doaj_stg,
      b.is_oa_high_rate,
      b.source_is_oa,
      TRUE AS composite_is_oa,
      b.is_in_doaj_start_year,
      b.source_type,
      b.source_name,
      b.publisher,
      b.published_date,
      b.volume,
      b.issue,
      b.first_page,
      b.last_page,
      b.language,
      b.authors,
      ARRAY(
        STRUCT(
          c.new_url AS url,
          CASE
            WHEN LOWER(c.new_url) LIKE '%.pdf%'
            OR LOWER(c.new_url) LIKE '%/pdf/%' THEN 'pdf'
            ELSE 'html'
          END AS content_type
        )
      ) AS urls,
      NULL AS license,
      b.institution_name,
      b.publisher_name,
      b.parent_publisher,
      b.institution_id,
      b.publisher_id,
      'publishedVersion' AS version,
      -- Set landing_page_url or pdf_url based on URL type
      CASE
        WHEN LOWER(c.new_url) LIKE '%.pdf%'
        OR LOWER(c.new_url) LIKE '%/pdf/%' THEN NULL
        ELSE c.new_url
      END AS landing_page_url,
      CASE
        WHEN LOWER(c.new_url) LIKE '%.pdf%'
        OR LOWER(c.new_url) LIKE '%/pdf/%' THEN c.new_url
        ELSE NULL
      END AS pdf_url,
      b.is_retracted,
      b.raw_type,
      b.repository_id,
      CAST(NULL AS STRING) AS pdf_s3_id,
      CAST(NULL AS STRING) AS grobid_s3_id,
      1 AS row_num,
      b.host_type,
      CASE
        WHEN b.composite_is_oa
        AND b.host_type = 'publisher' THEN CASE
          WHEN ZEROIFNULL(b.apc_usd) = 0
          AND b.source_is_oa THEN 1
          WHEN b.source_is_oa THEN 2 -- gold
          ELSE 4
        END -- bronze
        WHEN b.host_type = 'repository' THEN 5
        ELSE 4
      END AS oa_status
    FROM
      curation_requests_clean c
      INNER JOIN base b ON LOWER(b.best_doi) = c.doi
    WHERE
      c.new_url IS NOT NULL
      AND c.prev_url IS NULL -- this indicates a new URL addition, not a replacement
      AND NOT EXISTS (
        -- Ensure this URL doesn't already exist for this work
        SELECT
          1
        FROM
          base b2
        WHERE
          b2.work_id = b.work_id
          AND (
            REGEXP_REPLACE(
              LOWER(COALESCE(b2.pdf_url, '')),
              '^https?://',
              ''
            ) = REGEXP_REPLACE(LOWER(c.new_url), '^https?://', '')
            OR REGEXP_REPLACE(
              LOWER(COALESCE(b2.landing_page_url, '')),
              '^https?://',
              ''
            ) = REGEXP_REPLACE(LOWER(c.new_url), '^https?://', '')
            OR ARRAY_CONTAINS(
              TRANSFORM(
                b2.urls,
                u -> REGEXP_REPLACE(LOWER(u.url), '^https?://', '') = REGEXP_REPLACE(LOWER(c.new_url), '^https?://', '')
              ),
              TRUE
            )
          )
      )
  ),
  base_with_cr AS (
    SELECT
      *
    FROM
      base_filtered
    UNION ALL
    SELECT
      *
    FROM
      cr_upserts
    UNION ALL
    SELECT
      *
    FROM
      cr_nullify
    UNION ALL
    SELECT
      *
    FROM
      cr_new_locations
    UNION ALL
    SELECT
      *
    FROM
      cr_mark_oa
  ),
  -- deduplicate and rank landing page urls
  base_with_landing_page_rank AS (
    SELECT
      *,
      row_number() OVER (
        PARTITION BY work_id,
        coalesce(
          landing_page_url,
          get(
            filter(urls, x -> x.content_type = "html").url,
            0
          ),
          ''
        )
        ORDER BY
          case
            when provenance = 'crossref'
            and best_doi is not null then 1
            when provenance = 'crossref' then 2
            when provenance = 'datacite' THEN 3
            when version = 'publishedVersion'
            and pdf_url is not null then 4
            when version = 'publishedVersion' then 5
            when version = 'acceptedVersion'
            and pdf_url is not null then 6
            when version = 'acceptedVersion' then 7
            when version = 'submittedVersion'
            and pdf_url is not null then 8
            when version = 'submittedVersion' then 9
            else 10
          end,
          -- Then by url_sort_score
          case
            when contains(
              coalesce(pdf_url, landing_page_url),
              "europepmc.org"
            ) then 1
            when contains(coalesce(pdf_url, landing_page_url), "/pmc/") then 2
            when contains(coalesce(pdf_url, landing_page_url), "arxiv") then 3
            when contains(coalesce(pdf_url, landing_page_url), ".edu") then 4
            else 5
          end,
          -- Finally by priority from priority_table
          priority,
          case
            when provenance in ('repo', 'repo_backfill') then native_id
          end
      ) AS landing_page_rank
    FROM
      base_with_cr
  ),
  collect_all_values AS (
    select
      work_id,
      collect_list(struct(best_doi, priority)) as best_dois,
      collect_list(struct(title, priority)) as titles,
      collect_list(struct(publisher, priority)) as publishers,
      collect_list(struct(abstract, priority)) as abstracts,
      array_distinct(flatten(collect_list(referenced_works))) as referenced_works,
      -- preserve natural order (whatever it may be), sort in JSON
      collect_list(struct(abstract_inverted_index, priority)) as abstract_inverted_indexes,
      collect_list(struct(volume, priority)) as volumes,
      collect_list(struct(issue, priority)) as issues,
      collect_list(struct(first_page, priority)) as first_pages,
      collect_list(struct(last_page, priority)) as last_pages,
      collect_list(struct(language, priority)) as languages,
      collect_list(struct(type, priority)) as types,
      filter(
        collect_list(struct(published_date, priority)),
        x -> x.published_date is not null
      ) as published_dates,
      filter(
        collect_list(openalex_created_dt),
        x -> x is not null
      ) as openalex_created_dts,
      filter(
        collect_list(struct(openalex_updated_dt, priority)),
        x -> x.openalex_updated_dt is not null
      ) as openalex_updated_dts,
      filter(
        collect_set(struct(native_id_namespace, native_id)),
        x -> lower(x.native_id_namespace) != 'pmh'
      ) as ids,
      -- locations
      collect_set(
        struct(
          case
            when provenance = 'repo_backfill' then 'repo'
            else provenance
          end as provenance,
          native_id,
          case
            when provenance = 'crossref'
            and best_doi is not null then 1 -- publisher with a doi
            when provenance = 'crossref' then 2 -- publisher without a doi
            when version = 'publishedVersion'
            and pdf_url is not null then 3 -- published version with a pdf url
            when version = 'publishedVersion' then 4 -- published version without a pdf url
            when version = 'acceptedVersion'
            and pdf_url is not null then 5 -- accepted version with a pdf url
            when version = 'acceptedVersion' then 6 -- accepted version without a pdf url
            when version = 'submittedVersion'
            and pdf_url is not null then 7 -- submitted version with a pdf url
            when version = 'submittedVersion' then 8 -- submitted version without a pdf url
            else 9
          end as sort_score,
          case
            when host_type = 'repository' then (
              is_oa_raw
              OR composite_is_oa
            )
            else composite_is_oa
          end as is_oa,
          coalesce(
            landing_page_url,
            get(
              filter(urls, x -> x.content_type = "html").url,
              0
            )
          ) as landing_page_url,
          pdf_url,
          case
            when contains(
              coalesce(pdf_url, landing_page_url),
              "europepmc.org"
            ) then 1
            when contains(coalesce(pdf_url, landing_page_url), "/pmc/") then 2
            when contains(coalesce(pdf_url, landing_page_url), "arxiv") then 3
            when contains(coalesce(pdf_url, landing_page_url), ".edu") then 4
            else 5
          end as url_sort_score,
          oa_status,
          COALESCE(version = 'publishedVersion', FALSE) as is_published,
          COALESCE(version in ('acceptedVersion','publishedVersion'), FALSE) as is_accepted,
          CASE
            WHEN source_id IS NULL THEN NULL
            ELSE struct(
              concat("https://openalex.org/S", source_id) as id,
              display_name AS display_name,
              issn_l,
              issns as issn,
              COALESCE(source_is_oa, FALSE) as is_oa,
              COALESCE(is_in_doaj, FALSE) as is_in_doaj,
              COALESCE(is_core, FALSE) as is_core,
              case
                when source_type = 'repository' then concat('https://openalex.org/I', institution_id)
                else concat('https://openalex.org/P', publisher_id)
              end as host_organization,
              case
                when source_type = 'repository' then institution_name
                else publisher_name
              end as host_organization_name,
              IF ((source_type = 'repository' or source_type = 'metadata') and institution_id IS NOT NULL,
                ARRAY(CONCAT('https://openalex.org/I', institution_id)),
                ARRAY_COMPACT(ARRAY(CONCAT('https://openalex.org/P', publisher_id), parent_publisher.id))
              ) AS host_organization_lineage,
              IF ((source_type = 'repository' or source_type = 'metadata') and institution_id IS NOT NULL,
                CAST(ARRAY() AS ARRAY<STRING>),
                ARRAY_COMPACT(ARRAY(publisher_name, parent_publisher.display_name))
              ) AS host_organization_lineage_names,
              source_type as type
            )
          END as source,
          source_name as raw_source_name,
          raw_type,
          apc_prices,
          apc_usd,
          -- temp license filter, need to update normalize_license and re-run all DLTs to remove these upstream
          CASE
            WHEN license IS NOT NULL AND license NOT IN ('False', 'unspecified-oa', 'gpl', 'gpl-2', 'gpl-3', 'apache-2.0')
              THEN license
            ELSE NULL
          END as license,
          CASE
            WHEN license IS NOT NULL AND license NOT IN ('False', 'unspecified-oa', 'gpl', 'gpl-2', 'gpl-3', 'apache-2.0')
              THEN CONCAT('https://openalex.org/licenses/', license)
            ELSE NULL
          END as license_id,
          version,
          host_type,
          case
            when provenance in ('repo', 'repo_backfill') then repository_id
          end as endpoint_id,
          case
            when provenance in ('repo', 'repo_backfill') then native_id
          end as pmh_id,
          provenance = 'crossref' as is_unpaywall_record,
          pdf_s3_id,
          grobid_s3_id,
          type as location_type,
          cast(openalex_updated_dt as timestamp) as updated
        )
      ) as locations,
      exists(collect_set(is_retracted), x -> x = True) as is_retracted,
      array_contains(collect_set(provenance), 'crossref') as indexed_in_crossref
    from
      base_with_landing_page_rank
    where
      landing_page_rank = 1
    group by
      work_id
  ),
  abstracts_backfill AS (
    SELECT
      work_id,
      abstract,
      abstract_inverted_index
    FROM
      openalex.abstracts.abstracts_backfill
  ),
  approved_curations AS (
    SELECT
      CAST(SUBSTRING(entity_id, 2) AS BIGINT) AS work_id,
      MAP_FROM_ENTRIES(COLLECT_LIST(STRUCT(property, property_value))) AS curations
    FROM
      openalex.curations.approved_curations
    WHERE
      entity = 'works'
      AND property IN ('type', 'language')
      AND status = 'approved'
    GROUP BY work_id
  ),
  set_fields AS (
    SELECT
      work_id as id,
      get_highest_priority_value(titles, titles.title) AS title,
      get_highest_priority_value(best_dois, best_dois.best_doi) AS raw_best_doi,
      CONCAT('https://doi.org/', raw_best_doi) AS best_doi,
      get_highest_priority_value(publishers, publishers.publisher) AS publisher,
      get_highest_priority_value(
        concat(
          abstracts,
          CASE
            WHEN bf.abstract IS NOT NULL THEN array(
              named_struct('abstract', bf.abstract, 'priority', 998)
            )
            ELSE array()
          END
        ),
        abstracts.abstract
      ) AS abstract,
      SIZE(referenced_works) AS referenced_works_count,
      referenced_works,
      CASE
        WHEN (
          -- Springer/Elsevier by publisher string or host org id
          (
            publisher IS NOT NULL
            AND lower(publisher) RLIKE '(springer|elsevier)'
          )
          OR exists(
            locations,
            x -> x.source.host_organization IS NOT NULL
            AND get(split(x.source.host_organization, '/'), 4) IN ('P4310320990', 'P4310319965')
          )
        ) -- Allow only if best OA is diamond/gold/hybrid (1/2/3)
        AND coalesce(
          try_element_at(
            transform(
              filter(locations, x -> x.is_oa),
              y -> y.oa_status
            ),
            1
          ),
          0
        ) NOT IN (1, 2, 3) THEN NULL
        ELSE get_highest_priority_value(
          concat(
            abstract_inverted_indexes,
            CASE
              WHEN bf.abstract_inverted_index IS NOT NULL THEN array(
                named_struct(
                  'abstract_inverted_index',
                  bf.abstract_inverted_index,
                  'priority',
                  998
                )
              )
              ELSE array()
            END
          ),
          abstract_inverted_indexes.abstract_inverted_index
        )
      END AS abstract_inverted_index,
      get_highest_priority_value(volumes, volumes.volume) AS volume,
      get_highest_priority_value(issues, issues.issue) AS issue,
      get_highest_priority_value(first_pages, first_pages.first_page) AS first_page,
      get_highest_priority_value(last_pages, last_pages.last_page) AS last_page,
      COALESCE(
        ELEMENT_AT(ac.curations, 'language'),
        get_highest_priority_value(languages, languages.language)
      ) AS language,
      COALESCE(
        ELEMENT_AT(ac.curations, 'type'),
        CASE
          -- Override type to 'preprint' for known preprint server DOI prefixes
          WHEN raw_best_doi LIKE '10.48550/%'          -- arXiv
            OR raw_best_doi LIKE '10.1101/%'           -- bioRxiv/medRxiv
            OR raw_best_doi LIKE '10.21203/rs.%'       -- Research Square
            OR raw_best_doi LIKE '10.2139/ssrn.%'      -- SSRN
            OR raw_best_doi LIKE '10.20944/preprints%' -- Preprints.org
          THEN 'preprint'
          ELSE get_highest_priority_value(types, types.type)
        END
      ) AS type,
      TRY_CAST(
        ARRAY_MIN(openalex_created_dts) AS DATE
      ) AS created_date,
      CASE
        WHEN id > 6600000000 AND created_date < to_date('2025-11-04') THEN TRUE
        ELSE FALSE
      END AS is_xpac,
      CAST(
        get_highest_priority_value(
          openalex_updated_dts,
          openalex_updated_dts.openalex_updated_dt
        ) AS TIMESTAMP
      ) AS updated_date,
      TRY_CAST(
        get_highest_priority_value(published_dates, published_dates.published_date) AS DATE
      ) AS publication_date,
      YEAR(publication_date) AS publication_year,
      MAP_FROM_ENTRIES(
        AGGREGATE(
          ids,
          ARRAY(
            NAMED_STRUCT(
              'native_id_namespace',
              'openalex',
              'native_id',
              CONCAT(
                'https://openalex.org/W',
                CAST(work_id AS STRING)
              )
            )
          ),
          (acc, x) -> CASE
            WHEN SIZE(
              FILTER(
                acc,
                y -> y.native_id_namespace = x.native_id_namespace
              )
            ) = 0 THEN acc || ARRAY(x)
            ELSE acc
          END
        )
      ) AS ids,
      ARRAY_SORT(
        locations,
        (x, y) -> CASE
          WHEN STRUCT(
            x.sort_score,
            x.url_sort_score,
            x.provenance,
            x.native_id,
            COALESCE(x.landing_page_url, ''),
            COALESCE(x.pdf_url, '')
          ) < STRUCT(
            y.sort_score,
            y.url_sort_score,
            y.provenance,
            y.native_id,
            COALESCE(y.landing_page_url, ''),
            COALESCE(y.pdf_url, '')
          ) THEN -1
          WHEN STRUCT(
            x.sort_score,
            x.url_sort_score,
            x.provenance,
            x.native_id,
            COALESCE(x.landing_page_url, ''),
            COALESCE(x.pdf_url, '')
          ) > STRUCT(
            y.sort_score,
            y.url_sort_score,
            y.provenance,
            y.native_id,
            COALESCE(y.landing_page_url, ''),
            COALESCE(y.pdf_url, '')
          ) THEN 1
          ELSE 0
        END
      ) AS locations_sorted,
      raw_authorships.authorships,
      (
        is_retracted
        OR lower(title) like 'retracted article%'
        OR lower(title) like 'retracted: %'
      ) AS is_retracted,
      indexed_in_crossref
    FROM
      collect_all_values
      LEFT JOIN raw_authorships USING (work_id)
      LEFT JOIN abstracts_backfill bf USING (work_id)
      LEFT JOIN approved_curations ac USING (work_id)
  )
  SELECT
    s.id,
    s.best_doi AS doi,
    s.title,
    COALESCE(s.authorships, ARRAY()) as authorships,
    SIZE(COALESCE(s.authorships, ARRAY())) AS authors_count,
    CAST(
      ARRAY() AS ARRAY < STRUCT <
      id STRING,
      display_name STRING,
      ror STRING,
      country_code STRING,
      type STRING,
      lineage ARRAY<STRING> > >
    ) AS institutions,
    CAST(0 AS INT) AS institutions_distinct_count,
    CAST(0 AS INT) AS countries_distinct_count,
    -- @TODO make changes later to calculate them upstream - they will have to rely on Walden logic
    CAST(ARRAY() AS ARRAY<STRING>) AS corresponding_author_ids,
    CAST(ARRAY() AS ARRAY<STRING>) AS corresponding_institution_ids,
    s.publication_date,
    s.publication_year,
    s.created_date,
    s.updated_date,
    s.abstract,
    s.abstract IS NOT NULL AS has_abstract,
    s.referenced_works_count,
    s.referenced_works,
    CAST(ARRAY() AS ARRAY < STRING >) AS related_works,
    s.abstract_inverted_index,
    CONCAT(
      'https://api.openalex.org/works?filter=cites:W',
      CAST(s.id AS STRING)
    ) AS cited_by_api_url,
    CAST(0 AS INT) as cited_by_count,
    CAST(
      ARRAY() AS ARRAY < STRUCT < year: INT,
      cited_by_count: INT > >
    ) AS counts_by_year,
    CAST(NULL AS DOUBLE) as fwci,
    CAST(
      NULL AS STRUCT < value DOUBLE,
      is_in_top_1_percent BOOLEAN,
      is_in_top_10_percent BOOLEAN >
    ) as citation_normalized_percentile,
    CAST(NULL AS STRUCT < min INT, max INT >) as cited_by_percentile_year,
    MAP_FILTER(
      TRANSFORM_VALUES(
        MAP_CONCAT(
          MAP_FILTER(s.ids, (k, v) -> k != 'mag'),
          map('mag', COALESCE(
            IF(id < 4200000000, CAST(id AS STRING), NULL),
            s.ids['mag']
          ))
        ),
        (k, v) -> IF(k = 'pmid', CONCAT('https://pubmed.ncbi.nlm.nih.gov/', v), v)
      ),
      (k, v) -> v IS NOT NULL
    ) as ids,
    -- transform_values(ids, (k, v) -> case when k = 'pmid' then CONCAT('https://pubmed.ncbi.nlm.nih.gov/', v) else v end)
    ARRAY_SORT(
      ARRAY_DISTINCT(
        ARRAY_COMPACT(
          FLATTEN(
            TRANSFORM(
              s.locations_sorted,
              loc -> CASE
                WHEN loc.provenance IN ('crossref', 'pubmed', 'datacite') THEN array(
                  loc.provenance,
                  IF(loc.source.is_in_doaj, 'doaj', NULL)
                )
                WHEN loc.provenance = 'repo'
                AND lower(loc.native_id) like 'oai:arxiv.org%' THEN array('arxiv')
                WHEN loc.provenance = 'repo'
                AND lower(loc.native_id) like 'oai:doaj.org/%' THEN array('doaj')
                WHEN loc.provenance = 'mag'
                AND lower(loc.source.display_name) = 'pubmed' THEN array('pubmed')
                ELSE array()
              END
            )
          )
        )
      )
    ) as indexed_in,
    s.language,
    s.publisher,
    STRUCT(s.volume, s.issue, s.first_page, s.last_page) AS biblio,
    CASE
      WHEN GET(GET(locations_sorted, 0).apc_prices, 0).price IS NOT NULL THEN STRUCT(
        GET(GET(locations_sorted, 0).apc_prices, 0).price AS value,
        GET(GET(locations_sorted, 0).apc_prices, 0).currency,
        GET(locations_sorted, 0).apc_usd AS value_usd
      )
    END AS apc_list,
    CASE
      /* OpenAPC takes precedence whenever present */
      WHEN o.apc_in_euro IS NOT NULL OR o.apc_in_usd IS NOT NULL THEN STRUCT(
        o.apc_in_euro AS value,
        'EUR'         AS currency,
        o.apc_in_usd  AS value_usd
      )
      /* Otherwise: only relevant for gold/hybrid AND if a list price exists */
      WHEN GET(FILTER(s.locations_sorted, x -> x.is_oa), 0).oa_status IN (2, 3)
        AND GET(GET(s.locations_sorted, 0).apc_prices, 0).price IS NOT NULL
        THEN apc_list
        ELSE NULL
    END AS apc_paid,
    CAST(
      NULL AS STRUCT < id STRING,
      display_name STRING,
      score FLOAT,
      subfield STRUCT < id STRING,
      display_name STRING >,
      field STRUCT < id STRING,
      display_name STRING >,
      domain STRUCT < id STRING,
      display_name STRING > >
    ) AS primary_topic,
    CAST(
      ARRAY() AS ARRAY < STRUCT < id STRING,
      display_name STRING,
      score FLOAT,
      subfield STRUCT < id STRING,
      display_name STRING >,
      field STRUCT < id STRING,
      display_name STRING >,
      domain STRUCT < id STRING,
      display_name STRING > > >
    ) AS topics,
    xxhash64(concat_ws('|', doi, title, abstract,
        COALESCE(
            GET(s.locations_sorted, 0).source.display_name,
            GET(s.locations_sorted, 0).raw_source_name
        )
    )) as topics_key,
    CAST(
      ARRAY() AS ARRAY < STRUCT < id STRING,
      display_name STRING,
      score FLOAT > >
    ) AS keywords,
    CAST(
      ARRAY() AS ARRAY < STRUCT < id BIGINT,
      wikidata STRING,
      display_name STRING,
      level INT,
      score FLOAT > >
    ) AS concepts,
    SIZE(s.locations_sorted) AS locations_count,
    TRANSFORM(
      s.locations_sorted,
      x -> STRUCT(
        COALESCE(x.is_oa, FALSE) AS is_oa,
        x.landing_page_url,
        x.pdf_url,
        COALESCE(x.is_published, FALSE) AS is_published,
        COALESCE(x.is_accepted, FALSE) AS is_accepted,
        x.source,
        x.raw_source_name,
        x.raw_type,
        x.license,
        x.license_id,
        x.version,
        x.oa_status,
        x.host_type,
        x.endpoint_id,
        x.pmh_id,
        COALESCE(x.is_unpaywall_record, FALSE) AS is_unpaywall_record,
        x.location_type,
        x.pdf_s3_id,
        x.grobid_s3_id,
        CAST(x.updated AS TIMESTAMP) AS updated,
        x.provenance,
        x.native_id
      )
    ) AS locations,
    GET(s.locations_sorted, 0) AS primary_location,
    GET(FILTER(s.locations_sorted, x -> x.is_oa), 0) AS best_oa_location,
    STRUCT(
      CASE
        WHEN GET(FILTER(s.locations_sorted, x -> x.is_oa), 0) IS NOT NULL
            AND GET(FILTER(s.locations_sorted, x -> x.is_oa), 0).pdf_s3_id IS NOT NULL
        THEN TRUE ELSE FALSE
      END AS pdf,
      CASE
        WHEN GET(FILTER(s.locations_sorted, x -> x.is_oa), 0) IS NOT NULL
            AND GET(FILTER(s.locations_sorted, x -> x.is_oa), 0).grobid_s3_id IS NOT NULL
        THEN TRUE ELSE FALSE
      END AS `grobid_xml`
    ) AS has_content,
    CAST(
      ARRAY() AS ARRAY < STRUCT < id STRING,
      display_name STRING,
      score FLOAT > >
    ) AS sustainable_development_goals,
    CAST(
      ARRAY() AS ARRAY < STRUCT < id STRING,
      display_name STRING,
      funder_award_id STRING,
      funder_id STRING,
      funder_display_name STRING > >
    ) AS awards,
    CAST(
      ARRAY() AS ARRAY < STRUCT < id STRING,
      display_name STRING,
      ror STRING > >
    ) AS funders,
    STRUCT(
      best_oa_location IS NOT NULL AS is_oa,
      CASE
        WHEN best_oa_location.oa_status = 1 THEN 'diamond'
        WHEN best_oa_location.oa_status = 2 THEN 'gold'
        WHEN best_oa_location.oa_status = 3 THEN 'hybrid'
        WHEN best_oa_location.oa_status = 4 THEN 'bronze'
        WHEN best_oa_location.oa_status = 5 THEN 'green'
        ELSE 'closed'
      END AS oa_status,
      COALESCE(
        best_oa_location.pdf_url,
        best_oa_location.landing_page_url
      ) AS oa_url,
      CAST(NULL AS BOOLEAN) AS any_repository_has_fulltext
    ) AS open_access,
    s.type,
    CAST(NULL as STRING) AS type_crossref,
    s.type = 'paratext' AS is_paratext,
    s.is_retracted,
    s.indexed_in_crossref,
    s.is_xpac,
    m.mesh_formatted AS mesh,
    CAST(NULL AS STRING) AS fulltext
  FROM
    set_fields s
    LEFT JOIN openapc_paid o ON s.id = o.work_id
    LEFT JOIN (
      SELECT
        pmid,
        COLLECT_LIST(
          STRUCT(
            descriptor_ui,
            descriptor_name,
            qualifiers._UI AS qualifier_ui,
            qualifiers._VALUE AS qualifier_name,
            CASE
              WHEN is_major_topic = 'Y' THEN TRUE
              ELSE FALSE
            END AS is_major_topic
          )
        ) AS mesh_formatted
      FROM
        (
          SELECT
            pmid,
            EXPLODE(mesh.MeshHeading) AS mesh_exploded,
            mesh_exploded.DescriptorName._UI AS descriptor_ui,
            mesh_exploded.DescriptorName._VALUE AS descriptor_name,
            EXPLODE_OUTER(
              ARRAYS_ZIP(
                mesh_exploded.QualifierName._UI,
                mesh_exploded.QualifierName._VALUE
              )
            ) AS qualifiers,
            mesh_exploded.DescriptorName._MajorTopicYN AS is_major_topic
          FROM
            (
              SELECT
                FILTER(
                  PubmedData.ArticleIdList.ArticleId,
                  x -> x._IdType = 'pubmed'
                )._VALUE [0] AS pmid,
                MedlineCitation.MeshHeadingList AS mesh
              FROM
                openalex.pubmed.pubmed_items
            )
        )
      GROUP BY
        pmid
    ) m ON s.ids.pmid = m.pmid
);

In [None]:
OPTIMIZE identifier('openalex' || :env_suffix || '.works.openalex_works_base');