In [0]:
-- Register function
CREATE FUNCTION IF NOT EXISTS get_highest_priority_value(
    all_structs ARRAY<STRUCT<field_value: STRING, priority: INT>>, field_name STRING
)
RETURNS STRING
RETURN (
    SELECT aggregate(
        filter(all_structs, y -> y.field_value IS NOT NULL),
        struct(CAST(NULL AS STRING) AS field_value, 999 AS priority),
        (acc, x) -> CASE WHEN x.priority < acc.priority THEN x ELSE acc END
    ).field_value
);

-- Full CTE chain to final table
CREATE OR REPLACE TABLE identifier('openalex' || :env_suffix || '.works.openalex_works') AS (
WITH mat_sources AS (
    SELECT
        s.id AS source_id,
        s.display_name,
        s.issn AS issn_l,
        s.issns,
        s.is_in_doaj,
        s.is_core,
        s.publisher AS source_publisher,
        s.publisher_id,
        s.institution_id,
        s.repository_id,
        FROM_JSON(s.apc_prices, 'array<struct<price:int, currency:string>>') AS apc_prices,
        s.apc_usd,
        s.type AS source_type,
        i.display_name AS institution_name,
        p.display_name AS publisher_name,
        s.is_in_doaj_start_year,
        s.is_oa_high_oa_rate
    FROM openalex.sources.sources s
    LEFT JOIN openalex.institutions.institutions i ON s.institution_id = i.id
    LEFT JOIN openalex.publishers.publishers p ON s.publisher_id = p.id
),
priority_table AS (
    SELECT * FROM openalex.system.priority_table
),
base AS (
    SELECT
        a.work_id, a.provenance, a.native_id, a.native_id_namespace, a.best_doi, a.title, a.type,
        a.abstract, a.abstract_inverted_index, b.priority, a.openalex_created_dt, a.openalex_updated_dt,
        s.source_id, s.display_name, s.issn_l, s.issns, s.is_in_doaj,
        CASE WHEN GET(s.apc_prices,0) IS NULL THEN NULL ELSE s.apc_prices END AS apc_prices,
        s.apc_usd, s.is_core, a.is_oa, COALESCE(a.is_oa, FALSE) AS is_oa_raw,
        COALESCE(s.is_in_doaj, FALSE) AS is_in_doaj_raw,
        COALESCE(is_in_doaj_raw AND (ISNULL(s.is_in_doaj_start_year) OR YEAR(a.published_date) >= s.is_in_doaj_start_year), FALSE) AS is_in_doaj_stg,
        COALESCE(s.is_oa_high_oa_rate, FALSE) AS is_oa_high_rate,
        (is_in_doaj_stg OR is_oa_high_rate) AS source_is_oa,
        (is_oa_raw OR source_is_oa) AS composite_is_oa,
        s.is_in_doaj_start_year, s.source_type, a.source_name, a.publisher, a.published_date,
        a.volume, a.issue, a.first_page, a.last_page, a.language, a.authors,
        TRANSFORM(a.urls, x -> 
          STRUCT(REGEXP_REPLACE(x.url, 'dx.doi.org', 'doi.org') AS url, x.content_type)
        ) AS urls,
        a.license, s.institution_name, s.publisher_name, s.institution_id, s.publisher_id, a.version,
        CASE WHEN LOWER(a.native_id) LIKE '%arxiv.org%' 
          THEN COALESCE(GET(FILTER(a.urls, x -> x.content_type = 'html').url,0), a.landing_page_url) 
          ELSE a.landing_page_url 
        END AS landing_page_url,
        CASE WHEN LOWER(a.native_id) LIKE '%arxiv.org%' 
          THEN COALESCE(CONCAT('https://arxiv.org/pdf/', SPLIT_PART(a.native_id, ':', 3)), a.pdf_url) 
          ELSE a.pdf_url 
        END AS pdf_url,
        a.is_retracted, s.repository_id,
        ROW_NUMBER() OVER (PARTITION BY a.work_id, a.provenance ORDER BY a.created_date DESC) AS row_num,
        CASE WHEN a.provenance = 'crossref' THEN 'publisher' ELSE 'repository' END AS host_type,
        CASE
            WHEN composite_is_oa AND host_type = 'publisher' THEN
                CASE
                    WHEN ZEROIFNULL(s.apc_usd) = 0 AND source_is_oa THEN 1
                    WHEN source_is_oa THEN 2
                    WHEN a.license IS NOT NULL AND a.license != 'publisher-specific-oa' THEN 3
                    ELSE 4  -- covers (a.license IS NULL OR a.license = 'publisher-specific-oa')
                END
            WHEN a.is_oa AND host_type = 'repository' THEN 5
            ELSE 6
        END AS oa_status
    FROM identifier('openalex' || :env_suffix || '.works.locations_mapped') a
    LEFT JOIN priority_table b USING(provenance)
    LEFT JOIN mat_sources s ON a.source_id = s.source_id
    QUALIFY row_num <= 10
),
collect_all_values AS (
    SELECT
        work_id,
        COLLECT_LIST(STRUCT(best_doi, priority)) AS best_dois,
        COLLECT_LIST(STRUCT(title, priority)) AS titles,
        COLLECT_LIST(STRUCT(publisher, priority)) AS publishers,
        COLLECT_LIST(STRUCT(abstract, priority)) AS abstracts,
        COLLECT_LIST(STRUCT(abstract_inverted_index, priority)) AS abstract_inverted_indexes,
        COLLECT_LIST(STRUCT(volume, priority)) AS volumes,
        COLLECT_LIST(STRUCT(issue, priority)) AS issues,
        COLLECT_LIST(STRUCT(first_page, priority)) AS first_pages,
        COLLECT_LIST(STRUCT(last_page, priority)) AS last_pages,
        COLLECT_LIST(STRUCT(language, priority)) AS languages,
        COLLECT_LIST(STRUCT(type, priority)) AS types,
        FILTER(COLLECT_LIST(STRUCT(published_date, priority)), x -> x.published_date IS NOT NULL) AS published_dates,
        FILTER(COLLECT_LIST(STRUCT(openalex_created_dt, priority)), x -> x.openalex_created_dt IS NOT NULL) AS openalex_created_dts,
        FILTER(COLLECT_LIST(STRUCT(openalex_updated_dt, priority)), x -> x.openalex_updated_dt IS NOT NULL) AS openalex_updated_dts,
        FILTER(COLLECT_SET(STRUCT(native_id_namespace, native_id)), x -> LOWER(x.native_id_namespace) != 'pmh') AS ids,
        COLLECT_SET(
            STRUCT(
                provenance,
                CASE 
                    WHEN provenance = 'crossref' AND best_doi IS NOT NULL THEN 1
                    WHEN provenance = 'crossref' THEN 2
                    WHEN version = 'publishedVersion' AND pdf_url IS NOT NULL THEN 3
                    WHEN version = 'publishedVersion' THEN 4
                    WHEN version = 'acceptedVersion' AND pdf_url IS NOT NULL THEN 5
                    WHEN version = 'acceptedVersion' THEN 6
                    WHEN version = 'submittedVersion' AND pdf_url IS NOT NULL THEN 7
                    WHEN version = 'submittedVersion' THEN 8
                    ELSE 9
                END AS sort_score,
                CASE WHEN host_type = 'repository' THEN is_oa_raw ELSE composite_is_oa END AS is_oa,
                COALESCE(landing_page_url, GET(FILTER(urls, x -> x.content_type = 'html').url, 0)) AS landing_page_url,
                COALESCE(pdf_url, GET(FILTER(urls, x -> x.content_type = 'pdf').url, 0)) AS pdf_url,
                CASE 
                    WHEN CONTAINS(COALESCE(pdf_url, landing_page_url), 'europepmc.org') THEN 1
                    WHEN CONTAINS(COALESCE(pdf_url, landing_page_url), '/pmc/') THEN 2
                    WHEN CONTAINS(COALESCE(pdf_url, landing_page_url), 'arxiv') THEN 3
                    WHEN CONTAINS(COALESCE(pdf_url, landing_page_url), '.edu') THEN 4
                    ELSE 5
                END AS url_sort_score,
                oa_status,
                STRUCT(
                    CONCAT('https://openalex.org/S', source_id) AS id,
                    COALESCE(display_name, source_name) AS display_name,
                    issn_l, issns,
                    source_is_oa AS is_oa,
                    is_in_doaj, is_core,
                    CASE WHEN source_type = 'repository' THEN CONCAT('https://openalex.org/I', institution_id) ELSE CONCAT('https://openalex.org/P', publisher_id) END AS host_organization,
                    CASE
                        WHEN source_type = 'repository' THEN CASE WHEN REGEXP_EXTRACT(display_name, '\\(([^)]+)\\)', 1) = '' THEN display_name ELSE CONCAT(REGEXP_EXTRACT(display_name, '\\(([^)]+)\\)', 1), ' - ', REGEXP_REPLACE(display_name, '\\s*\\([^)]*\\)', '')) END
                        ELSE publisher_name
                    END AS host_organization_name,
                    source_type AS type
                ) AS source,
                apc_prices, apc_usd, license, version,
                host_type,
                CASE WHEN provenance IN ('repo', 'repo_backfill') THEN repository_id END AS endpoint_id,
                CASE WHEN provenance IN ('repo', 'repo_backfill') THEN native_id END AS pmh_id,
                provenance = 'crossref' AS is_unpaywall_record,
                type AS location_type,
                CAST(openalex_updated_dt AS TIMESTAMP) AS updated
            )
        ) AS locations,
        EXISTS(COLLECT_SET(is_retracted), x -> x = TRUE) AS is_retracted,
        ARRAY_CONTAINS(COLLECT_SET(provenance), 'crossref') AS indexed_in_crossref
    FROM base
    GROUP BY work_id
),
set_fields AS (
    SELECT
        CONCAT('https://openalex.org/W', work_id) AS id,
        get_highest_priority_value(titles, titles.title) AS title,
        CONCAT('https://doi.org/', get_highest_priority_value(best_dois, best_dois.best_doi)) AS best_doi,
        get_highest_priority_value(publishers, publishers.publisher) AS publisher,
        get_highest_priority_value(abstracts, abstracts.abstract) AS abstract,
        get_highest_priority_value(abstract_inverted_indexes, abstract_inverted_indexes.abstract_inverted_index) AS abstract_inverted_index,
        get_highest_priority_value(volumes, volumes.volume) AS volume,
        get_highest_priority_value(issues, issues.issue) AS issue,
        get_highest_priority_value(first_pages, first_pages.first_page) AS first_page,
        get_highest_priority_value(last_pages, last_pages.last_page) AS last_page,
        get_highest_priority_value(languages, languages.language) AS language,
        get_highest_priority_value(types, types.type) AS type,
        TRY_CAST(get_highest_priority_value(openalex_created_dts, openalex_created_dts.openalex_created_dt) AS DATE) AS created_date,
        CAST(get_highest_priority_value(openalex_updated_dts, openalex_updated_dts.openalex_updated_dt) AS TIMESTAMP) AS updated_date,
        TRY_CAST(get_highest_priority_value(published_dates, published_dates.published_date) AS DATE) AS publication_date,
        YEAR(publication_date) AS publication_year,
        MAP_FROM_ENTRIES(
            AGGREGATE(ids, ARRAY(NAMED_STRUCT('native_id_namespace', 'openalex', 'native_id', CONCAT('https://openalex.org/W', CAST(work_id AS STRING)))),
            (acc, x) -> CASE WHEN SIZE(FILTER(acc, y -> y.native_id_namespace = x.native_id_namespace)) = 0 THEN acc || ARRAY(x) ELSE acc END)
        ) AS ids,
        ARRAY_SORT(locations, (x, y) -> IF(x.sort_score < y.sort_score, -1, IF(x.sort_score > y.sort_score, 1, IF(x.url_sort_score < y.url_sort_score, -1, IF(x.url_sort_score > y.url_sort_score, 1, 0))))) AS locations_sorted,
        authorships,
        is_retracted,
        indexed_in_crossref
    FROM collect_all_values
    LEFT JOIN identifier('openalex' || :env_suffix || '.works.authors_and_affiliations') USING(work_id)
)
SELECT
    s.id, s.best_doi AS doi, s.title, s.authorships, s.publication_date, s.publication_year,
    s.abstract, s.abstract IS NOT NULL AS has_abstract, s.abstract_inverted_index, s.ids, s.language, s.publisher,
    STRUCT(s.volume, s.issue, s.first_page, s.last_page) AS biblio,
    CASE WHEN GET(GET(locations_sorted,0).apc_prices,0).price IS NOT NULL THEN STRUCT(GET(GET(locations_sorted,0).apc_prices,0).price AS value, GET(GET(locations_sorted,0).apc_prices,0).currency, GET(locations_sorted,0).apc_usd AS value_usd) END AS apc_list,
    SIZE(s.locations_sorted) AS locations_count,
    TRANSFORM(s.locations_sorted, x -> STRUCT(COALESCE(x.is_oa, FALSE) AS is_oa, x.landing_page_url, x.pdf_url, x.source, x.license, x.version, x.oa_status, x.host_type, x.endpoint_id, x.pmh_id, x.is_unpaywall_record, x.location_type, CAST(x.updated AS TIMESTAMP) AS updated, x.provenance)) AS locations,
    GET(locations, 0) AS primary_location,
    GET(FILTER(locations, x -> x.is_oa), 0) AS best_oa_location,
    STRUCT(
        best_oa_location IS NOT NULL AS is_oa,
        CASE 
            WHEN best_oa_location.oa_status = 1 THEN 'diamond'
            WHEN best_oa_location.oa_status = 2 THEN 'gold'
            WHEN best_oa_location.oa_status = 3 THEN 'hybrid'
            WHEN best_oa_location.oa_status = 4 THEN 'bronze'
            WHEN best_oa_location.oa_status = 5 THEN 'green'
            ELSE 'closed'
        END AS oa_status,
        COALESCE(best_oa_location.pdf_url, best_oa_location.landing_page_url) AS oa_url,
        CAST(NULL AS BOOLEAN) AS any_repository_has_fulltext
    ) AS open_access,
    s.type, s.type = 'paratext' AS is_paratext, s.is_retracted, s.indexed_in_crossref, m.mesh_formatted AS mesh, s.created_date, s.updated_date
FROM set_fields s
LEFT JOIN (
    SELECT pmid, COLLECT_LIST(STRUCT(descriptor_ui, descriptor_name, qualifiers._UI AS qualifier_ui, qualifiers._VALUE AS qualifier_name, CASE WHEN is_major_topic = 'Y' THEN TRUE ELSE FALSE END AS is_major_topic)) AS mesh_formatted
    FROM (
        SELECT pmid, EXPLODE(mesh.MeshHeading) AS mesh_exploded, mesh_exploded.DescriptorName._UI AS descriptor_ui, mesh_exploded.DescriptorName._VALUE AS descriptor_name, EXPLODE_OUTER(ARRAYS_ZIP(mesh_exploded.QualifierName._UI, mesh_exploded.QualifierName._VALUE)) AS qualifiers, mesh_exploded.DescriptorName._MajorTopicYN AS is_major_topic
        FROM (
            SELECT FILTER(PubmedData.ArticleIdList.ArticleId, x -> x._IdType = 'pubmed')._VALUE[0] AS pmid, MedlineCitation.MeshHeadingList AS mesh
            FROM openalex.pubmed.pubmed_items
        )
    )
    GROUP BY pmid
) m ON s.ids.pmid = m.pmid
);
