In [0]:
CREATE FUNCTION IF NOT EXISTS get_highest_priority_value(
    all_structs ARRAY<STRUCT<field_value: STRING, priority: INT>>, field_name STRING
  )
  RETURNS STRING
  RETURN
    (
      SELECT
        aggregate(filter(all_structs, y -> y.field_value is not null), struct(cast(null as string) AS field_value, 999 AS priority), (acc, x) -> CASE
            WHEN x.priority < acc.priority THEN x
            ELSE acc
          END).field_value
    );

-- Materialize sources
create or replace temp view mat_sources as
select
  s.id as source_id,
  s.display_name,
  s.issn as issn_l,
  s.issns,
  s.is_in_doaj,
  s.is_core,
  s.publisher as source_publisher,
  s.publisher_id,
  s.institution_id,
  s.repository_id,
  from_json(s.apc_prices, 'array<struct<price:int, currency:string>>') as apc_prices, -- apc_prices is a STR in openalex.sources.sources
  s.apc_usd,
  s.type as source_type,
  i.display_name as institution_name,
  p.display_name as publisher_name,
  s.is_in_doaj_start_year,
  s.is_oa_high_oa_rate
from
  openalex.sources.sources s
  left join openalex.institutions.institutions i on s.institution_id = i.id
  left join openalex.publishers.publishers p on s.publisher_id = p.id;

CREATE OR REPLACE TEMP VIEW mesh_pmids AS
SELECT
  FILTER(PubmedData.ArticleIdList.ArticleId, x -> x._IdType = 'pubmed')._VALUE[0] AS pmid,
  MedlineCitation.MeshHeadingList AS mesh
FROM
  openalex.pubmed.pubmed_items;

-- Mesh view
create or replace temp view mesh_base as
select
  pmid,
  explode(mesh.MeshHeading) as mesh_exploded,
  mesh_exploded.DescriptorName._UI as descriptor_ui,
  mesh_exploded.DescriptorName._VALUE as descriptor_name,
  explode_outer(
    arrays_zip(
      mesh_exploded.QualifierName._UI,
      mesh_exploded.QualifierName._VALUE
    )
  ) as qualifiers,
  mesh_exploded.DescriptorName._MajorTopicYN as is_major_topic
from
  mesh_pmids;

create
or replace temp view mesh as
select
  pmid,
  collect_list(
    struct(
      descriptor_ui,
      descriptor_name,
      qualifiers._UI as qualifier_ui,
      qualifiers._VALUE as qualifier_name,
      case
        when is_major_topic = "Y" then True
        else False
      end as is_major_topic -- confirm with Casey which is_major_topic to use from original mesh column.
    )
  ) as mesh_formatted
from
  mesh_base
group by
  pmid;

create or replace temp view priority_table as
(
  select
    *
  from
    openalex.system.priority_table
);

create
or replace temp view base as (
  select
    a.work_id,
    a.provenance,
    a.native_id,
    a.native_id_namespace,
    a.best_doi,
    a.title,
    a.type,
    a.abstract,
    a.abstract_inverted_index,
    b.priority,
    a.openalex_created_dt,
    a.openalex_updated_dt,
    s.source_id as source_id,
    s.display_name as display_name,
    s.issn_l,
    s.issns,
    s.is_in_doaj,
    case when get(s.apc_prices,0) is null then null else s.apc_prices end as apc_prices,
    s.apc_usd,
    s.is_core,
    a.is_oa,
    coalesce(a.is_oa, False) as is_oa_raw,
    coalesce(s.is_in_doaj, False) as is_in_doaj_raw,
    coalesce(is_in_doaj_raw and (isnull(s.is_in_doaj_start_year) or year(a.published_date) >= s.is_in_doaj_start_year), False) as is_in_doaj_stg,
    coalesce(s.is_oa_high_oa_rate, False) as is_oa_high_rate,
    (is_in_doaj_stg or is_oa_high_rate) as source_is_oa,
    (is_oa_raw or source_is_oa) as composite_is_oa,
    ---
    s.is_in_doaj_start_year,
    s.source_type,
    a.source_name,
    a.publisher as publisher,
    a.published_date,
    a.volume,
    a.issue,
    a.first_page,
    a.last_page,
    a.language,
    a.authors,
    transform(a.urls, x -> struct(regexp_replace(x.url, "dx.doi.org", "doi.org") as url, x.content_type)) as urls,
    a.license,
    s.institution_name,
    s.publisher_name,
    s.institution_id,
    s.publisher_id,
    a.version,
    -- manually build arxiv landing_page and pdf urls.
    case 
      when lower(a.native_id) like "%arxiv.org%"
        then coalesce(get(filter(a.urls, x -> x.content_type = 'html').url,0), a.landing_page_url)
      else a.landing_page_url end as landing_page_url,
    case
      when lower(a.native_id) like "%arxiv.org%"
        then coalesce(concat("https://arxiv.org/pdf/", split_part(a.true_native_id, ':', 3)), a.pdf_url)
      else a.pdf_url end as pdf_url,
    a.is_retracted,
    s.repository_id,
    row_number() over (partition by a.work_id, a.provenance order by a.created_date desc) as row_num,
    case when a.provenance = "crossref" then 'publisher' else 'repository' end as host_type,
    case
      when composite_is_oa and host_type = 'publisher' and zeroifnull(s.apc_usd) = 0 and source_is_oa then 1 -- diamond
      when composite_is_oa and host_type = 'publisher' and source_is_oa then 2 -- gold
      when composite_is_oa and host_type = 'publisher' and a.license is not null and a.license != "publisher-specific-oa" then 3 -- hybrid
      when composite_is_oa and host_type = 'publisher' and (a.license is null or a.license = 'publisher-specific-oa') then 4 -- bronze 
      when a.is_oa and host_type = 'repository' then 5 -- green
      else 6 -- closed
    end as oa_status
    --     case
    --   when a.is_oa and host_type = 'publisher' and zeroifnull(s.apc_usd) = 0 and a.is_oa_source and (s.is_in_doaj_start_year is null or s.is_in_doaj_start_year <= year(a.published_date)) then 1 -- diamond
    --   when a.is_oa and host_type = 'publisher' and a.is_oa_source and (s.is_in_doaj_start_year is null or s.is_in_doaj_start_year <= year(a.published_date)) then 2 -- gold
    --   when a.is_oa and host_type = 'publisher' and a.license is not null and a.license != "publisher-specific-oa" then 3 -- hybrid
    --   when a.is_oa and host_type = 'publisher' and (a.license is null or a.license = 'publisher-specific-oa') then 4 -- bronze 
    --   when (a.is_oa or a.is_oa_source) and host_type = 'repository' then 5 -- green
    --   else 6 -- closed
    -- end as oa_status
  from
    identifier('openalex' || :env_suffix || '.works.locations_mapped') a
    left join priority_table b using(provenance)
    left join mat_sources s on a.source_id = s.source_id 
    -- where work_id in (4244642524,2771338828,4399402102,2322835088,2504338649,2066213659,4249796172) -- 
    qualify row_num <= 10 -- only grab the most recent 10 locations per provenance. this is generally only relevant to repos & datacite.
);

-- For each work_id, collect all records for each STRING column, and pair each entry with it's priority in a struct field.
create
or replace temp view collect_all_values as (
  select
    work_id,
    collect_list(struct(best_doi, priority)) as best_dois,
    collect_list(struct(title, priority)) as titles,
    collect_list(struct(publisher, priority)) as publishers,
    collect_list(struct(abstract, priority)) as abstracts,
    collect_list(struct(abstract_inverted_index, priority)) as abstract_inverted_indexes,
    collect_list(struct(volume, priority)) as volumes,
    collect_list(struct(issue, priority)) as issues,
    collect_list(struct(first_page, priority)) as first_pages,
    collect_list(struct(last_page, priority)) as last_pages,
    collect_list(struct(language, priority)) as languages,
    collect_list(struct(type, priority)) as types,
    filter(
      collect_list(struct(published_date, priority)),
      x -> x.published_date is not null
    ) as published_dates,
    filter(
      collect_list(struct(openalex_created_dt, priority)),
      x -> x.openalex_created_dt is not null
    ) as openalex_created_dts,
    filter(
      collect_list(struct(openalex_updated_dt, priority)),
      x -> x.openalex_updated_dt is not null
    ) as openalex_updated_dts,
    filter(
      collect_set(struct(native_id_namespace, native_id)),
      x -> lower(x.native_id_namespace) != 'pmh'
    ) as ids,
      collect_set(
        struct(
          provenance, -- delete 
          case 
            when provenance =  'crossref' and best_doi is not null then 1 -- publisher with a doi
            when provenance = 'crossref' then 2 -- publisher without a doi
            when version = 'publishedVersion' and pdf_url is not null then 3 -- published version with a pdf url
            when version = 'publishedVersion' then 4 -- published version without a pdf url
            when version = 'acceptedVersion' and pdf_url is not null then 5 -- accepted version with a pdf url
            when version = 'acceptedVersion' then 6 -- accepted version without a pdf url
            when version = 'submittedVersion' and pdf_url is not null then 7 -- submitted version with a pdf url
            when version = 'submittedVersion' then 8 -- submitted version without a pdf url
            else 9
          end as sort_score,
          case when host_type = 'repository' then is_oa_raw else composite_is_oa end as is_oa,
          coalesce(landing_page_url, get(filter(urls, x -> x.content_type = "html").url, 0)) as landing_page_url, --should we remove coalesce here and only take the landing_page_url?
          coalesce(pdf_url, get(filter(urls, x -> x.content_type = "pdf").url, 0)) as pdf_url, --should we remove coalesce here and only take the pdf_url?
          case 
          when contains(coalesce(pdf_url, landing_page_url), "europepmc.org") then 1 
          when contains(coalesce(pdf_url, landing_page_url), "/pmc/") then 2
          when contains(coalesce(pdf_url, landing_page_url), "arxiv") then 3 
          when contains(coalesce(pdf_url, landing_page_url), ".edu") then 4 
          else 5
          end as url_sort_score,
          oa_status,
          struct(
            concat("https://openalex.org/S", source_id) as id,
            coalesce(display_name, source_name) as display_name, -- this helps when we don't get a match in locations_w_sources but have a source_name from the location itself
            issn_l,
            issns,
            source_is_oa as is_oa,
            is_in_doaj,
            is_core,
            case
              when source_type = 'repository' then concat('https://openalex.org/I', institution_id)
              else concat('https://openalex.org/P', publisher_id)
            end as host_organization,
                      case
            when
              source_type = 'repository'
            then
              case
                when REGEXP_EXTRACT(display_name, '\\(([^)]+)\\)', 1) = "" then display_name
                else
                  concat(
                    REGEXP_EXTRACT(display_name, '\\(([^)]+)\\)', 1),
                    " - ",
                    REGEXP_REPLACE(display_name, '\\s*\\([^)]*\\)', '')
                  )
              end
            else publisher_name
          end as host_organization_name, --updated to match repository name format ("Institution_Name - Repository_Name" e.g. PubMed Central - Europe PMC) in prod Unpaywall vs Wunpaywall
            source_type as type
          ) as source,
          apc_prices,
          apc_usd,
          license,
          version,
          host_type,
          case when provenance in ('repo', 'repo_backfill') then repository_id end as endpoint_id,
          case when provenance in ('repo', 'repo_backfill') then native_id end as pmh_id,
          provenance = 'crossref' as is_unpaywall_record,
          type as location_type,
          cast(openalex_updated_dt as timestamp) as updated -- changed from: openalex_updated_dt as updated
        )
      )
    as locations,
    exists(collect_set(is_retracted), x -> x = True) as is_retracted,
    array_contains(collect_set(provenance), 'crossref') as indexed_in_crossref
  from
    base
  group by
    work_id
);
-- For each STRING column, iterate through each collection of records and replace the default record if the priority is higher (lower, actually, since priority=1 is the "best" record)
create
or replace temp view set_fields as (
  select
    concat('https://openalex.org/W', work_id) as id,
    get_highest_priority_value(titles, titles.title) as title,
    concat(
      "https://doi.org/",
      get_highest_priority_value(best_dois, best_dois.best_doi)
    ) as best_doi,
    get_highest_priority_value(publishers, publishers.publisher) as publisher,
    get_highest_priority_value(abstracts, abstracts.abstract) as abstract,
    get_highest_priority_value(
      abstract_inverted_indexes,
      abstract_inverted_indexes.abstract_inverted_index
    ) as abstract_inverted_index,
    get_highest_priority_value(volumes, volumes.volume) as volume,
    get_highest_priority_value(issues, issues.issue) as issue,
    get_highest_priority_value(first_pages, first_pages.first_page) as first_page,
    get_highest_priority_value(last_pages, last_pages.last_page) as last_page,
    get_highest_priority_value(languages, languages.language) as language,
    get_highest_priority_value(types, types.type) as type,
    try_cast(
      get_highest_priority_value(
        openalex_created_dts,
        openalex_created_dts.openalex_created_dt
      ) as DATE -- this column, despite it's name, is a date, not a timestamp, in locations_mapped
    ) as created_date,
    cast( -- changed from try_cast
      get_highest_priority_value(
        openalex_updated_dts,
        openalex_updated_dts.openalex_updated_dt
      ) as TIMESTAMP
    ) as updated_date,
    try_cast(
      get_highest_priority_value(published_dates, published_dates.published_date) as date
    ) as publication_date,
    year(publication_date) as publication_year,
    map_from_entries(
      aggregate(
        ids,
        array(
          named_struct(
            'native_id_namespace',
            'openalex',
            'native_id',
            concat('https://openalex.org/W', cast(work_id as string))
          )
        ),
        (acc, x) -> CASE
          WHEN size(
            filter(
              acc,
              y -> y.native_id_namespace = x.native_id_namespace
            )
          ) = 0 THEN acc || array(x)
          ELSE acc
        END
      )
    ) as ids,
    array_sort(locations, (x, y) -> 
    if(x.sort_score < y.sort_score, -1, 
    if(x.sort_score > y.sort_score, 1, 
    if(x.url_sort_score < y.url_sort_score, -1, 
    if(x.url_sort_score > y.url_sort_score, 1, 0))))) AS locations_sorted, -- sort by sort_score then url_sort_score
    authorships,
    is_retracted,
    indexed_in_crossref
  from
    collect_all_values
    left join identifier('openalex' || :env_suffix || '.works.authors_and_affiliations') using(work_id)
);

create 
or replace table identifier('openalex' || :env_suffix || '.works.openalex_works') as (
-- or replace temp view openalex_works_temp_view as (
  select
    s.id,
    s.best_doi as doi,
    s.title,
    s.authorships,
    s.publication_date,
    s.publication_year,
    s.abstract,
    s.abstract is not null as has_abstract,
    s.abstract_inverted_index,
    s.ids,
    s.language,
    s.publisher,
    struct(s.volume, s.issue, s.first_page, s.last_page) as biblio,
    case when get(get(locations_sorted,0).apc_prices,0).price is not null then struct(
      get(get(locations_sorted,0).apc_prices,0).price as value, 
      get(get(locations_sorted,0).apc_prices,0).currency,
      get(locations_sorted,0).apc_usd as value_usd) end as apc_list, -- from the primary location, get the price, currency, and usd
    size(s.locations_sorted) as locations_count,
        transform(s.locations_sorted
    , x -> struct(
      coalesce(x.is_oa, False) as is_oa,
      x.landing_page_url,
      x.pdf_url,
      x.source,
      x.license,
      x.version,
      x.oa_status,
      x.host_type,
      x.endpoint_id,
      x.pmh_id,
      x.is_unpaywall_record,
      x.location_type,
      cast(x.updated as timestamp) as updated,
      x.provenance -- delete 
    )) as locations,
    get(locations, 0) as primary_location,
    get(filter(locations, x -> x.is_oa), 0) as best_oa_location, -- grab the first location that is_oa. Note: Grabbing the first works only because the locations are already sorted according to the sort_score and url_sort_score.
    struct(
      best_oa_location is not null as is_oa,
      case 
        when best_oa_location.oa_status = 1 then 'diamond'
        when best_oa_location.oa_status = 2 then 'gold' 
        when best_oa_location.oa_status = 3 then 'hybrid'
        when best_oa_location.oa_status = 4 then 'bronze'
        when best_oa_location.oa_status = 5 then 'green'
        else 'closed'
      end as oa_status,
      coalesce(best_oa_location.pdf_url, best_oa_location.landing_page_url) as oa_url,
      cast(null as boolean) as any_repository_has_fulltext
    ) as open_access,
    s.type,
    s.type = 'paratext' as is_paratext,
    s.is_retracted,
    s.indexed_in_crossref, -- So that unpaywall knows which works have a crossref location.
    m.mesh_formatted as mesh,
    s.created_date,
    s.updated_date
  from
    set_fields s
    left join mesh m on s.ids.pmid = m.pmid
);
