### Creates `openalex.works.superlocations` in Walden End to End workflow

In [0]:
create
or replace table identifier('openalex' || :env_suffix || '.works.superlocations') as (
  with
  -- crossref landing page url
  landing_page_urls_for_crossref as (
    select
      doi,
      landing_page_url
    from (
      select
        get(filter(ids, x -> x.namespace = 'doi').id, 0) as doi,
        native_id as landing_page_url,
        row_number() over(
          partition by get(filter(ids, x -> x.namespace = "doi").id, 0)
          order by 
            case 
              -- Highest priority: official DOI links
              when contains(native_id, "doi.org/") then 1
              -- Publisher domain patterns - high priority
              when contains(native_id, 'elsevier.com') then 2
              when contains(native_id, 'springer.com') then 2
              when contains(native_id, 'wiley.com') then 2
              when contains(native_id, 'sagepub.com') then 2
              when contains(native_id, 'ieee.org') then 2
              when contains(native_id, 'tandfonline.com') then 2
              when contains(native_id, 'oup.com') then 2
              when contains(native_id, 'nature.com') then 2
              when contains(native_id, 'acs.org') then 2
              -- Medium priority: institutional repositories
              when contains(native_id, 'edu/') then 3
              when contains(native_id, 'ac.uk') then 3
              when contains(native_id, 'gov/') then 3
              -- Avoid PDF links in landing pages
              when contains(lower(native_id), '.pdf') then 10
              -- Everything else
              else 5
            end asc,
            -- Use updated date as tiebreaker
            updated_date DESC NULLS LAST,
            -- Final alphabetical tiebreaker for complete determinism
            native_id ASC NULLS LAST
        ) as row_num
      from
        identifier('openalex' || :env_suffix || '.works.locations_parsed')
      where
        provenance = 'landing_page'
        and array_contains(ids.namespace, 'doi')
    )
    where row_num = 1
  ),

  -- crossref pdf url
  pdf_urls_for_crossref as (
    select
      doi,
      pdf_url,
      pdf_s3_id,
      grobid_s3_id
    from (
      select
        get(filter(ids, x -> x.namespace = "doi").id, 0) as doi,
        native_id as pdf_url,
        get(filter(ids, x -> x.namespace = 'docs.pdf').id, 0) as pdf_s3_id,
        get(filter(ids, x -> x.namespace = 'docs.parsed-pdf').id, 0) as grobid_s3_id,
        row_number() over (
          partition by get(filter(ids, x -> x.namespace = "doi").id, 0)
          order by
            -- First prioritize by version
            case
              when version = 'publishedVersion' then 1
              when version = 'acceptedVersion' then 2
              when version = 'submittedVersion' then 3
              else 4
            end asc,
            -- Then by whether it has a grobid id
            get(filter(ids, x -> x.namespace = 'docs.parsed-pdf').id, 0) nulls last,
            -- Then by URL pattern - prioritize official publisher domains
            case
              when contains(native_id, 'elsevier.com') then 1
              when contains(native_id, 'springer.com') then 1 
              when contains(native_id, 'wiley.com') then 1
              when contains(native_id, 'sagepub.com') then 1
              when contains(native_id, 'ieee.org') then 1
              when contains(native_id, 'tandfonline.com') then 1
              when contains(native_id, 'oup.com') then 1
              when contains(native_id, 'nature.com') then 1
              when contains(native_id, 'acs.org') then 1
              -- Lower priority for less reliable sources
              when contains(native_id, 'researchgate') then 3
              when contains(native_id, 'academia.edu') then 4
              else 2
            end asc,
            -- Date as tiebreaker
            updated_date DESC NULLS LAST,
            -- Final alphabetical tiebreaker for complete determinism
            native_id ASC NULLS LAST
        ) as row_num
      from
        identifier('openalex' || :env_suffix || '.works.locations_parsed')
      where
        provenance = 'pdf' and array_contains(ids.namespace, 'doi')
    ) 
    where row_num = 1 
  ),
  
  other_fields_for_crossref as (
    select
      case
        when provenance = 'crossref' then native_id
        else get(filter(ids, x -> x.namespace = "doi").id, 0)
      end as best_doi,
      provenance,
      license,
      is_oa,
      case
        when provenance = 'landing_page' then 1
        when provenance = 'pdf' then 2
        when provenance = 'crossref' then 3
      end as license_priority,
      references,
      case
        when provenance = 'crossref' then 1
        when provenance = 'pdf' then 2
      end as references_priority,
      authors,
      case
        when provenance = 'crossref' then 1
        when provenance = 'landing_page' then 2
        when provenance = 'pdf' then 3
      end as authors_priority,
      updated_date,
      native_id  -- Added for deterministic sorting
    from
      identifier('openalex' || :env_suffix || '.works.locations_parsed')
    where
      provenance in (
        'crossref',
        'landing_page',
        'pdf'
      )
  ),
  
  staging_other_fields_for_crossref as (
    select
      best_doi,
      exists(array_sort(collect_list(struct(is_oa, updated_date, native_id)), 
           (a, b) -> case 
                       when a.updated_date > b.updated_date then -1 
                       when a.updated_date < b.updated_date then 1
                       when a.native_id < b.native_id then -1
                       else 1 
                     end).is_oa, 
           x -> x = True) as is_oa,
    -- Sort each list deterministically before filtering
    filter(
      array_sort(
        collect_list(struct(license, license_priority, updated_date, native_id)),
        (a, b) -> case 
                    when a.license_priority < b.license_priority then -1
                    when a.license_priority > b.license_priority then 1
                    when a.updated_date > b.updated_date then -1 
                    when a.updated_date < b.updated_date then 1
                    when a.native_id < b.native_id then -1
                    else 1 
                  end
      ),
      x -> nullif(x.license, "") is not null
    ) as license_list,
    
    -- Sort authors list deterministically
    array_sort(
      collect_list(struct(authors, authors_priority, updated_date, native_id)),
      (a, b) -> case 
                  when a.authors_priority < b.authors_priority then -1
                  when a.authors_priority > b.authors_priority then 1
                  when a.updated_date > b.updated_date then -1 
                  when a.updated_date < b.updated_date then 1
                  when a.native_id < b.native_id then -1
                  else 1 
                end
    ) as authors_list,
    
    -- Sort references list deterministically
    filter(
      array_sort(
        collect_list(struct(references, references_priority, updated_date, native_id)),
        (a, b) -> case 
                    when a.references_priority < b.references_priority then -1
                    when a.references_priority > b.references_priority then 1
                    when a.updated_date > b.updated_date then -1 
                    when a.updated_date < b.updated_date then 1
                    when a.native_id < b.native_id then -1
                    else 1 
                  end
      ),
      x -> x.references is not null
    ) as references_list
    from
      other_fields_for_crossref
    where
      best_doi is not null
    group by
      best_doi
  ),
  
  best_other_fields_for_crossref as (
    -- get the highest priority value per license and author
    select
      best_doi,
      is_oa,
      -- Get first license after deterministic sort
      get(
        array_sort(
          filter(
            license_list,
            x -> x.license_priority = array_min(license_list.license_priority)
          ),
          (a, b) -> case 
                      when a.updated_date > b.updated_date then -1 
                      when a.updated_date < b.updated_date then 1
                      when a.license < b.license then -1
                      else 1 
                    end
        ).license,
        0
      ) as license,
      
      -- Get first authors after deterministic sort
      get(
        array_sort(
          filter(
            authors_list,
            x -> x.authors_priority = array_min(authors_list.authors_priority)
          ),
          (a, b) -> case 
                      when a.updated_date > b.updated_date then -1 
                      when a.updated_date < b.updated_date then 1
                      when a.native_id < b.native_id then -1
                      else 1 
                    end
        ).authors,
        0
      ) as authors,
      
      -- Get first references after deterministic sort
      get(
        array_sort(
          filter(
            references_list,
            x -> x.references_priority = array_min(references_list.references_priority)
          ),
          (a, b) -> case 
                      when a.updated_date > b.updated_date then -1 
                      when a.updated_date < b.updated_date then 1
                      when a.native_id < b.native_id then -1
                      else 1 
                    end
        ).references,
        0
      ) as references
    from
      staging_other_fields_for_crossref
  ),
  
  crossref_superlocations as (
    select
      c.provenance,
      c.native_id,
      c.native_id_namespace,
      c.title,
      c.normalized_title,
      coalesce(a.authorships, c.authors) as authors,
      c.ids,
      c.raw_native_type,
      c.type,
      c.version,
      nullif(o.license, "") as license,
      c.language,
      c.published_date,
      c.created_date,
      c.updated_date,
      c.issue,
      c.volume,
      c.first_page,
      c.last_page,
      c.is_retracted,
      c.abstract,
      c.source_name,
      c.publisher,
      c.funders,
      o.references,
      c.urls,
      c.mesh,
      o.is_oa,
      c.abstract_inverted_index,
      c.authors_exist,
      c.affiliations_exist,
      c.is_corresponding_exists,
      c.best_doi,
      c.merge_key, -- 33d column (debugging)
      p.pdf_url,
      l.landing_page_url,
      p.pdf_s3_id,
      p.grobid_s3_id
    from
      identifier('openalex' || :env_suffix || '.works.locations_parsed') c
      left join pdf_urls_for_crossref p on c.native_id = p.doi
      left join landing_page_urls_for_crossref l on c.native_id = l.doi
      left join best_other_fields_for_crossref o on c.native_id = o.best_doi
      left join identifier('openalex' || :env_suffix || '.works.crossref_super_authorships') a 
      on c.native_id = a.doi
    where
      provenance = 'crossref'
  ),
  
  landing_page_urls_for_repo as (
    select
      pmh,
      landing_page_url
    from (
      select
        get(filter(ids, x -> x.namespace = 'pmh').id, 0) as pmh,
        native_id as landing_page_url,
        row_number() over (
          partition by get(filter(ids, x -> x.namespace = 'pmh').id, 0)
          order by 
            -- Prioritize specific repository URL patterns
            case
              when contains(native_id, 'arxiv.org') then 1
              when contains(native_id, 'biorxiv.org') then 1
              when contains(native_id, 'medrxiv.org') then 1
              when contains(native_id, 'ssrn.com') then 2
              when contains(native_id, 'zenodo.org') then 2
              when contains(native_id, 'figshare.com') then 2
              when contains(native_id, 'edu/') then 3
              when contains(native_id, 'ac.uk') then 3
              when contains(native_id, 'gov/') then 3
              when contains(lower(native_id), '.pdf') then 10
              else 5
            end asc,
            updated_date DESC NULLS LAST,
            native_id ASC NULLS LAST
        ) as row_num
      from
        identifier('openalex' || :env_suffix || '.works.locations_parsed')
      where
        provenance = 'landing_page'
        and array_contains(ids.namespace, 'pmh')
    )
    where row_num = 1
  ),
  
  pdf_urls_for_repo as (
    select
      pmh,
      pdf_url,
      pdf_s3_id,
      grobid_s3_id
    from (
      select
        get(filter(ids, x -> x.namespace = "pmh").id, 0) as pmh,
        native_id as pdf_url,
        get(filter(ids, x -> x.namespace = 'docs.pdf').id, 0) as pdf_s3_id,
        get(filter(ids, x -> x.namespace = 'docs.parsed-pdf').id,0) as grobid_s3_id,
        row_number() over (
          partition by get(filter(ids, x -> x.namespace = "pmh").id, 0)
          order by
            -- First by version
            case
              when version = 'publishedVersion' then 1
              when version = 'acceptedVersion' then 2
              when version = 'submittedVersion' then 3
              else 4
            end asc,
            -- Then by whether it has a grobid id 
            get(filter(ids, x -> x.namespace = 'docs.parsed-pdf').id, 0) nulls last,
            -- Then by URL pattern
            case
              when contains(native_id, 'arxiv.org') then 1
              when contains(native_id, 'biorxiv.org') then 1
              when contains(native_id, 'medrxiv.org') then 1
              when contains(native_id, 'edu/') then 2
              when contains(native_id, 'ac.uk') then 2
              when contains(native_id, 'gov/') then 2
              else 3
            end asc,
            -- Date as tiebreaker
            updated_date DESC NULLS LAST,
            -- Final alphabetical tiebreaker for complete determinism
            native_id ASC NULLS LAST
        ) as row_num
      from
        identifier('openalex' || :env_suffix || '.works.locations_parsed')
      where
        provenance = 'pdf' and array_contains(ids.namespace, 'pmh')
    )
    where row_num = 1
  ),
  
  other_fields_for_repo as (
    select
      case
        when provenance in ('repo', 'repo_backfill') then native_id
        else get(filter(ids, x -> x.namespace = "pmh").id, 0)
      end as best_pmh,
      provenance,
      is_oa,
      version,
      case
        when version = 'submittedVersion' then 1
        when version = 'acceptedVersion' then 2
        when version = 'publishedVersion' then 3
        else 4
      end as version_priority,
      license,
      case
        when provenance = 'landing_page' then 1
        when provenance = 'pdf' then 2
        when provenance in ('repo', 'repo_backfill') then 3
      end as license_priority,
      references,
      case
        when provenance in ('repo', 'repo_backfill') then 1
        when provenance = 'pdf' then 2
      end as references_priority,
      authors,
      case
        when provenance in ('repo', 'repo_backfill') then 1
        when provenance = 'landing_page' then 2
        when provenance = 'pdf' then 3
      end as authors_priority,
      updated_date,
      native_id  -- Added for deterministic sorting
    from
      identifier('openalex' || :env_suffix || '.works.locations_parsed')
    where
      provenance in (
        'repo',
        'repo_backfill',
        'landing_page',
        'pdf'
      )
  ),
  
  staging_other_fields_for_repo as (
    select
      best_pmh,
      exists(array_sort(collect_list(struct(is_oa, updated_date, native_id)), 
           (a, b) -> case 
                       when a.updated_date > b.updated_date then -1 
                       when a.updated_date < b.updated_date then 1
                       when a.native_id < b.native_id then -1
                       else 1 
                     end).is_oa, 
           x -> x = True) as is_oa,
           
    -- Sort each list deterministically before filtering
    filter(
      array_sort(
        collect_list(struct(license, license_priority, updated_date, native_id)),
        (a, b) -> case 
                    when a.license_priority < b.license_priority then -1
                    when a.license_priority > b.license_priority then 1
                    when a.updated_date > b.updated_date then -1 
                    when a.updated_date < b.updated_date then 1
                    when a.native_id < b.native_id then -1
                    else 1 
                  end
      ),
      x -> nullif(x.license, "") is not null
    ) as license_list,
    
    -- Sort authors list deterministically
    array_sort(
      collect_list(struct(authors, authors_priority, updated_date, native_id)),
      (a, b) -> case 
                  when a.authors_priority < b.authors_priority then -1
                  when a.authors_priority > b.authors_priority then 1
                  when a.updated_date > b.updated_date then -1 
                  when a.updated_date < b.updated_date then 1
                  when a.native_id < b.native_id then -1
                  else 1 
                end
    ) as authors_list,
    
    -- Sort version list deterministically
    filter(
      array_sort(
        collect_list(struct(version, version_priority, updated_date, native_id)),
        (a, b) -> case 
                    when a.version_priority < b.version_priority then -1
                    when a.version_priority > b.version_priority then 1
                    when a.updated_date > b.updated_date then -1 
                    when a.updated_date < b.updated_date then 1
                    when a.native_id < b.native_id then -1
                    else 1 
                  end
      ),
      x -> nullif(x.version, "") is not null
    ) as version_list,
    
    -- Sort references list deterministically
    filter(
      array_sort(
        collect_list(struct(references, references_priority, updated_date, native_id)),
        (a, b) -> case 
                    when a.references_priority < b.references_priority then -1
                    when a.references_priority > b.references_priority then 1
                    when a.updated_date > b.updated_date then -1 
                    when a.updated_date < b.updated_date then 1
                    when a.native_id < b.native_id then -1
                    else 1 
                  end
      ),
      x -> x.references is not null
    ) as references_list
    from
      other_fields_for_repo
    where
      best_pmh is not null
    group by
      best_pmh
  ),
  
  best_other_fields_for_repo as (
    select
      best_pmh,
      is_oa,
      
      -- Get first license after deterministic sort
      get(
        array_sort(
          filter(
            license_list,
            x -> x.license_priority = array_min(license_list.license_priority)
          ),
          (a, b) -> case 
                      when a.updated_date > b.updated_date then -1 
                      when a.updated_date < b.updated_date then 1
                      when a.license < b.license then -1
                      else 1 
                    end
        ).license,
        0
      ) as license,
      
      -- Get first version after deterministic sort
      get(
        array_sort(
          filter(
            version_list,
            x -> x.version_priority = array_min(version_list.version_priority)
          ),
          (a, b) -> case 
                      when a.updated_date > b.updated_date then -1 
                      when a.updated_date < b.updated_date then 1
                      when a.version < b.version then -1
                      else 1 
                    end
        ).version,
        0
      ) as version,
      
      -- Get first authors after deterministic sort
      get(
        array_sort(
          filter(
            authors_list,
            x -> x.authors_priority = array_min(authors_list.authors_priority)
          ),
          (a, b) -> case 
                      when a.updated_date > b.updated_date then -1 
                      when a.updated_date < b.updated_date then 1
                      when a.native_id < b.native_id then -1
                      else 1 
                    end
        ).authors,
        0
      ) as authors,
      
      -- Get first references after deterministic sort
      get(
        array_sort(
          filter(
            references_list,
            x -> x.references_priority = array_min(references_list.references_priority)
          ),
          (a, b) -> case 
                      when a.updated_date > b.updated_date then -1 
                      when a.updated_date < b.updated_date then 1
                      when a.native_id < b.native_id then -1
                      else 1 
                    end
        ).references,
        0
      ) as references
    from
      staging_other_fields_for_repo
  ),
  
  repo_superlocations as (
    select
      c.provenance,
      c.native_id,
      c.native_id_namespace,
      c.title,
      c.normalized_title,
      coalesce(a.authorships, c.authors) as authors,
      c.ids,
      c.raw_native_type,
      c.type,
      nullif(o.version, "") as version,
      nullif(o.license, "") as license,
      c.language,
      c.published_date,
      c.created_date,
      c.updated_date,
      c.issue,
      c.volume,
      c.first_page,
      c.last_page,
      c.is_retracted,
      c.abstract,
      c.source_name,
      c.publisher,
      c.funders,
      o.references,
      c.urls,
      c.mesh,
      o.is_oa,
      c.abstract_inverted_index,
      c.authors_exist,
      c.affiliations_exist,
      c.is_corresponding_exists,
      c.best_doi,
      c.merge_key,
      p.pdf_url,
      l.landing_page_url,
      p.pdf_s3_id,
      p.grobid_s3_id
    from
      identifier('openalex' || :env_suffix || '.works.locations_parsed') c
      left join pdf_urls_for_repo p on c.native_id = p.pmh
      left join landing_page_urls_for_repo l on c.native_id = l.pmh
      left join best_other_fields_for_repo o on c.native_id = o.best_pmh
      left join identifier('openalex' || :env_suffix || '.works.repo_super_authorships')
      a on c.native_id = a.pmh_id
    where
      provenance in ('repo', 'repo_backfill')
  ),
  
  datacite_superlocations as (
    select
      *,
      cast(null as string) as pdf_url,
      concat("https://doi.org/", native_id) as landing_page_url,
      cast(null as string) as pdf_s3_id,
      cast(null as string) as grobid_s3_id
    from
      identifier('openalex' || :env_suffix || '.works.locations_parsed')
    where
      provenance = 'datacite'
  ),
  
  mag_superlocations as (
    select
      *,
      cast(null as string) as pdf_url,
      get(urls.url, 0) as landing_page_url,
      cast(null as string) as pdf_s3_id,
      cast(null as string) as grobid_s3_id
    from
      identifier('openalex' || :env_suffix || '.works.locations_parsed')
    where
      provenance = 'mag'
  ),
  
  pubmed_superlocations as (
    select
      *,
      cast(null as string) as pdf_url,
      concat("https://pubmed.ncbi.nlm.nih.gov/", native_id) as landing_page_url,
      cast(null as string) as pdf_s3_id,
      cast(null as string) as grobid_s3_id
    from
      identifier('openalex' || :env_suffix || '.works.locations_parsed')
    where
      provenance = 'pubmed'
  ),
  
  superlocations_unioned as (
    select
      *
    from
      crossref_superlocations
    union
    select
      *
    from
      datacite_superlocations
    union
    select
      *
    from
      pubmed_superlocations
    union
    select
      *
    from
      repo_superlocations
    union
    select
      *
    from
      mag_superlocations
  )
  
  select
    *
  from
    superlocations_unioned
  order by
    provenance ASC,
    native_id ASC,
    updated_date DESC NULLS LAST
)

In [0]:
SELECT * FROM identifier('openalex' || :env_suffix || '.works.superlocations')