### Creates `openalex.works.locations_w_sources` in Walden End to End workflow

In [0]:
CREATE OR REPLACE TABLE identifier('openalex' || :env_suffix || '.works.locations_w_sources')
CLUSTER BY (best_doi, provenance, native_id)
AS (
with all_locations as (
  select
    *
  from
    identifier('openalex' || :env_suffix || '.works.superlocations')
),
sources_intermediate as (
  select
    id,
    explode(issns) as issn,
    repository_id,
    sample_pmh_record,
    sample_pmh_record,
    merge_into_id,
    is_oa as is_oa_source
  from
    openalex.sources.sources
),
existing_sources_for_other_works as (
  select
    id,
    row_number() over (partition by issn order by id asc) as id_row_number,
    issn,
    is_oa_source,
    repository_id as repository_id_sources,
    sample_pmh_record,
    split_part(sample_pmh_record, ":", 2) as pmh_record_matching_string,
    row_number() over (
        partition by split_part(sample_pmh_record, ":", 2)
        order by id asc
      ) as pmh_row_number -- this is a bug that Casey will need to fix in sources.sources table.
  from
    sources_intermediate
  where
    merge_into_id is null
    and id is not null -- only select issn's that don't have a merge_into_id and are not null
),
existing_sources_for_repos as (
  select
    id,
    row_number() over (partition by issn order by id asc) as id_row_number,
    issn,
    repository_id as repository_id_sources,
    sample_pmh_record,
    is_oa as is_oa_source,
    split_part(sample_pmh_record, ":", 2) as pmh_record_matching_string,
    row_number() over (
        partition by split_part(sample_pmh_record, ":", 2)
        order by id asc
      ) as pmh_row_number -- this is a bug that Casey will need to fix in sources.sources table.
  from
    openalex.sources.sources
  where
    merge_into_id is null
    and id is not null -- only select issn's that don't have a merge_into_id and are not null
),
existing_sources_for_datacite as (
  select
    id,
    exploded_datacite_id as datacite_id,
    is_oa as is_oa_source,
    row_number() over (partition by exploded_datacite_id order by id asc) as datacite_row_number
  from
    openalex.sources.sources
    lateral view explode(datacite_ids) t as exploded_datacite_id
  where
    merge_into_id is null
    and id is not null
    and datacite_ids is not null 
    and size(datacite_ids) > 0
),
other_works as (
  select
    *
  from
    all_locations
  where
    provenance not in ('repo', 'repo_backfill', 'datacite', 'pubmed')
),
other_works_exploded as (
  select
    *,
    explode_outer(
      filter(
        ids,
        x -> contains(x.namespace, 'issn')
        and (
          x.relationship in (
            "self",
            "IsVersionOf",
            "IsNewVersionOf",
            "IsPreviousVersionOf",
            "IsVariantFormOf",
            "IsOriginalFormOf",
            "IsIdentialTo",
            "IsTranslationOf",
            "HasTranslation"
          )
          or x.relationship is null
        )
      )
    ) as relevant_id,
    relevant_id.namespace as namespace
  from
    other_works
),
other_works_ranked as (
  select
    *,
    row_number() over (
        partition by native_id
        order by
          case
            when namespace = 'eissn' then 1
            when namespace = 'lissn' then 2
            when namespace = 'pissn' then 3
          end,
          relevant_id.id
      ) as native_id_row_number
  from
    other_works_exploded
),
other_works_with_sources as (
  select
    native_id,
    id,
    is_oa_source,
    provenance,
    row_number() over (
        partition by native_id
        order by native_id_row_number asc
      ) as best_source_match
  from
    other_works_ranked
      left join existing_sources_for_other_works
        on other_works_ranked.relevant_id.id = existing_sources_for_other_works.issn
  where
    existing_sources_for_other_works.id is not null
  qualify
    best_source_match = 1
),
other_works_final as (
  select
    w.provenance,
    w.native_id,
    w.native_id_namespace,
    w.title,
    w.normalized_title,
    w.authors,
    w.ids,
    w.type,
    w.version,
    w.license,
    w.language,
    w.published_date,
    w.created_date,
    w.updated_date,
    w.issue,
    w.volume,
    w.first_page,
    w.last_page,
    w.is_retracted,
    w.abstract,
    w.source_name,
    w.publisher,
    w.funders,
    w.references,
    w.urls,
    w.mesh,
    w.is_oa,
    w.abstract_inverted_index,
    w.authors_exist,
    w.affiliations_exist,
    w.is_corresponding_exists,
    w.best_doi,
    w.merge_key,
    w.pdf_url,
    w.landing_page_url,
    w.pdf_s3_id,
    w.grobid_s3_id,
    ws.is_oa_source,
    ws.id as source_id
  from
    other_works w
      left join other_works_with_sources ws
        on w.native_id == ws.native_id
),
datacite_exploded as (
  select
    *,
    explode_outer(
      filter(
        ids,
        x -> x.namespace = 'datacite_client'
        and (x.relationship = 'self' or x.relationship is null)
      )
    ) as datacite_client_id
  from
    all_locations
  where
    provenance = "datacite"
),
datacite_with_sources as (
  select
    d.*,
    s.is_oa_source,
    s.id as source_id,
    row_number() over (
        partition by d.native_id
        order by s.id asc
      ) as best_datacite_match
  from
    datacite_exploded d
      left join existing_sources_for_datacite s
        on d.datacite_client_id.id = s.datacite_id
  qualify
    best_datacite_match = 1 or best_datacite_match is null
),
datacite_final as (
  select
    provenance,
    native_id,
    native_id_namespace,
    title,
    normalized_title,
    authors,
    ids,
    type,
    version,
    license,
    language,
    published_date,
    created_date,
    updated_date,
    issue,
    volume,
    first_page,
    last_page,
    is_retracted,
    abstract,
    source_name,
    publisher,
    funders,
    references,
    urls,
    mesh,
    is_oa,
    abstract_inverted_index,
    authors_exist,
    affiliations_exist,
    is_corresponding_exists,
    best_doi,
    merge_key,
    pdf_url,
    landing_page_url,
    pdf_s3_id,
    grobid_s3_id,
    is_oa_source,
    source_id
  from
    datacite_with_sources
),
pubmed_final as (
  select
    *,
    False as is_oa_source,
    4306525036 as source_id
  from
    all_locations
  where
    provenance = 'pubmed'
),
repo as (
  select
    *,
    split_part(native_id, ":", 2) as native_id_matching_string
  from
    all_locations
  where
    provenance = "repo"
    or provenance = "repo_backfill"
),
repo_sources as (
  select
    *
  from
    existing_sources_for_repos
  where
    pmh_row_number = 1
),
repo_final as (
  select
    r.provenance,
    r.native_id,
    r.native_id_namespace,
    r.title,
    r.normalized_title,
    r.authors,
    r.ids,
    r.type,
    r.version,
    r.license,
    r.language,
    r.published_date,
    r.created_date,
    r.updated_date,
    r.issue,
    r.volume,
    r.first_page,
    r.last_page,
    r.is_retracted,
    r.abstract,
    r.source_name,
    r.publisher,
    r.funders,
    r.references,
    r.urls,
    r.mesh,
    r.is_oa,
    r.abstract_inverted_index,
    r.authors_exist,
    r.affiliations_exist,
    r.is_corresponding_exists,
    r.best_doi,
    r.merge_key,
    r.pdf_url,
    r.landing_page_url,
    r.pdf_s3_id,
    r.grobid_s3_id,
    s.is_oa_source,
    s.id as source_id
  from
    repo r
      left join repo_sources s
        on r.native_id_matching_string = s.pmh_record_matching_string
),
unioned as (
  select
    *
  from
    repo_final
  union
  select
    *
  from
    datacite_final
  union
  select
    *
  from
    pubmed_final
  union
  select
    *
  from
    other_works_final
),
-- Everything below until the "DISPLAY_NAME END COMMENT" is to match on display_name <--> source_name for non-matched locations.
sources_unique_display_names as (
  select
    display_name,
    max(id) as id,
    max(is_oa) as is_oa_source,
    count(*) as rwcnt
  from
    openalex.sources.sources
  group by
    display_name
  having
    rwcnt = 1 -- only keep display names if there is only one record for that display name
),
matched_records as (
  select
    *
  from
    unioned
  where
    source_id is not null
),
unmatched_records as (
  select
    *
  from
    unioned
  where
    source_id is null
),
try_display_name_match_on_unmatched_records as (
  select
    * except (a.source_id, a.is_oa_source, b.display_name, b.id, b.rwcnt),
    b.id as source_id
  from
    unmatched_records a
      left join sources_unique_display_names b
        on a.source_name = b.display_name
),
unioned_with_display_name_matches as (
  select * from matched_records union select * from try_display_name_match_on_unmatched_records
),
-- select * from unioned_with_display_name_matches
  -- DISPLAY_NAME END COMMENT of matching on display_name <--> source_name for unmatched locations.

  -- Everything below until the "URL END COMMENT" is to match on url <--> webpage for non-matched locations.
matched_records_2 as (
  select
    *
  from
    unioned_with_display_name_matches
  where
    source_id is not null
),
unmatched_records_2 as (
  select
    *,
    REGEXP_EXTRACT(coalesce(landing_page_url, pdf_url), 'https?://(www\\.)?([^/]+)', 2) AS extracted_base_url
  from
    unioned_with_display_name_matches
  where
    source_id is null
),
manual_url_matching as (
  select * except(extracted_base_url, source_id), 
  case 
    when extracted_base_url = 'europepmc.org' then 4306400806
    when extracted_base_url = 'ci.nii.ac.jp' then 4210197683
    when extracted_base_url = 'dialnet.unirioja.es' then 4306401293
    when extracted_base_url = 'osti.gov' then 4306402487
    when extracted_base_url = "cqvip.com" then 4306500507
    when extracted_base_url = "books.openedition.org" then 4210222637
    when extracted_base_url = "cyberleninka.ru" then 4306401404
    when extracted_base_url = "ntrs.nasa.gov" then 4306402118
    when extracted_base_url = "scopus.com" then 4306400063
    when contains(coalesce(landing_page_url, pdf_url), "hal.science") then 4306402512
    when best_doi like '10.20944/preprints%' then 6309402219
    when best_doi like '10.2139/ssrn.%' then 4210172589
    else source_id 
  end as source_id
  from unmatched_records_2
),
unioned_with_url_matching as (
  select * from matched_records_2 union select * from manual_url_matching -- unmatched count: 137,005,916 --> 131,355,836 matched an additional 5.5M records with manual url_matching
)
select * from unioned_with_url_matching
-- select extracted_base_url, count(*) from unmatched_records_2 group by extracted_base_url order by count(*) desc
-- END COMMENT FOR URL MATCHING
);

ALTER TABLE identifier('openalex' || :env_suffix || '.works.locations_w_sources')
  ADD COLUMN normalized_source_name STRING AFTER source_name;
-- this one can add 1-2 minutes for these 600M+ rows, Jason's request to eventually carry normalized source display_name forward
UPDATE identifier('openalex' || :env_suffix || '.works.locations_w_sources')
  SET normalized_source_name = LOWER(REGEXP_REPLACE(source_name, '[^\\p{L}\\p{N}]', '')); --alphanumeric in all languages (important)

WITH final_display_name_fallback_sources AS ( -- get only unique titles (idea for later - check if it is non-numeric, length > N, more sources dedup coming)
  SELECT
    id AS source_id,
    LOWER(REGEXP_REPLACE(display_name, '[^\\p{L}\\p{N}]', '')) AS normalized_source_name,
    COUNT(*) OVER (
      PARTITION BY LOWER(REGEXP_REPLACE(display_name, '[^\\p{L}\\p{N}]', ''))
    ) AS name_count
  FROM openalex.sources.sources
  QUALIFY name_count = 1
)
MERGE INTO identifier('openalex' || :env_suffix || '.works.locations_w_sources') AS target
USING final_display_name_fallback_sources AS source
ON target.source_id IS NULL
   AND target.normalized_source_name = source.normalized_source_name
WHEN MATCHED THEN
  UPDATE SET target.source_id = source.source_id;



In [0]:
SELECT * FROM identifier('openalex' || :env_suffix || '.works.locations_w_sources')