### Creates `openalex.works.repo_super_authorships` in Walden End to End workflow

In [0]:
create or replace temp view pmh_base as
(
  select
    -- native_id, nice for tracking purposes
    case
      when provenance in ('repo', 'repo_backfill') then native_id
      else get(filter(ids, x -> x.namespace = "pmh").id, 0)
    end as pmh_id,
    authors,
    priority,
    affiliations_exist,
    is_corresponding_exists,
    row_number() OVER (
        PARTITION BY
          case
            when provenance in ('repo', 'repo_backfill') then native_id
            else get(filter(ids, x -> x.namespace = "pmh").id, 0)
          end
        ORDER BY priority ASC
      ) AS r
  from
    openalex.works.locations left join openalex.system.priority_table using (provenance)
  where
    authors_exist
    and provenance in ('repo', 'repo_backfill', 'pdf', 'landing_page')
);

create or replace temp view pmh_best_authors_exploded as
(
  select
    pmh_id,
    posexplode(authors) as (original_author_order, best_author_list_exploded),
    (best_author_list_exploded.author_key) as author_key
  from
    pmh_base
  where
    r = 1
);

-- Explode all records that have an affiliation.
create or replace temp view pmh_affiliations_base as
(
  select
    pmh_id,
    explode(authors) as authors_exploded,
    authors_exploded.affiliations,
    authors_exploded.author_key as author_key,
    priority
  from
    pmh_base
  where
    affiliations_exist
);

create or replace temp view pmh_affiliations_staging as
(
  select
    *,
    explode(affiliations) as exploded_affiliations,
    rank() OVER (PARTITION BY pmh_id, author_key ORDER BY priority ASC) AS r
  from
    pmh_affiliations_base where
  where
    get(affiliations.name, 0) is not null
);

create or replace temp view pmh_affiliations as
(
  select
    pmh_id,
    author_key,
    collect_list(exploded_affiliations) as affiliations
  from
    pmh_affiliations_staging
  where
    r = 1
  group by
    pmh_id,
    author_key -- in case any author_keys have multiple affiliations.
);

-- Explode all records that contain an is_corresponding author. These records are mainly, if not exclusively, from landing_page_works.
create or replace temp view pmh_is_corresponding_base as
(
  select
    pmh_id,
    authors,
    explode(filter(authors, x -> x.is_corresponding is not null)) as corresponding_author,
    corresponding_author.author_key as author_key,
    corresponding_author.is_corresponding as is_corresponding_landing_page
  from
    pmh_base
  where
    is_corresponding_exists
);

-- Table that matches to the work_id AND author_key
create or replace temp view pmh_is_corresponding as
(
  select
    pmh_id,
    author_key,
    is_corresponding_landing_page
  from
    pmh_is_corresponding_base
);

create or replace temp view pmh_authors_and_affiliations_base as
(
  select
    *
  from
    pmh_best_authors_exploded
      left join pmh_affiliations using (pmh_id, author_key)
      left join pmh_is_corresponding using (pmh_id, author_key)
);

create or replace temp view pmh_authors_and_affiliations_staging as
(
  select
    pmh_id,
    struct(
      original_author_order,
      best_author_list_exploded.given,
      best_author_list_exploded.family,
      best_author_list_exploded.name,
      best_author_list_exploded.orcid,
      affiliations,
      is_corresponding_landing_page as is_corresponding,
      best_author_list_exploded.author_key
    ) as authorships
  from
    pmh_authors_and_affiliations_base
);

create or replace table openalex.works.repo_super_authorships as (
-- create or replace temp view repo_super_authorships as (
  select
    pmh_id,
    transform(
      array_sort(
        collect_set(authorships),
        (left, right) -> case -- order the authors based on original_author_order
          when left.original_author_order < right.original_author_order then -1
          when left.original_author_order > right.original_author_order then 1
          else 0
        end
      ),
      x -> struct(
        trim(x.given) as given,
        trim(x.family) as family,
        trim(x.name) as name,
        x.orcid,
        transform(
          x.affiliations, y -> struct(trim(y.name) as name, y.department, y.ror_id)
        ) as affiliations,
        x.is_corresponding,
        x.author_key
      )
    ) as authorships -- transform is used to remove the original_author_order from the struct field. It was needed to correctly order the authors, but we don't want it in the API. Here we might want to remove bad authors, like in doi = 10.32870/rmip.v10i1.453, which has author names like "Universidad de Colima". I assume there is a list somewhere to know which authors to filter out programatically, but maybe not.
  from
    pmh_authors_and_affiliations_staging
  group by
    pmh_id
);

In [0]:
-- select * from repo_super_authorships 
select * from openalex.works.repo_super_authorships