### Creates `openalex.works.authors_and_affiliations` in Walden End to End workflow

In [0]:
create or replace temp view base as (
  select
    native_id,
    work_id,
    authors,
    priority,
    get(authors.affiliations.name, 0) is not null as affiliations_exist,
    exists(authors.is_corresponding, x -> x = True) as is_corresponding_exists,
    row_number() OVER (
      PARTITION BY work_id
      ORDER BY
        priority ASC,
        hash(to_json(authors)) ASC -- to ensure deterministic
    ) AS r
  from
    openalex.works.locations_mapped
    left join openalex.system.priority_table using(provenance)
  where
     authors_exist

);

create or replace temp view best_authors_exploded as (
  select
    work_id,
    array_size(authors) as best_author_list_len,
    posexplode(authors) as (original_author_order, best_author_list_exploded),
    (best_author_list_exploded.author_key) as author_key
  from
    base
  where
    r = 1
);

-- Explode all records that have an affiliation.
create or replace temp view affiliations_base as (
  select
    work_id,
    explode(authors) as authors_exploded,
    authors_exploded.affiliations,
    authors_exploded.author_key as author_key,
    priority
  from
    base
  where
    affiliations_exist
);

create or replace temp view affiliations_staging as (
  select
    *,
    rank() OVER (
      PARTITION BY work_id,
      author_key
      ORDER BY
        priority ASC,
        author_key ASC
    ) AS r
  from
    affiliations_base
);

create or replace temp view affiliations as(
  select
    work_id,
    author_key,
    affiliations
  from
    affiliations_staging
  where
    r = 1
);

-- Explode all records that contain an is_corresponding author. These records are mainly, if not exclusively, from landing_page_works.
create or replace temp view is_corresponding_base as (
  select
    work_id,
    authors,
    explode(filter(authors, x -> x.is_corresponding = true)) as corresponding_author,
    corresponding_author.author_key as author_key,
    corresponding_author.is_corresponding as is_corresponding_landing_page
  from
  base
  where
    is_corresponding_exists
);

-- Table that matches to the work_id AND author_key
create or replace temp view is_corresponding as (
  select
    work_id,
    author_key,
    is_corresponding_landing_page
  from
    is_corresponding_base
);

-- Table that matches only to the work_id. We need this column to indicate whether we should assign the first author to is_corresponding.
create or replace temp view work_has_corresponding_author as (
  select
    work_id,
    exists(
      collect_list(is_corresponding_landing_page),
      x -> x == true
    ) as work_has_corresponding_author
  from
    is_corresponding
  group by
  work_id
);

-- Consolidate the authors and affiliations info, and conform to the openalex api structure.
create or replace temp view authors_and_affiliations_base as (
  select
    *
  from
    best_authors_exploded
    left join affiliations using(work_id, author_key)
    left join is_corresponding using(work_id, author_key)
    left join work_has_corresponding_author using(work_id)
);

create or replace temp view authors_and_affiliations_staging as (
  select
    work_id,
    original_author_order,
    struct(
      case
        when original_author_order == 0 then "first"
        when original_author_order + 1 == best_author_list_len then "last"
        else "additional"
      end as author_position,
      trim(best_author_list_exploded.name) as raw_author_name,
      case
        when is_corresponding_landing_page then True -- when author is a corresponding author from landing_page_works, True
        when work_has_corresponding_author then False -- when the work has a corresponding author but the author isn't the corresponding author, False
        when original_author_order == 0 then True -- when the author is the first author, True
        else False -- otherwise False
      end as is_corresponding,
      affiliations.name as raw_affiliation_strings,
      original_author_order
    ) as authorships
  from
    authors_and_affiliations_base
);

create or replace table openalex.works.authors_and_affiliations as (
  select
    work_id,
    transform(array_sort(
      collect_set(authorships),
      (left, right) -> case -- order the authorships based on original_author_order
        when left.original_author_order < right.original_author_order then -1
        when left.original_author_order > right.original_author_order then 1
        else 0
      end
    ), x -> struct(x.author_position, trim(replace(x.raw_author_name, "\n", "")) as raw_author_name, x.is_corresponding, transform(x.raw_affiliation_strings, y -> trim(replace(y, "\n", ""))) as raw_affiliation_strings)) as authorships -- transform is used to remove the original_author_order from the struct field. It was needed to correctly order the authors, but we don't want it in the API.
  from
    authors_and_affiliations_staging
  group by
    work_id
);

In [0]:
select * from openalex.works.authors_and_affiliations;