### Creates `openalex.works.authors_and_affiliations` in Walden End to End workflow

In [0]:
-- Step 1: Create authors_and_affiliations table without institutions
CREATE OR REPLACE TABLE identifier('openalex' || :env_suffix || '.works.authors_and_affiliations_temp')
CLUSTER BY (work_id) AS (

WITH base AS (
    SELECT
        native_id,
        work_id,
        authors,
        priority,
        get(authors.affiliations.name, 0) IS NOT NULL AS affiliations_exist,
        EXISTS(authors.is_corresponding, x -> x = TRUE) AS is_corresponding_exists,
        ROW_NUMBER() OVER (
            PARTITION BY work_id
            ORDER BY priority ASC, hash(to_json(authors)) ASC
        ) AS r
    FROM identifier('openalex' || :env_suffix || '.works.locations_mapped')
    LEFT JOIN openalex.system.priority_table USING (provenance)
    WHERE authors_exist
),

best_authors_exploded AS (
    SELECT
        work_id,
        array_size(authors) AS best_author_list_len,
        posexplode(authors) AS (original_author_order, best_author_list_exploded),
        best_author_list_exploded.author_key AS author_key
    FROM base
    WHERE r = 1
),

affiliations_base AS (
    SELECT
        work_id,
        explode(authors) AS authors_exploded,
        authors_exploded.affiliations,
        authors_exploded.author_key AS author_key,
        priority
    FROM base
    WHERE affiliations_exist
),

affiliations_staging AS (
    SELECT
        *,
        RANK() OVER (
            PARTITION BY work_id, author_key
            ORDER BY priority ASC, author_key ASC
        ) AS r
    FROM affiliations_base
),

affiliations AS (
    SELECT
        work_id,
        author_key,
        affiliations
    FROM affiliations_staging
    WHERE r = 1
),

is_corresponding_base AS (
    SELECT
        work_id,
        authors,
        explode(filter(authors, x -> x.is_corresponding = TRUE)) AS corresponding_author,
        corresponding_author.author_key AS author_key,
        corresponding_author.is_corresponding AS is_corresponding_landing_page
    FROM base
    WHERE is_corresponding_exists
),

is_corresponding AS (
    SELECT
        work_id,
        author_key,
        is_corresponding_landing_page
    FROM is_corresponding_base
),

work_has_corresponding_author AS (
    SELECT
        work_id,
        EXISTS(
            collect_list(is_corresponding_landing_page),
            x -> x == TRUE
        ) AS work_has_corresponding_author
    FROM is_corresponding
    GROUP BY work_id
),

authors_and_affiliations_base AS (
    SELECT
        *
    FROM best_authors_exploded
    LEFT JOIN affiliations USING (work_id, author_key)
    LEFT JOIN is_corresponding USING (work_id, author_key)
    LEFT JOIN work_has_corresponding_author USING (work_id)
),

authors_and_affiliations_staging AS (
    SELECT
        work_id,
        original_author_order,
        STRUCT(
            CASE
                WHEN original_author_order == 0 THEN "first"
                WHEN original_author_order + 1 == best_author_list_len THEN "last"
                ELSE "additional"
            END AS author_position,
            TRIM(best_author_list_exploded.name) AS raw_author_name,
            CASE
                WHEN is_corresponding_landing_page THEN TRUE
                WHEN work_has_corresponding_author THEN FALSE
                WHEN original_author_order == 0 THEN TRUE
                ELSE FALSE
            END AS is_corresponding,
            affiliations.name AS raw_affiliation_strings,
            original_author_order
        ) AS authorships
    FROM authors_and_affiliations_base
)

SELECT
    work_id,
    TRANSFORM(
        ARRAY_SORT(
            COLLECT_SET(authorships),
            (left, right) -> CASE
                WHEN left.original_author_order < right.original_author_order THEN -1
                WHEN left.original_author_order > right.original_author_order THEN 1
                ELSE 0
            END
        ),
        x -> STRUCT(
            x.author_position,
            TRIM(REPLACE(x.raw_author_name, "\n", "")) AS raw_author_name,
            x.is_corresponding,
            TRANSFORM(x.raw_affiliation_strings, y -> TRIM(REPLACE(y, "\n", ""))) AS raw_affiliation_strings,
            x.original_author_order AS original_author_order
        )
    ) AS authorships
FROM authors_and_affiliations_staging
GROUP BY work_id

);

## Merge new affiliation strings into `affiliation_strings_lookup` table

In [0]:
-- Step 2: Merge new raw affiliation strings into lookup table
MERGE INTO openalex.institutions.affiliation_strings_lookup AS target
USING (
    WITH new_affiliation_strings AS (
        SELECT DISTINCT
            affiliation_string AS raw_affiliation_string
        FROM identifier('openalex' || :env_suffix || '.works.authors_and_affiliations_temp')
        LATERAL VIEW explode(authorships) AS authorship
        LATERAL VIEW explode(authorship.raw_affiliation_strings) AS affiliation_string
        WHERE affiliation_string IS NOT NULL 
          AND affiliation_string != ""
    )
    SELECT 
        nas.raw_affiliation_string,
        CAST(NULL AS ARRAY<BIGINT>) AS institution_ids,
        CAST(NULL AS ARRAY<BIGINT>) AS institution_ids_override,
        CURRENT_TIMESTAMP() AS created_datetime
    FROM new_affiliation_strings nas
    LEFT JOIN openalex.institutions.affiliation_strings_lookup existing
        ON nas.raw_affiliation_string = existing.raw_affiliation_string
    WHERE existing.raw_affiliation_string IS NULL
) AS source
ON target.raw_affiliation_string = source.raw_affiliation_string
WHEN NOT MATCHED THEN
    INSERT (raw_affiliation_string, institution_ids, institution_ids_override, created_datetime)
    VALUES (source.raw_affiliation_string, source.institution_ids, source.institution_ids_override, source.created_datetime);

### Create and populate `openalex.authors.author_institutions` mapping

In [0]:
-- Step 3: author_institutions mapping
-- CREATE TABLE IF NOT EXISTS identifier('openalex' || :env_suffix || '.authors.author_institutions')
CREATE OR REPLACE TABLE identifier('openalex' || :env_suffix || '.authors.author_institutions')  -- replace every time for now
USING DELTA
CLUSTER BY (work_id)
TBLPROPERTIES (
    'delta.autoOptimize.optimizeWrite' = 'true',
    'delta.autoOptimize.autoCompact' = 'true'
) AS
SELECT 
    CAST(NULL AS BIGINT) AS work_id,
    CAST(NULL AS BIGINT) AS author_sequence,
    CAST(NULL AS BIGINT) AS institution_id,
    CAST(NULL AS STRING) AS raw_author_name,
    CAST(NULL AS STRING) AS raw_affiliation_string,
    CAST(NULL as ARRAY<STRING>) as raw_countries
WHERE 1=0;

-- Execute merge
MERGE INTO identifier('openalex' || :env_suffix || '.authors.author_institutions') AS target
USING (
    WITH new_works AS (
        SELECT DISTINCT work_id
        FROM identifier('openalex' || :env_suffix || '.works.authors_and_affiliations_temp') aa
        -- WHERE NOT EXISTS (
        --     SELECT 1 
        --     FROM identifier('openalex' || :env_suffix || '.authors.author_institutions') ai
        --     WHERE ai.work_id = aa.work_id
        -- )
    ),
    exploded_authorships AS (
        SELECT 
            work_id,
            posexplode(authorships) AS (author_sequence, authorship),
            authorship.original_author_order AS original_author_order
        FROM identifier('openalex' || :env_suffix || '.works.authors_and_affiliations_temp')
        WHERE work_id IN (SELECT work_id FROM new_works)
    ),
    
    exploded_affiliations AS (
        SELECT 
            work_id,
            author_sequence,
            original_author_order,
            authorship.raw_author_name,
            authorship.author_position,
            authorship.is_corresponding,
            posexplode_outer(authorship.raw_affiliation_strings) AS (affiliation_sequence, raw_affiliation_string)
        FROM exploded_authorships
    ),
    
    with_institutions AS (
        SELECT 
            ea.work_id,
            ea.author_sequence,
            ea.raw_author_name,
            ea.raw_affiliation_string,
            asl.countries as raw_countries,
            CASE 
                WHEN ea.raw_affiliation_string IS NULL THEN NULL
                WHEN asl.institution_ids_override IS NOT NULL AND SIZE(asl.institution_ids_override) > 0 
                    THEN asl.institution_ids_override
                WHEN asl.institution_ids IS NOT NULL AND SIZE(asl.institution_ids) > 0 
                    AND NOT (SIZE(asl.institution_ids) = 1 AND asl.institution_ids[0] = -1) 
                    THEN asl.institution_ids
                ELSE NULL
            END AS institution_ids
        FROM exploded_affiliations ea
        LEFT JOIN openalex.institutions.affiliation_strings_lookup asl
            ON ea.raw_affiliation_string = asl.raw_affiliation_string
            AND ea.raw_affiliation_string IS NOT NULL
    )
    
    SELECT 
        work_id,
        author_sequence,
        institution_id,
        raw_author_name,
        raw_affiliation_string,
        raw_countries
    FROM with_institutions
    LATERAL VIEW OUTER explode(institution_ids) AS institution_id
    WHERE institution_id IS NOT NULL
) AS source
ON target.work_id = source.work_id 
   AND target.author_sequence = source.author_sequence 
   AND NVL(target.institution_id, -1) = NVL(source.institution_id, -1)
   AND NVL(target.raw_affiliation_string, '') = NVL(source.raw_affiliation_string, '')
WHEN NOT MATCHED THEN
    INSERT (work_id, author_sequence, institution_id, raw_author_name, raw_affiliation_string, raw_countries)
    VALUES (source.work_id, source.author_sequence, source.institution_id, 
            source.raw_author_name, source.raw_affiliation_string, source.raw_countries);

### Add institutions arrays to authorships

In [0]:
%run ../utils/variables

In [0]:
-- Step 4: Merge institution objects back into authorships
CREATE OR REPLACE TABLE identifier('openalex' || :env_suffix || '.works.authors_and_affiliations')
CLUSTER BY (work_id) AS (
WITH existing_authorships AS (
    SELECT
        work_id,
        posexplode(authorships) AS (author_sequence, authorship),
        authorship.original_author_order AS original_author_order
    FROM identifier('openalex' || :env_suffix || '.works.authors_and_affiliations_temp')
),
institution_lineage AS (
  SELECT
    institution_id,
    COLLECT_LIST(ancestor_id) AS lineage_ids
  FROM openalex.mid.institution_ancestors
  WHERE NOT ARRAY_CONTAINS(SUPER_SYSTEM_INSTITUTIONS, ancestor_id)
  GROUP BY institution_id
),
author_institutions_with_details AS (
    SELECT
        ai.work_id,
        ai.author_sequence,
        array_distinct(flatten(collect_set(ai.raw_countries))) as raw_parsed_countries,
        COLLECT_SET(
            STRUCT(
                inst.iso3166_code as country_code,
                inst.display_name,
                CONCAT('https://openalex.org/I', ai.institution_id) AS id,
                ARRAY_SORT(
                    TRANSFORM(
                    ARRAY_COMPACT(CONCAT(ARRAY(ai.institution_id), COALESCE(il.lineage_ids, ARRAY()))), id -> CONCAT('https://openalex.org/I', id)
                    )
                ) AS lineage,
                CONCAT('https://ror.org/', inst.ror_id) AS ror,
                inst.type,
                CAST(ARRAY(inst.type) as array<string>) as type_list
            )
        ) AS institutions
    FROM identifier('openalex' || :env_suffix || '.authors.author_institutions') ai
    LEFT JOIN openalex.institutions.institutions inst ON inst.id = ai.institution_id
    LEFT JOIN institution_lineage il USING (institution_id)
    WHERE ai.institution_id IS NOT NULL
    GROUP BY ai.work_id, ai.author_sequence
),
-- Map raw_affiliation_string -> institution_ids per (work_id, author_sequence)
affiliations_map_ids AS (
    SELECT
        work_id,
        author_sequence,
        raw_affiliation_string,
        array_distinct(
            array_compact(
                COLLECT_LIST(CONCAT('https://openalex.org/I', institution_id))
            )
        ) AS institution_ids
    FROM identifier('openalex' || :env_suffix || '.authors.author_institutions')
    WHERE institution_id IS NOT NULL
      AND raw_affiliation_string IS NOT NULL
    GROUP BY work_id, author_sequence, raw_affiliation_string
),
affiliations_map AS (
    SELECT
        work_id,
        author_sequence,
        MAP_FROM_ENTRIES(
            COLLECT_LIST(NAMED_STRUCT('key', raw_affiliation_string, 'value', institution_ids))
        ) AS aff_map
    FROM affiliations_map_ids
    GROUP BY work_id, author_sequence
),
authorships_with_institutions AS (
    SELECT
        ea.work_id,
        ea.author_sequence,
        ea.original_author_order,
        -- Rebuilding the authorship struct to strictly match the target schema
        STRUCT(
            -- 1. 'affiliations' is built from raw_strings, populate institution_ids from author_institutions per string
            TRANSFORM(
                COALESCE(ea.authorship.raw_affiliation_strings, ARRAY()),
                s -> STRUCT(
                    element_at(am.aff_map, s) as institution_ids,
                    s as raw_affiliation_string
                )
            ) as affiliations,
            -- 2. 'author' is filled with a correctly typed NULL as it's not in the source
            CAST(NULL AS STRUCT<display_name: STRING, id: STRING, orcid: STRING>) AS author,
            -- 3. All other fields are populated from the available data
            ea.authorship.author_position,
            ea.original_author_order AS author_order_number,
            CASE
                WHEN size(FILTER(aid.institutions.country_code, c -> c IS NOT NULL AND c <> '')) > 0
                    THEN array_sort(array_distinct(FILTER(aid.institutions.country_code, c -> c IS NOT NULL AND c <> '')))
                WHEN aid.raw_parsed_countries IS NOT NULL
                    AND aid.raw_parsed_countries <> ARRAY("")
                THEN aid.raw_parsed_countries
                ELSE ARRAY()
            END AS countries,
            COALESCE(aid.institutions, ARRAY()) AS institutions,
            ea.authorship.is_corresponding,
            ea.authorship.raw_affiliation_strings,
            ea.authorship.raw_author_name
        ) AS authorship_with_institutions
    FROM existing_authorships ea
    LEFT JOIN author_institutions_with_details aid
        ON ea.work_id = aid.work_id
        AND ea.author_sequence = aid.author_sequence
    LEFT JOIN affiliations_map am
        ON ea.work_id = am.work_id
        AND ea.author_sequence = am.author_sequence
)
SELECT
    work_id,
    TRANSFORM(
        ARRAY_SORT(
            COLLECT_LIST(
                STRUCT(
                    original_author_order,
                    authorship_with_institutions
                )
            ),
            (left, right) -> CASE
                WHEN left.original_author_order < right.original_author_order THEN -1
                WHEN left.original_author_order > right.original_author_order THEN 1
                ELSE 0
            END
        ),
        x -> x.authorship_with_institutions
    ) AS authorships
FROM authorships_with_institutions
GROUP BY work_id
);