### Creates `openalex.works.authors_and_affiliations` in Walden End to End workflow

In [0]:
DECLARE OR REPLACE VARIABLE max_updated_date TIMESTAMP DEFAULT to_timestamp('1900-01-01');
SET VARIABLE max_updated_date = COALESCE((SELECT MAX(updated_datetime) FROM identifier('openalex' || :env_suffix || '.works.authors_and_affiliations')), to_timestamp('1900-01-01'));
SELECT max_updated_date;

In [0]:
-- Step 1: Create updates table directly (combined stage1 + updates)
CREATE OR REPLACE TABLE identifier('openalex' || :env_suffix || '.works.authors_and_affiliations_updates')
CLUSTER BY (work_id) AS (
WITH deduplicated_works AS (
    SELECT
        work_id,
        authors,
        openalex_updated_dt AS updated_datetime,
        GET(authors.affiliations.name, 0) IS NOT NULL AS affiliations_exist,
        EXISTS(authors.is_corresponding, x -> x = TRUE) AS is_corresponding_exists,
        array_size(authors) AS author_count
    FROM identifier('openalex' || :env_suffix || '.works.locations_mapped')
    LEFT JOIN openalex.system.priority_table USING (provenance)
    WHERE authors_exist
      AND openalex_updated_dt > max_updated_date
    QUALIFY ROW_NUMBER() OVER (PARTITION BY work_id ORDER BY priority ASC) = 1
),
exploded_authors AS (
    SELECT
        work_id,
        updated_datetime,
        author_count,
        affiliations_exist,
        is_corresponding_exists,
        posexplode(authors) AS (original_author_order, author_data),
        -- Pre-compute normalized name for efficient join
        openalex.works.normalize_affiliation_string(TRIM(author_data.name)) AS normalized_author_name
    FROM deduplicated_works
),
enriched_authors AS (
    SELECT
        work_id,
        updated_datetime,
        original_author_order,
        author_count,
        TRIM(author_data.name) AS raw_author_name,
        author_data.affiliations.name AS raw_affiliation_strings,
        author_data.is_corresponding AS is_corresponding_from_source,
        is_corresponding_exists,
        -- Join author metadata using pre-computed normalized name
        a.display_name,
        a.author_id,
        a.orcid
    FROM exploded_authors
    LEFT JOIN (
        SELECT 
            a.normalized_name,
            FIRST(a.display_name, true) AS display_name,
            MIN(a.author_id) AS author_id,
            MIN(ao.orcid) AS orcid
        FROM openalex.mid.author a
        LEFT JOIN (
            SELECT author_id, FIRST(orcid, true) AS orcid
            FROM openalex.mid.author_orcid
            GROUP BY author_id
        ) ao ON a.author_id = ao.author_id
        WHERE a.normalized_name IS NOT NULL
        GROUP BY a.normalized_name
    ) a ON exploded_authors.normalized_author_name = a.normalized_name
),
merged_duplicates AS (
    SELECT
        work_id,
        FIRST(updated_datetime) AS updated_datetime,
        original_author_order,
        author_count,
        MIN_BY(raw_author_name, original_author_order) AS raw_author_name,
        MAX_BY(is_corresponding_from_source, original_author_order) AS is_corresponding_from_source,
        FIRST(is_corresponding_exists, true) AS is_corresponding_exists,
        FIRST(display_name, true) AS display_name,
        FIRST(author_id, true) AS author_id,
        FIRST(orcid, true) AS orcid,
        ARRAY_DISTINCT(
            TRANSFORM(
                ARRAY_COMPACT(FLATTEN(COLLECT_LIST(raw_affiliation_strings))),
                s -> TRIM(REPLACE(s, '\\n', ''))
            )
        ) AS raw_affiliation_strings,
        -- Compute work_has_corresponding directly using window function on grouped data
        CASE
            WHEN FIRST(is_corresponding_exists, true) THEN
                MAX(CASE WHEN MAX_BY(is_corresponding_from_source, original_author_order) = TRUE THEN 1 ELSE 0 END) 
                    OVER (PARTITION BY work_id)
            ELSE 0
        END AS work_has_corresponding
    FROM enriched_authors
    GROUP BY work_id, original_author_order, author_count
)
SELECT
    work_id,
    updated_datetime,
    TRANSFORM(
        ARRAY_SORT(
            COLLECT_LIST(
                STRUCT(
                    original_author_order,
                    STRUCT(
                        CASE
                            WHEN original_author_order == 0 THEN 'first'
                            WHEN original_author_order + 1 == author_count THEN 'last'
                            ELSE 'middle'
                        END AS author_position,
                        TRIM(REPLACE(raw_author_name, '\\n', '')) AS raw_author_name,
                        CASE
                            WHEN is_corresponding_from_source = TRUE THEN TRUE
                            WHEN work_has_corresponding = 1 THEN FALSE
                            WHEN original_author_order == 0 THEN TRUE
                            ELSE FALSE
                        END AS is_corresponding,
                        raw_affiliation_strings,
                        original_author_order,
                        NAMED_STRUCT(
                            'display_name', COALESCE(display_name, TRIM(REPLACE(raw_author_name, '\\n', ''))),
                            'id', CASE WHEN author_id IS NOT NULL THEN CONCAT('https://openalex.org/A', author_id) ELSE NULL END,
                            'orcid', CASE WHEN orcid IS NOT NULL THEN CONCAT('https://orcid.org/', orcid) ELSE NULL END
                        ) AS author
                    ) AS authorship
                )
            ),
            (left, right) -> CASE
                WHEN left.original_author_order < right.original_author_order THEN -1
                WHEN left.original_author_order > right.original_author_order THEN 1
                ELSE 0
            END
        ),
        x -> x.authorship
    ) AS authorships
FROM merged_duplicates
GROUP BY work_id, updated_datetime);

## Merge new affiliation strings into `affiliation_strings_lookup` table

In [0]:
-- Step 2: Merge new raw affiliation strings into lookup table
MERGE INTO openalex.institutions.affiliation_strings_lookup AS target
USING (
    WITH new_affiliation_strings AS (
        SELECT DISTINCT
            affiliation_string AS raw_affiliation_string
        FROM identifier('openalex' || :env_suffix || '.works.authors_and_affiliations_updates')
        LATERAL VIEW explode(authorships) AS authorship
        LATERAL VIEW explode(authorship.raw_affiliation_strings) AS affiliation_string
        WHERE affiliation_string IS NOT NULL 
          AND affiliation_string != ""
    )
    SELECT 
        nas.raw_affiliation_string,
        CAST(NULL AS ARRAY<BIGINT>) AS institution_ids,
        CAST(NULL AS ARRAY<BIGINT>) AS institution_ids_override,
        CURRENT_TIMESTAMP() AS created_datetime
    FROM new_affiliation_strings nas
    LEFT ANTI JOIN openalex.institutions.affiliation_strings_lookup existing
        ON nas.raw_affiliation_string = existing.raw_affiliation_string
) AS source
ON target.raw_affiliation_string = source.raw_affiliation_string
WHEN NOT MATCHED THEN
    INSERT (raw_affiliation_string, institution_ids, institution_ids_override, created_datetime)
    VALUES (source.raw_affiliation_string, source.institution_ids, source.institution_ids_override, source.created_datetime);

### Create and populate `openalex.authors.author_institutions` mapping

In [0]:
-- Step 3: author_institutions mapping

-- Execute merge - explode from updates table
MERGE INTO identifier('openalex' || :env_suffix || '.authors.author_institutions') AS target
USING (
    WITH exploded_authors AS (
        SELECT 
            work_id,
            posexplode(authorships) AS (author_sequence, authorship)
        FROM identifier('openalex' || :env_suffix || '.works.authors_and_affiliations_updates')
    ),
    exploded_affiliations AS (
        SELECT 
            work_id,
            author_sequence,
            authorship.raw_author_name,
            explode_outer(authorship.raw_affiliation_strings) AS raw_affiliation_string
        FROM exploded_authors
    ),
    
    with_institutions AS (
        SELECT 
            ea.work_id,
            ea.author_sequence,
            ea.raw_author_name,
            ea.raw_affiliation_string,
            asl.countries as raw_countries,
            CASE 
                WHEN ea.raw_affiliation_string IS NULL THEN NULL
                WHEN asl.institution_ids_override IS NOT NULL AND SIZE(asl.institution_ids_override) > 0 
                    THEN asl.institution_ids_override
                WHEN asl.institution_ids IS NOT NULL AND SIZE(asl.institution_ids) > 0 
                    AND NOT (SIZE(asl.institution_ids) = 1 AND asl.institution_ids[0] = -1) 
                    THEN asl.institution_ids
                ELSE NULL
            END AS institution_ids
        FROM exploded_affiliations ea
        LEFT JOIN openalex.institutions.affiliation_strings_lookup asl
            ON ea.raw_affiliation_string = asl.raw_affiliation_string
            AND ea.raw_affiliation_string IS NOT NULL
    )
    
    SELECT 
        work_id,
        author_sequence,
        explode_outer(institution_ids) AS institution_id,
        raw_author_name,
        raw_affiliation_string,
        raw_countries
    FROM with_institutions
    WHERE institution_ids IS NOT NULL AND SIZE(institution_ids) > 0
) AS source
ON target.work_id = source.work_id 
   AND target.author_sequence = source.author_sequence 
   AND NVL(target.institution_id, -1) = NVL(source.institution_id, -1)
   AND NVL(target.raw_affiliation_string, '') = NVL(source.raw_affiliation_string, '')
WHEN NOT MATCHED THEN
    INSERT (work_id, author_sequence, institution_id, raw_author_name, raw_affiliation_string, raw_countries)
    VALUES (source.work_id, source.author_sequence, source.institution_id, 
            source.raw_author_name, source.raw_affiliation_string, source.raw_countries);

### Add institutions arrays to authorships

In [0]:
%run ../utils/variables

In [0]:
-- Step 4: Enrich updates with full institution details using array operations (no explode!)
CREATE OR REPLACE TABLE identifier('openalex' || :env_suffix || '.works.authors_and_affiliations_updates')
CLUSTER BY (work_id) AS (
WITH base_authorships AS (
    SELECT
        work_id,
        updated_datetime,
        authorships
    FROM identifier('openalex' || :env_suffix || '.works.authors_and_affiliations_updates')
),
institution_lineage AS (
  SELECT
    institution_id,
    COLLECT_LIST(ancestor_id) AS lineage_ids
  FROM openalex.mid.institution_ancestors
  WHERE NOT ARRAY_CONTAINS(SUPER_SYSTEM_INSTITUTIONS, ancestor_id)
  GROUP BY institution_id
),
author_institutions_with_details AS (
    SELECT
        ai.work_id,
        ai.author_sequence,
        array_distinct(flatten(collect_set(ai.raw_countries))) as raw_parsed_countries,
        COLLECT_SET(
            STRUCT(
                inst.iso3166_code as country_code,
                inst.display_name,
                CONCAT('https://openalex.org/I', ai.institution_id) AS id,
                ARRAY_SORT(
                    TRANSFORM(
                    ARRAY_COMPACT(CONCAT(ARRAY(ai.institution_id), COALESCE(il.lineage_ids, ARRAY()))), id -> CONCAT('https://openalex.org/I', id)
                    )
                ) AS lineage,
                CONCAT('https://ror.org/', inst.ror_id) AS ror,
                inst.type--,
                --CAST(ARRAY(inst.type) as array<string>) as type_list -- no longer a wanted attribute
            )
        ) AS institutions
    FROM identifier('openalex' || :env_suffix || '.authors.author_institutions') ai
    LEFT JOIN openalex.institutions.institutions inst ON inst.id = ai.institution_id
    LEFT JOIN institution_lineage il USING (institution_id)
    WHERE ai.institution_id IS NOT NULL
    GROUP BY ai.work_id, ai.author_sequence
),
-- Map raw_affiliation_string -> institution_ids per (work_id, author_sequence)
affiliations_map_ids AS (
    SELECT
        work_id,
        author_sequence,
        raw_affiliation_string,
        array_distinct(
            array_compact(
                COLLECT_LIST(CONCAT('https://openalex.org/I', institution_id))
            )
        ) AS institution_ids
    FROM identifier('openalex' || :env_suffix || '.authors.author_institutions')
    WHERE institution_id IS NOT NULL
      AND raw_affiliation_string IS NOT NULL
    GROUP BY work_id, author_sequence, raw_affiliation_string
),
affiliations_map AS (
    SELECT
        work_id,
        author_sequence,
        MAP_FROM_ENTRIES(
            COLLECT_LIST(NAMED_STRUCT('key', raw_affiliation_string, 'value', institution_ids))
        ) AS aff_map
    FROM affiliations_map_ids
    GROUP BY work_id, author_sequence
),
-- Pre-aggregate institution details per work_id as a map[author_sequence -> details]
author_institution_lookup AS (
    SELECT
        work_id,
        MAP_FROM_ENTRIES(
            COLLECT_LIST(
                STRUCT(
                    author_sequence,
                    STRUCT(
                        institutions,
                        raw_parsed_countries,
                        aff_map
                    )
                )
            )
        ) as author_lookup
    FROM (
        SELECT
            aid.work_id,
            aid.author_sequence,
            aid.institutions,
            aid.raw_parsed_countries,
            am.aff_map
        FROM author_institutions_with_details aid
        LEFT JOIN affiliations_map am ON aid.work_id = am.work_id AND aid.author_sequence = am.author_sequence
    )
    GROUP BY work_id
)
SELECT
    ba.work_id,
    ba.updated_datetime,
    -- Use TRANSFORM to enrich each authorship in-place, no explode/re-aggregate!
    TRANSFORM(
        ba.authorships,
        (auth, idx) -> STRUCT(
            -- affiliations: map raw_affiliation_strings to institution_ids
            TRANSFORM(
                COALESCE(auth.raw_affiliation_strings, ARRAY()),
                s -> STRUCT(
                    COALESCE(element_at(element_at(ail.author_lookup, idx).aff_map, s), ARRAY()) as institution_ids,
                    s as raw_affiliation_string
                )
            ) as affiliations,
            -- Preserve author field
            auth.author,
            -- All other fields
            auth.author_position,
            auth.original_author_order AS author_order_number,
            -- countries from institutions
            CASE
                WHEN element_at(ail.author_lookup, idx).institutions IS NOT NULL 
                     AND size(FILTER(element_at(ail.author_lookup, idx).institutions.country_code, c -> c IS NOT NULL AND c <> '')) > 0
                    THEN array_sort(array_distinct(FILTER(element_at(ail.author_lookup, idx).institutions.country_code, c -> c IS NOT NULL AND c <> '')))
                WHEN element_at(ail.author_lookup, idx).raw_parsed_countries IS NOT NULL
                    THEN element_at(ail.author_lookup, idx).raw_parsed_countries
                ELSE ARRAY()
            END AS countries,
            COALESCE(element_at(ail.author_lookup, idx).institutions, ARRAY()) AS institutions,
            auth.is_corresponding,
            auth.raw_affiliation_strings,
            auth.raw_author_name
        )
    ) AS authorships
FROM base_authorships ba
LEFT JOIN author_institution_lookup ail ON ba.work_id = ail.work_id);

In [0]:
-- Step 5: Merge enriched updates into final authors_and_affiliations table
MERGE INTO identifier('openalex' || :env_suffix || '.works.authors_and_affiliations') AS target
USING identifier('openalex' || :env_suffix || '.works.authors_and_affiliations_updates') AS source
ON target.work_id = source.work_id
WHEN MATCHED THEN UPDATE SET
  target.authorships = source.authorships,
  target.updated_datetime = source.updated_datetime
WHEN NOT MATCHED THEN INSERT (work_id, authorships, updated_datetime)
VALUES (source.work_id, source.authorships, source.updated_datetime);


In [0]:
-- SELECT * FROM openalex.works.authors_and_affiliations where work_id = 4415178126