# Create `openalex.works.work_authors` table

Creates a denormalized work authors table from `openalex.works.work_author_affiliations` with aggregated affiliation strings and corresponding author flag.

Removes duplicate authors within each work_id where author_id is NULL and the same raw_author_name appears multiple times.

In [None]:
%run ../utils/variables

### Create work_authors table

In [None]:
CREATE OR REPLACE TABLE identifier('openalex' || :env_suffix || '.works.work_authors') AS
WITH authorships_is_corresponding AS (
    -- Get is_corresponding from openalex_works_base authorships
    SELECT 
        id AS work_id,
        author_sequence,
        authorship.is_corresponding
    FROM identifier('openalex' || :env_suffix || '.works.openalex_works_base')
    LATERAL VIEW POSEXPLODE(authorships) t AS author_sequence, authorship
),
aggregated_affiliations AS (
    -- Aggregate affiliation strings per work/author
    SELECT 
        work_id,
        author_sequence,
        author_id,
        MAX(raw_author_name) AS raw_author_name,
        ARRAY_DISTINCT(ARRAY_COMPACT(COLLECT_LIST(raw_affiliation_string))) AS raw_affiliation_strings,
        MIN(created_at) AS created_at,
        MAX(updated_at) AS updated_at
    FROM identifier('openalex' || :env_suffix || '.works.work_author_affiliations')
    GROUP BY work_id, author_sequence, author_id
),
with_is_corresponding AS (
    SELECT 
        aa.work_id,
        aa.author_sequence,
        aa.author_id,
        aa.raw_author_name,
        aa.raw_affiliation_strings,
        COALESCE(aic.is_corresponding, FALSE) AS is_corresponding,
        aa.created_at,
        aa.updated_at
    FROM aggregated_affiliations aa
    LEFT JOIN authorships_is_corresponding aic
        ON aa.work_id = aic.work_id 
        AND aa.author_sequence = aic.author_sequence
),
deduplicated AS (
    -- Remove duplicate authors within each work_id where author_id is NULL
    -- Keep only the first occurrence (lowest author_sequence) of each raw_author_name
    SELECT 
        work_id,
        author_sequence,
        author_id,
        raw_author_name,
        raw_affiliation_strings,
        is_corresponding,
        created_at,
        updated_at,
        ROW_NUMBER() OVER (
            PARTITION BY work_id, 
                CASE WHEN author_id IS NULL THEN LOWER(TRIM(raw_author_name)) ELSE CAST(author_sequence AS STRING) END
            ORDER BY author_sequence ASC
        ) AS rn
    FROM with_is_corresponding
)
SELECT 
    work_id,
    author_sequence,
    author_id,
    raw_author_name,
    raw_affiliation_strings,
    is_corresponding,
    created_at,
    updated_at
FROM deduplicated
WHERE rn = 1;