# Create `openalex.works.work_authors` table

Creates a denormalized work authors table by:
1. Copying records from `work_author_affiliations` where `author_id IS NOT NULL` (matched authors)
2. Ingesting fresh data from `openalex_works_base` for work_ids not already present
3. Aggregating affiliation strings and adding corresponding author flag

In [None]:
%run ../utils/variables

### Create work_authors table

In [None]:
CREATE OR REPLACE TABLE identifier('openalex' || :env_suffix || '.works.work_authors') AS
WITH -- Step 1: Get records from work_author_affiliations where author_id IS NOT NULL (matched authors)
matched_author_affiliations AS (
    SELECT 
        work_id,
        author_sequence,
        author_id,
        raw_author_name,
        raw_affiliation_string,
        created_at,
        updated_at
    FROM identifier('openalex' || :env_suffix || '.works.work_author_affiliations')
    WHERE author_id IS NOT NULL
),
-- Step 2: Get work_ids that already have matched authors
existing_work_ids AS (
    SELECT DISTINCT work_id
    FROM matched_author_affiliations
),
-- Step 3: Extract affiliations from openalex_works_base for work_ids not already present
raw_exploded AS (
    SELECT 
        id AS work_id,
        POSEXPLODE(authorships) AS (author_sequence, authorship)
    FROM identifier('openalex' || :env_suffix || '.works.openalex_works_base')
    WHERE authorships IS NOT NULL 
      AND SIZE(authorships) > 0
),
exploded_affiliations AS (
    SELECT 
        work_id,
        author_sequence,
        authorship.raw_author_name,
        EXPLODE_OUTER(authorship.raw_affiliation_strings) AS raw_affiliation_string
    FROM raw_exploded
),
-- Step 4: Only keep affiliations from openalex_works_base for work_ids not already in matched_author_affiliations
new_affiliations_from_base AS (
    SELECT 
        ea.work_id,
        ea.author_sequence,
        CAST(NULL AS BIGINT) AS author_id,
        ea.raw_author_name,
        ea.raw_affiliation_string,
        current_timestamp() AS created_at,
        current_timestamp() AS updated_at
    FROM exploded_affiliations ea
    LEFT JOIN existing_work_ids ex
        ON ea.work_id = ex.work_id
    WHERE ex.work_id IS NULL
),
-- Step 5: Union the two sources
combined_affiliations AS (
    SELECT work_id, author_sequence, author_id, raw_author_name, raw_affiliation_string, created_at, updated_at
    FROM matched_author_affiliations
    UNION ALL
    SELECT work_id, author_sequence, author_id, raw_author_name, raw_affiliation_string, created_at, updated_at
    FROM new_affiliations_from_base
),
-- Get is_corresponding from openalex_works_base authorships
authorships_is_corresponding AS (
    SELECT 
        id AS work_id,
        author_sequence,
        authorship.is_corresponding
    FROM identifier('openalex' || :env_suffix || '.works.openalex_works_base')
    LATERAL VIEW POSEXPLODE(authorships) t AS author_sequence, authorship
),
aggregated_affiliations AS (
    -- Aggregate affiliation strings per work/author
    SELECT 
        work_id,
        author_sequence,
        author_id,
        MAX(raw_author_name) AS raw_author_name,
        ARRAY_DISTINCT(ARRAY_COMPACT(COLLECT_LIST(raw_affiliation_string))) AS raw_affiliation_strings,
        MIN(created_at) AS created_at,
        MAX(updated_at) AS updated_at
    FROM combined_affiliations
    GROUP BY work_id, author_sequence, author_id
)
SELECT 
    aa.work_id,
    aa.author_sequence,
    aa.author_id,
    aa.raw_author_name,
    aa.raw_affiliation_strings,
    COALESCE(aic.is_corresponding, FALSE) AS is_corresponding,
    aa.created_at,
    aa.updated_at
FROM aggregated_affiliations aa
LEFT JOIN authorships_is_corresponding aic
    ON aa.work_id = aic.work_id 
    AND aa.author_sequence = aic.author_sequence;