# Migrate `openalex.works.work_author_affiliations`

This notebook:
1. Creates a migration copy of the existing table
2. Deletes records that don't have an author_id from the migration table
3. Inserts affiliations from backfill table (fills gaps with curated data)
4. Inserts remaining affiliations from openalex_works_base

In [None]:
%run ../utils/variables

### Step 1: Create migration copy of existing table

In [None]:
CREATE OR REPLACE TABLE identifier('openalex' || :env_suffix || '.works.work_author_affiliations_migration') AS
SELECT * FROM identifier('openalex' || :env_suffix || '.works.work_author_affiliations');

### Step 2: Delete records without author_id from migration table

In [None]:
DELETE FROM identifier('openalex' || :env_suffix || '.works.work_author_affiliations_migration')
WHERE author_id IS NULL;

### Step 3: Insert affiliations from backfill table

In [None]:
INSERT INTO identifier('openalex' || :env_suffix || '.works.work_author_affiliations_migration') (
    work_id, author_sequence, institution_id, author_id, 
    raw_author_name, raw_affiliation_string, raw_countries, 
    created_at, updated_at
)
WITH existing_works AS (
    -- Get work_ids that already exist in migration table
    SELECT DISTINCT work_id
    FROM identifier('openalex' || :env_suffix || '.works.work_author_affiliations_migration')
),
backfill_data AS (
    SELECT 
        work_id,
        author_sequence,
        raw_author_name,
        raw_affiliation_string
    FROM identifier('openalex' || :env_suffix || '.works.work_author_affiliations_backfill')
    WHERE work_id NOT IN (SELECT work_id FROM existing_works)
),
resolved_affiliations AS (
    SELECT 
        bf.work_id,
        bf.author_sequence,
        bf.raw_author_name,
        bf.raw_affiliation_string,
        CASE 
            WHEN bf.raw_affiliation_string IS NULL THEN NULL
            WHEN asl.institution_ids_override IS NOT NULL AND SIZE(asl.institution_ids_override) > 0 
                THEN asl.institution_ids_override
            WHEN asl.institution_ids IS NOT NULL AND SIZE(asl.institution_ids) > 0 
                AND NOT (SIZE(asl.institution_ids) = 1 AND asl.institution_ids[0] = -1) 
                THEN asl.institution_ids
            ELSE NULL
        END AS institution_ids,
        asl.countries AS raw_countries
    FROM backfill_data bf
    LEFT JOIN identifier('openalex' || :env_suffix || '.institutions.affiliation_strings_lookup') asl
        ON bf.raw_affiliation_string = asl.raw_affiliation_string
),
final_rows AS (
    SELECT DISTINCT
        work_id,
        author_sequence,
        exploded_inst_id AS institution_id,
        CAST(NULL AS BIGINT) AS author_id,
        raw_author_name,
        raw_affiliation_string,
        raw_countries
    FROM resolved_affiliations
    LATERAL VIEW OUTER EXPLODE(institution_ids) t AS exploded_inst_id
)
SELECT 
    work_id,
    author_sequence,
    institution_id,
    author_id,
    raw_author_name,
    raw_affiliation_string,
    raw_countries,
    current_timestamp() AS created_at,
    current_timestamp() AS updated_at
FROM final_rows;

### Step 4: Insert remaining affiliations from openalex_works_base

In [None]:
INSERT INTO identifier('openalex' || :env_suffix || '.works.work_author_affiliations_migration') (
    work_id, author_sequence, institution_id, author_id, 
    raw_author_name, raw_affiliation_string, raw_countries, 
    created_at, updated_at
)
WITH existing_works AS (
    -- Get work_ids that already exist in migration table (including backfill)
    SELECT DISTINCT work_id
    FROM identifier('openalex' || :env_suffix || '.works.work_author_affiliations_migration')
),
raw_exploded AS (
    SELECT 
        id AS work_id,
        POSEXPLODE(authorships) AS (author_sequence, authorship)
    FROM identifier('openalex' || :env_suffix || '.works.openalex_works_base')
    WHERE authorships IS NOT NULL 
      AND SIZE(authorships) > 0
),
exploded_affiliations AS (
    SELECT 
        work_id,
        author_sequence,
        authorship.raw_author_name,
        EXPLODE_OUTER(authorship.raw_affiliation_strings) AS raw_affiliation_string
    FROM raw_exploded
    -- Skip works that already exist in migration table
    WHERE work_id NOT IN (SELECT work_id FROM existing_works)
),
resolved_affiliations AS (
    SELECT 
        ea.work_id,
        ea.author_sequence,
        ea.raw_author_name,
        ea.raw_affiliation_string,
        CASE 
            WHEN ea.raw_affiliation_string IS NULL THEN NULL
            WHEN asl.institution_ids_override IS NOT NULL AND SIZE(asl.institution_ids_override) > 0 
                THEN asl.institution_ids_override
            WHEN asl.institution_ids IS NOT NULL AND SIZE(asl.institution_ids) > 0 
                AND NOT (SIZE(asl.institution_ids) = 1 AND asl.institution_ids[0] = -1) 
                THEN asl.institution_ids
            ELSE NULL
        END AS institution_ids,
        asl.countries AS raw_countries
    FROM exploded_affiliations ea
    LEFT JOIN identifier('openalex' || :env_suffix || '.institutions.affiliation_strings_lookup') asl
        ON ea.raw_affiliation_string = asl.raw_affiliation_string
),
final_rows AS (
    SELECT DISTINCT
        work_id,
        author_sequence,
        exploded_inst_id AS institution_id,
        CAST(NULL AS BIGINT) AS author_id,
        raw_author_name,
        raw_affiliation_string,
        raw_countries
    FROM resolved_affiliations
    LATERAL VIEW OUTER EXPLODE(institution_ids) t AS exploded_inst_id
)
SELECT 
    work_id,
    author_sequence,
    institution_id,
    author_id,
    raw_author_name,
    raw_affiliation_string,
    raw_countries,
    current_timestamp() AS created_at,
    current_timestamp() AS updated_at
FROM final_rows;