# Remove duplicates from `openalex.works.work_author_affiliations` and rebuild `work_authorships`. One-off migration script.

This notebook:
1. Creates a migration copy of the existing work_author_affiliations table
2. Deletes records that don't have an author_id from the migration table
3. Inserts affiliations from backfill table (fills gaps with curated data)
4. Inserts remaining affiliations from openalex_works_base (merges with backfill)
5. Creates a migration copy of the existing work_authorships table
6. Rebuilds all authorships using data from work_author_affiliations_migration (with institution lookup)
7. Merges rebuilt authorships into the migration table

In [None]:
%run ../utils/variables

### Step 1: Create migration copy of existing table

In [None]:
CREATE OR REPLACE TABLE identifier('openalex' || :env_suffix || '.works.work_author_affiliations_migration') AS
SELECT 
    work_id,
    author_sequence,
    author_id,
    raw_author_name,
    raw_affiliation_string,
    created_at,
    updated_at
FROM identifier('openalex' || :env_suffix || '.works.work_author_affiliations');

### Step 2: Delete records without author_id from migration table

In [None]:
DELETE FROM identifier('openalex' || :env_suffix || '.works.work_author_affiliations_migration')
WHERE author_id IS NULL;

### Step 3: Insert affiliations from backfill table

In [None]:
INSERT INTO identifier('openalex' || :env_suffix || '.works.work_author_affiliations_migration') (
    work_id, author_sequence, author_id, 
    raw_author_name, raw_affiliation_string, 
    created_at, updated_at
)
WITH existing_affiliations AS (
    -- Get existing (work_id, author_sequence, raw_affiliation_string) combinations
    SELECT DISTINCT work_id, author_sequence, raw_affiliation_string
    FROM identifier('openalex' || :env_suffix || '.works.work_author_affiliations_migration')
),
backfill_data AS (
    SELECT 
        bf.work_id,
        bf.author_sequence,
        bf.raw_author_name,
        bf.raw_affiliation_string
    FROM identifier('openalex' || :env_suffix || '.works.work_author_affiliations_backfill') bf
    LEFT JOIN existing_affiliations ea
        ON bf.work_id = ea.work_id 
        AND bf.author_sequence = ea.author_sequence
        AND NVL(bf.raw_affiliation_string, '') = NVL(ea.raw_affiliation_string, '')
    WHERE ea.work_id IS NULL
)
SELECT DISTINCT
    work_id,
    author_sequence,
    CAST(NULL AS BIGINT) AS author_id,
    raw_author_name,
    raw_affiliation_string,
    current_timestamp() AS created_at,
    current_timestamp() AS updated_at
FROM backfill_data;

### Step 4: Insert remaining affiliations from openalex_works_base

In [None]:
INSERT INTO identifier('openalex' || :env_suffix || '.works.work_author_affiliations_migration') (
    work_id, author_sequence, author_id, 
    raw_author_name, raw_affiliation_string, 
    created_at, updated_at
)
WITH existing_affiliations AS (
    -- Get existing (work_id, author_sequence, raw_affiliation_string) combinations
    SELECT DISTINCT work_id, author_sequence, raw_affiliation_string
    FROM identifier('openalex' || :env_suffix || '.works.work_author_affiliations_migration')
),
raw_exploded AS (
    SELECT 
        id AS work_id,
        POSEXPLODE(authorships) AS (author_sequence, authorship)
    FROM identifier('openalex' || :env_suffix || '.works.openalex_works_base')
    WHERE authorships IS NOT NULL 
      AND SIZE(authorships) > 0
),
exploded_affiliations AS (
    SELECT 
        work_id,
        author_sequence,
        authorship.raw_author_name,
        EXPLODE_OUTER(authorship.raw_affiliation_strings) AS raw_affiliation_string
    FROM raw_exploded
),
new_affiliations AS (
    -- Only keep affiliations that don't already exist in migration table
    SELECT 
        ea.work_id,
        ea.author_sequence,
        ea.raw_author_name,
        ea.raw_affiliation_string
    FROM exploded_affiliations ea
    LEFT JOIN existing_affiliations ex
        ON ea.work_id = ex.work_id 
        AND ea.author_sequence = ex.author_sequence
        AND NVL(ea.raw_affiliation_string, '') = NVL(ex.raw_affiliation_string, '')
    WHERE ex.work_id IS NULL
)
SELECT DISTINCT
    work_id,
    author_sequence,
    CAST(NULL AS BIGINT) AS author_id,
    raw_author_name,
    raw_affiliation_string,
    current_timestamp() AS created_at,
    current_timestamp() AS updated_at
FROM new_affiliations;

### Step 5: Create migration copy of work_authorships table

In [None]:
CREATE OR REPLACE TABLE identifier('openalex' || :env_suffix || '.works.work_authorships_migration') AS
SELECT * FROM identifier('openalex' || :env_suffix || '.works.work_authorships');

### Step 6: Build enriched authorships from migration affiliations table (with institution lookup)

In [None]:
CREATE OR REPLACE TABLE identifier('openalex' || :env_suffix || '.works.work_authorships_migration_updates')
CLUSTER BY (work_id) AS (
WITH base_works AS (
    -- Read raw authorships from openalex_works_base (ALL works)
    SELECT
        id AS work_id,
        authorships,
        updated_date AS updated_datetime
    FROM identifier('openalex' || :env_suffix || '.works.openalex_works_base')
    WHERE authorships IS NOT NULL
      AND SIZE(authorships) > 0
),
-- Look up institution_ids from affiliation_strings_lookup
affiliation_institution_lookup AS (
    SELECT
        ai.work_id,
        ai.author_sequence,
        ai.raw_affiliation_string,
        CASE 
            WHEN ai.raw_affiliation_string IS NULL THEN NULL
            WHEN asl.institution_ids_override IS NOT NULL AND SIZE(asl.institution_ids_override) > 0 
                THEN asl.institution_ids_override
            WHEN asl.institution_ids IS NOT NULL AND SIZE(asl.institution_ids) > 0 
                AND NOT (SIZE(asl.institution_ids) = 1 AND asl.institution_ids[0] = -1) 
                THEN asl.institution_ids
            ELSE NULL
        END AS institution_ids,
        asl.countries AS raw_countries
    FROM identifier('openalex' || :env_suffix || '.works.work_author_affiliations_migration') ai
    LEFT JOIN identifier('openalex' || :env_suffix || '.institutions.affiliation_strings_lookup') asl
        ON ai.raw_affiliation_string = asl.raw_affiliation_string
),
-- Explode institution_ids for joins
affiliation_institutions_exploded AS (
    SELECT
        work_id,
        author_sequence,
        raw_affiliation_string,
        exploded_inst_id AS institution_id,
        raw_countries
    FROM affiliation_institution_lookup
    LATERAL VIEW OUTER EXPLODE(institution_ids) t AS exploded_inst_id
),
institution_lineage AS (
    SELECT
        institution_id,
        COLLECT_LIST(ancestor_id) AS lineage_ids
    FROM openalex.mid.institution_ancestors
    WHERE NOT ARRAY_CONTAINS(SUPER_SYSTEM_INSTITUTIONS, ancestor_id)
    GROUP BY institution_id
),
-- 1. Get Institution Details (Grouped by Work/Seq) - using looked up institution_ids
author_institutions_with_details AS (
    SELECT
        ai.work_id,
        ai.author_sequence,
        ARRAY_DISTINCT(FLATTEN(COLLECT_SET(ai.raw_countries))) AS raw_parsed_countries,
        COLLECT_SET(
            STRUCT(
                inst.iso3166_code AS country_code,
                inst.display_name,
                CONCAT('https://openalex.org/I', ai.institution_id) AS id,
                ARRAY_SORT(
                    TRANSFORM(
                        ARRAY_COMPACT(CONCAT(ARRAY(ai.institution_id), COALESCE(il.lineage_ids, ARRAY()))), 
                        id -> CONCAT('https://openalex.org/I', id)
                    )
                ) AS lineage,
                CASE 
                    WHEN inst.ror_id IS NULL THEN NULL
                    WHEN inst.ror_id LIKE 'https://ror.org/%' THEN inst.ror_id
                    ELSE CONCAT('https://ror.org/', inst.ror_id) 
                END AS ror,
                inst.type
            )
        ) AS institutions
    FROM affiliation_institutions_exploded ai
    LEFT JOIN openalex.institutions.institutions inst ON inst.id = ai.institution_id
    LEFT JOIN institution_lineage il USING (institution_id)
    WHERE ai.institution_id IS NOT NULL 
    GROUP BY ai.work_id, ai.author_sequence
),
-- 2. Get Author IDs (Grouped by Work/Seq) - using migration table
author_id_lookup AS (
    SELECT 
        work_id, 
        author_sequence, 
        MAX(author_id) as author_id
    FROM identifier('openalex' || :env_suffix || '.works.work_author_affiliations_migration')
    GROUP BY work_id, author_sequence
),
-- 3. Enrich Author IDs with Profile Data (OpenAlex Authors + Registry)
author_data_enriched AS (
    SELECT 
        ail.work_id,
        ail.author_sequence,
        ail.author_id,
        -- Priority 1: Existing OpenAlex Author
        -- Priority 2: Newly Minted Registry Author
        COALESCE(oa.display_name, ar.display_name) as best_display_name,
        oa.orcid as best_orcid
    FROM author_id_lookup ail
    -- Join to Main Table (Existing Authors)
    LEFT JOIN openalex.authors.openalex_authors oa 
        ON ail.author_id = oa.id
    -- Join to Registry (New Authors)
    LEFT JOIN openalex.authors.author_registry ar 
        ON ail.author_id = ar.id
),
affiliations_map_ids AS (
    SELECT
        work_id,
        author_sequence,
        raw_affiliation_string,
        ARRAY_DISTINCT(
            ARRAY_COMPACT(
                COLLECT_LIST(CONCAT('https://openalex.org/I', institution_id))
            )
        ) AS institution_ids
    FROM affiliation_institutions_exploded
    WHERE institution_id IS NOT NULL
      AND raw_affiliation_string IS NOT NULL
    GROUP BY work_id, author_sequence, raw_affiliation_string
),
affiliations_map AS (
    SELECT
        work_id,
        author_sequence,
        MAP_FROM_ENTRIES(
            COLLECT_LIST(NAMED_STRUCT('key', raw_affiliation_string, 'value', institution_ids))
        ) AS aff_map
    FROM affiliations_map_ids
    GROUP BY work_id, author_sequence
),
-- 4. Build Final Lookup Map
author_institution_lookup AS (
    SELECT
        ade.work_id,
        MAP_FROM_ENTRIES(
            COLLECT_LIST(
                STRUCT(
                    ade.author_sequence,
                    STRUCT(
                        -- Enriched Author Data
                        ade.author_id,
                        ade.best_display_name,
                        ade.best_orcid,
                        
                        -- Institution Data
                        details.institutions,
                        details.raw_parsed_countries,
                        am.aff_map
                    )
                )
            )
        ) AS author_lookup
    FROM author_data_enriched ade
    LEFT JOIN author_institutions_with_details details
        ON ade.work_id = details.work_id 
        AND ade.author_sequence = details.author_sequence
    LEFT JOIN affiliations_map am 
        ON ade.work_id = am.work_id 
        AND ade.author_sequence = am.author_sequence
    GROUP BY ade.work_id
),
exploded_for_parsed_names AS (
    SELECT
        work_id,
        updated_datetime,
        POSEXPLODE(authorships) AS (author_idx, authorship)
    FROM base_works
),
with_parsed_names AS (
    SELECT
        e.work_id,
        e.updated_datetime,
        e.author_idx,
        e.authorship,
        pn.parsed_name
    FROM exploded_for_parsed_names e
    LEFT JOIN identifier('openalex' || :env_suffix || '.authors.parsed_names_lookup') pn
        ON TRIM(e.authorship.raw_author_name) = pn.raw_author_name
),
authorships_with_parsed_names AS (
    SELECT
        work_id,
        updated_datetime,
        TRANSFORM(
            ARRAY_SORT(
                COLLECT_LIST(
                    STRUCT(
                        author_idx,
                        STRUCT(
                            authorship.affiliations AS affiliations,
                            authorship.author AS author,
                            authorship.author_position AS author_position,
                            authorship.author_order_number AS author_order_number,
                            authorship.countries AS countries,
                            authorship.institutions AS institutions,
                            authorship.is_corresponding AS is_corresponding,
                            authorship.raw_affiliation_strings AS raw_affiliation_strings,
                            authorship.raw_author_name AS raw_author_name,
                            parsed_name AS parsed_name
                        ) AS authorship
                    )
                ),
                (left, right) -> CASE
                    WHEN left.author_idx < right.author_idx THEN -1
                    WHEN left.author_idx > right.author_idx THEN 1
                    ELSE 0
                END
            ),
            x -> x.authorship
        ) AS authorships
    FROM with_parsed_names
    GROUP BY work_id, updated_datetime
)
SELECT
    ba.work_id,
    ba.updated_datetime,
    TRANSFORM(
        ba.authorships,
        (auth, idx) -> STRUCT(
            TRANSFORM(
                COALESCE(auth.raw_affiliation_strings, ARRAY()),
                s -> STRUCT(
                    COALESCE(ELEMENT_AT(ELEMENT_AT(ail.author_lookup, idx).aff_map, s), ARRAY()) AS institution_ids,
                    s AS raw_affiliation_string
                )
            ) AS affiliations,
            
            STRUCT(
                CASE 
                   WHEN ELEMENT_AT(ail.author_lookup, idx).author_id IS NOT NULL 
                   THEN CONCAT('https://openalex.org/A', CAST(ELEMENT_AT(ail.author_lookup, idx).author_id AS STRING))
                   ELSE auth.author.id 
                END as id,
                -- Use Display Name from OA/Registry, fallback to raw work data
                COALESCE(
                    ELEMENT_AT(ail.author_lookup, idx).best_display_name, 
                    auth.author.display_name
                ) as display_name,
                -- Use ORCID from OA, fallback to raw work data
                CASE 
                    WHEN ELEMENT_AT(ail.author_lookup, idx).author_id IS NOT NULL 
                    THEN ELEMENT_AT(ail.author_lookup, idx).best_orcid
                    ELSE auth.author.orcid
                END as orcid
            ) as author,
            
            auth.author_position,
            auth.author_order_number,
            CASE
                WHEN ELEMENT_AT(ail.author_lookup, idx).institutions IS NOT NULL 
                     AND SIZE(FILTER(ELEMENT_AT(ail.author_lookup, idx).institutions.country_code, c -> c IS NOT NULL AND c <> '')) > 0
                    THEN ARRAY_SORT(ARRAY_DISTINCT(FILTER(ELEMENT_AT(ail.author_lookup, idx).institutions.country_code, c -> c IS NOT NULL AND c <> '')))
                WHEN ELEMENT_AT(ail.author_lookup, idx).raw_parsed_countries IS NOT NULL
                    THEN ELEMENT_AT(ail.author_lookup, idx).raw_parsed_countries
                ELSE ARRAY()
            END AS countries,
            COALESCE(ELEMENT_AT(ail.author_lookup, idx).institutions, ARRAY()) AS institutions,
            auth.is_corresponding,
            auth.raw_affiliation_strings,
            auth.raw_author_name,
            auth.parsed_name
        )
    ) AS authorships
FROM authorships_with_parsed_names ba
LEFT JOIN author_institution_lookup ail ON ba.work_id = ail.work_id);

### Step 7: Merge updates into migration table

In [None]:
MERGE INTO identifier('openalex' || :env_suffix || '.works.work_authorships_migration') AS target
USING identifier('openalex' || :env_suffix || '.works.work_authorships_migration_updates') AS source
ON target.work_id = source.work_id
WHEN MATCHED THEN UPDATE SET
  target.authorships = source.authorships,
  target.updated_datetime = source.updated_datetime
WHEN NOT MATCHED THEN INSERT (work_id, authorships, updated_datetime)
VALUES (source.work_id, source.authorships, source.updated_datetime);