# Apply Works-Magnet Curations

One-time batch notebook to apply ~165K approved affiliation corrections from the French Ministry's Works-Magnet tool.

**Job:** #63 apply-works-magnet

## Approach

Write corrected institution IDs to `institution_ids_override` on `affiliation_strings_lookup`. This is the existing batch curation mechanism:
- Highest priority in the MV
- Safe from inference overwrites (`institution_batch_inference` filters `WHERE institution_ids_override is null`)
- No pipeline changes needed

## Steps

1. Load CSV into staging table (filter approved, non-empty)
2. Deduplicate (latest github_issue_id wins per RAS)
3. Convert ROR IDs to OpenAlex institution IDs
4. Sanity checks
5. MERGE into affiliation_strings_lookup
6. Verify MERGE results
7. Refresh MV
8. Populate pending sync table with affected work_ids
9. Report pending sync count

In [None]:
%python
# ============================================
# Cell 1: Load CSV into staging table
# ============================================
# Use PySpark to load CSV (avoids Spark SQL analyzer bug with read_csv)

df = (spark.read
    .option("header", True)
    .option("inferSchema", True)
    .csv("/Volumes/openalex/works/magnet/works_magnet_final.csv")
)

# Filter to approved rows with non-empty RAS
from pyspark.sql.functions import trim, col

df_filtered = (df
    .filter(col("approved") == "yes")
    .filter(trim(col("raw_affiliation_name")) != "")
    .filter(col("raw_affiliation_name").isNotNull())
    .select("github_issue_id", "raw_affiliation_name", "new_rors")
)

df_filtered.write.mode("overwrite").saveAsTable("openalex.works.magnet_staging")
print(f"Loaded {df_filtered.count()} approved rows into magnet_staging")

In [None]:
-- ============================================
-- Cell 2: Deduplicate (latest issue wins per RAS)
-- ============================================
-- Some RAS appear in multiple issues with different corrections.
-- Take the latest github_issue_id as the most recent human decision.

CREATE OR REPLACE TEMP VIEW magnet_deduped AS
SELECT raw_affiliation_name, new_rors
FROM (
  SELECT
    raw_affiliation_name,
    new_rors,
    ROW_NUMBER() OVER (
      PARTITION BY TRIM(raw_affiliation_name)
      ORDER BY github_issue_id DESC
    ) as rn
  FROM openalex.works.magnet_staging
) WHERE rn = 1

In [None]:
-- ============================================
-- Cell 3: Convert ROR IDs to institution IDs
-- ============================================
-- CSV has ROR IDs (e.g. 003vg9w96), semicolon-separated.
-- We need OpenAlex institution IDs (BIGINTs) from mid.institution.
-- If a ROR ID doesn't exist in OpenAlex, it's dropped from that correction.

CREATE OR REPLACE TEMP VIEW magnet_with_institution_ids AS
SELECT
  TRIM(m.raw_affiliation_name) AS raw_affiliation_string,
  COLLECT_SET(i.affiliation_id) AS new_institution_ids,
  m.new_rors AS original_rors
FROM magnet_deduped m
LATERAL VIEW EXPLODE(SPLIT(m.new_rors, ';')) AS ror_id
LEFT JOIN openalex.mid.institution i ON TRIM(ror_id) = i.ror_id
WHERE i.affiliation_id IS NOT NULL
GROUP BY TRIM(m.raw_affiliation_name), m.new_rors

In [None]:
-- ============================================
-- Cell 4: Sanity checks
-- ============================================

-- How many RAS total, how many have resolved institution IDs?
SELECT
  COUNT(*) as total_ras,
  COUNT(CASE WHEN SIZE(new_institution_ids) > 0 THEN 1 END) as has_institutions,
  COUNT(CASE WHEN SIZE(new_institution_ids) = 0 THEN 1 END) as no_institutions
FROM magnet_with_institution_ids

In [None]:
-- ============================================
-- Cell 4b: Check for unresolved ROR IDs
-- ============================================

SELECT TRIM(ror_id) as missing_ror, COUNT(*) as cnt
FROM magnet_deduped
LATERAL VIEW EXPLODE(SPLIT(new_rors, ';')) AS ror_id
LEFT JOIN openalex.mid.institution i ON TRIM(ror_id) = i.ror_id
WHERE i.affiliation_id IS NULL AND TRIM(ror_id) != ''
GROUP BY TRIM(ror_id)
ORDER BY cnt DESC
LIMIT 20

In [None]:
-- ============================================
-- Cell 4c: Sample some corrections for spot-checking
-- ============================================

SELECT raw_affiliation_string, new_institution_ids, original_rors
FROM magnet_with_institution_ids
LIMIT 10

In [None]:
-- ============================================
-- Cell 5: MERGE into affiliation_strings_lookup
-- ============================================
-- Write corrected institution IDs to institution_ids_override.
-- MATCHED: update override + timestamp
-- NOT MATCHED: insert new row with source='magnet'

MERGE INTO openalex.institutions.affiliation_strings_lookup AS target
USING magnet_with_institution_ids AS source
ON target.raw_affiliation_string = source.raw_affiliation_string
WHEN MATCHED THEN UPDATE SET
  target.institution_ids_override = source.new_institution_ids,
  target.updated_datetime = current_timestamp()
WHEN NOT MATCHED THEN INSERT (
  raw_affiliation_string, institution_ids, institution_ids_override,
  countries, source, created_datetime, updated_datetime
) VALUES (
  source.raw_affiliation_string, array(), source.new_institution_ids,
  array(), 'magnet', current_timestamp(), current_timestamp()
)

In [None]:
-- ============================================
-- Cell 6: Verify MERGE results
-- ============================================

SELECT COUNT(*) as rows_with_override
FROM openalex.institutions.affiliation_strings_lookup
WHERE institution_ids_override IS NOT NULL
  AND institution_ids_override != array()
  AND updated_datetime >= current_timestamp() - INTERVAL 1 HOUR

In [None]:
-- ============================================
-- Cell 7: Refresh MV
-- ============================================

REFRESH MATERIALIZED VIEW openalex.institutions.raw_affiliation_strings_institutions_mv

In [None]:
-- ============================================
-- Cell 8: Populate pending sync table
-- ============================================
-- Critical: without this, old works won't get their authorships
-- updated or synced to ES. UpdateWorkAuthorships uses this table
-- to find works that need reprocessing.

INSERT INTO openalex.institutions.curated_work_ids_pending_sync (work_id, curated_ras, added_datetime)
SELECT DISTINCT
  waa.work_id,
  waa.raw_affiliation_string AS curated_ras,
  current_timestamp() AS added_datetime
FROM magnet_with_institution_ids m
INNER JOIN openalex.works.work_author_affiliations_mv waa
  ON m.raw_affiliation_string = waa.raw_affiliation_string
WHERE NOT EXISTS (
  SELECT 1 FROM openalex.institutions.curated_work_ids_pending_sync pending
  WHERE pending.work_id = waa.work_id
)

In [None]:
-- ============================================
-- Cell 9: Report pending sync count
-- ============================================

SELECT COUNT(*) as pending_work_ids,
       COUNT(DISTINCT work_id) as unique_works
FROM openalex.institutions.curated_work_ids_pending_sync