# Create/Update Institutions from ROR

This notebook extracts institution data from the ROR registry and prepares staging tables for:
1. **New institutions** - ROR records without matching OpenAlex institutions
2. **Updated institutions** - Existing institutions where ROR data has changed

## Output Tables
- `openalex.institutions.institutions_from_ror_staging` - All active ROR records with extracted fields
- `openalex.institutions.institutions_from_ror_new` - New institutions to be created (with minted IDs)
- `openalex.institutions.institutions_from_ror_updates` - Updates for existing institutions

## Usage
After running this notebook, review the staging tables before applying changes to production.

In [None]:
%sql
-- Extract fields from nested ROR structure into staging table
CREATE OR REPLACE TABLE openalex.institutions.institutions_from_ror_staging AS

-- Extract ror_display name
WITH ror_display_name AS (
  SELECT
    REPLACE(id, 'https://ror.org/', '') AS ror_id,
    MAX(CASE WHEN ARRAY_CONTAINS(n.types, 'ror_display') THEN n.value END) AS display_name
  FROM openalex.institutions.ror
  LATERAL VIEW EXPLODE(names) AS n
  GROUP BY id
),

-- Extract location from first geonames_details
ror_location AS (
  SELECT
    REPLACE(r.id, 'https://ror.org/', '') AS ror_id,
    r.locations[0].geonames_details.country_code AS iso3166_code,
    r.locations[0].geonames_details.lat AS latitude,
    r.locations[0].geonames_details.lng AS longitude,
    r.locations[0].geonames_details.name AS city,
    r.locations[0].geonames_details.country_subdivision_name AS region,
    r.locations[0].geonames_details.country_name AS country,
    r.locations[0].geonames_id AS geonames_city_id
  FROM openalex.institutions.ror r
),

-- Extract website and wikipedia links
ror_links AS (
  SELECT
    REPLACE(id, 'https://ror.org/', '') AS ror_id,
    MAX(CASE WHEN l.type = 'website' THEN l.value END) AS official_page,
    MAX(CASE WHEN l.type = 'wikipedia' THEN l.value END) AS wiki_page
  FROM openalex.institutions.ror
  LATERAL VIEW EXPLODE(links) AS l
  GROUP BY id
),

-- Extract external IDs (grid, wikidata)
ror_external_ids AS (
  SELECT
    REPLACE(id, 'https://ror.org/', '') AS ror_id,
    MAX(CASE WHEN e.type = 'grid' THEN e.preferred END) AS grid_id,
    MAX(CASE WHEN e.type = 'wikidata' THEN e.preferred END) AS wikidata_id
  FROM openalex.institutions.ror
  LATERAL VIEW EXPLODE(external_ids) AS e
  GROUP BY id
),

-- Extract type (prefer non-funder)
ror_type AS (
  SELECT
    REPLACE(id, 'https://ror.org/', '') AS ror_id,
    LOWER(COALESCE(
      MAX(CASE WHEN LOWER(t) != 'funder' THEN t END),
      MAX(t)
    )) AS type
  FROM openalex.institutions.ror
  LATERAL VIEW EXPLODE(types) AS t
  GROUP BY id
),

-- Extract acronyms
ror_acronyms AS (
  SELECT
    REPLACE(id, 'https://ror.org/', '') AS ror_id,
    COLLECT_LIST(n.value) AS display_name_acronyms
  FROM openalex.institutions.ror
  LATERAL VIEW EXPLODE(names) AS n
  WHERE ARRAY_CONTAINS(n.types, 'acronym')
  GROUP BY id
),

-- Extract aliases
ror_aliases AS (
  SELECT
    REPLACE(id, 'https://ror.org/', '') AS ror_id,
    COLLECT_LIST(n.value) AS display_name_alternatives
  FROM openalex.institutions.ror
  LATERAL VIEW EXPLODE(names) AS n
  WHERE ARRAY_CONTAINS(n.types, 'alias')
  GROUP BY id
)

SELECT
  REPLACE(r.id, 'https://ror.org/', '') AS ror_id,
  dn.display_name,
  rt.type,
  rl.official_page,
  rl.wiki_page,
  loc.iso3166_code,
  loc.latitude,
  loc.longitude,
  loc.city,
  loc.region,
  loc.country,
  loc.geonames_city_id,
  ei.grid_id,
  ei.wikidata_id,
  COALESCE(acr.display_name_acronyms, ARRAY()) AS display_name_acronyms,
  COALESCE(al.display_name_alternatives, ARRAY()) AS display_name_alternatives,
  r.admin.last_modified.date AS ror_updated_date
FROM openalex.institutions.ror r
LEFT JOIN ror_display_name dn ON REPLACE(r.id, 'https://ror.org/', '') = dn.ror_id
LEFT JOIN ror_location loc ON REPLACE(r.id, 'https://ror.org/', '') = loc.ror_id
LEFT JOIN ror_links rl ON REPLACE(r.id, 'https://ror.org/', '') = rl.ror_id
LEFT JOIN ror_external_ids ei ON REPLACE(r.id, 'https://ror.org/', '') = ei.ror_id
LEFT JOIN ror_type rt ON REPLACE(r.id, 'https://ror.org/', '') = rt.ror_id
LEFT JOIN ror_acronyms acr ON REPLACE(r.id, 'https://ror.org/', '') = acr.ror_id
LEFT JOIN ror_aliases al ON REPLACE(r.id, 'https://ror.org/', '') = al.ror_id
WHERE r.status != 'withdrawn'

In [None]:
%sql
-- Verify staging table
SELECT COUNT(*) AS total_active_ror_records
FROM openalex.institutions.institutions_from_ror_staging

In [None]:
%sql
-- Sample staging data
SELECT * FROM openalex.institutions.institutions_from_ror_staging LIMIT 5

## Create New Institutions Table

For ROR records that don't have a matching OpenAlex institution yet.

In [None]:
%sql
-- Create table with new institutions (ROR records without OpenAlex match)
CREATE OR REPLACE TABLE openalex.institutions.institutions_from_ror_new AS
WITH max_id AS (
  SELECT MAX(id) AS current_max_id
  FROM openalex.institutions.institutions
),
new_ror AS (
  SELECT
    s.*,
    ROW_NUMBER() OVER (ORDER BY s.ror_id) AS row_num
  FROM openalex.institutions.institutions_from_ror_staging s
  LEFT JOIN openalex.institutions.institutions i
    ON s.ror_id = REPLACE(i.ror_id, 'https://ror.org/', '')
  WHERE i.id IS NULL
)
SELECT
  (SELECT current_max_id FROM max_id) + row_num AS id,
  CONCAT('https://ror.org/', ror_id) AS ror_id,
  display_name,
  type,
  official_page,
  wiki_page,
  iso3166_code,
  CAST(latitude AS DOUBLE) AS latitude,
  CAST(longitude AS DOUBLE) AS longitude,
  city,
  region,
  country,
  geonames_city_id,
  grid_id,
  wikidata_id,
  display_name_acronyms,
  display_name_alternatives,
  NULL AS image_url,
  NULL AS image_thumbnail_url,
  NULL AS merge_into_id,
  CURRENT_TIMESTAMP() AS created_date,
  CURRENT_TIMESTAMP() AS updated_date
FROM new_ror

In [None]:
%sql
-- Count new institutions to be created
SELECT
  COUNT(*) AS new_institutions_count,
  MIN(id) AS first_new_id,
  MAX(id) AS last_new_id
FROM openalex.institutions.institutions_from_ror_new

In [None]:
%sql
-- Sample new institutions
SELECT id, ror_id, display_name, type, country, city
FROM openalex.institutions.institutions_from_ror_new
LIMIT 10

## Create Updates Table

For existing institutions where ROR data has changed since the last update.

In [None]:
%sql
-- Create table with updates for existing institutions
CREATE OR REPLACE TABLE openalex.institutions.institutions_from_ror_updates AS
SELECT
  i.id,
  i.ror_id,
  -- Current values (for comparison)
  i.display_name AS current_display_name,
  i.type AS current_type,
  i.official_page AS current_official_page,
  i.wiki_page AS current_wiki_page,
  i.iso3166_code AS current_iso3166_code,
  i.city AS current_city,
  i.region AS current_region,
  i.country AS current_country,
  -- New values from ROR
  s.display_name AS new_display_name,
  s.type AS new_type,
  s.official_page AS new_official_page,
  s.wiki_page AS new_wiki_page,
  s.iso3166_code AS new_iso3166_code,
  CAST(s.latitude AS DOUBLE) AS new_latitude,
  CAST(s.longitude AS DOUBLE) AS new_longitude,
  s.city AS new_city,
  s.region AS new_region,
  s.country AS new_country,
  s.geonames_city_id AS new_geonames_city_id,
  s.grid_id AS new_grid_id,
  s.wikidata_id AS new_wikidata_id,
  s.display_name_acronyms AS new_display_name_acronyms,
  s.display_name_alternatives AS new_display_name_alternatives,
  -- Timestamps
  i.created_date,
  i.updated_date AS current_updated_date,
  s.ror_updated_date,
  CURRENT_TIMESTAMP() AS new_updated_date
FROM openalex.institutions.institutions_from_ror_staging s
INNER JOIN openalex.institutions.institutions i
  ON s.ror_id = REPLACE(i.ror_id, 'https://ror.org/', '')
WHERE s.ror_updated_date > COALESCE(i.updated_date, '1970-01-01')
  AND i.merge_into_id IS NULL

In [None]:
%sql
-- Count institutions to be updated
SELECT COUNT(*) AS institutions_to_update
FROM openalex.institutions.institutions_from_ror_updates

In [None]:
%sql
-- Sample updates - show what changed
SELECT
  id,
  ror_id,
  CASE WHEN current_display_name != new_display_name THEN CONCAT(current_display_name, ' -> ', new_display_name) ELSE 'unchanged' END AS display_name_change,
  CASE WHEN current_type != new_type THEN CONCAT(current_type, ' -> ', new_type) ELSE 'unchanged' END AS type_change,
  CASE WHEN current_city != new_city THEN CONCAT(current_city, ' -> ', new_city) ELSE 'unchanged' END AS city_change,
  ror_updated_date,
  current_updated_date
FROM openalex.institutions.institutions_from_ror_updates
LIMIT 10

## Summary Report

In [None]:
%sql
-- Summary statistics
SELECT
  (SELECT COUNT(*) FROM openalex.institutions.institutions) AS current_institutions,
  (SELECT MAX(id) FROM openalex.institutions.institutions) AS current_max_id,
  (SELECT COUNT(*) FROM openalex.institutions.institutions_from_ror_staging) AS active_ror_records,
  (SELECT COUNT(*) FROM openalex.institutions.institutions_from_ror_new) AS new_institutions_to_create,
  (SELECT COUNT(*) FROM openalex.institutions.institutions_from_ror_updates) AS institutions_to_update

## Verification Queries

Run these to validate the data before applying to production.

In [None]:
%sql
-- Check for any duplicate IDs in new institutions
SELECT id, COUNT(*) as cnt
FROM openalex.institutions.institutions_from_ror_new
GROUP BY id
HAVING COUNT(*) > 1

In [None]:
%sql
-- Check for ID collision with existing institutions
SELECT n.id, n.ror_id, n.display_name
FROM openalex.institutions.institutions_from_ror_new n
INNER JOIN openalex.institutions.institutions i ON n.id = i.id

In [None]:
%sql
-- Verify all new institutions have display names
SELECT COUNT(*) AS missing_display_name
FROM openalex.institutions.institutions_from_ror_new
WHERE display_name IS NULL OR display_name = ''

In [None]:
%sql
-- Distribution of types for new institutions
SELECT type, COUNT(*) AS cnt
FROM openalex.institutions.institutions_from_ror_new
GROUP BY type
ORDER BY cnt DESC

In [None]:
%sql
-- Distribution of countries for new institutions
SELECT country, COUNT(*) AS cnt
FROM openalex.institutions.institutions_from_ror_new
GROUP BY country
ORDER BY cnt DESC
LIMIT 20

## Apply to Production (DO NOT RUN YET)

Once the staging tables have been reviewed and approved, uncomment and run these cells.

In [None]:
-- %sql
-- -- INSERT new institutions into production table
-- INSERT INTO openalex.institutions.institutions
-- SELECT
--   id,
--   ror_id,
--   display_name,
--   type,
--   official_page,
--   wiki_page,
--   iso3166_code,
--   latitude,
--   longitude,
--   city,
--   region,
--   country,
--   geonames_city_id,
--   grid_id,
--   wikidata_id,
--   display_name_acronyms,
--   display_name_alternatives,
--   image_url,
--   image_thumbnail_url,
--   merge_into_id,
--   created_date,
--   updated_date
-- FROM openalex.institutions.institutions_from_ror_new

In [None]:
-- %sql
-- -- MERGE updates into production table
-- MERGE INTO openalex.institutions.institutions AS target
-- USING openalex.institutions.institutions_from_ror_updates AS source
-- ON target.id = source.id
-- WHEN MATCHED THEN UPDATE SET
--   target.display_name = source.new_display_name,
--   target.type = source.new_type,
--   target.official_page = source.new_official_page,
--   target.wiki_page = source.new_wiki_page,
--   target.iso3166_code = source.new_iso3166_code,
--   target.latitude = source.new_latitude,
--   target.longitude = source.new_longitude,
--   target.city = source.new_city,
--   target.region = source.new_region,
--   target.country = source.new_country,
--   target.geonames_city_id = source.new_geonames_city_id,
--   target.grid_id = source.new_grid_id,
--   target.wikidata_id = source.new_wikidata_id,
--   target.display_name_acronyms = source.new_display_name_acronyms,
--   target.display_name_alternatives = source.new_display_name_alternatives,
--   target.updated_date = source.new_updated_date