# Create/Update Institutions from ROR

This notebook syncs institution data from ROR to `openalex.institutions.institutions`:
1. **Insert** new institutions for ROR records without OpenAlex match
2. **Update** existing institutions where ROR data has changed

New institution IDs are minted sequentially from `MAX(id) + 1`.

In [None]:
%sql
-- Show current state before changes
SELECT
  COUNT(*) AS current_institutions,
  MAX(id) AS current_max_id
FROM openalex.institutions.institutions

## Insert New Institutions

Create institutions for ROR records that don't have an OpenAlex match.

In [None]:
%sql
-- Insert new institutions from ROR
INSERT INTO openalex.institutions.institutions

WITH ror_display_name AS (
  SELECT
    REPLACE(id, 'https://ror.org/', '') AS ror_id,
    MAX(CASE WHEN ARRAY_CONTAINS(n.types, 'ror_display') THEN n.value END) AS display_name
  FROM openalex.institutions.ror
  LATERAL VIEW EXPLODE(names) AS n
  GROUP BY id
),

ror_location AS (
  SELECT
    REPLACE(r.id, 'https://ror.org/', '') AS ror_id,
    r.locations[0].geonames_details.country_code AS iso3166_code,
    r.locations[0].geonames_details.lat AS latitude,
    r.locations[0].geonames_details.lng AS longitude,
    r.locations[0].geonames_details.name AS city,
    r.locations[0].geonames_details.country_subdivision_name AS region,
    r.locations[0].geonames_details.country_name AS country,
    r.locations[0].geonames_id AS geonames_city_id
  FROM openalex.institutions.ror r
),

ror_links AS (
  SELECT
    REPLACE(id, 'https://ror.org/', '') AS ror_id,
    MAX(CASE WHEN l.type = 'website' THEN l.value END) AS official_page,
    MAX(CASE WHEN l.type = 'wikipedia' THEN l.value END) AS wiki_page
  FROM openalex.institutions.ror
  LATERAL VIEW EXPLODE(links) AS l
  GROUP BY id
),

ror_external_ids AS (
  SELECT
    REPLACE(id, 'https://ror.org/', '') AS ror_id,
    MAX(CASE WHEN e.type = 'grid' THEN e.preferred END) AS grid_id,
    MAX(CASE WHEN e.type = 'wikidata' THEN e.preferred END) AS wikidata_id
  FROM openalex.institutions.ror
  LATERAL VIEW EXPLODE(external_ids) AS e
  GROUP BY id
),

ror_type AS (
  SELECT
    REPLACE(id, 'https://ror.org/', '') AS ror_id,
    LOWER(COALESCE(
      MAX(CASE WHEN LOWER(t) != 'funder' THEN t END),
      MAX(t)
    )) AS type
  FROM openalex.institutions.ror
  LATERAL VIEW EXPLODE(types) AS t
  GROUP BY id
),

ror_acronyms AS (
  SELECT
    REPLACE(id, 'https://ror.org/', '') AS ror_id,
    COLLECT_LIST(n.value) AS display_name_acronyms
  FROM openalex.institutions.ror
  LATERAL VIEW EXPLODE(names) AS n
  WHERE ARRAY_CONTAINS(n.types, 'acronym')
  GROUP BY id
),

ror_aliases AS (
  SELECT
    REPLACE(id, 'https://ror.org/', '') AS ror_id,
    COLLECT_LIST(n.value) AS display_name_alternatives
  FROM openalex.institutions.ror
  LATERAL VIEW EXPLODE(names) AS n
  WHERE ARRAY_CONTAINS(n.types, 'alias')
  GROUP BY id
),

max_id AS (
  SELECT MAX(id) AS current_max_id
  FROM openalex.institutions.institutions
),

new_ror AS (
  SELECT
    REPLACE(r.id, 'https://ror.org/', '') AS ror_id,
    ROW_NUMBER() OVER (ORDER BY r.id) AS row_num
  FROM openalex.institutions.ror r
  LEFT JOIN openalex.institutions.institutions i
    ON REPLACE(r.id, 'https://ror.org/', '') = REPLACE(i.ror_id, 'https://ror.org/', '')
  WHERE i.id IS NULL
    AND r.status != 'withdrawn'
)

SELECT
  (SELECT current_max_id FROM max_id) + nr.row_num AS id,
  CONCAT('https://ror.org/', nr.ror_id) AS ror_id,
  dn.display_name,
  rt.type,
  rl.official_page,
  rl.wiki_page,
  loc.iso3166_code,
  CAST(loc.latitude AS DOUBLE) AS latitude,
  CAST(loc.longitude AS DOUBLE) AS longitude,
  loc.city,
  loc.region,
  loc.country,
  loc.geonames_city_id,
  ei.grid_id,
  ei.wikidata_id,
  COALESCE(acr.display_name_acronyms, ARRAY()) AS display_name_acronyms,
  COALESCE(al.display_name_alternatives, ARRAY()) AS display_name_alternatives,
  NULL AS image_url,
  NULL AS image_thumbnail_url,
  NULL AS merge_into_id,
  CURRENT_TIMESTAMP() AS created_date,
  CURRENT_TIMESTAMP() AS updated_date
FROM new_ror nr
LEFT JOIN ror_display_name dn ON nr.ror_id = dn.ror_id
LEFT JOIN ror_location loc ON nr.ror_id = loc.ror_id
LEFT JOIN ror_links rl ON nr.ror_id = rl.ror_id
LEFT JOIN ror_external_ids ei ON nr.ror_id = ei.ror_id
LEFT JOIN ror_type rt ON nr.ror_id = rt.ror_id
LEFT JOIN ror_acronyms acr ON nr.ror_id = acr.ror_id
LEFT JOIN ror_aliases al ON nr.ror_id = al.ror_id

In [None]:
%sql
-- Verify inserts
SELECT
  COUNT(*) AS total_institutions,
  MAX(id) AS new_max_id,
  COUNT(CASE WHEN DATE(created_date) = CURRENT_DATE() THEN 1 END) AS inserted_today
FROM openalex.institutions.institutions

## Update Existing Institutions

Update institutions where ROR data has changed since last update.

In [None]:
%sql
-- Update existing institutions from ROR
MERGE INTO openalex.institutions.institutions AS target
USING (
  WITH ror_display_name AS (
    SELECT
      REPLACE(id, 'https://ror.org/', '') AS ror_id,
      MAX(CASE WHEN ARRAY_CONTAINS(n.types, 'ror_display') THEN n.value END) AS display_name
    FROM openalex.institutions.ror
    LATERAL VIEW EXPLODE(names) AS n
    GROUP BY id
  ),

  ror_location AS (
    SELECT
      REPLACE(r.id, 'https://ror.org/', '') AS ror_id,
      r.locations[0].geonames_details.country_code AS iso3166_code,
      r.locations[0].geonames_details.lat AS latitude,
      r.locations[0].geonames_details.lng AS longitude,
      r.locations[0].geonames_details.name AS city,
      r.locations[0].geonames_details.country_subdivision_name AS region,
      r.locations[0].geonames_details.country_name AS country,
      r.locations[0].geonames_id AS geonames_city_id
    FROM openalex.institutions.ror r
  ),

  ror_links AS (
    SELECT
      REPLACE(id, 'https://ror.org/', '') AS ror_id,
      MAX(CASE WHEN l.type = 'website' THEN l.value END) AS official_page,
      MAX(CASE WHEN l.type = 'wikipedia' THEN l.value END) AS wiki_page
    FROM openalex.institutions.ror
    LATERAL VIEW EXPLODE(links) AS l
    GROUP BY id
  ),

  ror_external_ids AS (
    SELECT
      REPLACE(id, 'https://ror.org/', '') AS ror_id,
      MAX(CASE WHEN e.type = 'grid' THEN e.preferred END) AS grid_id,
      MAX(CASE WHEN e.type = 'wikidata' THEN e.preferred END) AS wikidata_id
    FROM openalex.institutions.ror
    LATERAL VIEW EXPLODE(external_ids) AS e
    GROUP BY id
  ),

  ror_type AS (
    SELECT
      REPLACE(id, 'https://ror.org/', '') AS ror_id,
      LOWER(COALESCE(
        MAX(CASE WHEN LOWER(t) != 'funder' THEN t END),
        MAX(t)
      )) AS type
    FROM openalex.institutions.ror
    LATERAL VIEW EXPLODE(types) AS t
    GROUP BY id
  ),

  ror_acronyms AS (
    SELECT
      REPLACE(id, 'https://ror.org/', '') AS ror_id,
      COLLECT_LIST(n.value) AS display_name_acronyms
    FROM openalex.institutions.ror
    LATERAL VIEW EXPLODE(names) AS n
    WHERE ARRAY_CONTAINS(n.types, 'acronym')
    GROUP BY id
  ),

  ror_aliases AS (
    SELECT
      REPLACE(id, 'https://ror.org/', '') AS ror_id,
      COLLECT_LIST(n.value) AS display_name_alternatives
    FROM openalex.institutions.ror
    LATERAL VIEW EXPLODE(names) AS n
    WHERE ARRAY_CONTAINS(n.types, 'alias')
    GROUP BY id
  )

  SELECT
    i.id,
    dn.display_name,
    rt.type,
    rl.official_page,
    rl.wiki_page,
    loc.iso3166_code,
    CAST(loc.latitude AS DOUBLE) AS latitude,
    CAST(loc.longitude AS DOUBLE) AS longitude,
    loc.city,
    loc.region,
    loc.country,
    loc.geonames_city_id,
    ei.grid_id,
    ei.wikidata_id,
    COALESCE(acr.display_name_acronyms, ARRAY()) AS display_name_acronyms,
    COALESCE(al.display_name_alternatives, ARRAY()) AS display_name_alternatives
  FROM openalex.institutions.ror r
  INNER JOIN openalex.institutions.institutions i
    ON REPLACE(r.id, 'https://ror.org/', '') = REPLACE(i.ror_id, 'https://ror.org/', '')
  LEFT JOIN ror_display_name dn ON REPLACE(r.id, 'https://ror.org/', '') = dn.ror_id
  LEFT JOIN ror_location loc ON REPLACE(r.id, 'https://ror.org/', '') = loc.ror_id
  LEFT JOIN ror_links rl ON REPLACE(r.id, 'https://ror.org/', '') = rl.ror_id
  LEFT JOIN ror_external_ids ei ON REPLACE(r.id, 'https://ror.org/', '') = ei.ror_id
  LEFT JOIN ror_type rt ON REPLACE(r.id, 'https://ror.org/', '') = rt.ror_id
  LEFT JOIN ror_acronyms acr ON REPLACE(r.id, 'https://ror.org/', '') = acr.ror_id
  LEFT JOIN ror_aliases al ON REPLACE(r.id, 'https://ror.org/', '') = al.ror_id
  WHERE r.status != 'withdrawn'
    AND r.updated_date > COALESCE(i.updated_date, '1970-01-01')
    AND i.merge_into_id IS NULL
) AS source
ON target.id = source.id
WHEN MATCHED THEN UPDATE SET
  target.display_name = source.display_name,
  target.type = source.type,
  target.official_page = source.official_page,
  target.wiki_page = source.wiki_page,
  target.iso3166_code = source.iso3166_code,
  target.latitude = source.latitude,
  target.longitude = source.longitude,
  target.city = source.city,
  target.region = source.region,
  target.country = source.country,
  target.geonames_city_id = source.geonames_city_id,
  target.grid_id = source.grid_id,
  target.wikidata_id = source.wikidata_id,
  target.display_name_acronyms = source.display_name_acronyms,
  target.display_name_alternatives = source.display_name_alternatives,
  target.updated_date = CURRENT_TIMESTAMP()

In [None]:
%sql
-- Verify updates
SELECT
  COUNT(*) AS total_institutions,
  COUNT(CASE WHEN DATE(updated_date) = CURRENT_DATE() AND DATE(created_date) != CURRENT_DATE() THEN 1 END) AS updated_today
FROM openalex.institutions.institutions

In [None]:
%sql
-- Final summary
SELECT
  COUNT(*) AS total_institutions,
  MAX(id) AS max_id,
  COUNT(CASE WHEN DATE(created_date) = CURRENT_DATE() THEN 1 END) AS created_today,
  COUNT(CASE WHEN DATE(updated_date) = CURRENT_DATE() AND DATE(created_date) != CURRENT_DATE() THEN 1 END) AS updated_today
FROM openalex.institutions.institutions