# Create Institution Ancestors

Computes institution ancestry (lineage) via recursive CTE.

Uses:
- `openalex.institutions.ror_relationships` - ROR parent/child relationships (from CreateRorDerivedTables)
- `openalex.institutions.institutions` - Institution master table with ROR IDs

Creates:
- `openalex.institutions.institution_ancestors` - Lineage IDs for each institution

In [None]:
%sql
-- Create institution_ancestors table via recursive CTE
-- Walks up the parent hierarchy to build complete lineage
CREATE OR REPLACE TABLE openalex.institutions.institution_ancestors AS
WITH RECURSIVE ancestorpath AS (
  -- Base case: direct parent relationships
  SELECT
    i.id AS institution_id,
    REPLACE(i.ror_id, 'https://ror.org/', '') AS ror_id,
    parent_i.id AS ancestor_id,
    rr.related_ror_id AS ancestor_ror_id,
    1 AS generation,
    ARRAY(i.id) AS path
  FROM openalex.institutions.institutions i
  INNER JOIN openalex.institutions.ror_relationships rr
    ON REPLACE(i.ror_id, 'https://ror.org/', '') = rr.ror_id
  INNER JOIN openalex.institutions.institutions parent_i
    ON rr.related_ror_id = REPLACE(parent_i.ror_id, 'https://ror.org/', '')
  WHERE rr.relationship_type = 'Parent'
    AND i.merge_into_id IS NULL
    AND parent_i.merge_into_id IS NULL

  UNION ALL

  -- Recursive case: grandparents and beyond
  SELECT
    p.institution_id,
    p.ror_id,
    grandparent.id AS ancestor_id,
    rel.related_ror_id AS ancestor_ror_id,
    p.generation + 1,
    ARRAY_UNION(p.path, ARRAY(grandparent.id))
  FROM ancestorpath p
  INNER JOIN openalex.institutions.ror_relationships rel
    ON p.ancestor_ror_id = rel.ror_id
  INNER JOIN openalex.institutions.institutions grandparent
    ON rel.related_ror_id = REPLACE(grandparent.ror_id, 'https://ror.org/', '')
  WHERE rel.relationship_type = 'Parent'
    AND p.generation < 10  -- Prevent infinite loops
    AND NOT ARRAY_CONTAINS(p.path, grandparent.id)  -- Cycle detection
    AND grandparent.merge_into_id IS NULL
)
SELECT
  institution_id,
  COLLECT_LIST(ancestor_id) AS lineage_ids
FROM ancestorpath
GROUP BY institution_id

In [None]:
%sql
-- Verify: Count institutions with ancestors
SELECT 
  COUNT(*) as institutions_with_lineage,
  AVG(SIZE(lineage_ids)) as avg_lineage_depth,
  MAX(SIZE(lineage_ids)) as max_lineage_depth
FROM openalex.institutions.institution_ancestors

In [None]:
%sql
-- Verification: Check UBC (I141945490) - should have NO ancestors after ROR fix
-- (UBC's "parent" relationships were changed to "related" in ROR)
SELECT * 
FROM openalex.institutions.institution_ancestors
WHERE institution_id = 141945490

In [None]:
%sql
-- Sample: Show a few institutions with deep lineages
SELECT 
  ia.institution_id,
  i.display_name,
  SIZE(ia.lineage_ids) as lineage_depth,
  ia.lineage_ids
FROM openalex.institutions.institution_ancestors ia
JOIN openalex.institutions.institutions i ON ia.institution_id = i.id
ORDER BY SIZE(ia.lineage_ids) DESC
LIMIT 10