In [0]:
%sql
-- Let's see what we're working with on a small sample
SELECT 
    w.id as work_id,
    authorship.raw_affiliation_strings
FROM openalex.works.OpenAlex_works w
LATERAL VIEW EXPLODE(authorships) AS authorship
LIMIT 20


In [0]:
%sql
-- Now explode the raw_affiliation_strings array and count
SELECT 
    raw_aff_string,
    COUNT(DISTINCT work_id) as works_count
FROM (
    SELECT 
        w.id as work_id,
        raw_aff_string
    FROM openalex.works.OpenAlex_works w
    LATERAL VIEW EXPLODE(authorships) AS authorship
    LATERAL VIEW EXPLODE(authorship.raw_affiliation_strings) AS raw_aff_string
)
GROUP BY raw_aff_string
LIMIT 100

In [0]:
%sql
CREATE TABLE openalex.institutions.affiliation_string_works_counts AS
SELECT 
    raw_aff_string,
    COUNT(DISTINCT w.id) as works_count
FROM openalex.works.OpenAlex_works w
LATERAL VIEW EXPLODE(authorships) AS authorship
LATERAL VIEW EXPLODE(authorship.raw_affiliation_strings) AS raw_aff_string
GROUP BY raw_aff_string

In [0]:
%sql
SELECT * FROM openalex.institutions.affiliation_string_works_counts
LIMIT 20

In [0]:
%sql
-- there are more than 100M rows in the affiliation_strings_lookup table that are not, it seems in any work.authorship.raw_affiliation_strings which is super odd. i'm guessing maybe we normalized the strings at some point, but left the non-normalized versions in the lookup table? they are mostly jankier versions (end with a period, use an & instead of and) of strings that exist in works.

SELECT l.raw_affiliation_string, l.institution_ids, l.institution_ids_override
FROM openalex.institutions.affiliation_strings_lookup l
LEFT JOIN openalex.institutions.affiliation_string_works_counts c
    ON l.raw_affiliation_string = c.raw_aff_string
WHERE c.raw_aff_string IS NULL
LIMIT 1000



In [0]:
%sql
-- Create a version of affiliation_strings_lookup that includes works counts.
-- Joins the lookup table to precomputed counts from openalex.works.OpenAlex_works.
-- Only keeps affiliation strings that appear in at least one work (INNER JOIN).
-- Source counts were computed by exploding authorships.raw_affiliation_strings.

CREATE TABLE openalex.institutions.affiliation_strings_lookup_with_counts
TBLPROPERTIES ('delta.feature.allowColumnDefaults' = 'supported')
AS
SELECT 
    l.raw_affiliation_string,
    l.institution_ids,
    l.institution_ids_override,
    l.countries,
    l.source,
    l.created_datetime,
    l.updated_datetime,
    c.works_count
FROM openalex.institutions.affiliation_strings_lookup l
INNER JOIN openalex.institutions.affiliation_string_works_counts c
    ON l.raw_affiliation_string = c.raw_aff_string