In [0]:
CREATE OR REPLACE TABLE openalex.institutions.institutions_api AS
WITH
-- explode authorships, then institutions (no lateral views)
wa AS (
  SELECT
    w.id,
    w.publication_year,
    w.publication_date,
    w.cited_by_count,
    w.open_access,
    w.topics,
    w.updated_date,
    w.created_date,
    EXPLODE_OUTER(w.authorships) AS a
  FROM openalex.works.openalex_works w
),
wai AS (
  SELECT
    id,
    publication_year,
    publication_date,
    cited_by_count,
    open_access,
    topics,
    updated_date,
    created_date,
    EXPLODE_OUTER(a.institutions) AS inst
  FROM wa
),
-- per-work, per-institution base
works_base AS (
  SELECT
    CAST(REGEXP_EXTRACT(inst.id, '/I([0-9]+)$', 1) AS BIGINT) AS institution_id,
    id AS work_id,
    COALESCE(publication_year, YEAR(publication_date)) AS pub_year,
    CAST(cited_by_count AS INT) AS cited_by_count,
    CAST(open_access.is_oa AS BOOLEAN) AS is_oa,
    topics AS work_topics,
    updated_date,
    created_date
  FROM wai
  WHERE inst.id IS NOT NULL
),
-- dedupe to unique (work, institution) grain for counting works
unique_work_institution_pairs AS (
  SELECT DISTINCT
    work_id,
    institution_id,
    pub_year,
    cited_by_count,
    is_oa
  FROM works_base
),
-- compact per-institution rollup for totals and summary_stats helpers
main_agg_pre AS (
  SELECT
    institution_id,
    CAST(COUNT(DISTINCT work_id) AS INT) AS works_count,
    CAST(SUM(cited_by_count) AS INT) AS cited_by_count,
    CAST(SUM(CASE WHEN is_oa THEN 1 ELSE 0 END) AS INT) AS oa_works_count,
    CAST(AVG(CASE WHEN pub_year >= YEAR(current_date()) - 2 THEN cited_by_count END) AS DOUBLE) AS two_year_mean,
    CAST(COUNT_IF(cited_by_count >= 10) AS INT) AS i10_index,
    SORT_ARRAY(
      TRANSFORM(
        FILTER(COLLECT_LIST(cited_by_count), x -> x IS NOT NULL),
        x -> CAST(x AS INT)
      ),
      false
    ) AS sorted_citations,
    MAX(updated_date) AS updated_date,
    CAST(MIN(created_date) AS DATE) AS created_date
  FROM works_base
  GROUP BY institution_id
),
-- topics + topic_share (same template as authors/sources)
topics_and_share AS (
  WITH awt AS (
    SELECT DISTINCT
      institution_id,
      work_id,
      CAST(t.id AS STRING) AS topic_id,
      t.display_name, t.subfield, t.field, t.domain, t.score
    FROM (
      SELECT institution_id, work_id, EXPLODE_OUTER(work_topics) AS t
      FROM works_base
      WHERE work_id IS NOT NULL
    )
    WHERE t.id IS NOT NULL
  ),
  counts AS (
    SELECT
      institution_id,
      topic_id,
      MAX_BY(display_name, score) AS display_name,
      MAX_BY(subfield, score) AS subfield,
      MAX_BY(field, score) AS field,
      MAX_BY(domain, score) AS domain,
      MAX(score) AS score,
      COUNT(DISTINCT work_id) AS cnt
    FROM awt
    GROUP BY GROUPING SETS ((institution_id, topic_id), (topic_id))
  ),
  with_totals AS (
    SELECT
      institution_id,
      topic_id,
      display_name, subfield, field, domain, score,
      cnt AS topic_count,
      MAX(CASE WHEN institution_id IS NULL THEN cnt END) OVER (PARTITION BY topic_id) AS topic_total
    FROM counts
  )
  SELECT
    institution_id,
    ARRAY_SORT(
      COLLECT_LIST(
        STRUCT(
          topic_id AS id,
          display_name,
          CAST(topic_count AS INT) AS count,
          score,
          subfield,
          field,
          domain
        )
      ),
      (l, r) -> CASE
        WHEN l.count > r.count THEN -1
        WHEN l.count < r.count THEN 1
        WHEN l.id < r.id THEN -1
        WHEN l.id > r.id THEN 1
        ELSE 0
      END
    ) AS topics,
    ARRAY_SORT(
      COLLECT_LIST(
        STRUCT(
          topic_id AS id,
          display_name,
          ROUND(CAST(topic_count AS DOUBLE) / NULLIF(topic_total, 0), 7) AS value,
          subfield,
          field,
          domain
        )
      ),
      (l, r) -> CASE
        WHEN l.value > r.value THEN -1
        WHEN l.value < r.value THEN 1
        WHEN l.id < r.id THEN -1
        WHEN l.id > r.id THEN 1
        ELSE 0
      END
    ) AS topic_share
  FROM with_totals
  WHERE institution_id IS NOT NULL
  GROUP BY institution_id
),

-- finalize summary_stats struct from helpers
main_agg AS (
  SELECT
    institution_id AS id,
    NAMED_STRUCT(
      '2yr_mean_citedness', COALESCE(two_year_mean, 0.0),
      'h_index',
        CAST(
          ARRAY_MAX(
            ZIP_WITH(
              sorted_citations,
              SEQUENCE(1, SIZE(sorted_citations)),
              (citation, rank) -> IF(citation >= rank, rank, 0)
            )
          ) AS INT
        ),
      'i10_index', COALESCE(i10_index, 0)
    ) AS summary_stats,
    works_count,
    cited_by_count,
    oa_works_count,
    updated_date,
    created_date
  FROM main_agg_pre
)

SELECT
  CONCAT('https://openalex.org/I', i.id) AS id,
  i.ror_id AS ror,
  i.display_name,
  i.iso3166_code AS country_code,
  i.type,
  CONCAT('https://openalex.org/institution-types/', i.type) AS type_id,
  i.official_page AS homepage_url,
  i.image_url,
  i.image_thumbnail_url,
  i.display_name_acronyms,
  i.display_name_alternatives,

  COALESCE(ma.works_count, 0) AS works_count,
  COALESCE(ma.cited_by_count, 0) AS cited_by_count,

  STRUCT(
    CONCAT('https://openalex.org/I', i.id) AS openalex,
    i.ror_id AS ror,
    i.wiki_page AS wikipedia,
    i.wikidata_id AS wikidata
  ) AS ids,

  STRUCT(
    i.city,
    i.geonames_city_id,
    i.region,
    i.iso3166_code AS country_code,
    i.country,
    i.latitude,
    i.longitude
  ) AS geo,

  i.wikidata_id,
  i.wiki_page,

  /* topics & concepts */
  COALESCE(ts.topics, ARRAY()) AS topics,
  COALESCE(ts.topic_share, ARRAY()) AS topic_share,

  /* counts_by_year (authors-style; includes OA) */
  (
    SELECT
      SORT_ARRAY(
        COLLECT_LIST(
          STRUCT(
            year,
            works_count,
            oa_works_count,
            cited_by_count
          )
        ),
        false
      )
    FROM (
      SELECT
        CAST(u.pub_year AS INT) AS year,
        CAST(COUNT(DISTINCT u.work_id) AS INT) AS works_count,
        CAST(SUM(CASE WHEN u.is_oa THEN 1 ELSE 0 END) AS INT) AS oa_works_count,
        CAST(SUM(u.cited_by_count) AS INT) AS cited_by_count
      FROM unique_work_institution_pairs u
      WHERE u.institution_id = i.id
        AND u.pub_year IS NOT NULL
      GROUP BY CAST(u.pub_year AS INT)
    )
  ) AS counts_by_year,

  /* summary_stats */
  COALESCE(ma.summary_stats,
           NAMED_STRUCT('2yr_mean_citedness', CAST(0.0 AS DOUBLE),
                        'h_index', CAST(0 AS INT),
                        'i10_index', CAST(0 AS INT))) AS summary_stats,

  /* very simple lineage (self only; expand later if ancestors are available) */
  ARRAY(CONCAT('https://openalex.org/I', i.id)) AS lineage,

  CONCAT('https://api.openalex.org/works?filter=institutions.id:I', CAST(i.id AS STRING)) AS works_api_url,

  ma.updated_date AS updated_date,
  TO_TIMESTAMP(i.created_date) AS created_date

FROM openalex.institutions.institutions i
LEFT JOIN main_agg ma ON i.id = ma.id
LEFT JOIN topics_and_share ts ON i.id = ts.institution_id
WHERE i.merge_into_id IS NULL;


In [0]:
CREATE OR REPLACE FUNCTION openalex.common.http_get(url STRING)
  RETURNS STRING
  LANGUAGE PYTHON
  AS $$
import requests

try:
    if url is None:
        return None

    req_headers = {
        "User-Agent": "OpenAlex-Databricks/1.0 (+https://openalex.org)",
        "Accept": "*/*",
    }

    resp = requests.get(
        url,
        headers=req_headers,
        timeout=(5, 30),        # (connect, read) seconds
        allow_redirects=True,
    )

    if not resp.ok:
        return None

    return resp.text

except Exception:
    return None
$$;

In [0]:
SELECT wikidata_id, wikidata_json FROM openalex.mid.institution WHERE affiliation_id = 1294671590

In [0]:
SELECT COUNT(*)
FROM
  openalex.mid.institution
WHERE
  wikidata_id IS NOT NULL AND affiliation_id IS NOT NULL;

In [0]:
CREATE OR REPLACE TABLE openalex.institutions.wikidata AS
SELECT
  affiliation_id,
  display_name,
  ror_id,
  openalex.common.http_get(
    CONCAT_WS('', 'https://www.wikidata.org/wiki/Special:EntityData/', replace(wikidata_id, 'https://www.wikidata.org/wiki/', ''), '.json')
  ) AS wikidata_json
FROM
  openalex.mid.institution
WHERE
  wikidata_id IS NOT NULL AND affiliation_id IS NOT NULL and xxhash64(affiliation_id) % 23 = 10;

MERGE INTO openalex.mid.institution AS target
USING openalex.institutions.wikidata AS source
ON target.affiliation_id = source.affiliation_id
WHEN MATCHED and target.wikidata_json IS NULL THEN
  UPDATE SET target.wikidata_json = source.wikidata_json;

In [0]:
SELECT * FROM openalex.mid.institution where affiliation_id = 1294671590

In [0]:
%sql
-- PREPARE institutions_api table with selective year filtering
CREATE OR REPLACE TABLE openalex.institutions.institutions_api AS
WITH works_institution_ids AS (
  SELECT w.id,
    w.publication_year,
    w.cited_by_count,
    explode(a.institutions.id) as institution_id_url
  FROM openalex.works.openalex_works w
  LATERAL VIEW explode(w.authorships) as a
),
unique_work_institution_pairs AS (
  SELECT DISTINCT 
    id,
    CAST(replace(institution_id_url, 'https://openalex.org/I', '') AS BIGINT) as institution_id,
    publication_year,
    cited_by_count
  FROM works_institution_ids
),
institution_yearly_counts AS (
  SELECT 
    institution_id, 
    publication_year,
    COUNT(id) as works_count,
    SUM(cited_by_count) as cited_by_count
  FROM unique_work_institution_pairs
  WHERE publication_year >= 2014
  GROUP BY institution_id, publication_year
),
-- Calculate total counts (ALL years)
institution_total_counts AS (
  SELECT 
    institution_id,
    COUNT(id) as total_works_count,
    SUM(cited_by_count) as total_cited_by_count
  FROM unique_work_institution_pairs
  GROUP BY institution_id
),
-- Combine yearly and total counts
work_counts_by_institution_id AS (
  SELECT 
    t.institution_id as id,
    t.total_works_count as works_count,
    t.total_cited_by_count as cited_by_count,
    SORT_ARRAY(
      COLLECT_LIST(
        NAMED_STRUCT(
          'year', y.publication_year,
          'works_count', y.works_count,
          'cited_by_count', y.cited_by_count
        )
      ),
      false
    ) AS counts_by_year
  FROM institution_total_counts t
  LEFT JOIN institution_yearly_counts y ON t.institution_id = y.institution_id
  GROUP BY t.institution_id, t.total_works_count, t.total_cited_by_count
)
SELECT 
  CONCAT('https://openalex.org/I', i.id) as id,
  i.ror_id as ror,
  i.display_name,
  i.iso3166_code as country_code,
  i.type,
  CONCAT('https://openalex.org/institution-types/', i.type) as type_id,
  i.official_page as homepage_url,
  i.image_url,
  i.image_thumbnail_url,
  i.display_name_acronyms,
  i.display_name_alternatives,
  c.works_count,
  c.cited_by_count,
  struct(
    CONCAT('https://openalex.org/I', i.id) as openalex,
    ror_id as ror,
    wiki_page as wikipedia,
    wikidata_id as wikidata
  ) as ids,
  struct(
    i.city,
    i.geonames_city_id,
    i.region,
    i.iso3166_code as country_code,
    i.country,
    i.latitude,
    i.longitude
  ) as geo,
  i.wikidata_id,
  i.wiki_page,
  c.counts_by_year,
  CONCAT('https://api.openalex.org/works?data-version=2&filter=institutions.id:I', i.id) as works_api_url,
  i.updated_date,
  to_timestamp(i.created_date) as created_date
FROM openalex.institutions.institutions i
LEFT JOIN work_counts_by_institution_id c USING (id)
WHERE i.merge_into_id IS NULL;