In [0]:
SELECT * FROM openalex.funders.funders_api --has works_count


In [0]:
SELECT array_union(collect_set(id_1), collect_set(id_2)) FROM openalex.mid.entity_link 
where id_1 = 'P4310320990' or id_2 = 'P4310320990'

In [0]:
%sql
CREATE OR REPLACE TABLE openalex.publishers.publishers_api AS
WITH works_publisher_ids AS (
  SELECT w.id,
         w.publication_year,
         w.cited_by_count,
         l.source.host_organization as publisher_id_url
  FROM openalex.works.openalex_works w
  LATERAL VIEW explode(w.locations) as l
  WHERE w.cited_by_count >= 0
    AND l.source.host_organization IS NOT NULL
    AND l.source.host_organization LIKE 'https://openalex.org/P%'
),
unique_work_publisher_pairs AS (
  SELECT DISTINCT 
    id,
    CAST(replace(publisher_id_url, 'https://openalex.org/P', '') AS BIGINT) as publisher_id,
    publication_year,
    cited_by_count
  FROM works_publisher_ids
),
publisher_yearly_counts AS (
  SELECT 
    publisher_id, 
    publication_year,
    COUNT(id) as works_count,
    SUM(cited_by_count) as cited_by_count
  FROM unique_work_publisher_pairs
  WHERE publication_year >= 2014
  GROUP BY publisher_id, publication_year
),
publisher_total_counts AS (
  SELECT 
    publisher_id,
    COUNT(id) as total_works_count,
    SUM(cited_by_count) as total_cited_by_count
  FROM unique_work_publisher_pairs
  GROUP BY publisher_id
),
-- NEW: precompute summary_stats helpers per publisher
publisher_stats AS (
  SELECT
    publisher_id,
    CAST(AVG(CASE WHEN publication_year >= YEAR(current_date()) - 2
                  THEN cited_by_count END) AS DOUBLE) AS two_year_mean,
    CAST(COUNT_IF(cited_by_count >= 10) AS INT)        AS i10_index,
    SORT_ARRAY(
      TRANSFORM(
        FILTER(COLLECT_LIST(cited_by_count), x -> x IS NOT NULL),
        x -> CAST(x AS INT)
      ),
      false
    ) AS sorted_citations
  FROM unique_work_publisher_pairs
  GROUP BY publisher_id
),
-- Combine yearly and total counts
work_counts_by_publisher_id AS (
  SELECT 
    t.publisher_id as id,
    t.total_works_count as works_count,
    t.total_cited_by_count as cited_by_count,
    SORT_ARRAY(
      COLLECT_LIST(
        NAMED_STRUCT(
          'year', y.publication_year,
          'works_count', y.works_count,
          'cited_by_count', y.cited_by_count
        )
      ),
      false
    ) AS counts_by_year
  FROM publisher_total_counts t
  LEFT JOIN publisher_yearly_counts y ON t.publisher_id = y.publisher_id
  GROUP BY t.publisher_id, t.total_works_count, t.total_cited_by_count
)
SELECT 
  p.id as original_id,
  CONCAT('https://openalex.org/P', p.id) as id,
  ARRAY(CONCAT('https://openalex.org/P', p.id)) AS lineage, --@TODO add the logic
  p.display_name,
  from_json(p.alternate_titles, 'ARRAY<STRING>') AS alternate_titles,
  from_json(p.country_codes, 'ARRAY<STRING>') AS country_codes,
  p.hierarchy_level,
  p.parent_publisher,
  struct(
    CONCAT('https://openalex.org/P', p.id) as openalex,
    p.ror_id as ror,
    p.wikidata_id as wikidata
  ) as ids,
  p.ror_id,
  p.image_url,
  p.image_thumbnail_url,
  p.wikidata_id,
  p.homepage_url,
  COALESCE(c.works_count, 0) as works_count,
  COALESCE(c.cited_by_count, 0) as cited_by_count,
  /* summary_stats built from precomputed helpers */
  named_struct(
    '2yr_mean_citedness', COALESCE(ps.two_year_mean, 0.0),
    'h_index',
      CAST(
        ARRAY_MAX(
          ZIP_WITH(
            ps.sorted_citations,
            SEQUENCE(1, SIZE(ps.sorted_citations)),
            (citation, rank) -> IF(citation >= rank, rank, 0)
          )
        ) AS INT
      ),
    'i10_index', COALESCE(ps.i10_index, 0)
  ) as summary_stats,
  COALESCE(c.counts_by_year, array()) as counts_by_year,
  CONCAT('https://api.openalex.org/sources?data-version=2&filter=host_organization.id:P', p.id) as sources_api_url,
  to_date(p.created_date) as created_date,
  to_timestamp(p.updated_date) as updated_date
FROM openalex.publishers.publishers p
LEFT JOIN work_counts_by_publisher_id c USING (id)
LEFT JOIN publisher_stats ps ON ps.publisher_id = p.id
WHERE p.merge_into_id IS NULL;

In [0]:
SELECT * FROM openalex.publishers.publishers