In [0]:
SELECT * FROM openalex.funders.funders_api --has works_count


In [0]:
SELECT array_union(collect_set(id_1), collect_set(id_2)) FROM openalex.mid.entity_link 
where id_1 = 'P4310320990' or id_2 = 'P4310320990'

In [0]:
%sql
CREATE OR REPLACE TABLE openalex.publishers.publishers_api AS
WITH works_publisher_ids AS (
  SELECT w.id,
         w.publication_year,
         w.cited_by_count,
         l.source.host_organization as publisher_id_url
  FROM openalex.works.openalex_works w
  LATERAL VIEW explode(w.locations) as l
  WHERE w.cited_by_count >= 0
    AND l.source.host_organization IS NOT NULL
    AND l.source.host_organization LIKE 'https://openalex.org/P%'
),
unique_work_publisher_pairs AS (
  SELECT DISTINCT 
    id,
    CAST(replace(publisher_id_url, 'https://openalex.org/P', '') AS BIGINT) as publisher_id,
    publication_year,
    cited_by_count
  FROM works_publisher_ids
),
publisher_yearly_counts AS (
  SELECT 
    publisher_id, 
    publication_year,
    COUNT(id) as works_count,
    SUM(cited_by_count) as cited_by_count
  FROM unique_work_publisher_pairs
  WHERE publication_year >= 2014
  GROUP BY publisher_id, publication_year
),
publisher_total_counts AS (
  SELECT 
    publisher_id,
    COUNT(id) as total_works_count,
    SUM(cited_by_count) as total_cited_by_count
  FROM unique_work_publisher_pairs
  GROUP BY publisher_id
),
-- NEW: precompute summary_stats helpers per publisher
publisher_stats AS (
  SELECT
    publisher_id,
    CAST(AVG(CASE WHEN publication_year >= YEAR(current_date()) - 2
                  THEN cited_by_count END) AS DOUBLE) AS two_year_mean,
    CAST(COUNT_IF(cited_by_count >= 10) AS INT)        AS i10_index,
    SORT_ARRAY(
      TRANSFORM(
        FILTER(COLLECT_LIST(cited_by_count), x -> x IS NOT NULL),
        x -> CAST(x AS INT)
      ),
      false
    ) AS sorted_citations
  FROM unique_work_publisher_pairs
  GROUP BY publisher_id
),
-- Combine yearly and total counts
work_counts_by_publisher_id AS (
  SELECT 
    t.publisher_id as id,
    t.total_works_count as works_count,
    t.total_cited_by_count as cited_by_count,
    SORT_ARRAY(
      COLLECT_LIST(
        NAMED_STRUCT(
          'year', y.publication_year,
          'works_count', y.works_count,
          'cited_by_count', y.cited_by_count
        )
      ),
      false
    ) AS counts_by_year
  FROM publisher_total_counts t
  LEFT JOIN publisher_yearly_counts y ON t.publisher_id = y.publisher_id
  GROUP BY t.publisher_id, t.total_works_count, t.total_cited_by_count
)
SELECT 
  p.id as original_id,
  CONCAT('https://openalex.org/P', p.id) as id,
  ARRAY(CONCAT('https://openalex.org/P', p.id)) AS lineage, --@TODO add the logic
  p.display_name,
  from_json(p.alternate_titles, 'ARRAY<STRING>') AS alternate_titles,
  from_json(p.country_codes, 'ARRAY<STRING>') AS country_codes,
  p.hierarchy_level,
  p.parent_publisher,
  struct(
    CONCAT('https://openalex.org/P', p.id) as openalex,
    p.ror_id as ror,
    p.wikidata_id as wikidata
  ) as ids,
  p.ror_id,
  p.image_url,
  p.image_thumbnail_url,
  p.wikidata_id,
  p.homepage_url,
  COALESCE(c.works_count, 0) as works_count,
  COALESCE(c.cited_by_count, 0) as cited_by_count,
  /* summary_stats built from precomputed helpers */
  named_struct(
    '2yr_mean_citedness', COALESCE(ps.two_year_mean, 0.0),
    'h_index',
      CAST(
        ARRAY_MAX(
          ZIP_WITH(
            ps.sorted_citations,
            SEQUENCE(1, SIZE(ps.sorted_citations)),
            (citation, rank) -> IF(citation >= rank, rank, 0)
          )
        ) AS INT
      ),
    'i10_index', COALESCE(ps.i10_index, 0)
  ) as summary_stats,
  COALESCE(c.counts_by_year, array()) as counts_by_year,
  CONCAT('https://api.openalex.org/sources?data-version=2&filter=host_organization.id:P', p.id) as sources_api_url,
  to_date(p.created_date) as created_date,
  to_timestamp(p.updated_date) as updated_date
FROM openalex.publishers.publishers p
LEFT JOIN work_counts_by_publisher_id c USING (id)
LEFT JOIN publisher_stats ps ON ps.publisher_id = p.id
WHERE p.merge_into_id IS NULL;

In [0]:
SELECT * FROM openalex.publishers.publishers

In [0]:
CREATE OR REPLACE TABLE openalex.publishers.publishers_api AS
WITH works_publisher_ids AS (
  SELECT w.id,
         w.publication_year,
         w.cited_by_count,
         l.source.host_organization as publisher_id_url
  FROM openalex.works.openalex_works w
  LATERAL VIEW explode(w.locations) as l
  WHERE w.cited_by_count >= 0
    AND l.source.host_organization IS NOT NULL
    AND l.source.host_organization LIKE 'https://openalex.org/P%'
),
unique_work_publisher_pairs AS (
  SELECT DISTINCT 
    id,
    CAST(replace(publisher_id_url, 'https://openalex.org/P', '') AS BIGINT) as publisher_id,
    publication_year,
    cited_by_count
  FROM works_publisher_ids
),
publisher_yearly_counts AS (
  SELECT 
    publisher_id, 
    publication_year,
    COUNT(id) as works_count,
    SUM(cited_by_count) as cited_by_count
  FROM unique_work_publisher_pairs
  WHERE publication_year >= 2014
  GROUP BY publisher_id, publication_year
),
publisher_total_counts AS (
  SELECT 
    publisher_id,
    COUNT(id) as total_works_count,
    SUM(cited_by_count) as total_cited_by_count
  FROM unique_work_publisher_pairs
  GROUP BY publisher_id
),
-- NEW: precompute summary_stats helpers per publisher
publisher_stats AS (
  SELECT
    publisher_id,
    CAST(AVG(CASE WHEN publication_year >= YEAR(current_date()) - 2
                  THEN cited_by_count END) AS DOUBLE) AS two_year_mean,
    CAST(COUNT_IF(cited_by_count >= 10) AS INT) AS i10_index,
    SORT_ARRAY(
      TRANSFORM(
        FILTER(COLLECT_LIST(cited_by_count), x -> x IS NOT NULL),
        x -> CAST(x AS INT)
      ),
      false
    ) AS sorted_citations
  FROM unique_work_publisher_pairs
  GROUP BY publisher_id
),
-- Combine yearly and total counts
work_counts_by_publisher_id AS (
  SELECT 
    t.publisher_id as id,
    t.total_works_count as works_count,
    t.total_cited_by_count as cited_by_count,
    SORT_ARRAY(
      COLLECT_LIST(
        NAMED_STRUCT(
          'year', y.publication_year,
          'works_count', y.works_count,
          'cited_by_count', y.cited_by_count
        )
      ),
      false
    ) AS counts_by_year
  FROM publisher_total_counts t
  LEFT JOIN publisher_yearly_counts y ON t.publisher_id = y.publisher_id
  GROUP BY t.publisher_id, t.total_works_count, t.total_cited_by_count
),

-- NEW: Publisher Roles CTE (same logic as institutions)
publisher_roles AS (
  WITH entity_links_expanded AS (
    SELECT 
      p.id AS publisher_id,
      el.id_1,
      el.id_2
    FROM openalex.publishers.publishers p
    INNER JOIN openalex.mid.entity_link el 
      ON (el.id_1 = CONCAT('P', p.id) OR el.id_2 = CONCAT('P', p.id))
  ),
  all_entity_ids AS (
    -- Publisher's own role
    SELECT 
      p.id AS publisher_id,
      CONCAT('P', p.id) AS entity_id,
      'publisher' AS role
    FROM openalex.publishers.publishers p
    
    UNION ALL
    
    -- Linked institution/funder roles
    SELECT 
      publisher_id,
      CASE 
        WHEN id_1 LIKE 'P%' THEN id_2 
        ELSE id_1 
      END AS entity_id,
      CASE 
        WHEN id_1 LIKE 'I%' OR id_2 LIKE 'I%' THEN 'institution'
        WHEN id_1 LIKE 'F%' OR id_2 LIKE 'F%' THEN 'funder'
      END AS role
    FROM entity_links_expanded
    WHERE (id_1 LIKE 'I%' OR id_1 LIKE 'F%' OR id_2 LIKE 'I%' OR id_2 LIKE 'F%')
  ),
  roles_with_counts AS (
    SELECT 
      ae.publisher_id,
      ae.role,
      CONCAT('https://openalex.org/', ae.entity_id) AS id,
      CASE 
        -- For publisher: use works_count from work_counts_by_publisher_id
        WHEN ae.role = 'publisher' THEN wc.works_count
        -- For institution: join to institutions_api table
        WHEN ae.role = 'institution' THEN i_api.works_count
        -- For funder: join to funders_api table
        WHEN ae.role = 'funder' THEN f_api.works_count
      END AS works_count
    FROM all_entity_ids ae
    -- Join to work_counts for publisher works_count
    LEFT JOIN work_counts_by_publisher_id wc 
      ON ae.role = 'publisher' AND ae.publisher_id = wc.id
    -- Join to institutions_api for institution works_count
    LEFT JOIN openalex.institutions.institutions_api i_api
      ON ae.role = 'institution' AND ae.entity_id = REPLACE(i_api.id, 'https://openalex.org/', '')
    -- Join to funders_api for funder works_count
    LEFT JOIN openalex.funders.funders_api f_api
      ON ae.role = 'funder' AND ae.entity_id = REPLACE(f_api.id, 'https://openalex.org/', '')
  ),
  -- Deduplicate funders (keep one with highest works_count)
  deduplicated_roles AS (
    SELECT 
      publisher_id,
      role,
      id,
      CAST(COALESCE(works_count, 0) AS INT) AS works_count,
      ROW_NUMBER() OVER (
        PARTITION BY publisher_id, role 
        ORDER BY works_count DESC, id
      ) AS rn
    FROM roles_with_counts
  )
  SELECT 
    publisher_id,
    COLLECT_LIST(
      STRUCT(role, id, works_count)
    ) AS roles
  FROM deduplicated_roles
  WHERE rn = 1 OR role != 'funder'  -- Keep all non-funders, but only top funder
  GROUP BY publisher_id
)

SELECT 
  p.id as original_id,
  CONCAT('https://openalex.org/P', p.id) as id,
  ARRAY(CONCAT('https://openalex.org/P', p.id)) AS lineage, --@TODO add the logic
  p.display_name,
  from_json(p.alternate_titles, 'ARRAY<STRING>') AS alternate_titles,
  from_json(p.country_codes, 'ARRAY<STRING>') AS country_codes,
  p.hierarchy_level,
  p.parent_publisher,
  struct(
    CONCAT('https://openalex.org/P', p.id) as openalex,
    p.ror_id as ror,
    p.wikidata_id as wikidata
  ) as ids,
  p.ror_id,
  p.image_url,
  p.image_thumbnail_url,
  p.wikidata_id,
  p.homepage_url,
  COALESCE(c.works_count, 0) as works_count,
  COALESCE(c.cited_by_count, 0) as cited_by_count,
  /* summary_stats built from precomputed helpers */
  named_struct(
    '2yr_mean_citedness', COALESCE(ps.two_year_mean, 0.0),
    'h_index',
      CAST(
        ARRAY_MAX(
          ZIP_WITH(
            ps.sorted_citations,
            SEQUENCE(1, SIZE(ps.sorted_citations)),
            (citation, rank) -> IF(citation >= rank, rank, 0)
          )
        ) AS INT
      ),
    'i10_index', COALESCE(ps.i10_index, 0)
  ) as summary_stats,
  -- NEW: Add roles
  COALESCE(pr.roles, ARRAY()) AS roles,
  COALESCE(c.counts_by_year, array()) as counts_by_year,
  CONCAT('https://api.openalex.org/sources?data-version=2&filter=host_organization.id:P', p.id) as sources_api_url,
  to_date(p.created_date) as created_date,
  to_timestamp(p.updated_date) as updated_date
FROM openalex.publishers.publishers p
LEFT JOIN work_counts_by_publisher_id c USING (id)
LEFT JOIN publisher_stats ps ON ps.publisher_id = p.id
LEFT JOIN publisher_roles pr ON p.id = pr.publisher_id  -- NEW: Join roles
WHERE p.merge_into_id IS NULL;