In [0]:
%sql
select * from openalex.sources.sources

In [0]:
%sql
SELECT country, country_code, count(*) as count
FROM openalex.sources.sources where country is not null and length(country_code) = 2
group by country, country_code
order by country


In [0]:
%sql
CREATE OR REPLACE TABLE openalex.sources.sources_api
CLUSTER BY AUTO AS
WITH
src AS (
  SELECT
    s.id,
    s.display_name,
    s.issn AS issn_l,
    s.issns AS issn,
    s.publisher_id,
    s.publisher AS host_organization_name,
    s.wikidata_id,
    s.datacite_ids,
    s.is_in_doaj,
    s.is_in_doaj_start_year AS is_in_doaj_since_year,
    s.is_oa_high_oa_rate AS is_high_oa_rate,
    s.high_oa_rate_start_year AS is_high_oa_rate_since_year,
    s.is_in_scielo,
    s.is_ojs,
    s.is_core,
    CAST(
      IF(
        COALESCE(is_high_oa_rate_since_year, is_in_doaj_since_year) IS NOT NULL,
        COALESCE(is_high_oa_rate_since_year, is_in_doaj_since_year) - 1,
        NULL
      ) AS INT
    ) AS oa_flip_year,
    sm.first_publication_year,
    sm.last_publication_year,
    s.is_oa,
    s.webpage AS homepage_url,
    s.type,
    s.apc_prices,
    s.apc_usd,
    s.country,
    s.country_code,
    s.societies,
    s.alternate_titles
  FROM openalex.sources.sources s
  LEFT JOIN openalex.sources.source_metrics_precomputed sm ON s.id = sm.id  
  WHERE s.merge_into_id IS NULL
),
works_base AS (
  SELECT
    CAST(REGEXP_EXTRACT(w.primary_location.source.id, '/S([0-9]+)$', 1) AS BIGINT) AS source_id,
    w.id AS work_id,
    COALESCE(w.publication_year, YEAR(w.publication_date)) AS pub_year,
    CAST(w.cited_by_count AS INT) AS cited_by_count,
    CAST(w.open_access.is_oa AS BOOLEAN) AS is_oa,
    w.updated_date,
    w.created_date,
    w.topics AS work_topics
  FROM openalex.works.openalex_works w
  WHERE w.primary_location.source.id IS NOT NULL
),
main_agg_pre AS (
  SELECT
    source_id,
    CAST(COUNT(*) AS INT) AS works_count,
    CAST(SUM(cited_by_count) AS INT) AS cited_by_count,
    CAST(SUM(CASE WHEN is_oa THEN 1 ELSE 0 END) AS INT) AS oa_works_count,
    CAST(AVG(CASE WHEN pub_year >= YEAR(current_date()) - 2 THEN cited_by_count END) AS DOUBLE) AS two_year_mean,
    CAST(COUNT_IF(cited_by_count >= 10) AS INT) AS i10_index,
    SORT_ARRAY(
      TRANSFORM(
        FILTER(COLLECT_LIST(cited_by_count), x -> x IS NOT NULL),
        x -> CAST(x AS INT)
      ),
      false
    ) AS sorted_citations,
    MAX(updated_date) AS updated_date,
    CAST(MIN(created_date) AS DATE) AS created_date
  FROM works_base
  GROUP BY source_id
),
topics_and_share AS (
  WITH awt AS (
    SELECT DISTINCT
      source_id,
      work_id,
      CAST(t.id AS STRING) AS topic_id,
      t.display_name, t.subfield, t.field, t.domain, t.score
    FROM (
      SELECT source_id, work_id, EXPLODE_OUTER(work_topics) AS t
      FROM works_base
      WHERE work_id IS NOT NULL
    )
    WHERE t.id IS NOT NULL
  ),
  counts AS (
    SELECT
      source_id,
      topic_id,
      MAX_BY(display_name, score) AS display_name,
      MAX_BY(subfield, score) AS subfield,
      MAX_BY(field, score) AS field,
      MAX_BY(domain, score) AS domain,
      MAX(score) AS score,
      COUNT(DISTINCT work_id) AS cnt
    FROM awt
    GROUP BY GROUPING SETS ((source_id, topic_id), (topic_id))
  ),
  with_totals AS (
    SELECT
      source_id,
      topic_id,
      display_name, subfield, field, domain, score,
      cnt AS topic_count,
      MAX(CASE WHEN source_id IS NULL THEN cnt END) OVER (PARTITION BY topic_id) AS topic_total
    FROM counts
  )
  SELECT
    source_id,
    ARRAY_SORT(
      COLLECT_LIST(
        STRUCT(
          topic_id AS id,
          display_name,
          CAST(topic_count AS INT) AS count,
          score,
          subfield,
          field,
          domain
        )
      ),
      (l, r) -> CASE
        WHEN l.count > r.count THEN -1
        WHEN l.count < r.count THEN 1
        WHEN l.id < r.id THEN -1
        WHEN l.id > r.id THEN 1
        ELSE 0
      END
    ) AS topics,
    ARRAY_SORT(
      COLLECT_LIST(
        STRUCT(
          topic_id AS id,
          display_name,
          ROUND(CAST(topic_count AS DOUBLE) / NULLIF(topic_total, 0), 7) AS value,
          subfield,
          field,
          domain
        )
      ),
      (l, r) -> CASE
        WHEN l.value > r.value THEN -1
        WHEN l.value < r.value THEN 1
        WHEN l.id < r.id THEN -1
        WHEN l.id > r.id THEN 1
        ELSE 0
      END
    ) AS topic_share
  FROM with_totals
  WHERE source_id IS NOT NULL
  GROUP BY source_id
),
main_agg AS (
  SELECT
    source_id AS id,
    NAMED_STRUCT(
      '2yr_mean_citedness', COALESCE(two_year_mean, 0.0),
      'h_index',
        CAST(
          ARRAY_MAX(
            ZIP_WITH(
              sorted_citations,
              SEQUENCE(1, SIZE(sorted_citations)),
              (citation, rank) -> IF(citation >= rank, rank, 0)
            )
          ) AS INT
        ),
      'i10_index', COALESCE(i10_index, 0)
    ) AS summary_stats,
    works_count,
    cited_by_count,
    oa_works_count,
    updated_date,
    created_date
  FROM main_agg_pre
)
SELECT
  CONCAT('https://openalex.org/S', s.id) AS id,
  s.issn_l,
  s.issn,
  s.display_name,

  CONCAT('https://openalex.org/P', s.publisher_id) AS host_organization,
  s.host_organization_name AS host_organization_name,
  ARRAY(CONCAT('https://openalex.org/P', s.publisher_id)) AS host_organization_lineage,

  COALESCE(ma.works_count, 0) AS works_count,
  COALESCE(ma.oa_works_count, 0) AS oa_works_count,
  COALESCE(ma.cited_by_count, 0) AS cited_by_count,
  COALESCE(ma.summary_stats, NAMED_STRUCT('2yr_mean_citedness', CAST(0.0 AS DOUBLE), 'h_index', CAST(0 AS INT), 'i10_index', CAST(0 AS INT))) AS summary_stats,

  COALESCE(s.is_oa, FALSE) AS is_oa,
  COALESCE(s.is_in_doaj, FALSE) AS is_in_doaj,
  s.is_in_doaj_since_year,
  COALESCE(s.is_high_oa_rate, FALSE) AS is_high_oa_rate,
  s.is_high_oa_rate_since_year,
  COALESCE(s.is_in_scielo, FALSE) AS is_in_scielo,
  COALESCE(s.is_ojs, FALSE) AS is_ojs,
  COALESCE(s.is_core, FALSE) AS is_core,
  s.oa_flip_year,
  s.first_publication_year,
  s.last_publication_year,
  NAMED_STRUCT(
    'openalex', CONCAT('https://openalex.org/S', s.id),
    'issn_l', s.issn_l,
    'issn', s.issn,
    'mag', CAST(s.id AS STRING),
    'wikidata', s.wikidata_id
  ) AS ids,
  s.homepage_url,
  s.apc_prices,
  s.apc_usd,
  s.country_code,
  COALESCE(s.societies, ARRAY()) AS societies,
  COALESCE(s.alternate_titles, ARRAY()) AS alternate_titles,
  s.type,
  COALESCE(ts.topics, ARRAY()) AS topics,
  COALESCE(ts.topic_share, ARRAY()) AS topic_share,
  (
    SELECT
      SORT_ARRAY(
        COLLECT_LIST(
          STRUCT(
            year,
            works_count,
            oa_works_count,
            cited_by_count
          )
        ),
        false
      )
    FROM (
      SELECT
        CAST(wb.pub_year AS INT) AS year,
        CAST(COUNT(*) AS INT) AS works_count,
        CAST(SUM(CASE WHEN wb.is_oa THEN 1 ELSE 0 END) AS INT) AS oa_works_count,
        CAST(SUM(wb.cited_by_count) AS INT) AS cited_by_count
      FROM works_base wb
      WHERE wb.source_id = s.id
        AND wb.pub_year IS NOT NULL
      GROUP BY CAST(wb.pub_year AS INT)
    )
  ) AS counts_by_year,
  CONCAT('https://api.openalex.org/works?filter=primary_location.source.id:S', CAST(s.id AS STRING)) AS works_api_url,
  ma.updated_date AS updated_date,
  ma.created_date AS created_date
FROM src s
LEFT JOIN main_agg ma ON s.id = ma.id
LEFT JOIN topics_and_share ts ON s.id = ts.source_id;
