In [8]:
dataset = "oag_v10_6_0"

## Language diversity across research products

In [22]:
# sql_engine: bigquery
# output_variable: df
# start _sql
_sql = """
WITH products as (
  SELECT id, languageCode FROM {dataset}.publications
  UNION ALL
  SELECT id, languageCode FROM {dataset}.others
  UNION ALL
  SELECT id, languageCode FROM {dataset}.datasets
  UNION ALL
  SELECT id, languageCode FROM {dataset}.software
)
SELECT languageCode, COUNT( DISTINCT id) AS productCount
FROM products
  WHERE languageCode != "und"
GROUP BY languageCode
ORDER BY productCount DESC
""" # end _sql
from google.colab.sql import bigquery as _bqsqlcell
df = _bqsqlcell.run(_sql)
df

<bigframes.display.anywidget.TableWidget object at 0x7bed13245510>

## Geographic presence of the most prominent languages

In [23]:
# sql_engine: bigquery
# output_variable: df
# start _sql
_sql = """
WITH products as (
  SELECT id, type, languageCode FROM {dataset}.publications
  UNION ALL
  SELECT id, type, languageCode FROM {dataset}.others
  UNION ALL
  SELECT id, type, languageCode FROM {dataset}.datasets
  UNION ALL
  SELECT id, type, languageCode FROM {dataset}.software
),
country_rel AS (
	SELECT r.source, r.target, o.countryCode
	FROM {dataset}.relations r
	JOIN {dataset}.organizations o ON r.target = o.id
		WHERE r.relationName = 'hasAuthorInstitution'
)
SELECT p.languageCode, r.countryCode, COUNT(DISTINCT p.id) AS productCount
	FROM products p
	JOIN country_rel r ON p.id = r.source

	GROUP BY languageCode, countryCode
	ORDER BY productCount DESC
""" # end _sql
from google.colab.sql import bigquery as _bqsqlcell
df = _bqsqlcell.run(_sql)
df

<bigframes.display.anywidget.TableWidget object at 0x7bed133acd60>

## Top-level research product formats

In [24]:
# sql_engine: bigquery
# output_variable: df
# start _sql
_sql = """
WITH products as (
  SELECT id, type FROM {dataset}.publications
  UNION ALL
  SELECT id, type FROM {dataset}.others
  UNION ALL
  SELECT id, type FROM {dataset}.datasets
  UNION ALL
  SELECT id, type FROM {dataset}.software
)
SELECT type, COUNT(DISTINCT id) AS productCount
FROM products

GROUP BY type
""" # end _sql
from google.colab.sql import bigquery as _bqsqlcell
df = _bqsqlcell.run(_sql)
df

<bigframes.display.anywidget.TableWidget object at 0x7bed1354d360>

## Share of top-level research product formats across languages



In [27]:
# sql_engine: bigquery
# output_variable: df
# start _sql
_sql = """
WITH products as (
  SELECT id, type, languageCode FROM {dataset}.publications
  UNION ALL
  SELECT id, type, languageCode FROM {dataset}.others
  UNION ALL
  SELECT id, type, languageCode FROM {dataset}.datasets
  UNION ALL
  SELECT id, type, languageCode FROM {dataset}.software
)
SELECT type, languageCode, COUNT(DISTINCT id) AS productCount
FROM products
where languageCode != "und"
GROUP BY type, languageCode
ORDER BY productCount desc
""" # end _sql
from google.colab.sql import bigquery as _bqsqlcell
df = _bqsqlcell.run(_sql)
df

<bigframes.display.anywidget.TableWidget object at 0x7bec846fea10>

## Fine-grained bibliotyping of research products

In [26]:
# sql_engine: bigquery
# output_variable: df
# start _sql
_sql = """
WITH products as (
  SELECT id, type, instances FROM {dataset}.publications
  UNION ALL
  SELECT id, type, instances FROM {dataset}.others
  UNION ALL
  SELECT id, type, instances FROM {dataset}.datasets
  UNION ALL
  SELECT id, type, instances FROM {dataset}.software
)
SELECT  JSON_VALUE(inst, "$.type") AS fineGrainedType, COUNT(DISTINCT id) AS productCount
FROM products,
	UNNEST(JSON_EXTRACT_ARRAY(instances)) AS inst
GROUP BY fineGrainedType
ORDER BY productCount desc
""" # end _sql
from google.colab.sql import bigquery as _bqsqlcell
df = _bqsqlcell.run(_sql)
df

<bigframes.display.anywidget.TableWidget object at 0x7bec877b3880>

## Geographic coverage of research products

In [28]:
# sql_engine: bigquery
# output_variable: df
# start _sql
_sql = """
WITH products as (
  SELECT id, type FROM {dataset}.publications
  UNION ALL
  SELECT id, type FROM {dataset}.others
  UNION ALL
  SELECT id, type FROM {dataset}.datasets
  UNION ALL
  SELECT id, type FROM {dataset}.software
),
country_rel AS (
	SELECT r.source, r.target, o.countryCode
	FROM {dataset}.relations r
	JOIN {dataset}.organizations o ON r.target = o.id
		WHERE r.relationName = 'hasAuthorInstitution'
)
SELECT r.countryCode, count(distinct p.id) AS productCount
FROM products p
JOIN country_rel r ON p.id = r.source

GROUP BY countryCode
ORDER BY productCount desc
""" # end _sql
from google.colab.sql import bigquery as _bqsqlcell
df = _bqsqlcell.run(_sql)
df

<bigframes.display.anywidget.TableWidget object at 0x7bed133553c0>

## Geographic coverage of organisations

In [29]:
# sql_engine: bigquery
# output_variable: df
# start _sql
_sql = """
SELECT countryCode, COUNT(DISTINCT ID) AS organisationCount
FROM {dataset}.organizations
	WHERE countryCode IS NOT NULL

GROUP BY countryCode
ORDER BY organisatiONCount DESC
""" # end _sql
from google.colab.sql import bigquery as _bqsqlcell
df = _bqsqlcell.run(_sql)
df

<bigframes.display.anywidget.TableWidget object at 0x7bed1354e380>

## Number of organisations across organisation types

In [31]:
# sql_engine: bigquery
# output_variable: df
# start _sql
_sql = """
WITH organizations AS (
    SELECT o.id, org.organizationtype
    FROM oag_v10_6_0.organizations o
    JOIN `oag_v10_6_0_additional.organization-types` org ON o.id = org.id
)
SELECT organizationtype, COUNT(DISTINCT id) AS nOrganisation
FROM organizations
  WHERE organizationtype != "Unknown"
GROUP BY organizationtype
""" # end _sql
from google.colab.sql import bigquery as _bqsqlcell
df = _bqsqlcell.run(_sql)
df

<bigframes.display.anywidget.TableWidget object at 0x7bec84742f20>

## Share of top-level research product formats across organisation types

In [36]:
# sql_engine: bigquery
# output_variable: df
# start _sql
_sql = """
WITH products as (
  SELECT id, type FROM {dataset}.publications
  UNION ALL
  SELECT id, type FROM {dataset}.others
  UNION ALL
  SELECT id, type FROM {dataset}.datasets
  UNION ALL
  SELECT id, type FROM {dataset}.software
),
organizations AS (
	SELECT o.id, org.organizationtype
	FROM {dataset}.organizations o
	JOIN `oag_v10_6_0_additional.organization-types` org ON o.id = org.id
)
SELECT o.organizationtype, p.type, COUNT(DISTINCT p.id) AS productCount
FROM {dataset}.relations r
JOIN products p ON r.source = p.id
JOIN organizations o ON r.target = o.id
	WHERE r.relationName = 'hasAuthorInstitution' AND o.organizationtype != "Unknown"

GROUP BY o.organizationtype, p.type
ORDER BY productCount DESC

""" # end _sql
from google.colab.sql import bigquery as _bqsqlcell
df = _bqsqlcell.run(_sql)
df

<bigframes.display.anywidget.TableWidget object at 0x7bed132bc6a0>

## Number of research products per top-level format across Fields of Science

In [38]:
# sql_engine: bigquery
# output_variable: df
# start _sql
_sql = """
WITH products as (
  SELECT id, type, subjects FROM {dataset}.publications
  UNION ALL
  SELECT id, type, subjects FROM {dataset}.others
  UNION ALL
  SELECT id, type, subjects FROM {dataset}.datasets
  UNION ALL
  SELECT id, type, subjects FROM {dataset}.software
)
SELECT JSON_VALUE(subject.subject.value) AS fos, type, COUNT(DISTINCT id) AS nProducts
FROM products,
	UNNEST(JSON_EXTRACT_ARRAY(subjects)) AS subject
	WHERE JSON_VALUE(subject.subject.scheme) = "FOS"

GROUP BY fos, type
ORDER BY fos, type ASC
""" # end _sql
from google.colab.sql import bigquery as _bqsqlcell
df = _bqsqlcell.run(_sql)
df

<bigframes.display.anywidget.TableWidget object at 0x7bec847818d0>

## Number of research products per top-level format across Sustainable Development Goals

In [39]:
# sql_engine: bigquery
# output_variable: df
# start _sql
_sql = """
WITH products as (
  SELECT id, type, subjects FROM {dataset}.publications
  UNION ALL
  SELECT id, type, subjects FROM {dataset}.others
  UNION ALL
  SELECT id, type, subjects FROM {dataset}.datasets
  UNION ALL
  SELECT id, type, subjects FROM {dataset}.software
)
SELECT JSON_VALUE(subject.subject.value) AS SDG, type, count(distinct id) AS nProducts
FROM products,
	UNNEST(JSON_EXTRACT_ARRAY(subjects)) AS subject

	WHERE JSON_VALUE(subject.subject.scheme) = "SDG"
GROUP BY SDG, type
ORDER BY SDG, type ASC
""" # end _sql
from google.colab.sql import bigquery as _bqsqlcell
df = _bqsqlcell.run(_sql)
df

<bigframes.display.anywidget.TableWidget object at 0x7bec847827d0>

## Share of research products per Sustainable Development Goals across countries

In [40]:
# sql_engine: bigquery
# output_variable: df
# start _sql
_sql = """
WITH products as (
  SELECT id, type, subjects FROM {dataset}.publications
  UNION ALL
  SELECT id, type, subjects FROM {dataset}.others
  UNION ALL
  SELECT id, type, subjects FROM {dataset}.datasets
  UNION ALL
  SELECT id, type, subjects FROM {dataset}.software
),
country_rel AS (
	SELECT r.source, r.target, o.countryCode
	FROM {dataset}.relations r
	JOIN {dataset}.organizations o ON r.target = o.id
	WHERE r.relationName = 'hasAuthorInstitution'
)
SELECT JSON_VALUE(subject.subject.value) AS SDG, r.countryCode, count(distinct id) AS nProducts
FROM products p,
UNNEST(JSON_EXTRACT_ARRAY(subjects)) AS subject
JOIN country_rel r ON p.id = r.source
WHERE JSON_VALUE(subject.subject.scheme) = "SDG"
GROUP BY SDG, r.countryCode
""" # end _sql
from google.colab.sql import bigquery as _bqsqlcell
df = _bqsqlcell.run(_sql)
df

<bigframes.display.anywidget.TableWidget object at 0x7bec800290c0>

## Data providers collected by the OpenAIRE Graph

In [41]:
# sql_engine: bigquery
# output_variable: df
# start _sql
_sql = """
SELECT distinct type, count(distinct id) AS dataSourcesCount
FROM {dataset}.datasources
GROUP BY type
""" # end _sql
from google.colab.sql import bigquery as _bqsqlcell
df = _bqsqlcell.run(_sql)
df

<bigframes.display.anywidget.TableWidget object at 0x7bec8005dea0>

## Top-10 countries for managed journals

In [43]:
# sql_engine: bigquery
# output_variable: df
# start _sql
_sql = """
with filtered_relations AS (
  SELECT r.source, r.relationName, r.target
  FROM {dataset}.datasources d
  JOIN {dataset}.relations r ON d.id = r.source
    WHERE r.relationName = 'isProvidedBy' and r.targetType = 'organization' and d.type = "Journal"
)
SELECT o.countryCode, count(distinct r.source) AS nJournals
FROM filtered_relations r
JOIN {dataset}.organizations o ON r.target = o.id
WHERE o.countryCode IS NOT NULL
GROUP BY o.countryCode
ORDER BY nJournals DESC
""" # end _sql
from google.colab.sql import bigquery as _bqsqlcell
df = _bqsqlcell.run(_sql)
df

<bigframes.display.anywidget.TableWidget object at 0x7bec84703b50>

## Funders and respective amount of funded projects

In [45]:
# sql_engine: bigquery
# output_variable: df
# start _sql
_sql = """
SELECT JSON_VALUE(funding, '$.shortName') AS funderId, JSON_VALUE(funding, '$.name') AS funderName, count(distinct id) AS nProjects
FROM {dataset}.projects,
	UNNEST(JSON_QUERY_ARRAY(fundings)) AS funding
GROUP BY funderId, funderName
ORDER BY nProjects DESC
""" # end _sql
from google.colab.sql import bigquery as _bqsqlcell
df = _bqsqlcell.run(_sql)
df

<bigframes.display.anywidget.TableWidget object at 0x7bec877b31f0>

## Geographic coverage of funders jurisdiction

In [46]:
# sql_engine: bigquery
# output_variable: df
# start _sql
_sql = """
SELECT JSON_VALUE(funding.jurisdiction) AS fundingJurisdiction, COUNT(DISTINCT JSON_VALUE(funding.shortName)) AS nFunding
FROM {dataset}.projects ,
UNNEST (JSON_EXTRACT_ARRAY(fundings)) AS funding
GROUP BY fundingJurisdiction
""" # end _sql
from google.colab.sql import bigquery as _bqsqlcell
df = _bqsqlcell.run(_sql)
df

<bigframes.display.anywidget.TableWidget object at 0x7bec844f8a00>

## Top-level research product format across funders

In [47]:
# sql_engine: bigquery
# output_variable: df
# start _sql
_sql = """
WITH products as (
  SELECT id, type, languageCode FROM {dataset}.publications
  UNION ALL
  SELECT id, type, languageCode FROM {dataset}.others
  UNION ALL
  SELECT id, type, languageCode FROM {dataset}.datasets
  UNION ALL
  SELECT id, type, languageCode FROM {dataset}.software
),
enriched_products AS (
	SELECT r.source AS projectId, r.target AS productId, r.relatiONName, pr.fundings, p.type
	FROM {dataset}.relations r
	JOIN {dataset}.projects pr ON r.source = pr.id
	JOIN products p ON r.target = p.id
		WHERE r.relationName = 'produces'
)
SELECT JSON_VALUE(funder.shortName) AS funderId, JSON_VALUE(funder.name) AS funderName, type, COUNT(DISTINCT productId) AS nProduct
FROM enriched_products ,
	UNNEST(JSON_EXTRACT_ARRAY(fundings)) AS funder

GROUP BY funderId, funderName, type
ORDER BY funderName ASC, nProduct DESC
""" # end _sql
from google.colab.sql import bigquery as _bqsqlcell
df = _bqsqlcell.run(_sql)
df

<bigframes.display.anywidget.TableWidget object at 0x7bec844f9510>

## Language diversity in research products across funders

In [49]:
# sql_engine: bigquery
# output_variable: df
# start _sql
_sql = """
WITH products as (
  SELECT id, type, languageCode FROM {dataset}.publications
  UNION ALL
  SELECT id, type, languageCode FROM {dataset}.others
  UNION ALL
  SELECT id, type, languageCode FROM {dataset}.datasets
  UNION ALL
  SELECT id, type, languageCode FROM {dataset}.software
),
enriched_products AS (
	SELECT r.source AS projectId, r.target AS productId, r.relatiONName, pr.fundings, p.languageCode
	FROM {dataset}.relations r
	JOIN {dataset}.projects pr ON r.source = pr.id
	JOIN products p ON r.target = p.id
		WHERE r.relationName = 'produces'
)
SELECT JSON_VALUE(funding.shortName) AS funderId, JSON_VALUE(funding.name) AS funderName, count(distinct languageCode) AS languageCount, count(distinct productId) AS productsCount
FROM enriched_products,
	UNNEST(JSON_QUERY_ARRAY(fundings)) AS funding
GROUP BY funderId, funderName
ORDER BY productsCount DESC
""" # end _sql
from google.colab.sql import bigquery as _bqsqlcell
df = _bqsqlcell.run(_sql)
df

<bigframes.display.anywidget.TableWidget object at 0x7bec84513df0>

## Number of research products per Sustainable Development Goals across Funders

In [51]:
# sql_engine: bigquery
# output_variable: df
# start _sql
_sql = """
WITH products as (
  SELECT id, type, subjects FROM {dataset}.publications
  UNION ALL
  SELECT id, type, subjects FROM {dataset}.others
  UNION ALL
  SELECT id, type, subjects FROM {dataset}.datasets
  UNION ALL
  SELECT id, type, subjects FROM {dataset}.software
),
enriched_products AS (
	SELECT r.source AS projectId, r.target AS productId, r.relationName, pr.fundings, p.subjects
	FROM {dataset}.relations r
	JOIN {dataset}.projects pr ON r.source = pr.id
	JOIN products p ON r.target = p.id
		WHERE r.relationName = 'produces'
)
SELECT JSON_VALUE(funding, "$.shortName") AS funderId, JSON_VALUE(funding, "$.name") AS funderName, JSON_VALUE(subject.subject.value) AS SDG, count(distinct productId) AS n_products
FROM enriched_products,
	UNNEST(JSON_QUERY_ARRAY(fundings)) AS funding,
	UNNEST(JSON_QUERY_ARRAY(subjects)) AS subject

	WHERE JSON_VALUE(subject.subject.scheme) = "SDG"
GROUP BY funderId, funderName, SDG
ORDER BY funderId, funderName, SDG ASC
""" # end _sql
from google.colab.sql import bigquery as _bqsqlcell
df = _bqsqlcell.run(_sql)
df

<bigframes.display.anywidget.TableWidget object at 0x7bec8457a6b0>