In [1]:
import requests

import pandas as pd

In [2]:
url = "https://api.openalex.org/works?filter=institutions.ror:04dkp9463,publication_year:2024"

response = requests.get(url)

In [3]:
if response.status_code == 200:
    data = response.json()
else:
    print(response.status_code)

In [4]:
data["meta"]

{'count': 8005,
 'db_response_time_ms': 185,
 'page': 1,
 'per_page': 25,
 'groups_count': None}

We only get the first 25 publications!

To get them all, we need Paging: https://docs.openalex.org/how-to-use-the-api/get-lists-of-entities/paging#cursor-paging

In [5]:
data = []

cursor = "*"

while cursor:

    url = f"https://api.openalex.org/works?filter=institutions.ror:04dkp9463,publication_year:2024&cursor={cursor}"
    response = requests.get(url)

    if response.status_code == 200:
        page_data = response.json()
    else:
        print(response.status_code)
        break

    cursor = page_data["meta"]["next_cursor"]

    data.extend(page_data["results"])

In [6]:
len(data)

8005

In [7]:
df = pd.DataFrame(data)
df = df[["id", "title", "abstract_inverted_index", "concepts", "countries_distinct_count", "institutions_distinct_count", 
         "grants", "keywords", "open_access", "primary_topic", "sustainable_development_goals"]]

In [8]:
# single values
df["open_access"] = df.open_access.apply(lambda x: x["oa_status"] if x else x)
df["primary_topic"] = df.primary_topic.apply(lambda x: x["display_name"] if x else x)

In [9]:
# lists: separate by comma
df["concepts"] = df.concepts.apply(lambda x: ",".join(concept["display_name"] for concept in x) if x else None)
df["grants"] = df.grants.apply(lambda x: ",".join(grant["funder_display_name"] for grant in x) if x else None)
df["keywords"] = df.keywords.apply(lambda x: ",".join(keyword["display_name"] for keyword in x) if x else None)
df["sustainable_development_goals"] = df.sustainable_development_goals.apply(lambda x: ",".join(sdg["display_name"] for sdg in x) if x else None)


In [None]:
# drop inverted abstracts before saving
df.drop(columns="abstract_inverted_index").to_csv("publications2024.csv")

Inverted abstracts: has to be done individually (copyright)

Save separately as json for now

In [11]:
import json

In [None]:
abstracts_inverted = [{"id": item["id"], "abstract": item["abstract_inverted_index"]} for item in data]

with open("abstracts_inverted.json", "w") as f:
    f.write(json.dumps(abstracts_inverted))