In [1]:
import requests
import json
import os
import asyncio
import httpx
import pandas as pd
import random

In [2]:
with open('clean_data/papers/batch_1.json', 'r', encoding='utf-8') as f:
    existing_data = json.load(f)
len(existing_data)

122

In [3]:
existing_papers_id = [rec.get('paper').get('id') for rec in existing_data]

papers_id = []
for rec in existing_data:
    referenced_works = rec.get('referencedWorks')
    papers_id.extend(referenced_works)
papers_id = list(set(papers_id) - set(existing_papers_id))
len(papers_id)

6570

In [5]:
async def get_data(url, parse_func):
    async with httpx.AsyncClient(timeout=10.0) as client:
        try:
            response = await client.get(url, params={'mailto': 'rimaz.temp@gmail.com'})
            response.raise_for_status()
            data = response.json()
            parsed_data = parse_func(data)
            return True, parsed_data, None
        except httpx.HTTPStatusError as e:
            return False, f"HTTP error: {e}", url
        except httpx.RequestError as e:
            return False, f"A request error occurred: {e}", url
        except Exception as e:
            return False, f"An unexpected error occurred: {e}", url


In [6]:
async def get_batch_async(ids_list: list[str], parse_func, batch_size: int = 4, sleep_sec: float = 1):
    results = []
    not_valid_url = []
    for i in range(0, len(ids_list), batch_size):
        batch = ids_list[i:i + batch_size]
        
        tasks = [get_data(id_, parse_func) for id_ in batch]
        
        batch_results = await asyncio.gather(*tasks)
        results.extend([rec[1] for rec in batch_results if rec[0]])
        not_valid_url.extend([(rec[2], rec[1]) for rec in batch_results if not rec[0]])
        
        if i + batch_size < len(ids_list):
            await asyncio.sleep(sleep_sec)
                
    return results, not_valid_url

In [7]:
CONCEPT_SIMILARITY_THRESH = 0.5
TOPIC_SIMILARITY_THRESH = 0.8

def parse_paper_data(rec):
    return {
        'paper':{
            'id': rec.get('id', None),
            'doi': rec.get('doi', None),
            'title': rec.get('title', None),
            'publicationYear': rec.get('publication_year', None),
            'language': rec.get('language', None),
            'type': rec.get('type', None),
            'crossrefType': rec.get('type_crossref', None),
            'numberOfCiteation': rec.get('cited_by_count', None),
            'FieldWeightedCitationImpact': rec.get('fwci', None),
            'countsByYear': rec.get('counts_by_year', {}),
            'pdfUrl': rec.get('primary_location', {}).get('pdf_url', None)
        },

        'authorship':[
            {
                'authorPosition': author.get('author_position', None),
                'author':{
                    'id': author.get('author', {}).get('id', None),
                    'institutions': [institution.get('id', None) for institution in author.get('institutions', [])],
                    'isCorresponding': author.get('is_corresponding', None),
                    'affiliations': author.get('raw_affiliation_strings', []),
                }    
            } for author in rec.get('authorships')
        ],

        'source': {
            'id': rec.get('primary_location', {}).get('source', {}).get('id', None) if rec.get('primary_location', {}).get('source', {}) else None,
            'hostOrganizationId': rec.get('primary_location', {}).get('source', {}).get('host_organization', None) if rec.get('primary_location', {}).get('source', {}) else None,
            'volume': rec.get('biblio', {}).get('volume', None),
            'issue': rec.get('biblio', {}).get('issue', None),
        },

        'referencedWorks': rec.get('referenced_works', []),
        'relatedWorks': rec.get('related_works', []),

        'grants': [
            {
                'funder': grant.get('funder', None),
                'awardId': grant.get('award_id', None),
            } for grant in rec.get('grants', [])
        ],

        'concepts': [concept.get('id', None) for concept in rec.get('concepts', []) if concept.get('score', None) > CONCEPT_SIMILARITY_THRESH ],
        'keywords': [keyword.get('display_name', None) for keyword in rec.get('keywords', []) ],

        'topics': [topic.get('id', None) for topic in rec.get('topics', []) if topic.get('score', None) > TOPIC_SIMILARITY_THRESH ],

    }

In [None]:
# random.shuffle(papers_id)
# N_PAPERS = len(papers_id)
BATCH_SIZE = 160
j = 4

# not_valid_url = []

for i in range(320, N_PAPERS, BATCH_SIZE):
    selected_ids = [url.replace('openalex.org', 'api.openalex.org', 1) for url in papers_id[i:i+BATCH_SIZE]]
    selected_results, temp_not_valid_url = await get_batch_async(selected_ids, parse_paper_data)
    
    with open(f'clean_data/papers/batch_{j}.json', 'w', encoding='utf-8') as f:
        json.dump(selected_results, f, indent=4, ensure_ascii=False)
    
    print(f'{i:5}:{i+BATCH_SIZE:5} -> {len(temp_not_valid_url)} ')
    not_valid_url.extend(temp_not_valid_url)

    j += 1
    

In [10]:
len(not_valid_url)

21

In [21]:
# [rec[1] for rec in not_valid_url]

In [18]:
selected_ids = [url for url, _ in not_valid_url]
selected_results, temp_not_valid_url = await get_batch_async(selected_ids, parse_paper_data)

with open(f'clean_data/papers/batch_{j}.json', 'w', encoding='utf-8') as f:
    json.dump(selected_results, f, indent=4, ensure_ascii=False)

In [20]:
# temp_not_valid_url

In [22]:
ids = []
dir_paper_id = 'clean_data/papers'
for fname in os.listdir(dir_paper_id):
    with open(os.path.join(dir_paper_id, fname), 'r', encoding='utf-8') as f:
        temp = json.load(f)
    ids.extend([r.get('paper', {}).get('id') for r in temp])

In [25]:
len(ids), len(set(ids)), len(papers_id)

(6671, 6650, 6570)

In [75]:
index = 6
print(json.dumps(selected_results[index].get('grants', {}), indent=4))

[
    {
        "funder": "https://openalex.org/F4320320709",
        "awardId": null
    },
    {
        "funder": "https://openalex.org/F4320320766",
        "awardId": null
    }
]


In [89]:
resp = requests.get('https://api.openalex.org/W2951730755', params={'mailto': 'rimaz.temp@gmail.com'})
resp.status_code

200

In [90]:
resp.headers

{'Date': 'Mon, 27 Oct 2025 16:21:26 GMT', 'Content-Type': 'application/json', 'Transfer-Encoding': 'chunked', 'Connection': 'keep-alive', 'CF-Ray': '995386c63b0b2ad8-LAX', 'CF-Cache-Status': 'DYNAMIC', 'Access-Control-Allow-Origin': '*', 'Vary': 'accept-encoding', 'Server': 'cloudflare', 'Via': '1.1 heroku-router', 'Access-Control-Allow-Headers': 'Accept, Accept-Language, Accept-Encoding, Authorization, Content-Type', 'Access-Control-Allow-Methods': 'GET, HEAD, POST, OPTIONS', 'access-control-expose-headers': 'Cache-Control, RateLimit-Limit, RateLimit-Remaining, RateLimit-Reset, Retry-After', 'Nel': '{"report_to":"heroku-nel","response_headers":["Via"],"max_age":3600,"success_fraction":0.01,"failure_fraction":0.1}', 'RateLimit-Limit': '5', 'RateLimit-Remaining': '9', 'RateLimit-Reset': '1', 'Report-To': '{"group":"heroku-nel","endpoints":[{"url":"https://nel.heroku.com/reports?s=qhZY9C8%2F4qcnXFWK2LIUM%2F%2FY7tQE8Fcun%2Ffxalps7zY%3D\\u0026sid=c46efe9b-d3d2-4a0c-8c76-bfafa16c5add\\u0026

In [91]:
resp = requests.get('https://api.openalex.org/W2951730755')
resp.status_code

200

In [92]:
resp.headers

{'Date': 'Mon, 27 Oct 2025 16:21:56 GMT', 'Content-Type': 'application/json', 'Transfer-Encoding': 'chunked', 'Connection': 'keep-alive', 'CF-Ray': '995387804841f9ce-LAX', 'CF-Cache-Status': 'DYNAMIC', 'Access-Control-Allow-Origin': '*', 'Vary': 'accept-encoding', 'Server': 'cloudflare', 'Via': '1.1 heroku-router', 'Access-Control-Allow-Headers': 'Accept, Accept-Language, Accept-Encoding, Authorization, Content-Type', 'Access-Control-Allow-Methods': 'GET, HEAD, POST, OPTIONS', 'access-control-expose-headers': 'Cache-Control, RateLimit-Limit, RateLimit-Remaining, RateLimit-Reset, Retry-After', 'Nel': '{"report_to":"heroku-nel","response_headers":["Via"],"max_age":3600,"success_fraction":0.01,"failure_fraction":0.1}', 'RateLimit-Limit': '5', 'RateLimit-Remaining': '9', 'RateLimit-Reset': '1', 'Report-To': '{"group":"heroku-nel","endpoints":[{"url":"https://nel.heroku.com/reports?s=ofYoILYxuSDChS5KjW7ErWy9Ji2dmcg8RZcMZN%2FWJFo%3D\\u0026sid=c46efe9b-d3d2-4a0c-8c76-bfafa16c5add\\u0026ts=176