In [1]:
import requests
import json
import os
import asyncio
import httpx
import pandas as pd
import random

In [2]:
with open('clean_data/papers/batch_1.json', 'r', encoding='utf-8') as f:
    existing_data = json.load(f)
len(existing_data)

122

In [3]:
existing_papers_id = [rec.get('paper').get('id') for rec in existing_data]

papers_id = []
for rec in existing_data:
    referenced_works = rec.get('referencedWorks')
    papers_id.extend(referenced_works)
papers_id = list(set(papers_id) - set(existing_papers_id))
len(papers_id)

6570

In [5]:
async def get_data(url, parse_func):
    async with httpx.AsyncClient(timeout=10.0) as client:
        try:
            response = await client.get(url, params={'mailto': 'rimaz.temp@gmail.com'})
            response.raise_for_status()
            data = response.json()
            parsed_data = parse_func(data)
            return True, parsed_data, None
        except httpx.HTTPStatusError as e:
            return False, f"HTTP error: {e}", url
        except httpx.RequestError as e:
            return False, f"A request error occurred: {e}", url
        except Exception as e:
            return False, f"An unexpected error occurred: {e}", url


In [6]:
async def get_batch_async(ids_list: list[str], parse_func, batch_size: int = 4, sleep_sec: float = 1):
    results = []
    not_valid_url = []
    for i in range(0, len(ids_list), batch_size):
        batch = ids_list[i:i + batch_size]
        
        tasks = [get_data(id_, parse_func) for id_ in batch]
        
        batch_results = await asyncio.gather(*tasks)
        results.extend([rec[1] for rec in batch_results if rec[0]])
        not_valid_url.extend([(rec[2], rec[1]) for rec in batch_results if not rec[0]])
        
        if i + batch_size < len(ids_list):
            await asyncio.sleep(sleep_sec)
                
    return results, not_valid_url

In [7]:
CONCEPT_SIMILARITY_THRESH = 0.5
TOPIC_SIMILARITY_THRESH = 0.8

def parse_paper_data(rec):
    return {
        'paper':{
            'id': rec.get('id', None),
            'doi': rec.get('doi', None),
            'title': rec.get('title', None),
            'publicationYear': rec.get('publication_year', None),
            'language': rec.get('language', None),
            'type': rec.get('type', None),
            'crossrefType': rec.get('type_crossref', None),
            'numberOfCiteation': rec.get('cited_by_count', None),
            'FieldWeightedCitationImpact': rec.get('fwci', None),
            'countsByYear': rec.get('counts_by_year', {}),
            'pdfUrl': rec.get('primary_location', {}).get('pdf_url', None)
        },

        'authorship':[
            {
                'authorPosition': author.get('author_position', None),
                'author':{
                    'id': author.get('author', {}).get('id', None),
                    'institutions': [institution.get('id', None) for institution in author.get('institutions', [])],
                    'isCorresponding': author.get('is_corresponding', None),
                    'affiliations': author.get('raw_affiliation_strings', []),
                }    
            } for author in rec.get('authorships')
        ],

        'source': {
            'id': rec.get('primary_location', {}).get('source', {}).get('id', None) if rec.get('primary_location', {}).get('source', {}) else None,
            'hostOrganizationId': rec.get('primary_location', {}).get('source', {}).get('host_organization', None) if rec.get('primary_location', {}).get('source', {}) else None,
            'volume': rec.get('biblio', {}).get('volume', None),
            'issue': rec.get('biblio', {}).get('issue', None),
        },

        'referencedWorks': rec.get('referenced_works', []),
        'relatedWorks': rec.get('related_works', []),

        'grants': [
            {
                'funder': grant.get('funder', None),
                'awardId': grant.get('award_id', None),
            } for grant in rec.get('grants', [])
        ],

        'concepts': [concept.get('id', None) for concept in rec.get('concepts', []) if concept.get('score', None) > CONCEPT_SIMILARITY_THRESH ],
        'keywords': [keyword.get('display_name', None) for keyword in rec.get('keywords', []) ],

        'topics': [topic.get('id', None) for topic in rec.get('topics', []) if topic.get('score', None) > TOPIC_SIMILARITY_THRESH ],

    }

In [None]:
# random.shuffle(papers_id)
# N_PAPERS = len(papers_id)
BATCH_SIZE = 160
j = 4

# not_valid_url = []

for i in range(320, N_PAPERS, BATCH_SIZE):
    selected_ids = [url.replace('openalex.org', 'api.openalex.org', 1) for url in papers_id[i:i+BATCH_SIZE]]
    selected_results, temp_not_valid_url = await get_batch_async(selected_ids, parse_paper_data)
    
    with open(f'clean_data/papers/batch_{j}.json', 'w', encoding='utf-8') as f:
        json.dump(selected_results, f, indent=4, ensure_ascii=False)
    
    print(f'{i:5}:{i+BATCH_SIZE:5} -> {len(temp_not_valid_url)} ')
    not_valid_url.extend(temp_not_valid_url)

    j += 1
    

In [10]:
len(not_valid_url)

21

In [21]:
# [rec[1] for rec in not_valid_url]

In [18]:
selected_ids = [url for url, _ in not_valid_url]
selected_results, temp_not_valid_url = await get_batch_async(selected_ids, parse_paper_data)

with open(f'clean_data/papers/batch_{j}.json', 'w', encoding='utf-8') as f:
    json.dump(selected_results, f, indent=4, ensure_ascii=False)

In [22]:
ids = []
dir_paper_id = 'clean_data/papers'
for fname in os.listdir(dir_paper_id):
    with open(os.path.join(dir_paper_id, fname), 'r', encoding='utf-8') as f:
        temp = json.load(f)
    ids.extend([r.get('paper', {}).get('id') for r in temp])

In [28]:
len(ids), len(set(ids)), len(papers_id)+len(existing_papers_id)

(6671, 6650, 6692)

In [33]:
[a.get('author', {}).get('id') for a in temp[0].get('authorship') if a.get('author', {}).get('id', None)]

['https://openalex.org/A5072811055',
 'https://openalex.org/A5080039924',
 'https://openalex.org/A5091175785',
 'https://openalex.org/A5003932703',
 'https://openalex.org/A5102861496',
 'https://openalex.org/A5037751863',
 'https://openalex.org/A5086198262',
 'https://openalex.org/A5112608251']

In [40]:
authors_ids = []
dir_paper_id = 'clean_data/papers'
for fname in os.listdir(dir_paper_id):
    with open(os.path.join(dir_paper_id, fname), 'r', encoding='utf-8') as f:
        temp = json.load(f)
    for rec in temp:
        authors_ids.extend([a.get('author', {}).get('id') for a in rec.get('authorship') if a.get('author', {}).get('id', None)])

len(authors_ids)

28832

In [41]:
authors_ids = list(set(authors_ids))
len(authors_ids)

17901

In [42]:
random.shuffle(authors_ids)

In [43]:
authors_ids[0]

'https://openalex.org/A5032989636'

In [44]:
resp = requests.get('https://api.openalex.org/A5032989636')
resp.status_code

200

In [46]:
data = resp.json()
data.keys()

dict_keys(['id', 'orcid', 'display_name', 'display_name_alternatives', 'works_count', 'cited_by_count', 'summary_stats', 'ids', 'affiliations', 'last_known_institutions', 'topics', 'topic_share', 'x_concepts', 'counts_by_year', 'works_api_url', 'updated_date', 'created_date'])

In [48]:
data.get('id'), data.get('orcid'), data.get('display_name')

('https://openalex.org/A5032989636', None, 'Minzhou Luo')

In [49]:
data.get('display_name_alternatives')

['Minzhou Luo',
 'M. Luo',
 'Mengjuan Luo',
 'Min‐Zhou Luo',
 'Luo Min‐zhou',
 'Luo Minzhou',
 'Mina Luo']

In [50]:
data.get('works_count'), data.get('cited_by_count')

(136, 1486)

In [51]:
data.get('summary_stats')

{'2yr_mean_citedness': 7.666666666666667, 'h_index': 20, 'i10_index': 36}

In [53]:
data.get('ids')

{'openalex': 'https://openalex.org/A5032989636'}

In [54]:
data.get('affiliations')

[{'institution': {'id': 'https://openalex.org/I163340411',
   'ror': 'https://ror.org/01wd4xt90',
   'display_name': 'Hohai University',
   'country_code': 'CN',
   'type': 'funder',
   'lineage': ['https://openalex.org/I163340411']},
  'years': [2025, 2024, 2023, 2022, 2021, 2020, 2019, 2018, 2017, 2016]},
 {'institution': {'id': 'https://openalex.org/I4210148486',
   'ror': 'https://ror.org/044a9d018',
   'display_name': 'Jiangsu Industry Technology Research Institute',
   'country_code': 'CN',
   'type': 'facility',
   'lineage': ['https://openalex.org/I4210148486']},
  'years': [2023, 2022, 2020, 2018, 2017, 2016]},
 {'institution': {'id': 'https://openalex.org/I4210117754',
   'ror': 'https://ror.org/023zynq23',
   'display_name': 'Chinese Academy of Civil Aviation Science and Technology',
   'country_code': 'CN',
   'type': 'nonprofit',
   'lineage': ['https://openalex.org/I4210117754']},
  'years': [2023]},
 {'institution': {'id': 'https://openalex.org/I126520041',
   'ror': 'ht

In [55]:
data.get('last_known_institutions')

[{'id': 'https://openalex.org/I163340411',
  'ror': 'https://ror.org/01wd4xt90',
  'display_name': 'Hohai University',
  'country_code': 'CN',
  'type': 'funder',
  'lineage': ['https://openalex.org/I163340411']}]

In [56]:
data.get('topics')

[{'id': 'https://openalex.org/T10879',
  'display_name': 'Robotic Locomotion and Control',
  'count': 27,
  'subfield': {'id': 'https://openalex.org/subfields/2204',
   'display_name': 'Biomedical Engineering'},
  'field': {'id': 'https://openalex.org/fields/22',
   'display_name': 'Engineering'},
  'domain': {'id': 'https://openalex.org/domains/3',
   'display_name': 'Physical Sciences'}},
 {'id': 'https://openalex.org/T10653',
  'display_name': 'Robot Manipulation and Learning',
  'count': 22,
  'subfield': {'id': 'https://openalex.org/subfields/2207',
   'display_name': 'Control and Systems Engineering'},
  'field': {'id': 'https://openalex.org/fields/22',
   'display_name': 'Engineering'},
  'domain': {'id': 'https://openalex.org/domains/3',
   'display_name': 'Physical Sciences'}},
 {'id': 'https://openalex.org/T11023',
  'display_name': 'Prosthetics and Rehabilitation Robotics',
  'count': 22,
  'subfield': {'id': 'https://openalex.org/subfields/2204',
   'display_name': 'Biomedi

In [57]:
data.get('topic_share')

[{'id': 'https://openalex.org/T10879',
  'display_name': 'Robotic Locomotion and Control',
  'value': 0.0003709,
  'subfield': {'id': 'https://openalex.org/subfields/2204',
   'display_name': 'Biomedical Engineering'},
  'field': {'id': 'https://openalex.org/fields/22',
   'display_name': 'Engineering'},
  'domain': {'id': 'https://openalex.org/domains/3',
   'display_name': 'Physical Sciences'}},
 {'id': 'https://openalex.org/T11023',
  'display_name': 'Prosthetics and Rehabilitation Robotics',
  'value': 0.0002933,
  'subfield': {'id': 'https://openalex.org/subfields/2204',
   'display_name': 'Biomedical Engineering'},
  'field': {'id': 'https://openalex.org/fields/22',
   'display_name': 'Engineering'},
  'domain': {'id': 'https://openalex.org/domains/3',
   'display_name': 'Physical Sciences'}},
 {'id': 'https://openalex.org/T10868',
  'display_name': 'Soft Robotics and Applications',
  'value': 0.0001978,
  'subfield': {'id': 'https://openalex.org/subfields/2204',
   'display_name

In [58]:
data.get('x_concepts')

[{'id': 'https://openalex.org/C41008148',
  'wikidata': 'https://www.wikidata.org/wiki/Q21198',
  'display_name': 'Computer science',
  'level': 0,
  'score': 99.3},
 {'id': 'https://openalex.org/C154945302',
  'wikidata': 'https://www.wikidata.org/wiki/Q11660',
  'display_name': 'Artificial intelligence',
  'level': 1,
  'score': 86.0},
 {'id': 'https://openalex.org/C127413603',
  'wikidata': 'https://www.wikidata.org/wiki/Q11023',
  'display_name': 'Engineering',
  'level': 0,
  'score': 78.7},
 {'id': 'https://openalex.org/C121332964',
  'wikidata': 'https://www.wikidata.org/wiki/Q413',
  'display_name': 'Physics',
  'level': 0,
  'score': 75.7},
 {'id': 'https://openalex.org/C62520636',
  'wikidata': 'https://www.wikidata.org/wiki/Q944',
  'display_name': 'Quantum mechanics',
  'level': 1,
  'score': 60.3},
 {'id': 'https://openalex.org/C90509273',
  'wikidata': 'https://www.wikidata.org/wiki/Q11012',
  'display_name': 'Robot',
  'level': 2,
  'score': 50.0},
 {'id': 'https://opena

In [59]:
data.get('counts_by_year')

[{'year': 2025, 'works_count': 3, 'oa_works_count': 1, 'cited_by_count': 320},
 {'year': 2024, 'works_count': 3, 'oa_works_count': 1, 'cited_by_count': 458},
 {'year': 2023, 'works_count': 10, 'oa_works_count': 5, 'cited_by_count': 530},
 {'year': 2022, 'works_count': 5, 'oa_works_count': 2, 'cited_by_count': 354},
 {'year': 2021, 'works_count': 11, 'oa_works_count': 3, 'cited_by_count': 228},
 {'year': 2020, 'works_count': 3, 'oa_works_count': 3, 'cited_by_count': 199},
 {'year': 2019, 'works_count': 13, 'oa_works_count': 5, 'cited_by_count': 155},
 {'year': 2018, 'works_count': 25, 'oa_works_count': 8, 'cited_by_count': 187},
 {'year': 2017, 'works_count': 6, 'oa_works_count': 1, 'cited_by_count': 90},
 {'year': 2016, 'works_count': 20, 'oa_works_count': 3, 'cited_by_count': 64},
 {'year': 2015, 'works_count': 8, 'oa_works_count': 0, 'cited_by_count': 60},
 {'year': 2014, 'works_count': 6, 'oa_works_count': 0, 'cited_by_count': 22},
 {'year': 2013, 'works_count': 4, 'oa_works_count':