In [1]:
import requests
import json
import os
import asyncio
import httpx
import pandas as pd
import random

In [2]:
with open('clean_data/papers/batch_1.json', 'r', encoding='utf-8') as f:
    existing_data = json.load(f)
len(existing_data)

122

In [3]:
existing_papers_id = [rec.get('paper').get('id') for rec in existing_data]

papers_id = []
for rec in existing_data:
    referenced_works = rec.get('referencedWorks')
    papers_id.extend(referenced_works)
papers_id = list(set(papers_id) - set(existing_papers_id))
len(papers_id)

6570

In [2]:
async def get_data(url, parse_func):
    async with httpx.AsyncClient(timeout=10.0) as client:
        try:
            response = await client.get(url, params={'mailto': 'rimaz.temp@gmail.com'})
            response.raise_for_status()
            data = response.json()
            parsed_data = parse_func(data)
            return True, parsed_data, None
        except httpx.HTTPStatusError as e:
            return False, f"HTTP error: {e}", url
        except httpx.RequestError as e:
            return False, f"A request error occurred: {e}", url
        except Exception as e:
            return False, f"An unexpected error occurred: {e}", url


In [3]:
async def get_batch_async(ids_list: list[str], parse_func, batch_size: int = 4, sleep_sec: float = 1):
    results = []
    not_valid_url = []
    for i in range(0, len(ids_list), batch_size):
        batch = ids_list[i:i + batch_size]
        
        tasks = [get_data(id_, parse_func) for id_ in batch]
        
        batch_results = await asyncio.gather(*tasks)
        results.extend([rec[1] for rec in batch_results if rec[0]])
        not_valid_url.extend([(rec[2], rec[1]) for rec in batch_results if not rec[0]])
        
        if i + batch_size < len(ids_list):
            await asyncio.sleep(sleep_sec)
                
    return results, not_valid_url

In [7]:
CONCEPT_SIMILARITY_THRESH = 0.5
TOPIC_SIMILARITY_THRESH = 0.8

def parse_paper_data(rec):
    return {
        'paper':{
            'id': rec.get('id', None),
            'doi': rec.get('doi', None),
            'title': rec.get('title', None),
            'publicationYear': rec.get('publication_year', None),
            'language': rec.get('language', None),
            'type': rec.get('type', None),
            'crossrefType': rec.get('type_crossref', None),
            'numberOfCiteation': rec.get('cited_by_count', None),
            'FieldWeightedCitationImpact': rec.get('fwci', None),
            'countsByYear': rec.get('counts_by_year', {}),
            'pdfUrl': rec.get('primary_location', {}).get('pdf_url', None)
        },

        'authorship':[
            {
                'authorPosition': author.get('author_position', None),
                'author':{
                    'id': author.get('author', {}).get('id', None),
                    'institutions': [institution.get('id', None) for institution in author.get('institutions', [])],
                    'isCorresponding': author.get('is_corresponding', None),
                    'affiliations': author.get('raw_affiliation_strings', []),
                }    
            } for author in rec.get('authorships')
        ],

        'source': {
            'id': rec.get('primary_location', {}).get('source', {}).get('id', None) if rec.get('primary_location', {}).get('source', {}) else None,
            'hostOrganizationId': rec.get('primary_location', {}).get('source', {}).get('host_organization', None) if rec.get('primary_location', {}).get('source', {}) else None,
            'volume': rec.get('biblio', {}).get('volume', None),
            'issue': rec.get('biblio', {}).get('issue', None),
        },

        'referencedWorks': rec.get('referenced_works', []),
        'relatedWorks': rec.get('related_works', []),

        'grants': [
            {
                'funder': grant.get('funder', None),
                'awardId': grant.get('award_id', None),
            } for grant in rec.get('grants', [])
        ],

        'concepts': [concept.get('id', None) for concept in rec.get('concepts', []) if concept.get('score', None) > CONCEPT_SIMILARITY_THRESH ],
        'keywords': [keyword.get('display_name', None) for keyword in rec.get('keywords', []) ],

        'topics': [topic.get('id', None) for topic in rec.get('topics', []) if topic.get('score', None) > TOPIC_SIMILARITY_THRESH ],

    }

In [None]:
# random.shuffle(papers_id)
# N_PAPERS = len(papers_id)
BATCH_SIZE = 160
j = 4

# not_valid_url = []

for i in range(320, N_PAPERS, BATCH_SIZE):
    selected_ids = [url.replace('openalex.org', 'api.openalex.org', 1) for url in papers_id[i:i+BATCH_SIZE]]
    selected_results, temp_not_valid_url = await get_batch_async(selected_ids, parse_paper_data)
    
    with open(f'clean_data/papers/batch_{j}.json', 'w', encoding='utf-8') as f:
        json.dump(selected_results, f, indent=4, ensure_ascii=False)
    
    print(f'{i:5}:{i+BATCH_SIZE:5} -> {len(temp_not_valid_url)} ')
    not_valid_url.extend(temp_not_valid_url)

    j += 1
    

In [10]:
len(not_valid_url)

21

In [21]:
# [rec[1] for rec in not_valid_url]

In [18]:
selected_ids = [url for url, _ in not_valid_url]
selected_results, temp_not_valid_url = await get_batch_async(selected_ids, parse_paper_data)

with open(f'clean_data/papers/batch_{j}.json', 'w', encoding='utf-8') as f:
    json.dump(selected_results, f, indent=4, ensure_ascii=False)

In [22]:
ids = []
dir_paper_id = 'clean_data/papers'
for fname in os.listdir(dir_paper_id):
    with open(os.path.join(dir_paper_id, fname), 'r', encoding='utf-8') as f:
        temp = json.load(f)
    ids.extend([r.get('paper', {}).get('id') for r in temp])

In [28]:
len(ids), len(set(ids)), len(papers_id)+len(existing_papers_id)

(6671, 6650, 6692)

In [33]:
[a.get('author', {}).get('id') for a in temp[0].get('authorship') if a.get('author', {}).get('id', None)]

['https://openalex.org/A5072811055',
 'https://openalex.org/A5080039924',
 'https://openalex.org/A5091175785',
 'https://openalex.org/A5003932703',
 'https://openalex.org/A5102861496',
 'https://openalex.org/A5037751863',
 'https://openalex.org/A5086198262',
 'https://openalex.org/A5112608251']

In [4]:
authors_ids = []
dir_paper_id = 'clean_data/papers'
for fname in os.listdir(dir_paper_id):
    with open(os.path.join(dir_paper_id, fname), 'r', encoding='utf-8') as f:
        temp = json.load(f)
    for rec in temp:
        authors_ids.extend([a.get('author', {}).get('id') for a in rec.get('authorship') if a.get('author', {}).get('id', None)])

len(authors_ids)

28832

In [5]:
authors_ids = list(set(authors_ids))
random.shuffle(authors_ids)
len(authors_ids)

17901

In [None]:
def parse_autor_data(rec):
    return {
        'id': rec.get('id', None),
        'orcid': rec.get('orcid', None),
        'name': rec.get('display_name', None),
        'nameAlternatives': rec.get('display_name_alternatives', []),

        'worksCount': rec.get('works_count', None),
        'numberOfCitation': rec.get('cited_by_count', None),

        'twoYearMeanCitedness': rec.get('summary_stats', {}).get('2yr_mean_citedness', None),
        'hIndex': rec.get('summary_stats', {}).get('h_index', None),
        'iTenIndex': rec.get('summary_stats', {}).get('i10_index', None),

        'affiliations': [
            {
                'institution': affiliation.get('institution', {}).get('id', None),
                'years': affiliation.get('years', [])
            } for affiliation in  rec.get('affiliations', [])
        ],

        'lastKnownInstitutions': [institution.get('id', None) for institution in rec.get('last_known_institutions', [])],

        'topics': [
            {
                'id': topic.get('id', None) ,
                'count': topic.get('count', None) ,
            } for topic in rec.get('topics') if topic.get('count', 0) > 2],

        'worksApiUrl': rec.get('works_api_url'),

        'counts_by_year':[
            {
                'year': info.get('year', None),
                'citedByCount': info.get('cited_by_count', None),
                'worksCount': info.get('works_count', None),
            } for info in rec.get('counts_by_year', [])
        ]
        

    }

In [69]:
authors_ids[1166]

'https://openalex.org/A5060443387'

In [71]:
resp = requests.get('https://api.openalex.org/A5060443387455')
if resp.status_code == 200:
    data = resp.json()
    print(json.dumps(parse_autor_data(data), indent=4))

In [73]:
N_AUTORS = len(authors_ids)
BATCH_SIZE = 500
j = 1

not_valid_url = []

for i in range(0, N_AUTORS, BATCH_SIZE):
    selected_ids = [url.replace('openalex.org', 'api.openalex.org', 1) for url in authors_ids[i:i+BATCH_SIZE]]
    selected_results, temp_not_valid_url = await get_batch_async(selected_ids, parse_autor_data)
    
    with open(f'clean_data/authors/batch_{j}.json', 'w', encoding='utf-8') as f:
        json.dump(selected_results, f, indent=4, ensure_ascii=False)
    
    print(f'{i:5}:{i+BATCH_SIZE:5} -> {len(temp_not_valid_url)} ')
    not_valid_url.extend(temp_not_valid_url)

    j += 1


    0:  500 -> 0 
  500: 1000 -> 0 
 1000: 1500 -> 0 
 1500: 2000 -> 0 
 2000: 2500 -> 0 
 2500: 3000 -> 0 
 3000: 3500 -> 0 
 3500: 4000 -> 0 
 4000: 4500 -> 0 
 4500: 5000 -> 0 
 5000: 5500 -> 0 
 5500: 6000 -> 0 
 6000: 6500 -> 0 
 6500: 7000 -> 0 
 7000: 7500 -> 0 
 7500: 8000 -> 0 
 8000: 8500 -> 0 
 8500: 9000 -> 0 
 9000: 9500 -> 0 
 9500:10000 -> 0 
10000:10500 -> 0 
10500:11000 -> 0 
11000:11500 -> 0 
11500:12000 -> 0 
12000:12500 -> 0 
12500:13000 -> 0 
13000:13500 -> 0 
13500:14000 -> 0 
14000:14500 -> 0 
14500:15000 -> 0 
15000:15500 -> 0 
15500:16000 -> 0 
16000:16500 -> 0 
16500:17000 -> 0 
17000:17500 -> 0 
17500:18000 -> 0 


In [74]:
ids = []
dir_authors_id = 'clean_data/authors'
for fname in os.listdir(dir_authors_id):
    with open(os.path.join(dir_authors_id, fname), 'r', encoding='utf-8') as f:
        temp = json.load(f)
    ids.extend([r.get('id', None) for r in temp])

In [75]:
len(ids), len(set(ids)), len(authors_ids)

(17901, 17900, 17901)

## journal

In [15]:
sources_ids = []
dir_paper_id = 'clean_data/papers'
for fname in os.listdir(dir_paper_id):
    with open(os.path.join(dir_paper_id, fname), 'r', encoding='utf-8') as f:
        temp = json.load(f)
    for rec in temp:
        sources_ids.append(rec.get('source', {}).get('id', None))

len(sources_ids)

6671

In [16]:
sources_ids = list(set(sources_ids))
random.shuffle(sources_ids)
sources_ids.remove(None)
len(sources_ids)

836

In [17]:
def parse_source_data(rec):
    return{
        'id': rec.get('id', None),
        'name': rec.get('display_name'),

        'hostOrganizationId': rec.get('host_organization'),
        'hostOrganizationName': rec.get('host_organization_name'),

        'worksCount': rec.get('works_count'),
        'citedByCount': rec.get('cited_by_count'),
        'twoYearMeanCitedness': rec.get('summary_stats', {}).get('2yr_mean_citedness', None),
        'hIndex': rec.get('summary_stats', {}).get('h_index', None),
        'iTenIndex': rec.get('summary_stats', {}).get('i10_index', None),

        'isOpenAccess': rec.get('is_oa', None),
        'isInDoaj': rec.get('is_in_doaj', None),
        'isIndexedInScopus': rec.get('is_indexed_in_scopus', None),
        'isCore': rec.get('is_core', None),

        'ArticleProcessingCharge': rec.get('apc_prices', []),

        'countryCode': rec.get('country_code', None), 
        'societies': rec.get('societies', []),

        'topics': [
            {
                'id': topic.get('id', None) ,
                'count': topic.get('count', None) ,
            } for topic in rec.get('topics')
        ],

        'counts_by_year':[
            {
                'year': info.get('year', None),
                'citedByCount': info.get('cited_by_count', None),
                'worksCount': info.get('works_count', None),
            } for info in rec.get('counts_by_year', [])
        ]

    }

In [57]:
sources_ids[1]

'https://openalex.org/S4210227635'

In [59]:
response = requests.get('https://api.openalex.org/S421022763555')
if response.status_code == 200:
    response.raise_for_status()
    data = response.json()
    print(json.dumps(parse_source_data(data), indent=4))

In [18]:
N_SOURCES = len(sources_ids)
BATCH_SIZE = 200
j = 1

not_valid_url = []

for i in range(0, N_SOURCES, BATCH_SIZE):
    selected_ids = [url.replace('openalex.org', 'api.openalex.org', 1) for url in sources_ids[i:i+BATCH_SIZE]]
    selected_results, temp_not_valid_url = await get_batch_async(selected_ids, parse_source_data)
    
    with open(f'clean_data/sources/batch_{j}.json', 'w', encoding='utf-8') as f:
        json.dump(selected_results, f, indent=4, ensure_ascii=False)
    
    print(f'{i:5}:{i+BATCH_SIZE:5} -> {len(temp_not_valid_url)} ')
    not_valid_url.extend(temp_not_valid_url)

    j += 1

    0:  200 -> 0 
  200:  400 -> 0 
  400:  600 -> 0 
  600:  800 -> 1 
  800: 1000 -> 0 


In [19]:
ids = []
dir_sources_id = 'clean_data/sources'
for fname in os.listdir(dir_sources_id):
    with open(os.path.join(dir_sources_id, fname), 'r', encoding='utf-8') as f:
        temp = json.load(f)
    ids.extend([r.get('id', None) for r in temp])

In [20]:
len(ids), len(set(ids)), len(sources_ids)

(835, 835, 836)

## institute

In [4]:
institutions_ids = []
dir_paper_id = 'clean_data/papers'
for fname in os.listdir(dir_paper_id):
    with open(os.path.join(dir_paper_id, fname), 'r', encoding='utf-8') as f:
        temp = json.load(f)
    for rec in temp:
        institutions_ids.extend([i for a in rec.get('authorship', []) for i in a.get('author', {}).get('institutions', [])])

print(len(institutions_ids))

dir_author_id = 'clean_data/authors'
for fname in os.listdir(dir_author_id):
    with open(os.path.join(dir_author_id, fname), 'r', encoding='utf-8') as f:
        temp = json.load(f)
    for rec in temp:
        institutions_ids.extend(rec.get('lastKnownInstitutions', []))

print(len(institutions_ids))

29171
49967


In [5]:
institutions_ids = list(set(institutions_ids))
random.shuffle(institutions_ids)
if None in institutions_ids:
    institutions_ids.remove(None)
len(institutions_ids)

5049

In [6]:
def parse_institution_data(rec):
    return {
        'id': rec.get('id', None),
        'ror': rec.get('ror', None),
        'name': rec.get('display_name', None),
        'countryCode': rec.get('country_code', None),
        'type': rec.get('type', None),
        'internationalName': rec.get('international', {}).get('display_name', {}).get('en', None),

        'worksCount': rec.get('works_count', None),
        'citedByCount': rec.get('cited_by_count', None),

        'twoYearMeanCitedness': rec.get('summary_stats', {}).get('2yr_mean_citedness', None),
        'hIndex': rec.get('summary_stats', {}).get('h_index', None),
        'iTenIndex': rec.get('summary_stats', {}).get('i10_index', None),


        'geo': {
            'city': rec.get('geo', {}).get('city', None),
            'country': rec.get('geo', {}).get('country', None),
            'latitude': rec.get('geo', {}).get('latitude', None),
            'longitude': rec.get('geo', {}).get('longitude', None),
        },



        'countsByYear': [
            {
                'year': info.get('year', None),
                'worksCount': info.get('works_count', None),
                'citedByCount': info.get('cited_by_count', None),
            } for info in rec.get('counts_by_year', [])
        ],
        
        'topics': [
            {
                'id': topic.get('id', None) ,
                'count': topic.get('count', None) ,
            } for topic in rec.get('topics', [])
        ],
    }

In [54]:
institutions_ids[0]

'https://openalex.org/I32377083'

In [56]:
response = requests.get('https://api.openalex.org/I3237708003')
if response.status_code == 200:
    response.raise_for_status()
    data = response.json()
    print(json.dumps(parse_institution_data(data), indent=4))

In [7]:
N_INSTITUTIONS = len(institutions_ids)
BATCH_SIZE = 500
j = 1

not_valid_url = []

for i in range(0, N_INSTITUTIONS, BATCH_SIZE):
    selected_ids = [url.replace('openalex.org', 'api.openalex.org', 1) for url in institutions_ids[i:i+BATCH_SIZE]]
    selected_results, temp_not_valid_url = await get_batch_async(selected_ids, parse_institution_data)
    
    with open(f'clean_data/institutions/batch_{j}.json', 'w', encoding='utf-8') as f:
        json.dump(selected_results, f, indent=4, ensure_ascii=False)
    
    print(f'{i:5}:{i+BATCH_SIZE:5} -> {len(temp_not_valid_url)} ')
    not_valid_url.extend(temp_not_valid_url)

    j += 1

    0:  500 -> 0 
  500: 1000 -> 0 
 1000: 1500 -> 0 
 1500: 2000 -> 0 
 2000: 2500 -> 0 
 2500: 3000 -> 0 
 3000: 3500 -> 1 
 3500: 4000 -> 0 
 4000: 4500 -> 0 
 4500: 5000 -> 0 
 5000: 5500 -> 0 


In [9]:
ids = []
dir_institutions_id = 'clean_data/institutions'
for fname in os.listdir(dir_institutions_id):
    with open(os.path.join(dir_institutions_id, fname), 'r', encoding='utf-8') as f:
        temp = json.load(f)
    ids.extend([r.get('id', None) for r in temp])

In [10]:
len(ids), len(set(ids)), len(institutions_ids)

(5048, 5048, 5049)

## grants

In [4]:
grants_ids = []
dir_paper_id = 'clean_data/papers'
for fname in os.listdir(dir_paper_id):
    with open(os.path.join(dir_paper_id, fname), 'r', encoding='utf-8') as f:
        temp = json.load(f)
    for rec in temp:
        grants_ids.extend([g.get('funder', None) for g in rec.get('grants', [])])

print(len(grants_ids))

3694


In [5]:
grants_ids = list(set(grants_ids))
random.shuffle(grants_ids)
if None in grants_ids:
    grants_ids.remove(None)
len(grants_ids)

605

In [8]:
def parse_grant_data(rec):
    return {
        'id': rec.get('id', None),
        'name': rec.get('display_name', None),
        'ror': rec.get('ids', {}).get('ror', None),
        'doi': rec.get('ids', {}).get('doi', None),
        'countryCode': rec.get('country_code', None),
        'description': rec.get('description', None),

        'grantsCount': rec.get('grants_count', None),
        'worksCount': rec.get('works_count', None),
        'citedByCount': rec.get('cited_by_count', None),

        'twoYearMeanCitedness': rec.get('summary_stats', {}).get('2yr_mean_citedness', None),
        'hIndex': rec.get('summary_stats', {}).get('h_index', None),
        'iTenIndex': rec.get('summary_stats', {}).get('i10_index', None),

        'countsByYear': [
            {
                'year': info.get('year', None),
                'worksCount': info.get('works_count', None),
                'citedByCount': info.get('cited_by_count', None),
            } for info in rec.get('counts_by_year', [])
        ]
    }

In [9]:
response = requests.get('https://api.openalex.org/F43203060760')
if response.status_code == 200:
    response.raise_for_status()
    data = response.json()
    print(json.dumps(parse_grant_data(data), indent=4))

In [10]:
N_FUNDERS = len(grants_ids)
BATCH_SIZE = 200
j = 1

not_valid_url = []

for i in range(0, N_FUNDERS, BATCH_SIZE):
    selected_ids = [url.replace('openalex.org', 'api.openalex.org', 1) for url in grants_ids[i:i+BATCH_SIZE]]
    selected_results, temp_not_valid_url = await get_batch_async(selected_ids, parse_grant_data)
    
    with open(f'clean_data/funders/batch_{j}.json', 'w', encoding='utf-8') as f:
        json.dump(selected_results, f, indent=4, ensure_ascii=False)
    
    print(f'{i:5}:{i+BATCH_SIZE:5} -> {len(temp_not_valid_url)} ')
    not_valid_url.extend(temp_not_valid_url)

    j += 1

    0:  200 -> 0 
  200:  400 -> 0 
  400:  600 -> 0 
  600:  800 -> 0 


In [13]:
ids = []
dir_funders_id = 'clean_data/funders'
for fname in os.listdir(dir_funders_id):
    with open(os.path.join(dir_funders_id, fname), 'r', encoding='utf-8') as f:
        temp = json.load(f)
    ids.extend([r.get('id', None) for r in temp])

In [14]:
len(ids), len(set(ids)), len(grants_ids)

(605, 605, 605)

## topic