In [31]:
import requests
import json
import os
import asyncio
import httpx
import pandas as pd

In [3]:
with open('improved_refrences.json', 'r', encoding='utf-8') as f:
    data = json.load(f)
len(data)

122

In [4]:
doi_reference_index_list = [i for i in range(len(data)) if 'referenced_doi' in  data[i].keys()]
doi_reference_index_list

[10, 11, 21, 29, 38, 46, 50, 60, 62, 64, 72, 84, 89, 90, 100, 111, 116]

In [20]:
[data[i].get('referenced_works_count') for i in doi_reference_index_list]

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [21]:
[data[i].get('referenced_works') for i in doi_reference_index_list]

[[], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], []]

In [7]:
def get_openalex_id_from_doi(doi: str):
    if doi.startswith('https://doi.org/'):
        canonical_doi = doi
    elif doi.startswith('http://doi.org/'):
        canonical_doi = doi
    elif doi.startswith('doi:'):
        canonical_doi = 'https://doi.org/' + doi[4:]
    else:
        canonical_doi = 'https://doi.org/' + doi


    base_url = 'https://api.openalex.org/works'
    params = {
        'filter': f'doi:{canonical_doi}',
        'select': 'id', 
    }
    try:
        response = requests.get(base_url, params=params)
        response.raise_for_status()

        data = response.json()

        if data.get('results') and len(data['results']) > 0:
            openalex_id = data['results'][0].get('id')
            
            if openalex_id:
                return True, openalex_id
            else:
                return False, "Work found, but 'id' field was missing in the response."
        else:
            return False, f"No work found with DOI: {doi}"

    except requests.exceptions.HTTPError as e:
        return False, f"HTTP error: {e}"
    except requests.exceptions.RequestException as e:
        return False, f"A request error occurred: {e}"
    except Exception as e:
        return False, f"An unexpected error occurred: {e}"

In [12]:
data[doi_reference_index_list[0]].get('referenced_doi')[-1]

'10.3934/mbe.2021172'

In [13]:
get_openalex_id_from_doi('10.3934/mbe.2021172')

(True, 'https://openalex.org/W3155736193')

In [15]:
async def _fetch_openalex_id_async(client: httpx.AsyncClient, doi: str):
    """Async helper to fetch a single OpenAlex ID."""
    
    # Clean the DOI
    if doi.startswith('https://doi.org/'):
        canonical_doi = doi
    elif doi.startswith('http://doi.org/'):
        canonical_doi = doi
    elif doi.startswith('doi:'):
        canonical_doi = 'https://doi.org/' + doi[4:]
    else:
        canonical_doi = 'https://doi.org/' + doi

    base_url = 'https://api.openalex.org/works'
    params = {
        'filter': f'doi:{canonical_doi}',
        'select': 'id',
    }

    try:
        response = await client.get(base_url, params=params)
        response.raise_for_status() # Check for HTTP errors
        data = response.json()

        if data.get('results') and len(data['results']) > 0:
            openalex_id = data['results'][0].get('id')
            if openalex_id:
                return (doi, True, openalex_id)
            else:
                return (doi, False, "Work found, but 'id' field was missing.")
        else:
            return (doi, False, f"No work found with DOI: {doi}")
            
    except httpx.HTTPStatusError as e:
        return (doi, False, f"HTTP error: {e}")
    except httpx.RequestError as e:
        return (doi, False, f"A request error occurred: {e}")
    except Exception as e:
        return (doi, False, f"An unexpected error occurred: {e}")

In [18]:
async def get_openalex_ids_batch_async(doi_list: list[str], batch_size: int = 25, sleep_sec: float = 0.1):
    """
    Fetches OpenAlex IDs for a list of DOIs asynchronously in batches.
    """
    results = []
    async with httpx.AsyncClient(timeout=20.0) as client:
        for i in range(0, len(doi_list), batch_size):
            # Create a batch of DOI strings
            batch_dois = doi_list[i:i + batch_size]
            
            # Create a list of tasks for the current batch
            tasks = [_fetch_openalex_id_async(client, doi) for doi in batch_dois]
            
            # Run tasks concurrently
            batch_results = await asyncio.gather(*tasks)
            results.extend(batch_results)
            
            # Sleep between batches if not the last batch
            if i + batch_size < len(doi_list):
                await asyncio.sleep(sleep_sec)
                
    return results

In [None]:
for index in doi_reference_index_list:
    doi_list = data[index].get('referenced_doi')
    N_doi = data[index].get('referenced_doi_count')
    paper_title = data[index].get('title')

    batch_results = await get_openalex_ids_batch_async(doi_list, batch_size=8, sleep_sec=1)

    data[index]['referenced_works'].extend(batch_results)
    data[index]['referenced_works_count'] = len(batch_results)

    print(f'{paper_title:140} - {len(batch_results:3)}/{N_doi:3}')


A Full-Lifecycle Calibration Method for Camera and LiDAR in Autonomous Driving                                                               - 53/53
A New Literature Review of 3D Object Detection on Autonomous Driving                                                                         - 209/209
A Survey of Trajectory Planning Methods for Autonomous Driving — Part I: Unstructured Scenarios                                              - 89/89
Accurate 3D Multi-Object Detection and Tracking on Vietnamese Street Scenes Based on Sparse Point Cloud Data                                 - 60/60
Application of semantic information module in LiDAR-based Simultaneous-Localization-and-Mapping algorithm                                    - 39/39
Deep Learning Frontiers in 3D Object Detection: A Comprehensive Review for Autonomous Driving                                                - 218/218
Differentiable Integrated Motion Prediction and Planning With Learnable Cost Function for Autonomous D

In [24]:
doi_reference_index_list

[10, 11, 21, 29, 38, 46, 50, 60, 62, 64, 72, 84, 89, 90, 100, 111, 116]

In [26]:
data[10].get('referenced_works_count'), len(data[10].get('referenced_works'))

(53, 53)

In [27]:
with open('improved_refrences.json', 'w', encoding='utf-8') as f:
    json.dump(data, f, indent=4, ensure_ascii=False)

In [28]:
for index in doi_reference_index_list:
    del data[index]['referenced_doi']
    del data[index]['referenced_doi_count']

In [29]:
for rec in data:
    if 'referenced_doi' in rec.keys():
        print('referenced_doi')
    if 'referenced_doi_count' in rec.keys():
        print('referenced_doi_count')

In [32]:
ref_count_df = pd.DataFrame([
    {
        'title': rec.get('title'),
        'ref_count': rec.get('referenced_works_count'),
        'ref_count_from_list': len(rec.get('referenced_works'))
    }
    for rec in data
])
ref_count_df.head()

Unnamed: 0,title,ref_count,ref_count_from_list
0,<scp>3D LiDAR SLAM</scp>: A survey,259,259
1,3D Multi-Object Tracking With Adaptive Cubatur...,23,23
2,3D Object Detection and Tracking Based on Lida...,35,35
3,3D Object Detection for Autonomous Driving: A ...,312,312
4,3D Object Detection for Autonomous Driving: A ...,100,100


In [33]:
ref_count_df['diff'] = ref_count_df['ref_count'] - ref_count_df['ref_count_from_list']

ref_count_df = ref_count_df.sort_values('diff')

ref_count_df['diff'].describe()

count    122.0
mean       0.0
std        0.0
min        0.0
25%        0.0
50%        0.0
75%        0.0
max        0.0
Name: diff, dtype: float64

In [35]:
ref_count_df.drop(['ref_count_from_list', 'diff'], axis=1, inplace=True)
ref_count_df.sort_values('ref_count')

Unnamed: 0,title,ref_count
6,6DoF-3D: Efficient and accurate 3D object dete...,20
66,Joint 3D Proposal Generation and Object Detect...,21
43,C2L3-Fusion: An Integrated 3D Object Detection...,23
1,3D Multi-Object Tracking With Adaptive Cubatur...,23
98,Robust Visual Localization System With HD Map ...,24
...,...,...
81,Path planning algorithms in the autonomous dri...,217
46,Deep Learning Frontiers in 3D Object Detection...,218
0,<scp>3D LiDAR SLAM</scp>: A survey,259
3,3D Object Detection for Autonomous Driving: A ...,312


In [36]:
ref_count_df['ref_count'].sum()

np.int64(10586)

In [37]:
with open('improved_refrences.json', 'w', encoding='utf-8') as f:
    json.dump(data, f, indent=4, ensure_ascii=False)