In [1]:
import requests
import json
import os
import asyncio
import httpx
import pandas as pd

In [2]:
with open('improved_refrences.json', 'r', encoding='utf-8') as f:
    data = json.load(f)
len(data)

122

In [3]:
doi_reference_index_list = [i for i in range(len(data)) if 'referenced_doi' in  data[i].keys()]
doi_reference_index_list

[10, 11, 21, 29, 38, 46, 50, 60, 62, 64, 72, 84, 89, 90, 100, 111, 116]

In [4]:
[data[i].get('referenced_works_count') for i in doi_reference_index_list]

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [5]:
[data[i].get('referenced_works') for i in doi_reference_index_list]

[[], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], []]

In [None]:
async def _fetch_openalex_id_async(client: httpx.AsyncClient, doi: str):
    """Async helper to fetch a single OpenAlex ID."""
    
    # Clean the DOI
    if doi.startswith('https://doi.org/'):
        canonical_doi = doi
    elif doi.startswith('http://doi.org/'):
        canonical_doi = doi
    elif doi.startswith('doi:'):
        canonical_doi = 'https://doi.org/' + doi[4:]
    else:
        canonical_doi = 'https://doi.org/' + doi

    base_url = 'https://api.openalex.org/works'
    params = {
        'filter': f'doi:{canonical_doi}',
        'select': 'id',
        'mailto': 'rimaz.temp@gmail.com'
    }

    try:
        response = await client.get(base_url, params=params)
        response.raise_for_status() # Check for HTTP errors
        data = response.json()

        if data.get('results') and len(data['results']) > 0:
            openalex_id = data['results'][0].get('id')
            if openalex_id:
                return (doi, True, openalex_id)
            else:
                return (doi, False, "Work found, but 'id' field was missing.")
        else:
            return (doi, False, f"No work found with DOI: {doi}")
            
    except httpx.HTTPStatusError as e:
        return (doi, False, f"HTTP error: {e}")
    except httpx.RequestError as e:
        return (doi, False, f"A request error occurred: {e}")
    except Exception as e:
        return (doi, False, f"An unexpected error occurred: {e}")

In [7]:
async def get_openalex_ids_batch_async(doi_list: list[str], batch_size: int = 25, sleep_sec: float = 0.1):
    """
    Fetches OpenAlex IDs for a list of DOIs asynchronously in batches.
    """
    results = []
    async with httpx.AsyncClient(timeout=20.0) as client:
        for i in range(0, len(doi_list), batch_size):
            # Create a batch of DOI strings
            batch_dois = doi_list[i:i + batch_size]
            
            # Create a list of tasks for the current batch
            tasks = [_fetch_openalex_id_async(client, doi) for doi in batch_dois]
            
            # Run tasks concurrently
            batch_results = await asyncio.gather(*tasks)
            results.extend([rec[2] for rec in batch_results])
            
            # Sleep between batches if not the last batch
            if i + batch_size < len(doi_list):
                await asyncio.sleep(sleep_sec)
                
    return results

In [17]:
import copy

for index in doi_reference_index_list:
    doi_list = data[index].get('referenced_doi')
    N_doi = data[index].get('referenced_doi_count')
    paper_title = data[index].get('title')
    
    batch_results = await get_openalex_ids_batch_async(doi_list, batch_size=4, sleep_sec=1)

    data[index]['referenced_works'] = copy.deepcopy(batch_results)
    data[index]['referenced_works_count'] = len(batch_results)

    print(f'{paper_title:140} - {len(batch_results):3}/{N_doi:3}')


A Full-Lifecycle Calibration Method for Camera and LiDAR in Autonomous Driving                                                               -  53/ 53
A New Literature Review of 3D Object Detection on Autonomous Driving                                                                         - 209/209
A Survey of Trajectory Planning Methods for Autonomous Driving — Part I: Unstructured Scenarios                                              -  89/ 89
Accurate 3D Multi-Object Detection and Tracking on Vietnamese Street Scenes Based on Sparse Point Cloud Data                                 -  60/ 60
Application of semantic information module in LiDAR-based Simultaneous-Localization-and-Mapping algorithm                                    -  39/ 39
Deep Learning Frontiers in 3D Object Detection: A Comprehensive Review for Autonomous Driving                                                - 218/218
Differentiable Integrated Motion Prediction and Planning With Learnable Cost Function for Auto

In [18]:
for index in doi_reference_index_list:
    paper_title = data[index].get('title')
    N_works = data[index].get('referenced_works_count')
    non_https_count = sum([1 for item in data[index].get('referenced_works') if not item.startswith("https:")])
    print(f'{paper_title:140} - {non_https_count:3}/{N_works:3}')

A Full-Lifecycle Calibration Method for Camera and LiDAR in Autonomous Driving                                                               -   0/ 53
A New Literature Review of 3D Object Detection on Autonomous Driving                                                                         -   0/209
A Survey of Trajectory Planning Methods for Autonomous Driving — Part I: Unstructured Scenarios                                              -   2/ 89
Accurate 3D Multi-Object Detection and Tracking on Vietnamese Street Scenes Based on Sparse Point Cloud Data                                 -   0/ 60
Application of semantic information module in LiDAR-based Simultaneous-Localization-and-Mapping algorithm                                    -   1/ 39
Deep Learning Frontiers in 3D Object Detection: A Comprehensive Review for Autonomous Driving                                                -  14/218
Differentiable Integrated Motion Prediction and Planning With Learnable Cost Function for Auto

In [21]:
for index in doi_reference_index_list:
    referenced_works = [item for item in data[index].get('referenced_works') if item.startswith("https:")]
    data[index]['referenced_works'] = copy.deepcopy(referenced_works)
    data[index]['referenced_works_count'] = len(referenced_works)


In [23]:
for index in doi_reference_index_list:
    del data[index]['referenced_doi']
    del data[index]['referenced_doi_count']

In [24]:
for rec in data:
    if 'referenced_doi' in rec.keys():
        print('referenced_doi')
    if 'referenced_doi_count' in rec.keys():
        print('referenced_doi_count')

In [25]:
with open('improved_refrences.json', 'w', encoding='utf-8') as f:
    json.dump(data, f, indent=4, ensure_ascii=False)