In [1]:
import requests
import time
from collections import deque
from tqdm import tqdm
import json

In [2]:
# Function to reconstruct the abstract from inverted index to plain text
def reconstruct_abstract(abstract_inverted_index):
    if not abstract_inverted_index:
        return ""
    
    # Find the maximum position to determine the abstract length
    max_pos = max(pos for positions in abstract_inverted_index.values() for pos in positions)
    abstract_words = [""] * (max_pos + 1)
    
    for word, positions in abstract_inverted_index.items():
        for pos in positions:
            abstract_words[pos] = word
    
    return " ".join(abstract_words)

# Function to fetch paper metadata using DOI
def fetch_paper_metadata_by_doi(doi):
    url = f"https://api.openalex.org/works/https://doi.org/{doi}"
    response = requests.get(url)
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Failed to fetch data for DOI: {doi} with status code {response.status_code}")
        return None

# Function to fetch paper metadata using OpenAlex ID
def fetch_paper_metadata_by_openalex_id(openalex_id):
    url = f"https://api.openalex.org/works/{openalex_id}"
    response = requests.get(url)
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Failed to fetch data for OpenAlex ID: {openalex_id} with status code {response.status_code}")
        return None


# BFS to fetch references up to a specified depth
def fetch_references_bfs(seed_dois, max_depth=2):
    result = []
    queue = deque([(doi, 0, "doi") for doi in seed_dois])  # Initialize queue with seed DOIs and depth 0, using "doi" for seed papers
    visited = set()  # Track visited papers using DOIs and OpenAlex IDs

    current_depth = 0
    depth_queue = []  # To store elements of the current depth level for progress tracking

    while queue and current_depth <= max_depth:
        # Separate elements for the current depth
        while queue and queue[0][1] == current_depth:
            depth_queue.append(queue.popleft())
        
        # Create a new progress bar for the current depth
        with tqdm(total=len(depth_queue), desc=f"Processing Depth {current_depth}") as pbar:

            count=0
            for current_id, depth, id_type in depth_queue:

                count += 1

                # Skip if the paper has already been visited
                if current_id in visited:
                    pbar.update(1)
                    continue

                # Fetch metadata based on ID type
                if id_type == "doi":
                    paper_data = fetch_paper_metadata_by_doi(current_id)
                else:
                    paper_data = fetch_paper_metadata_by_openalex_id(current_id)

                if not paper_data:
                    pbar.update(1)
                    continue

                # Mark the paper as visited
                visited.add(current_id)

                locations = paper_data.get("locations", [])
                location_info = [
                    {
                        "venue_name": loc.get("source", {}).get("display_name", "Not Available"),
                        "publisher": loc.get("source", {}).get("host_organization", "Not Available"),
                        # "landing_page_url": loc.get("landing_page_url", "Not Available"),
                        "pdf_url": loc.get("pdf_url", "Not Available"),
                        # "version": loc.get("version", "Not Available"),
                        # "is_open_access": loc.get("is_oa", False),
                        # "license": loc.get("license", "Not Available"),
                    }
                    for loc in locations if loc.get('source', None) is not None
                ]

                # Structuring metadata for JSON format
                paper_info = {
                    "title": paper_data.get("title", ""),
                    "doi_openalexid": paper_data.get("doi", "").replace("https://doi.org/", "") if paper_data.get("doi") else paper_data.get("id", ""),
                    "authors": [author['author']['display_name'] for author in paper_data.get("authorships", [])],
                    "publication_date": paper_data.get("publication_date", ""),
                    "publish_year": paper_data.get("publication_year", ""),
                    "keywords": [concept['display_name'] for concept in paper_data.get("concepts", [])],
                    "abstract": reconstruct_abstract(paper_data.get("abstract_inverted_index", "")),
                    "global_link_openable": paper_data.get("id", ""),
                    "citation_count": paper_data.get("cited_by_count", 0),
                    "publication": location_info,
                    "references_related_works": [] ### TODO
                }

                # Process references using OpenAlex IDs
                references = paper_data.get("referenced_works", [])
                if len(references) == 0:
                    references = paper_data.get("related_works", [])
                for ref_id in references:
                    if ref_id not in visited:
                        queue.append((ref_id, depth + 1, "openalex_id"))  # Use OpenAlex ID for subsequent references
                        paper_info["references_related_works"].append({"openalex_id": ref_id}) ### TODO

                result.append(paper_info)
                pbar.update(1)  # Update progress for each paper processed
                time.sleep(0.1)  # Delay to respect rate limits

                # Save Intermediate Results for Depth >= 2:
                if current_depth>1 and count%1000==0:
                    filename = f'OpenAlex_Intermediate_Depth_{current_depth}.json'
                    with open(filename, "w") as json_file:
                        json.dump(result, json_file, indent=2)
                    print('\n')
                    print(f'File for Depth-{current_depth}, Count-{count} : Saved')
                    print(f'Number of Unique Papers at Depth-{current_depth}, Count-{count} (cummulative of all depths) : {len(result)}')
                    print('\n')

        # Saving Intermediate Results for each depth
        filename = f'OpenAlex_Intermediate_Depth_{current_depth}.json'
        with open(filename, "w") as json_file:
            json.dump(result, json_file, indent=2)
        print('\n')
        print(f'File for Depth-{current_depth} : Saved')
        print(f'Number of Unique Papers at Depth-{current_depth} (cummulative of all depths) : {len(result)}')
        print('\n')



        # Move to the next depth level
        current_depth += 1
        depth_queue.clear()  # Clear depth_queue for the next depth

    return result


In [3]:

# Titles of the papers:
# 1. "Attention Is All You Need"
# 2. "BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding"
# 3. "Deep Residual Learning for Image Recognition"
# 4. "Generative Adversarial Nets"
# 5. "Adam: A Method for Stochastic Optimization"
# 6. "YOLOv3: An Incremental Improvement"
# 7. "Neural Architecture Search with Reinforcement Learning"
# 8. "The Lottery Ticket Hypothesis: Finding Sparse, Trainable Neural Networks"
# 9. "Deep Learning for Natural Language Processing"  XXX
# 10. "Deep Learning for Genomics: A Concise Overview"
# 11. "Deep Learning for Healthcare: Review, Opportunities and Challenges" XXX
# 12. "Deep Learning for Time Series Forecasting: The Electric Load Case"
# 13. "Deep Learning for Image Super-Resolution: A Survey" XXX
# 14. "Deep Learning for Anomaly Detection: A Survey"
# 15. "Deep Learning for Recommender Systems: A Survey and New Perspectives"
# 16. "YOLO9000: Better, Faster, Stronger"
# 17. "EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks"
# 18. "XLNet: Generalized Autoregressive Pretraining for Language Understanding"
# 19. "RoBERTa: A Robustly Optimized BERT Pretraining Approach"
# 20. "GPT-3: Language Models are Few-Shot Learners"
# 21. "Swin Transformer: Hierarchical Vision Transformer using Shifted Windows"
# 22. "DALL·E: Creating Images from Text"
# 23. "AlphaFold: A Solution to a 50-Year-Old Grand Challenge in Biology"
# 24. "Neural Ordinary Differential Equations"
# 25. "StyleGAN: A Style-Based Generator Architecture for Generative Adversarial Networks"
# 26. "BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding"
# 27. "Deep Learning for Large-Scale Hierarchical Image Database" XXX
# 28. "Deep Learning for Machine Translation by Jointly Learning to Align and Translate"
# 29. "Playing Atari with Deep Reinforcement Learning" XXX
# 30. "EfficientNet: Model Scaling for Convolutional Neural Networks"

seed_dois = [
    "10.48550/arXiv.1706.03762",
    "10.18653/v1/N19-1423",
    "10.1109/CVPR.2016.90",
    "10.48550/arXiv.1406.2661",
    "10.48550/arXiv.1412.6980",
    "10.48550/arXiv.1804.02767",
    "10.48550/arXiv.1611.01578",
    "10.48550/arXiv.1803.03635",
    # "10.2200/S00762ED1V01Y201501HLT027", # XXX
    "10.1016/j.patcog.2019.107107",
    # "10.1007/s10115-018-1149-0", # XXX
    "10.1016/j.ijforecast.2016.01.001",
    # "10.1109/TNNLS.2020.2966520", # XXX
    "10.1145/3236009",
    "10.1145/3285029",
    "10.1109/CVPR.2017.690",
    "10.48550/arXiv.1905.11946",
    "10.48550/arXiv.1906.08237",
    "10.48550/arXiv.1907.11692",
    "10.48550/arXiv.2005.14165",
    "10.48550/arXiv.2103.14030",
    "10.48550/arXiv.2102.12092",
    "10.1038/s41586-021-03819-2",
    "10.48550/arXiv.1806.07366",
    "10.1109/CVPR.2019.00453",
    "10.1109/CVPR.2018.00727",
    # "10.48550/arXiv.1705.08436", # XXX
    "10.1109/CVPR.2017.424",
    # "10.1109/TPAMI.2018.2844859", # XXX
    "10.1109/CVPR.2019.00020"
]



max_depth = 2
data = fetch_references_bfs(seed_dois, max_depth=max_depth)

with open("openalex_papers.json", "w") as json_file:
    json.dump(data, json_file, indent=2)
print('Data Saved.')

print(f'Number of Unique Papers = {len(data)}')


Processing Depth 0:   0%|          | 0/25 [00:00<?, ?it/s]

Processing Depth 0: 100%|██████████| 25/25 [00:07<00:00,  3.41it/s]




File for Depth-0 : Saved
Number of Unique Papers at Depth-0 (cummulative of all depths) : 25




Processing Depth 1: 100%|██████████| 1085/1085 [06:26<00:00,  2.81it/s]




File for Depth-1 : Saved
Number of Unique Papers at Depth-1 (cummulative of all depths) : 1028




Processing Depth 2:   0%|          | 1000/297182 [04:57<26:00:27,  3.16it/s]



File for Depth-2, Count-1000 : Saved
Number of Unique Papers at Depth-2, Count-1000 (cummulative of all depths) : 1771




Processing Depth 2:   0%|          | 1046/297182 [05:13<19:33:05,  4.21it/s]

Failed to fetch data for OpenAlex ID: https://openalex.org/W2530816535 with status code 404


Processing Depth 2:   0%|          | 1180/297182 [06:06<30:50:10,  2.67it/s]