# Extract paper titles and abstracts from arXiv

In [8]:
import json
import re
import xml.etree.ElementTree as ET

import requests

In [9]:
def clean_text(text: str) -> str:
    """Remove unwanted characters from text."""
    # Replace \" with `
    text = text.replace('\"', "'")
    # Replace newlines by spaces
    text = text.strip().replace("\n", " ")
    # Remove extra spaces
    text = re.sub(r"\s+", " ", text)
    return text

arXiv API [documentation](https://info.arxiv.org/help/api/basics.html) and [user's manual](https://info.arxiv.org/help/api/user-manual.html)

In [10]:
queries = [
    "nanoporous materials",
    "many-body",
    "machine learning",
    "quantum computing",
    "biomolecular modeling",
]

In [11]:
def query_arxiv(query: str, max_results: int = 10) -> str:
    """Query the arXiv API for a given search term."""
    print(f"Querying {max_results} papers from arXiv for: {query}")
    BASE_URL = "http://export.arxiv.org/api/query"
    params = {
        "search_query": f"all:{query}",
        "sortBy": "relevance",
        "sortOrder": "descending",
        "start": 0,
        "max_results": max_results
    }
    response = requests.get(BASE_URL, params=params)
    if response.status_code != 200:
        print(f"Error querying arXiv: {response.status_code}")
        return ""
    return response.text

def parse_arxiv_response(query: str, response: str) -> list:
    if not response:
        return []
    root = ET.fromstring(response)
    namespace = {"atom": "http://www.w3.org/2005/Atom"}
    entries = []
    for entry in root.findall("atom:entry", namespace):
        paper_id = entry.find("atom:id", namespace).text
        date = entry.find("atom:published", namespace).text
        title = clean_text(entry.find("atom:title", namespace).text)
        abstract = clean_text(entry.find("atom:summary", namespace).text)
        entries.append({
            "id": paper_id,
            "date": date,
            "title": title,
            "abstract": abstract,
            "category": query,
        })
    return entries

In [12]:
entries = []
for query in queries:
    query_result = query_arxiv(query, max_results=100)
    entries += parse_arxiv_response(query, query_result)

with open("arxiv_papers.json", "w", encoding="utf-8") as file_out:
    json.dump(entries, file_out, indent=2, ensure_ascii=False)


Querying 100 papers from arXiv for: nanoporous materials
Querying 100 papers from arXiv for: many-body
Querying 100 papers from arXiv for: machine learning
Querying 100 papers from arXiv for: quantum computing
Querying 100 papers from arXiv for: biomolecular modeling
