# ArXiv

In [15]:
import json
import re
import xml.etree.ElementTree as ET

import requests

In [16]:
def clean_text(text):
    # Replace \" with `
    text = text.replace('\"', "'")
    # Replace newlines by spaces
    text = text.strip().replace("\n", " ")
    # Remove extra spaces
    text = re.sub(r"\s+", " ", text)
    return text

arXiv API [documentation](https://info.arxiv.org/help/api/basics.html) and [user's manual](https://info.arxiv.org/help/api/user-manual.html)

In [17]:
BASE_URL = "http://export.arxiv.org/api/query"
query = "nanoporous materials"
params = {
    "search_query": f"all:{query}",
    "sortBy": "relevance",
    "sortOrder": "descending",
    "start": 0,
    "max_results": 10
}
response = requests.get(BASE_URL, params=params)
print(response.status_code)
print(response.text)

200
<?xml version="1.0" encoding="UTF-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
  <link href="http://arxiv.org/api/query?search_query%3Dall%3Ananoporous%20materials%26id_list%3D%26start%3D0%26max_results%3D10" rel="self" type="application/atom+xml"/>
  <title type="html">ArXiv Query: search_query=all:nanoporous materials&amp;id_list=&amp;start=0&amp;max_results=10</title>
  <id>http://arxiv.org/api/RxPvvP7npbd/DhQAxQbCp2gfFNk</id>
  <updated>2025-05-22T00:00:00-04:00</updated>
  <opensearch:totalResults xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">188343</opensearch:totalResults>
  <opensearch:startIndex xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">0</opensearch:startIndex>
  <opensearch:itemsPerPage xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">10</opensearch:itemsPerPage>
  <entry>
    <id>http://arxiv.org/abs/2402.01321v1</id>
    <updated>2024-02-02T11:17:55Z</updated>
    <published>2024-02-02T11:17:55Z</published>
    <title>Ionic Current Rec

In [18]:
root = ET.fromstring(response.text)
namespace = {'atom': 'http://www.w3.org/2005/Atom'}
entries = []
for entry in root.findall('atom:entry', namespace):
    paper_id = entry.find('atom:id', namespace).text
    date = entry.find('atom:published', namespace).text
    title = clean_text(entry.find('atom:title', namespace).text)
    abstract = clean_text(entry.find('atom:summary', namespace).text)
    entries.append({
        "id": paper_id,
        "date": date,
        "title": title,
        "abstract": abstract
    })

with open("arxiv_results.json", "w", encoding="utf-8") as f:
    json.dump(entries, f, indent=2, ensure_ascii=False)
