In [1]:
import requests
from bs4 import BeautifulSoup
import json
from urllib.parse import urljoin

In [27]:
base_url = "https://arxiv.org"
target_url = f"{base_url}/list/cs.AI/new"

# 设置请求头模拟浏览器
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

# 获取页面内容
response = requests.get(target_url, headers=headers)

# 检查请求是否成功
response.raise_for_status() 

# 解析HTML
soup = BeautifulSoup(response.text, 'lxml')

In [28]:
soup

<!DOCTYPE html>
<html lang="en">
<head> <title>Artificial Intelligence  </title>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<link href="/static/browse/0.3.4/images/icons/apple-touch-icon.png" rel="apple-touch-icon" sizes="180x180"/>
<link href="/static/browse/0.3.4/images/icons/favicon-32x32.png" rel="icon" sizes="32x32" type="image/png"/>
<link href="/static/browse/0.3.4/images/icons/favicon-16x16.png" rel="icon" sizes="16x16" type="image/png"/>
<link href="/static/browse/0.3.4/images/icons/site.webmanifest" rel="manifest"/>
<link color="#5bbad5" href="/static/browse/0.3.4/images/icons/safari-pinned-tab.svg" rel="mask-icon"/>
<meta content="#da532c" name="msapplication-TileColor"/>
<meta content="#ffffff" name="theme-color"/>
<link href="/static/browse/0.3.4/css/arXiv.css?v=20241206" media="screen" rel="stylesheet" type="text/css"/>
<link href="/static/browse/0.3.4/css/arXiv-print.css?v=20200611" media="print" rel="stylesheet" type="text/css"/>
<link href="/

In [29]:
dt = None
dd = None
for dt, dd in zip(soup.select("dl > dt"), soup.select("dl > dd")):
    print(dt)
    break

<dt>
<a name="item1">[1]</a>
<a href="/abs/2506.13768" id="2506.13768" title="Abstract">
        arXiv:2506.13768
      </a>
      
        [<a aria-labelledby="pdf-2506.13768" href="/pdf/2506.13768" id="pdf-2506.13768" title="Download PDF">pdf</a>, <a aria-labelledby="html-2506.13768" href="https://arxiv.org/html/2506.13768v1" id="html-2506.13768" rel="noopener noreferrer" target="_blank" title="View HTML">html</a>, <a aria-labelledby="oth-2506.13768" href="/format/2506.13768" id="oth-2506.13768" title="Other formats">other</a>]
    </dt>


In [30]:
dt.find_all(title="Abstract")[0]['href']

'/abs/2506.13768'

In [31]:
urljoin(base_url, dt.find_all(title="Abstract")[0]["href"])

'https://arxiv.org/abs/2506.13768'

In [32]:
urljoin(base_url, dt.find_all(title="Download PDF")[0]["href"])

'https://arxiv.org/pdf/2506.13768'

In [33]:
dd.find("div", class_="list-title").get_text(strip=True).replace("Title:", "")

"'Memory States' from Almost Nothing: Representing and Computing in a Non-associative Algebra"

In [34]:
dd.find("p", class_="mathjax").get_text(strip=True)

'This note presents a non-associative algebraic framework for the representation and computation of information items in high-dimensional space. This framework is consistent with the principles of spatial computing and with the empirical findings in cognitive science about memory. Computations are performed through a process of multiplication-like binding and non-associative interference-like bundling. Models that rely on associative bundling typically lose order information, which necessitates the use of auxiliary order structures, such as position markers, to represent sequential information that is important for cognitive tasks. In contrast, the non-associative bundling proposed allows the construction of sparse representations of arbitrarily long sequences that maintain their temporal structure across arbitrary lengths. In this operation, noise is a constituent element of the representation of order information, rather than a means of obscuring it. The non-associative nature of the

In [35]:
dd.find("div", class_="list-authors").get_text(strip=True)

'Stefan Reimann'

In [36]:
# 定位所有论文条目（arXiv页面使用dl>dt+dd结构组织论文）
papers = []
for dt, dd in zip(soup.select("dl > dt"), soup.select("dl > dd")):
    # 提取链接（dt标签包含链接信息）
    links = dt.find_all("a")
    html_link = urljoin(base_url, dt.find_all(title="Abstract")[0]["href"])
    pdf_link = urljoin(base_url, dt.find_all(title="Download PDF")[0]["href"])

    # 提取标题（dd标签中的.title类）
    title = dd.find("div", class_="list-title").get_text(strip=True).replace("Title:", "")

    # 提取作者信息（dd标签中的.authors类）
    authors = dd.find("div", class_="list-authors").get_text(strip=True)

    # 提取摘要（dd标签中的.abstract类）
    abstract = dd.find("p", class_="mathjax").get_text(strip=True)

    papers.append({
        "pdf_url": pdf_link,
        "html_url": html_link,
        "authors": authors,
        "title": title,
        "abstract": abstract
    })

# 保存为jsonl格式（每行一个JSON对象）
with open("arxiv_papers.jsonl", "w", encoding="utf-8") as jsonl_file:
    for paper in papers:
        json.dump(paper, jsonl_file, ensure_ascii=False)
        jsonl_file.write("\n")