In [2]:
from typing import Optional, Dict

import requests
from pathlib import Path
from bs4 import BeautifulSoup
from urllib import request


PDF_PATH = "../data/pdf"
pdf_path = Path(PDF_PATH)

if not pdf_path.exists():
    pdf_path.mkdir()

In [3]:
def get_pdf_info(id: str) -> Dict:
    url = f"https://arxiv.org/abs/{id}"

    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")

    title_element = soup.select_one("h1.title")
    title = title_element.text[6:]

    author_elements = soup.select("div.authors > a")
    authors = ", ".join([e.text for e in author_elements])

    abstract_element = soup.select_one("blockquote.abstract")
    abstract = abstract_element.text[10:]

    return {
        "title": title,
        "authors": authors,
        "abstract": abstract
    }

def download_arxiv_pdf(id: str, pdf_path: Path) -> Optional[str]:
    info = get_pdf_info(id)
    url = f"https://arxiv.org/pdf/{id}"
    filename = info["title"] + ".pdf"
    filepath = pdf_path / filename
    
    try:
        request.urlretrieve(url, filepath)
        print(f"다운로드 완료 - {filepath}")
        return filepath
    except Exception as e:
        print(e)
        return None

In [4]:
# download single file
arxiv_id = "2410.10315"

filepath = download_arxiv_pdf(arxiv_id, pdf_path)

다운로드 완료 - ../data/pdf/EasyRAG: Efficient Retrieval-Augmented Generation Framework for Automated Network Operations.pdf


In [6]:
# download multiple files
ids = [
    "2410.10315",
    "2410.05779",
    "2410.01782",
    "2409.12941"
]

files = [download_arxiv_pdf(id, pdf_path) for id in ids]
files

다운로드 완료 - ../data/pdf/EasyRAG: Efficient Retrieval-Augmented Generation Framework for Automated Network Operations.pdf
다운로드 완료 - ../data/pdf/LightRAG: Simple and Fast Retrieval-Augmented Generation.pdf
다운로드 완료 - ../data/pdf/Open-RAG: Enhanced Retrieval-Augmented Reasoning with Open-Source Large Language Models.pdf
다운로드 완료 - ../data/pdf/Fact, Fetch, and Reason: A Unified Evaluation of Retrieval-Augmented Generation.pdf


[PosixPath('../data/pdf/EasyRAG: Efficient Retrieval-Augmented Generation Framework for Automated Network Operations.pdf'),
 PosixPath('../data/pdf/LightRAG: Simple and Fast Retrieval-Augmented Generation.pdf'),
 PosixPath('../data/pdf/Open-RAG: Enhanced Retrieval-Augmented Reasoning with Open-Source Large Language Models.pdf'),
 PosixPath('../data/pdf/Fact, Fetch, and Reason: A Unified Evaluation of Retrieval-Augmented Generation.pdf')]