In [None]:
import sys
import arxiv
import time
import os
import requests
from tqdm import tqdm

# --- Configuration ---
# The complete list of "AI" categories
CATEGORIES = [
    "cs.AI", "cs.LG", "cs.CL", "cs.CV", 
    "cs.RO", "cs.MA", "stat.ML", "cs.NE"
]

YEAR = 2025
MONTH = 11
DOWNLOAD_DIR = "/media/mascit/datasets/arxiv/2025_1112"
SLEEP_TIME = 4 



def download_pdf(paper, save_dir):
    """Downloads PDF with safe filename."""
    # Clean title
    safe_title = "".join(x for x in paper.title if x.isalnum() or x in " -_").strip()
    filename = f"{paper.get_short_id()}_{safe_title[:50]}.pdf"
    file_path = os.path.join(save_dir, filename)

    if os.path.exists(file_path):
        return False  # Skip existing

    # Use export.arxiv.org for scripts
    pdf_url = paper.pdf_url.replace("arxiv.org", "export.arxiv.org")

    try:
        response = requests.get(pdf_url, stream=True, headers={"User-Agent": "AI_Researcher/1.0"})
        if response.status_code == 200:
            with open(file_path, 'wb') as f:
                for chunk in response.iter_content(chunk_size=1024):
                    if chunk:
                        f.write(chunk)
            return True
        elif response.status_code == 403:
            print("\n[!] 403 Forbidden. You might be blocked. Pause the script.")
            sys.exit(-1)
    except Exception as e:
        print(f"\nError: {e}")
    
    return False

def main():
    if not os.path.exists(DOWNLOAD_DIR):
        os.makedirs(DOWNLOAD_DIR)

    print(f"Fetching metadata for AI categories: {CATEGORIES}")
    
    # Construct query: (cat:cs.CV OR cat:cs.CL OR ...)
    query = " OR ".join([f"cat:{cat}" for cat in CATEGORIES])
    
    client = arxiv.Client(
        page_size=1000,
        delay_seconds=1,
        num_retries=5
    )
    
    search = arxiv.Search(
        query=query,
        sort_by=arxiv.SortCriterion.SubmittedDate,
        sort_order=arxiv.SortOrder.Descending
    )

    target_papers = []
    
    # --- Step 1: Filter Metadata ---
    # We fetch ALL results and filter by year client-side because 
    # the API's date filtering is often unreliable.
    print("Scanning metadata (this may take a few minutes)...")
    try:
        for result in tqdm(client.results(search), desc="Fetching Metadata"):
            if result.published.year < YEAR or result.published.month < MONTH:
                break # Stop once we hit 2024
            
            if result.published.year == YEAR and result.published.month >= MONTH:
                target_papers.append(result)
    except Exception as e:
        print(f"Metadata fetch interrupted: {e}")

    # Remove duplicates (papers often listed in multiple categories)
    # We use a dictionary keyed by entry_id to ensure uniqueness
    unique_papers = {p.entry_id: p for p in target_papers}.values()
    
    print(f"\nFound {len(unique_papers)} unique AI papers from {YEAR}.")
    
    # --- Step 2: Download ---
    print(f"Starting downloads to {DOWNLOAD_DIR}...")
    
    for paper in tqdm(list(unique_papers), desc="Downloading PDFs"):
        success = download_pdf(paper, DOWNLOAD_DIR)
        if success:
            time.sleep(SLEEP_TIME)

In [3]:
main()


Fetching metadata for AI categories: ['cs.AI', 'cs.LG', 'cs.CL', 'cs.CV', 'cs.RO', 'cs.MA', 'stat.ML', 'cs.NE']
Scanning metadata (this may take a few minutes)...


Fetching Metadata: 10000it [01:15, 132.42it/s]


Metadata fetch interrupted: Page request resulted in HTTP 500 (https://export.arxiv.org/api/query?search_query=cat%3Acs.AI+OR+cat%3Acs.LG+OR+cat%3Acs.CL+OR+cat%3Acs.CV+OR+cat%3Acs.RO+OR+cat%3Acs.MA+OR+cat%3Astat.ML+OR+cat%3Acs.NE&id_list=&sortBy=submittedDate&sortOrder=descending&start=10000&max_results=1000)

Found 10000 unique AI papers from 2025.
Starting downloads to /media/mascit/datasets/arxiv/2025_1112...


Downloading PDFs:   0%|          | 35/10000 [03:33<16:53:24,  6.10s/it]


KeyboardInterrupt: 