In [1]:
import os
import re
import sys
import time
import json
import math
import glob
import queue
import random
import string
import hashlib
import pymysql
import logging
import pathlib
import requests
import textwrap
import feedparser

import mysql.connector as mysql

from tqdm import tqdm
from lxml import etree
from dotenv import load_dotenv
from typing import List, Dict, Optional, Tuple
from pathlib import Path
from datetime import datetime, timezone
from dateutil import parser as dtparse
from itertools import islice
from sqlalchemy import create_engine, text
from sqlalchemy.engine import Engine

In [None]:
OUTPUT_DIR = pathlib.Path("D:/paper-crawl")

TOPICS = [
    "large language model",
    "transformer",
    "reinforcement learning",
    "computer vision",
    "natural language processing",
    "multimodal",
    "diffusion model",
    "graph neural network",
    "federated learning",
    "speech",
]

CATEGORIES = ["cs.AI", "cs.CL", "cs.LG", "cs.CV"]

MIN_YEAR: Optional[int] = None

PAGE_SIZE = 100

REQUEST_SLEEP_SECONDS = 3.0
DOWNLOAD_RETRIES = 3
DOWNLOAD_TIMEOUT = 60

ARXIV_API = "http://export.arxiv.org/api/query"
HEADERS = {
    "User-Agent": "arxiv-ai-crawler/1.0 (+https://arxiv.org; personal academic use)"
}

logging.basicConfig(
    format="%(asctime)s [%(levelname)s] %(message)s",
    level=logging.INFO
)

def ensure_dir(path: str) -> None:
    pathlib.Path(path).mkdir(parents=True, exist_ok=True)

def sanitize_filename(name: str, max_len: int = 160) -> str:
    name = re.sub(r"[\\/*?\"<>|:]", "_", name)
    name = re.sub(r"\s+", " ", name).strip()
    if len(name) > max_len:
        name = name[:max_len].rstrip()
    return name

def build_query(term: Optional[str]) -> str:
    if CATEGORIES:
        cats = " OR ".join([f'cat:{c}' for c in CATEGORIES])
    else:
        cats = ""

    if term is None or term == "*" or term.strip() == "":
        q = f"({cats})" if cats else "all:*"
    else:
        term_part = f'all:"{term}"'
        q = f"({term_part}) AND ({cats})" if cats else term_part
    return q

def parse_arxiv_id(entry) -> str:
    raw = entry.get("id", "")
    m = re.search(r'arxiv\.org\/abs\/([^\s]+)$', raw)
    if m:
        return m.group(1)
    for link in entry.get("links", []):
        href = link.get("href", "")
        m2 = re.search(r'arxiv\.org\/abs\/([^\s]+)$', href)
        if m2:
            return m2.group(1)
    return "unknown-" + "".join(random.choices(string.ascii_lowercase + string.digits, k=8))

def get_pdf_link(entry) -> Optional[str]:
    for link in entry.get("links", []):
        if link.get("type") == "application/pdf":
            return link.get("href")
        if link.get("title", "").lower() == "pdf":
            return link.get("href")
    return None

def is_recent_enough(_: str, __: Optional[int]) -> bool:
    return True

def fetch_entries_for_topic(term: Optional[str], page_size: int, max_per_topic: Optional[int]) -> List[dict]:
    start = 0
    collected: List[dict] = []
    while True:
        params = {
            "search_query": build_query(term),
            "start": start,
            "max_results": page_size,
            "sortBy": "submittedDate",
            "sortOrder": "descending",
        }
        logging.info(f"[{term}] Querying arXiv start={start}, size={page_size}")
        resp = requests.get(ARXIV_API, params=params, headers=HEADERS, timeout=60)
        time.sleep(REQUEST_SLEEP_SECONDS)  # tuân thủ rate-limit
        resp.raise_for_status()

        feed = feedparser.parse(resp.text)
        entries = feed.get("entries", [])
        if not entries:
            logging.info(f"[{term}] Hết kết quả.")
            break

        collected.extend(entries)
        logging.info(f"[{term}] Got {len(entries)} entries; total so far {len(collected)}")

        start += page_size

        if max_per_topic is not None and len(collected) >= max_per_topic:
            collected = collected[:max_per_topic]
            logging.info(f"[{term}] Reached max_per_topic={max_per_topic}.")
            break

    return collected

def download_pdf(url: str, dest_path: str) -> bool:
    for attempt in range(1, DOWNLOAD_RETRIES + 1):
        try:
            with requests.get(url, stream=True, headers=HEADERS, timeout=DOWNLOAD_TIMEOUT) as r:
                r.raise_for_status()
                total = int(r.headers.get("Content-Length", 0)) or None
                tmp_path = dest_path + ".part"
                with open(tmp_path, "wb") as f, tqdm(
                    total=total,
                    unit="B",
                    unit_scale=True,
                    desc=os.path.basename(dest_path),
                    leave=False
                ) as pbar:
                    for chunk in r.iter_content(chunk_size=1024 * 64):
                        if chunk:
                            f.write(chunk)
                            if total:
                                pbar.update(len(chunk))
                os.replace(tmp_path, dest_path)
            return True
        except Exception as e:
            logging.warning(f"Lỗi tải ({attempt}/{DOWNLOAD_RETRIES}) {url}: {e}")
            time.sleep(2 * attempt)
    return False

def main():
    ensure_dir(OUTPUT_DIR)
    meta_log = []

    for topic in TOPICS:
        entries = fetch_entries_for_topic(
            term=topic,
            page_size=PAGE_SIZE,
            max_per_topic=None
        )

        logging.info(f"[{topic}] Total kept entries: {len(entries)}")

        for e in entries:
            arxiv_id = parse_arxiv_id(e)
            title = e.get("title", "").replace("\n", " ").strip()
            published = e.get("published", "")
            pdf_url = get_pdf_link(e)

            safe_title = sanitize_filename(title)
            filename = f"{sanitize_filename(arxiv_id)} - {safe_title}.pdf"
            save_path = os.path.join(OUTPUT_DIR, filename)

            if not pdf_url:
                logging.info(f"Bỏ qua (không có PDF): {arxiv_id} | {title}")
                continue

            if os.path.exists(save_path):
                logging.info(f"Đã tồn tại: {filename}")
                continue

            logging.info(f"Tải: {arxiv_id} | {title} | {published}")
            ok = download_pdf(pdf_url, save_path)
            meta_log.append({
                "topic": "cs.CV_all",
                "arxiv_id": arxiv_id,
                "title": title,
                "published": published,
                "pdf_url": pdf_url,
                "saved": save_path if ok else None,
                "status": "downloaded" if ok else "failed"
            })

    # Nếu muốn lưu metadata:
    # meta_path = os.path.join(OUTPUT_DIR, f"_arxiv_download_meta_{int(time.time())}.json")
    # with open(meta_path, "w", encoding="utf-8") as f:
    #     json.dump(meta_log, f, ensure_ascii=False, indent=2)
    # logging.info(f"Đã lưu metadata: {meta_path}")

if __name__ == "__main__":
    main()

2025-09-27 21:48:42,958 [INFO] [large language model] Querying arXiv start=0, size=100
2025-09-27 21:48:47,280 [INFO] [large language model] Got 100 entries; total so far 100
2025-09-27 21:48:47,281 [INFO] [large language model] Querying arXiv start=100, size=100
2025-09-27 21:48:51,420 [INFO] [large language model] Hết kết quả.
2025-09-27 21:48:51,421 [INFO] [large language model] Total kept entries: 100
2025-09-27 21:48:51,422 [INFO] Tải: 2509.21310v1 | SAGE: A Realistic Benchmark for Semantic Understanding | 2025-09-25T15:27:15Z
2025-09-27 21:48:52,185 [INFO] Tải: 2509.21305v1 | Sycophancy Is Not One Thing: Causal Separation of Sycophantic Behaviors   in LLMs | 2025-09-25T15:19:39Z
2025-09-27 21:48:53,699 [INFO] Tải: 2509.21291v1 | VC-Agent: An Interactive Agent for Customized Video Dataset Collection | 2025-09-25T15:08:28Z        
2025-09-27 21:49:10,756 [INFO] Tải: 2509.21282v1 | It's Not You, It's Clipping: A Soft Trust-Region via Probability   Smoothing for LLM RL | 2025-09-25T1

In [None]:
docker run --rm -p 18070:8070 -p 18071:8071 lfoppiano/grobid:latest-crf

In [None]:
$inDir  = "E:\paper-low"
$outDir = "E:\data-low"
New-Item -ItemType Directory -Force -Path $outDir | Out-Null

Get-ChildItem -Path $inDir -Filter *.pdf | ForEach-Object {
    $pdfPath = $_.FullName
    $outPath = Join-Path $outDir ($_.BaseName + ".tei.xml")

    & curl.exe -s -S -X POST `
        -F "input=@$pdfPath" `
        -F "consolidateHeader=1" `
        -F "consolidateCitations=0" `
        "http://localhost:18070/api/processFulltextDocument" `
        -o "$outPath"
}