In [None]:
import os
import re
import time
import json
import math
import queue
import random
import string
import logging
import pathlib
import requests
import feedparser
import hashlib
import pymysql

from tqdm import tqdm
from datetime import datetime, timezone
from dateutil import parser as dtparse
from typing import List, Dict, Optional
from lxml import etree
from dotenv import load_dotenv

In [None]:
OUTPUT_DIR = pathlib.Path("...")

TOPICS = [
    "large language model",
    "transformer",
    "reinforcement learning",
    "computer vision",
    "natural language processing",
    "multimodal",
    "diffusion model",
    "graph neural network",
    "federated learning",
    "speech",
]

MAX_PER_TOPIC: Optional[int] = 100
CATEGORIES = ["cs.AI", "cs.CL", "cs.LG", "cs.CV", "stat.ML"]
MIN_YEAR = 2023
PAGE_SIZE = 100
REQUEST_SLEEP_SECONDS = 3.0
DOWNLOAD_RETRIES = 3
DOWNLOAD_TIMEOUT = 60

ARXIV_API = "http://export.arxiv.org/api/query"
HEADERS = {
    "User-Agent": "arxiv-ai-crawler/1.0 (+https://arxiv.org; personal academic use)"
}

logging.basicConfig(
    format="%(asctime)s [%(levelname)s] %(message)s",
    level=logging.INFO
)

def ensure_dir(path: str) -> None:
    pathlib.Path(path).mkdir(parents=True, exist_ok=True)

def sanitize_filename(name: str, max_len: int = 160) -> str:
    name = re.sub(r"[\\/*?\"<>|:]", "_", name)
    name = re.sub(r"\s+", " ", name).strip()
    if len(name) > max_len:
        name = name[:max_len].rstrip()
    return name

# Xây dựng chuỗi query cho arXiv API
def build_query(term: str) -> str:
    term_part = f'all:"{term}"'
    if CATEGORIES:
        cats = " OR ".join([f'cat:{c}' for c in CATEGORIES])
        q = f"({term_part}) AND ({cats})"
    else:
        q = term_part
    return q

# Trích arXiv ID từ entry.id
def parse_arxiv_id(entry) -> str:
    raw = entry.get("id", "")
    m = re.search(r'arxiv\.org\/abs\/([^\s]+)$', raw)
    if m:
        return m.group(1)
    for link in entry.get("links", []):
        href = link.get("href", "")
        m2 = re.search(r'arxiv\.org\/abs\/([^\s]+)$', href)
        if m2:
            return m2.group(1)
    return "unknown-" + "".join(random.choices(string.ascii_lowercase + string.digits, k=8))

# Lấy link PDF
def get_pdf_link(entry) -> Optional[str]:
    for link in entry.get("links", []):
        if link.get("type") == "application/pdf":
            return link.get("href")
        if link.get("title", "").lower() == "pdf":
            return link.get("href")
    return None

def is_recent_enough(published_str: str, min_year: int) -> bool:
    try:
        dt = dtparse.parse(published_str)
        return dt.year >= min_year
    except Exception:
        return False

def fetch_entries_for_topic(term: str, min_year: int, page_size: int, max_per_topic: Optional[int]) -> List[dict]:
    start = 0
    collected = []
    while True:
        params = {
            "search_query": build_query(term),
            "start": start,
            "max_results": page_size,
            "sortBy": "submittedDate",
            "sortOrder": "descending",
        }
        logging.info(f"[{term}] Querying arXiv start={start}, size={page_size}")
        resp = requests.get(ARXIV_API, params=params, headers=HEADERS, timeout=60)
        time.sleep(REQUEST_SLEEP_SECONDS)  # tuân thủ rate-limit
        resp.raise_for_status()

        feed = feedparser.parse(resp.text)
        entries = feed.get("entries", [])
        if not entries:
            break

        filtered = [e for e in entries if is_recent_enough(e.get("published", ""), min_year)]
        collected.extend(filtered)

        logging.info(f"[{term}] Got {len(entries)} entries, kept {len(filtered)} (>= {min_year})")

        if len(filtered) < len(entries):
            logging.info(f"[{term}] Encountered entries older than {min_year}; stopping pagination.")
            break

        start += page_size

        if max_per_topic is not None and len(collected) >= max_per_topic:
            collected = collected[:max_per_topic]
            break

    return collected

# Tải pdf
def download_pdf(url: str, dest_path: str) -> bool:
    for attempt in range(1, DOWNLOAD_RETRIES + 1):
        try:
            with requests.get(url, stream=True, headers=HEADERS, timeout=DOWNLOAD_TIMEOUT) as r:
                r.raise_for_status()
                total = int(r.headers.get("Content-Length", 0)) or None
                tmp_path = dest_path + ".part"
                with open(tmp_path, "wb") as f, tqdm(
                    total=total,
                    unit="B",
                    unit_scale=True,
                    desc=os.path.basename(dest_path),
                    leave=False
                ) as pbar:
                    for chunk in r.iter_content(chunk_size=1024 * 64):
                        if chunk:
                            f.write(chunk)
                            if total:
                                pbar.update(len(chunk))
                os.replace(tmp_path, dest_path)
            return True
        except Exception as e:
            logging.warning(f"Lỗi tải ({attempt}/{DOWNLOAD_RETRIES}) {url}: {e}")
            time.sleep(2 * attempt)
    return False

def main():
    ensure_dir(OUTPUT_DIR)
    meta_log = []

    for topic in TOPICS:
        entries = fetch_entries_for_topic(
            term=topic,
            min_year=MIN_YEAR,
            page_size=PAGE_SIZE,
            max_per_topic=None
        )

        logging.info(f"[{topic}] Total kept entries: {len(entries)}")

        for e in entries:
            arxiv_id = parse_arxiv_id(e)
            title = e.get("title", "").replace("\n", " ").strip()
            published = e.get("published", "")
            pdf_url = get_pdf_link(e)

            safe_title = sanitize_filename(title)
            filename = f"{sanitize_filename(arxiv_id)} - {safe_title}.pdf"
            save_path = os.path.join(OUTPUT_DIR, filename)

            if not pdf_url:
                logging.info(f"Bỏ qua (không có PDF): {arxiv_id} | {title}")
                continue

            if os.path.exists(save_path):
                logging.info(f"Đã tồn tại: {filename}")
                continue

            logging.info(f"Tải: {arxiv_id} | {title}")
            ok = download_pdf(pdf_url, save_path)
            meta_log.append({
                "topic": topic,
                "arxiv_id": arxiv_id,
                "title": title,
                "published": published,
                "pdf_url": pdf_url,
                "saved": save_path if ok else None,
                "status": "downloaded" if ok else "failed"
            })

    # Lưu metadata để tiện tra cứu
    # meta_path = os.path.join(OUTPUT_DIR, f"_arxiv_download_meta_{int(time.time())}.json")
    # with open(meta_path, "w", encoding="utf-8") as f:
    #     json.dump(meta_log, f, ensure_ascii=False, indent=2)
    # logging.info(f"Đã lưu metadata: {meta_path}")

if __name__ == "__main__":
    main()

docker run --rm -p 18070:8070 -p 18071:8071 lfoppiano/grobid:latest-crf

$inDir  = "C:\Users\NCPC\OneDrive\Python\Search-Engine\paper-crawl"
$outDir = "C:\Users\NCPC\OneDrive\Python\Search-Engine\data-crawl"
New-Item -ItemType Directory -Force -Path $outDir | Out-Null

Get-ChildItem -Path $inDir -Filter *.pdf | ForEach-Object {
    $pdfPath = $_.FullName
    $outPath = Join-Path $outDir ($_.BaseName + ".tei.xml")

    & curl.exe -s -S -X POST `
        -F "input=@$pdfPath" `
        -F "consolidateHeader=1" `
        -F "consolidateCitations=0" `
        "http://localhost:18070/api/processFulltextDocument" `
        -o "$outPath"
}

In [None]:
TEI_NS = {"tei": "http://www.tei-c.org/ns/1.0"}
tei_dir = pathlib.Path(r"...")

load_dotenv()  # tự đọc .env
DB_USER = os.getenv("DB_USER")
DB_PASS = os.getenv("DB_PASS")
DB_HOST = os.getenv("DB_HOST")
DB_NAME = os.getenv("DB_NAME")

conn = pymysql.connect(
    host=DB_HOST, user=DB_USER, password=DB_PASS,
    database=DB_NAME, charset="utf8mb4", autocommit=False
)
cur = conn.cursor()

def h(s: str) -> str:
    return hashlib.md5(s.encode("utf-8")).hexdigest()

def load_xml_lenient(path):
    parser = etree.XMLParser(
        ns_clean=True,
        remove_blank_text=True,
        recover=True,        # cho phép bỏ qua lỗi ID trùng, XML không chuẩn
        huge_tree=True,
        resolve_entities=False
    )
    with open(path, "rb") as f:
        try:
            return etree.parse(f, parser=parser)
        except etree.XMLSyntaxError as e:
            print(f"[WARN] Không parse được {path}: {e}")
            return None

def parse_tei(tei_path: pathlib.Path):
    root = load_xml_lenient(str(tei_path))
    if root is None:
        return "", "", "", "", []   # bỏ qua file lỗi

    title = root.xpath("string(//tei:titleStmt/tei:title)", namespaces=TEI_NS).strip()
    authors = [" ".join(a.itertext()).strip()
               for a in root.xpath("//tei:author/tei:persName", namespaces=TEI_NS)]
    year = (root.xpath("string(//tei:sourceDesc//tei:date/@when)", namespaces=TEI_NS)
            or root.xpath("string(//tei:sourceDesc//tei:date)", namespaces=TEI_NS) or "").strip()
    abstract = root.xpath("string(//tei:profileDesc/tei:abstract)", namespaces=TEI_NS).strip()

    paras = []
    for p in root.xpath("//tei:text//tei:body//tei:p", namespaces=TEI_NS):
        txt = " ".join(p.itertext()).strip()
        if len(txt) < 40:
            continue
        sec = p.xpath("string(ancestor::tei:div[1]/@type)", namespaces=TEI_NS) or ""
        paras.append((sec, None, txt))
    return title, "; ".join(authors), year, abstract, paras

def upsert_doc(doc_id, pdf_path, tei_path, title, authors, year, abstract):
    cur.execute("""
        INSERT INTO docs (doc_id, pdf_path, tei_path, title, authors, year, abstract)
        VALUES (%s,%s,%s,%s,%s,%s,%s)
        ON DUPLICATE KEY UPDATE
            pdf_path=VALUES(pdf_path), tei_path=VALUES(tei_path),
            title=VALUES(title), authors=VALUES(authors),
            year=VALUES(year), abstract=VALUES(abstract)
    """, (doc_id, str(pdf_path), str(tei_path), title, authors, year, abstract))

def upsert_chunk(chunk_id, doc_id, section, page, text):
    cur.execute("""
        INSERT INTO chunks (chunk_id, doc_id, section, page, text)
        VALUES (%s,%s,%s,%s,%s)
        ON DUPLICATE KEY UPDATE
            section=VALUES(section), page=VALUES(page), text=VALUES(text)
    """, (chunk_id, doc_id, section, page, text))

# map TEI -> PDF path theo tên file (đổi nếu bạn có mapping khác)
pdf_dir = pathlib.Path(r"...")

for tei_path in tqdm(list(tei_dir.glob("*.tei.xml"))):
    stem = tei_path.stem.replace(".fulltext", "")
    pdf_path = pdf_dir / (stem + ".pdf")
    doc_id = h(str(pdf_path))

    title, authors, year, abstract, paras = parse_tei(tei_path)
    upsert_doc(doc_id, pdf_path, tei_path, title, authors, year, abstract)

    for i, (sec, page, txt) in enumerate(paras):
        chunk_id = f"{doc_id}_{i}"
        upsert_chunk(chunk_id, doc_id, sec, page, txt)

conn.commit()
cur.close(); conn.close()
print("Done TEI→MySQL.")