In [None]:
import re
import time
from typing import List, Dict, Tuple, Optional
import collections

import requests
import feedparser
import numpy as np

import fitz  # PDF

import chromadb
from chromadb.utils import embedding_functions

In [None]:
ARXIV_API = "http://export.arxiv.org/api/query"

def fetch_arxiv(search_query: str, max_results: int = 50) -> List[Dict]:
    params = {
        "search_query": search_query,
        "start": 0,
        "max_results": max_results,
        "sortBy": "submittedDate",
        "sortOrder": "descending",
    }
    r = requests.get(ARXIV_API, params=params, timeout=30)
    r.raise_for_status()

    feed = feedparser.parse(r.text)
    docs = []
    for e in feed.entries:
        doc = {
            "id": e.get("id"),
            "title": (e.get("title") or "").replace("\n", " ").strip(),
            "abstract": (e.get("summary") or "").replace("\n", " ").strip(),
            "published": e.get("published"),
            "authors": ", ".join(a.name for a in e.get("authors", [])),
            "link": e.get("link"),
        }
        docs.append(doc)

   
    time.sleep(2)
    return docs

# Example: ML Papers
docs = fetch_arxiv("cat:cs.LG AND (all:retrieval OR all:RAG)", max_results=50)
docs[0]

{'id': 'http://arxiv.org/abs/2602.17654v1',
 'title': 'Mine and Refine: Optimizing Graded Relevance in E-commerce Search Retrieval',
 'abstract': 'We propose a two-stage "Mine and Refine" contrastive training framework for semantic text embeddings to enhance multi-category e-commerce search retrieval. Large scale e-commerce search demands embeddings that generalize to long tail, noisy queries while adhering to scalable supervision compatible with product and policy constraints. A practical challenge is that relevance is often graded: users accept substitutes or complements beyond exact matches, and production systems benefit from clear separation of similarity scores across these relevance strata for stable hybrid blending and thresholding. To obtain scalable policy consistent supervision, we fine-tune a lightweight LLM on human annotations under a three-level relevance guideline and further reduce residual noise via engagement driven auditing. In Stage 1, we train a multilingual Siame

In [None]:

def abs_to_pdf_url(arxiv_abs_url: str) -> str:
    # abs -> pdf
    return arxiv_abs_url.replace("/abs/", "/pdf/") + ".pdf"

def download_pdf_bytes(pdf_url: str) -> bytes:
    r = requests.get(pdf_url, timeout=60)
    r.raise_for_status()
    return r.content

def pdf_to_text(pdf_bytes: bytes, max_pages: int = 12) -> str:
    doc = fitz.open(stream=pdf_bytes, filetype="pdf")
    texts = []
    n_pages = min(len(doc), max_pages) #cap pages for now
    for i in range(n_pages):
        page = doc.load_page(i)
        texts.append(page.get_text("text"))
    return "\n".join(texts)

def basic_clean(text: str) -> str:
    # remove excessive whitespace + hyphenated line breaks
    text = re.sub(r"-\n", "", text)          
    text = re.sub(r"\n{3,}", "\n\n", text)   
    text = re.sub(r"[ \t]{2,}", " ", text)   
    return text.strip()

In [None]:
def strip_references(text: str) -> str:
    # Common section headers
    m = re.search(r"\n(References|REFERENCES|Bibliography)\n", text)
    if m:
        return text[:m.start()].strip()
    return text

def build_fulltext_for_paper(d: Dict, max_pages: int = 12) -> Tuple[str, Dict]:
    """
    Returns (full_text, extra_meta). If PDF fails, returns abstract-only text.
    """
    base = (
        f"TITLE: {d['title']}\n"
        f"AUTHORS: {d['authors']}\n"
        f"PUBLISHED: {d['published']}\n"
        f"LINK: {d['link']}\n\n"
        f"ABSTRACT: {d['abstract']}\n"
    )

    pdf_url = abs_to_pdf_url(d["id"])
    extra_meta = {"pdf_url": pdf_url, "pdf_pages_indexed": 0, "pdf_ok": False}

    try:
        pdf_bytes = download_pdf_bytes(pdf_url)
        raw = pdf_to_text(pdf_bytes, max_pages=max_pages)
        body = strip_references(basic_clean(raw))
        extra_meta.update({"pdf_pages_indexed": max_pages, "pdf_ok": True})
        return base + "\n\nFULL_TEXT:\n" + body, extra_meta
    except Exception as e:
         #index abstract-only
        extra_meta.update({"pdf_error": str(e)})
        return base + "\n\nFULL_TEXT: (PDF fetch/parse failed, indexed abstract only)", extra_meta

In [None]:
#Chunking
def chunk_text(text: str, chunk_size: int = 1400, overlap: int = 250) -> List[str]:
    text = " ".join(text.split())  # normalize whitespace
    chunks = []
    i = 0
    while i < len(text):
        chunks.append(text[i:i + chunk_size])
        i += max(1, chunk_size - overlap)
    return chunks

In [None]:
#set up Chroma persistent collection with local embeddings
# Use Local embedding model for speed
emb_fn = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name="all-MiniLM-L6-v2"
)

client = chromadb.PersistentClient(path="chroma_db")
collection = client.get_or_create_collection(
    name="arxiv_fullpdf_rag",
    embedding_function=emb_fn
)

collection.count()

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

0

In [19]:
# Cell 6 — indexing: fetch PDF -> extract -> clean -> chunk -> upsert into Chroma
def index_papers_as_chunks(
    collection,
    papers: List[Dict],
    max_pages: int = 12,
    chunk_size: int = 1400,
    overlap: int = 250,
    throttle_seconds: float = 1.0,
) -> Dict:
    """
    Indexes papers into Chroma.
    Returns stats dict.
    """
    ids, texts, metas = [], [], []
    pdf_ok = 0
    pdf_fail = 0
    total_chunks = 0

    for p in papers:
        full_text, extra = build_fulltext_for_paper(p, max_pages=max_pages)
        if extra.get("pdf_ok"):
            pdf_ok += 1
        else:
            pdf_fail += 1

        chunks = chunk_text(full_text, chunk_size=chunk_size, overlap=overlap)

        for j, ch in enumerate(chunks):
            ids.append(f"{p['id']}::chunk{j}")
            texts.append(ch)
            metas.append({
                "paper_id": p["id"],
                "title": p["title"],
                "authors": p["authors"],
                "published": p["published"],
                "link": p["link"],
                "chunk": j,
                **extra
            })

        total_chunks += len(chunks)

        time.sleep(throttle_seconds)

    collection.upsert(ids=ids, documents=texts, metadatas=metas)

    return {
        "papers": len(papers),
        "pdf_ok": pdf_ok,
        "pdf_fail": pdf_fail,
        "chunks_indexed": total_chunks,
        "collection_count": collection.count(),
    }

stats = index_papers_as_chunks(collection, docs, max_pages=12)
stats

{'papers': 50,
 'pdf_ok': 50,
 'pdf_fail': 0,
 'chunks_indexed': 1652,
 'collection_count': 1652}

In [None]:

def retrieve(
    collection,
    query: str,
    k: int = 5
) -> List[Tuple[str, Dict, float]]:
    res = collection.query(
        query_texts=[query],
        n_results=k,
        include=["documents", "metadatas", "distances"]
    )
    docs_ = res["documents"][0]
    metas_ = res["metadatas"][0]
    dists_ = res["distances"][0]
    return list(zip(docs_, metas_, dists_))

def show_hits(hits: List[Tuple[str, Dict, float]], max_chars: int = 900):
    for i, (txt, meta, dist) in enumerate(hits, start=1):
        print(f"\n[{i}] {meta['title']}")
        print(f"    Authors: {meta['authors']}")
        print(f"    Published: {meta['published']}")
        print(f"    Link: {meta['link']}")
        if meta.get("pdf_url"):
            print(f"    PDF: {meta['pdf_url']}")
        print(f"    Distance: {dist:.4f}")
        print("    Evidence:")
        snippet = txt[:max_chars] + ("..." if len(txt) > max_chars else "")
        print("    " + snippet)

q = "What retrieval strategies are commonly used in RAG systems, and why?"
hits = retrieve(collection, q, k=5)
show_hits(hits)


[1] Retrieval-Augmented Foundation Models for Matched Molecular Pair Transformations to Recapitulate Medicinal Chemistry Intuition
    Authors: Bo Pan, Peter Zhiping Zhang, Hao-Wei Pang, Alex Zhu, Xiang Yu, Liying Zhang, Liang Zhao
    Published: 2026-02-18T18:27:21Z
    Link: https://arxiv.org/abs/2602.16684v1
    PDF: http://arxiv.org/pdf/2602.16684v1.pdf
    Distance: 0.5376
    Evidence:
    /i Recall/o Database Retrieval 28.57% 57.49% 0.00% REINVENT 4 7.36% 12.21% 1.87% MMPT-FM (Ours) 43.77% 76.45% 11.48% MMPT-RAG (Ours) 46.81% 81.35% 12.99% Table 2: Effect of beam size on average validity of MMPT-FM, averaged on the ChEMBL MMPT held-out test set. Beam 400 600 800 1000 1200 Avg Validity 0.9992 0.9991 0.9989 0.9988 0.9986 (Recall-o = 13.15%). MMPT-RAG further improves performance across all metrics, achieving the highest overall recall (49.21%), the strongest in-training-set recovery (62.08%), and the best outof-training-set recall (15.24%). The gains in Recall indicate that MMPT-

In [None]:
import os
os.environ["OPENAI_API_KEY"] = "Insert Key Here"

In [None]:
def build_context_for_llm(hits, max_chars_per_chunk=1200):

    blocks = []
    sources_meta = []
    for i, (txt, meta, dist) in enumerate(hits, start=1):
        snippet = txt[:max_chars_per_chunk]
        blocks.append(
            f"[{i}] TITLE: {meta.get('title','')}\n"
            f"AUTHORS: {meta.get('authors','')}\n"
            f"PUBLISHED: {meta.get('published','')}\n"
            f"LINK: {meta.get('link','')}\n"
            f"PDF: {meta.get('pdf_url','')}\n"
            f"EXCERPT:\n{snippet}\n"
        )
        sources_meta.append(meta)
    return "\n\n".join(blocks), sources_meta

In [None]:
from openai import OpenAI

client = OpenAI()

SYSTEM_INSTRUCTIONS = """You are a research assistant.
Answer the user's question using ONLY the provided sources.
If the sources do not contain the answer, say: "I don't know based on the provided documents."
When you use information from a source, cite it inline like [1] or [2].
Do not cite sources that you did not use.
Keep the answer concise and structured (bullets are fine)."""

def generate_rag_answer(question: str, hits, model: str = "gpt-4.1-mini"):
    context_text, sources_meta = build_context_for_llm(hits)

    user_message = f"""QUESTION:
{question}

SOURCES:
{context_text}
"""

    resp = client.responses.create(
        model=model,
        instructions=SYSTEM_INSTRUCTIONS,
        input=user_message,
        temperature=0.2,
    )

    answer_text = resp.output_text
    return answer_text, sources_meta

In [33]:
question = "What retrieval strategies are commonly used in RAG systems, and why?"
hits = retrieve(collection, question, k=5) 

answer, sources_meta = generate_rag_answer(question, hits)
print(answer)

Common retrieval strategies used in Retrieval-Augmented Generation (RAG) systems include:

- **Keyword-based (sparse) retrieval:** This method retrieves documents or evidence based on matching keywords from the query. It is straightforward but may miss semantically relevant information if exact keywords are not present.

- **Embedding-based (dense) retrieval:** This approach uses vector embeddings to represent queries and documents, retrieving items based on semantic similarity rather than exact keyword matches. It is more flexible and effective for capturing nuanced meanings [2].

- **Retrieval during generation:** Some architectures, such as RETRO, retrieve relevant information dynamically during the generation process to keep the model's context fresh and reduce hallucinations [2].

- **Self-RAG variants:** These allow the model to request or verify evidence mid-answer, improving the grounding and accuracy of generated responses [2].

- **Structured geometric retrieval:** In specifi

In [34]:
def print_citations(sources_meta):
    print("\nCITATIONS:")
    for i, m in enumerate(sources_meta, start=1):
        print(f"[{i}] {m.get('title','')}")
        print(f"    {m.get('link','')}")
        if m.get("pdf_url"):
            print(f"    PDF: {m.get('pdf_url')}")
        print()

print_citations(sources_meta)


CITATIONS:
[1] Retrieval-Augmented Foundation Models for Matched Molecular Pair Transformations to Recapitulate Medicinal Chemistry Intuition
    https://arxiv.org/abs/2602.16684v1
    PDF: http://arxiv.org/pdf/2602.16684v1.pdf

[2] CacheMind: From Miss Rates to Why -- Natural-Language, Trace-Grounded Reasoning for Cache Replacement
    https://arxiv.org/abs/2602.12422v1
    PDF: http://arxiv.org/pdf/2602.12422v1.pdf

[3] LongAudio-RAG: Event-Grounded Question Answering over Multi-Hour Long Audio
    https://arxiv.org/abs/2602.14612v1
    PDF: http://arxiv.org/pdf/2602.14612v1.pdf

[4] GRAIL: Geometry-Aware Retrieval-Augmented Inference with LLMs over Hyperbolic Representations of Patient Trajectories
    https://arxiv.org/abs/2602.12828v1
    PDF: http://arxiv.org/pdf/2602.12828v1.pdf

[5] Evolutionary Context Search for Automated Skill Acquisition
    https://arxiv.org/abs/2602.16113v1
    PDF: http://arxiv.org/pdf/2602.16113v1.pdf

