# Install dan Library

In [None]:
!pip install pypdf sentence-transformers faiss-cpu transformers faiss-cpu nltk tdqm

Collecting tdqm
  Downloading tdqm-0.0.1.tar.gz (1.4 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: tdqm
  Building wheel for tdqm (setup.py) ... [?25l[?25hdone
  Created wheel for tdqm: filename=tdqm-0.0.1-py3-none-any.whl size=1322 sha256=821831e3b19a25504191d9d541cebd5a339d14f5dd7b6bfd5779302935c1da35
  Stored in directory: /root/.cache/pip/wheels/af/02/71/aae0f7ee738abf19498353918ddae0f90a0d6ceb337b0bbc91
Successfully built tdqm
Installing collected packages: tdqm
Successfully installed tdqm-0.0.1


In [None]:
import os
import re
from pypdf import PdfReader
from google.colab import files
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
from nltk.tokenize import sent_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


# PDFLoader

In [None]:
class PDFLoader:
    def __init__(self):
        self.file_names = []
        self.documents = []
        self.doc_map = {}

    def upload_pdfs(self):
        uploaded = files.upload()
        self.file_names = sorted(uploaded.keys())

        print("\nUploaded files (sorted):")
        for fn in self.file_names:
            print(" -", fn)
        return self.file_names

    def clean_text(self, text):
        text = text.replace("\u00a0", " ")

        text = re.sub(r'-\s*\n\s*', '', text)

        text = re.sub(r'\n+', ' ', text)

        text = re.sub(r'\[\d+\]', ' ', text)

        text = re.sub(r'[^\x00-\x7F]+', ' ', text)

        text = re.sub(r'(?<!\.)\.\.(?!\.)', '.', text)

        text = re.sub(r'\s+', ' ', text)

        return text.strip()

    def extract_text(self, pdf_path):
        reader = PdfReader(pdf_path)
        text = ""

        for i, page in enumerate(reader.pages):
            try:
                extracted = page.extract_text() or ""
                cleaned = self.clean_text(extracted)
                text += cleaned + " "
            except Exception as e:
                print(f"[Warning] Page {i} failed: {e}")

        return text.strip()

    # ----------------------------------------------------------
    def load_documents(self):
        if not self.file_names:
            raise Exception("No PDF uploaded. Run upload_pdfs() first.")

        print("\nExtracting & deep-cleaning documents...\n")

        self.documents = []
        self.doc_map = {}

        for fn in self.file_names:
            print(f"Processing: {fn}")
            fulltext = self.extract_text(fn)
            self.documents.append(fulltext)
            self.doc_map[fn] = fulltext
            print(f" → Cleaned length: {len(fulltext)} characters.\n")

        print("All documents loaded + cleaned successfully.")
        return self.documents

    def get_documents(self):
        return self.documents

    def get_doc_map(self):
        return self.doc_map

In [None]:
# Inisialisasi
loader = PDFLoader()

# 1. Upload ketiga file PDF
loader.upload_pdfs()

# 2. Extract semua teks dari PDF
docs = loader.load_documents()

# 3. Lihat dokumen pertama
print(docs[0][:1000])
print(docs[1][:1000])
print(docs[2][:1000])

Saving DAPO An Open-Source LLM Reinforcement Learning System at Scale.pdf to DAPO An Open-Source LLM Reinforcement Learning System at Scale.pdf
Saving RLAC REINFORCEMENT LEARNING WITH ADVERSARIAL CRITIC FOR FREE-FORM GENERATION TASKS.pdf to RLAC REINFORCEMENT LEARNING WITH ADVERSARIAL CRITIC FOR FREE-FORM GENERATION TASKS.pdf
Saving RLVE Scaling Up Reinforcement Learning for Language Models.pdf to RLVE Scaling Up Reinforcement Learning for Language Models.pdf

Uploaded files (sorted):
 - DAPO An Open-Source LLM Reinforcement Learning System at Scale.pdf
 - RLAC REINFORCEMENT LEARNING WITH ADVERSARIAL CRITIC FOR FREE-FORM GENERATION TASKS.pdf
 - RLVE Scaling Up Reinforcement Learning for Language Models.pdf

Extracting & deep-cleaning documents...

Processing: DAPO An Open-Source LLM Reinforcement Learning System at Scale.pdf
 → Cleaned length: 39948 characters.

Processing: RLAC REINFORCEMENT LEARNING WITH ADVERSARIAL CRITIC FOR FREE-FORM GENERATION TASKS.pdf
 → Cleaned length: 69112 c

# TextChunker

In [None]:
class SectionAwareChunker:

    DEFAULT_SECTION_HEADINGS = [
        "abstract", "introduction", "related work", "background",
        "method", "methods", "methodology", "approach", "model",
        "experiments", "experimental setup", "setup", "evaluation",
        "results", "analysis", "discussion", "conclusion", "conclusions",
        "future work", "limitations", "references", "acknowledgements",
        "acknowledgments", "appendix"
    ]

    def __init__(self, sentences_per_chunk=4, overlap_sentences=1, headings=None, verbose=True):
        self.sentences_per_chunk = sentences_per_chunk
        self.overlap_sentences = overlap_sentences
        self.headings = headings or self.DEFAULT_SECTION_HEADINGS
        self.verbose = verbose

        self.chunks = []
        self.chunk_meta = []

    def _find_headings(self, text):
        matches = []
        pattern = r'(?i)\b(' + "|".join(re.escape(h) for h in self.headings) + r')\b'
        for m in re.finditer(pattern, text):
            label = m.group(1).strip()
            matches.append((m.start(), m.end(), label.lower()))
        matches.sort(key=lambda x: x[0])
        return matches

    def _section_boundaries(self, text):
        matches = self._find_headings(text)

        if not matches:
            return [("body", 0, len(text))]

        spans = []
        for i, (s, e, label) in enumerate(matches):
            start = s
            end = matches[i + 1][0] if i + 1 < len(matches) else len(text)
            sec_name = label.lower()
            spans.append((sec_name, start, end))
        first_start = spans[0][1]
        if first_start > 0:
            spans.insert(0, ("preface", 0, first_start))
        return spans

    def _extract_section_text(self, text, start, end):
        return text[start:end].strip()

    def _split_sentences(self, text):
        sents = sent_tokenize(text)
        sents = [re.sub(r'\s+', ' ', s).strip() for s in sents if s.strip()]
        return sents

    def _chunk_sentences_in_section(self, sentences, doc_name, doc_id, section_name, base_chunk_id):
        chunks = []
        meta = []
        start = 0
        end = self.sentences_per_chunk
        chunk_id = base_chunk_id

        while start < len(sentences):
            chunk_sents = sentences[start:end]
            chunk_text = " ".join(chunk_sents).strip()
            if chunk_text:
                chunks.append(chunk_text)
                meta.append({
                    "doc_id": doc_id,
                    "chunk_id": chunk_id,
                    "source": doc_name,
                    "section": section_name,
                    "sentence_start": start,
                    "sentence_end": min(end, len(sentences)),
                })
                chunk_id += 1

            start = end - self.overlap_sentences
            end = start + self.sentences_per_chunk

        return chunks, meta, chunk_id

    def process_documents(self, doc_map):
        self.chunks = []
        self.chunk_meta = []

        for doc_id, (doc_name, full_text) in enumerate(doc_map.items()):
            if self.verbose:
                print(f"[Doc {doc_id}] Section-aware chunking: {doc_name}")

            text = full_text if full_text else ""
            spans = self._section_boundaries(text)

            if self.verbose:
                print(f" → Detected {len(spans)} sections")

            next_chunk_id = 0
            for sec_name, s_idx, e_idx in spans:
                sec_text = self._extract_section_text(text, s_idx, e_idx)
                sents = self._split_sentences(sec_text)
                if not sents:
                    continue

                sec_chunks, sec_meta, next_chunk_id = self._chunk_sentences_in_section(
                    sents, doc_name, doc_id, sec_name, next_chunk_id
                )

                self.chunks.extend(sec_chunks)
                self.chunk_meta.extend(sec_meta)

                if self.verbose:
                    print(f"   - Section '{sec_name}': {len(sec_chunks)} chunks")

            if self.verbose:
                print(f" → Total chunks for doc: {sum(1 for m in self.chunk_meta if m['doc_id']==doc_id)}\n")

        if self.verbose:
            print(f"All documents processed. Total chunks: {len(self.chunks)}")
        return self.chunks

    def get_chunks(self):
        return self.chunks

    def get_chunk_meta(self):
        return self.chunk_meta


In [None]:
# inisialisasi
sac = SectionAwareChunker(sentences_per_chunk=4, overlap_sentences=1, verbose=True)

# proses semua dokumen yang ada di loader
chunks = sac.process_documents(loader.get_doc_map())

# cek contoh
print("Total chunks:", len(chunks))
print(chunks[0][:500])
print(sac.get_chunk_meta()[0])


[Doc 0] Section-aware chunking: DAPO An Open-Source LLM Reinforcement Learning System at Scale.pdf
 → Detected 88 sections
   - Section 'preface': 1 chunks
   - Section 'abstract': 1 chunks
   - Section 'results': 1 chunks
   - Section 'model': 2 chunks
   - Section 'model': 1 chunks
   - Section 'introduction': 2 chunks
   - Section 'results': 1 chunks
   - Section 'model': 1 chunks
   - Section 'analysis': 1 chunks
   - Section 'results': 1 chunks
   - Section 'model': 1 chunks
   - Section 'results': 6 chunks
   - Section 'model': 1 chunks
   - Section 'model': 4 chunks
   - Section 'model': 1 chunks
   - Section 'model': 1 chunks
   - Section 'model': 1 chunks
   - Section 'model': 1 chunks
   - Section 'model': 1 chunks
   - Section 'model': 1 chunks
   - Section 'approach': 1 chunks
   - Section 'model': 3 chunks
   - Section 'experiments': 8 chunks
   - Section 'results': 1 chunks
   - Section 'model': 2 chunks
   - Section 'model': 1 chunks
   - Section 'model': 1 chunks
   - S

# EmbeddingGenerator

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np
import time

class EmbeddingGenerator:
    def __init__(self, model_name="sentence-transformers/all-MiniLM-L6-v2", batch_size=32, use_auth_token=None):

        print(f"Loading embedding model: {model_name} ...")
        self.model = SentenceTransformer(model_name, use_auth_token=use_auth_token)
        print("Model loaded successfully!\n")

        self.batch_size = batch_size

        self.embeddings = None
        self.chunk_texts = None
        self.chunk_meta = None

    def encode_chunks(self, chunk_texts, chunk_meta):
        print("Encoding chunks into embeddings...")

        all_embeddings = []
        num_chunks = len(chunk_texts)

        t0 = time.time()

        for start_idx in range(0, num_chunks, self.batch_size):
            end_idx = min(start_idx + self.batch_size, num_chunks)
            batch = chunk_texts[start_idx:end_idx]

            batch_emb = self.model.encode(batch, convert_to_numpy=True)
            all_embeddings.append(batch_emb)

            print(f"  Encoded batch {start_idx} → {end_idx}")

        self.embeddings = np.vstack(all_embeddings).astype("float32")
        self.chunk_texts = chunk_texts
        self.chunk_meta = chunk_meta

        print("\nEmbedding generation complete!")
        print(f"Shape: {self.embeddings.shape}")
        print(f"Time taken: {time.time() - t0:.2f} seconds\n")

        return self.embeddings

    def get_embeddings(self):
        if self.embeddings is None:
            raise Exception("No embeddings generated yet.")
        return self.embeddings

    def get_chunk_texts(self):
        return self.chunk_texts

    def get_chunk_meta(self):
        return self.chunk_meta


In [None]:
# 1. Ambil hasil dari SectionAwareChunker
chunk_texts = sac.get_chunks()
chunk_meta = sac.get_chunk_meta()

# 2. Generate Embeddings
generator = EmbeddingGenerator(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    batch_size=32
)

embeddings = generator.encode_chunks(chunk_texts, chunk_meta)

# 3. Cek hasil
print(embeddings.shape)
print(generator.get_chunk_meta()[0])
print(generator.get_chunk_texts()[0][:300])


Loading embedding model: sentence-transformers/all-MiniLM-L6-v2 ...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Model loaded successfully!

Encoding chunks into embeddings...
  Encoded batch 0 → 32
  Encoded batch 32 → 64
  Encoded batch 64 → 96
  Encoded batch 96 → 128
  Encoded batch 128 → 160
  Encoded batch 160 → 192
  Encoded batch 192 → 224
  Encoded batch 224 → 256
  Encoded batch 256 → 288
  Encoded batch 288 → 320
  Encoded batch 320 → 352
  Encoded batch 352 → 384
  Encoded batch 384 → 416
  Encoded batch 416 → 448
  Encoded batch 448 → 480
  Encoded batch 480 → 512
  Encoded batch 512 → 544
  Encoded batch 544 → 576
  Encoded batch 576 → 608
  Encoded batch 608 → 640
  Encoded batch 640 → 672
  Encoded batch 672 → 696

Embedding generation complete!
Shape: (696, 384)
Time taken: 57.91 seconds

(696, 384)
{'doc_id': 0, 'chunk_id': 0, 'source': 'DAPO An Open-Source LLM Reinforcement Learning System at Scale.pdf', 'section': 'preface', 'sentence_start': 0, 'sentence_end': 1}
DAPO: An Open-Source LLM Reinforcement Learning System at Scale 1ByteDance Seed 2Institute for AI Industry Researc

# VectorIndex

In [None]:
import faiss
import numpy as np
import json
import os

class VectorIndexFAISS:
    def __init__(self, metric='l2', use_gpu=False, gpu_device=0):
        assert metric in ('l2', 'cosine'), "metric must be 'l2' or 'cosine'"
        self.metric = metric
        self.use_gpu = use_gpu
        self.gpu_device = gpu_device

        self.index = None
        self.dimension = None
        self.embeddings = None
        self.chunk_texts = None
        self.chunk_meta = None

    def _make_index(self, dim):
        if self.metric == 'l2':
            index = faiss.IndexFlatL2(dim)
        else:
            index = faiss.IndexFlatIP(dim)
        return index

    def _maybe_move_to_gpu(self, index):
        """Move index to GPU if requested and available."""
        if self.use_gpu:
            try:
                res = faiss.StandardGpuResources()
                index = faiss.index_cpu_to_gpu(res, self.gpu_device, index)
                print(f"[FAISS] Index moved to GPU:{self.gpu_device}")
            except Exception as e:
                print(f"[FAISS] GPU move failed, falling back to CPU. Error: {e}")
        return index

    def build(self, embeddings, chunk_texts, chunk_meta, normalize=False):
        assert isinstance(embeddings, np.ndarray), "embeddings must be numpy array"
        assert embeddings.ndim == 2, "embeddings must be 2D array (n, dim)"
        n, dim = embeddings.shape
        assert len(chunk_texts) == n and len(chunk_meta) == n, "chunk_texts and chunk_meta must align with embeddings"

        self.dimension = dim
        self.embeddings = embeddings.astype('float32')
        self.chunk_texts = list(chunk_texts)
        self.chunk_meta = list(chunk_meta)

        if normalize or self.metric == 'cosine':
            faiss.normalize_L2(self.embeddings)

        index = self._make_index(self.dimension)
        index = self._maybe_move_to_gpu(index)

        index.add(self.embeddings)
        self.index = index

        print(f"[FAISS] Built index: vectors={self.index.ntotal}, dim={self.dimension}")
        return self.index

    def add(self, new_embeddings, new_texts, new_meta, normalize=False):
        assert self.index is not None, "Index not built. Call build() first."
        assert new_embeddings.shape[1] == self.dimension, "Dimension mismatch."

        emb = new_embeddings.astype('float32')
        if normalize or self.metric == 'cosine':
            faiss.normalize_L2(emb)

        self.index.add(emb)

        if self.embeddings is None:
            self.embeddings = emb.copy()
        else:
            self.embeddings = np.vstack([self.embeddings, emb])

        self.chunk_texts.extend(new_texts)
        self.chunk_meta.extend(new_meta)

        print(f"[FAISS] Added {len(new_texts)} vectors. Total now: {self.index.ntotal}")
        return self.index.ntotal

    def search(self, query_embeddings, top_k=5, return_distance=True):
        assert self.index is not None, "Index not built."
        q_emb = query_embeddings.astype('float32')
        if q_emb.ndim == 1:
            q_emb = q_emb.reshape(1, -1)
        assert q_emb.shape[1] == self.dimension, "Query embedding dimension mismatch."

        if self.metric == 'cosine':
            faiss.normalize_L2(q_emb)

        distances, indices = self.index.search(q_emb, top_k)

        results_all = []
        for qi in range(distances.shape[0]):
            row = []
            for pos, idx in enumerate(indices[qi]):
                if idx < 0 or idx >= len(self.chunk_texts):
                    continue
                score = float(distances[qi, pos])
                row.append({
                    "faiss_index": int(idx),
                    "score": score,
                    "chunk": self.chunk_texts[idx],
                    "metadata": self.chunk_meta[idx]
                })
            results_all.append(row)
        return results_all

    def save(self, dirpath):
        os.makedirs(dirpath, exist_ok=True)
        idx_path = os.path.join(dirpath, "index.faiss")
        try:
            cpu_index = faiss.index_cpu_to_all_gpus(self.index) if False else self.index
        except Exception:
            cpu_index = self.index
        try:
            cpu_index_for_write = faiss.index_gpu_to_cpu(self.index) if self.use_gpu else self.index
        except Exception:
            cpu_index_for_write = self.index

        faiss.write_index(cpu_index_for_write, idx_path)
        if self.embeddings is not None:
            np.save(os.path.join(dirpath, "embeddings.npy"), self.embeddings)
        if self.chunk_meta is not None:
            with open(os.path.join(dirpath, "meta.json"), "w", encoding="utf-8") as f:
                json.dump(self.chunk_meta, f, ensure_ascii=False, indent=2)
        if self.chunk_texts is not None:
            with open(os.path.join(dirpath, "chunks.txt"), "w", encoding="utf-8") as f:
                for c in self.chunk_texts:
                    f.write(c.replace("\n", " ") + "\n")

        print(f"[FAISS] Saved index+metadata to {dirpath}")


    def load(self, dirpath):

        idx_path = os.path.join(dirpath, "index.faiss")
        if not os.path.exists(idx_path):
            raise FileNotFoundError(f"No index.faiss found at {idx_path}")

        index = faiss.read_index(idx_path)
        self.index = index
        self.dimension = index.d

        emb_path = os.path.join(dirpath, "embeddings.npy")
        if os.path.exists(emb_path):
            self.embeddings = np.load(emb_path)

        meta_path = os.path.join(dirpath, "meta.json")
        if os.path.exists(meta_path):
            with open(meta_path, "r", encoding="utf-8") as f:
                self.chunk_meta = json.load(f)

        chunks_path = os.path.join(dirpath, "chunks.txt")
        if os.path.exists(chunks_path):
            with open(chunks_path, "r", encoding="utf-8") as f:
                self.chunk_texts = [line.strip() for line in f]

        print(f"[FAISS] Loaded index from {dirpath}. Vectors={self.index.ntotal}, dim={self.dimension}")


    def move_to_gpu(self, gpu_device=0):
        if self.index is None:
            raise Exception("No index to move. Build or load index first.")
        try:
            res = faiss.StandardGpuResources()
            self.index = faiss.index_cpu_to_gpu(res, gpu_device, self.index)
            self.use_gpu = True
            self.gpu_device = gpu_device
            print(f"[FAISS] Index moved to GPU:{gpu_device}")
        except Exception as e:
            raise RuntimeError(f"Failed to move FAISS index to GPU: {e}")

In [None]:
# asumsi: generator is EmbeddingGenerator instance that sudah encode chunks
embeddings = generator.get_embeddings()         # (n, d) numpy float32
chunk_texts = generator.get_chunk_texts()       # list len n
chunk_meta = generator.get_chunk_meta()         # list len n

# 1) build index
vindex = VectorIndexFAISS(metric='l2', use_gpu=False)
vindex.build(embeddings, chunk_texts, chunk_meta)

# 2) search single query
q = "What is the core contribution of DAPO?"
q_emb = generator.model.encode([q], convert_to_numpy=True).astype('float32')
results = vindex.search(q_emb, top_k=5)

for i, r in enumerate(results[0]):
    print(f"#{i+1} score={r['score']:.4f} src={r['metadata']['source']} chunk_id={r['metadata']['chunk_id']}")
    print(r['chunk'][:300], "...\n")

# 3) incremental add (optional)
# new_embs, new_texts, new_meta = ...
# vindex.add(new_embs, new_texts, new_meta)

# 4) save index
vindex.save("./faiss_store")

# 5) load later
vindex2 = VectorIndexFAISS(metric='l2', use_gpu=False)
vindex2.load("./faiss_store")


[FAISS] Built index: vectors=696, dim=384
#1 score=0.9831 src=DAPO An Open-Source LLM Reinforcement Learning System at Scale.pdf chunk_id=4
These components of our open-source system enhance reproducibility and support future research in large-scale LLM RL. Date: March 17, 2025 Correspondence: zhouhao@air.tsinghua.edu.cn, wangmingxuan.89@bytedance.com Project Page: https://dapo-sia.github.io/ ahttps://github.com/volcengine/verl 0 2000 4 ...

#2 score=1.2157 src=RLVE Scaling Up Reinforcement Learning for Language Models.pdf chunk_id=241
C. RL Training Details We run our RL training using the slime framework4 and adopt the DAPO algorithm (Yu et al., 2025), a variant of GRPO (Shao et al., 2024). Unless otherwise specified, the ...

#3 score=1.3212 src=DAPO An Open-Source LLM Reinforcement Learning System at Scale.pdf chunk_id=85
Model AIME24 avg@32 DeepSeek-R1-Zero-Qwen-32B 47 Naive GRPO 30 + Overlong Filtering 36 + Clip-Higher 38 + Soft Overlong Punishment 41 + Token-level Loss 42 + Dyna

# Retriever

In [None]:
class Retriever:
    def __init__(
        self,
        embedding_model,
        vector_index,
        top_k=5,
        rewrite_query=False
    ):
        self.model = embedding_model
        self.index = vector_index
        self.top_k = top_k
        self.rewrite_query = rewrite_query

        self.section_priority = {
            "abstract": 3.0,
            "introduction": 2.5,
            "method": 2.2,
            "methods": 2.2,
            "approach": 2.0,
            "model": 1.8,
            "analysis": 1.8,
            "results": 1.6,
            "experiments": 1.4,
            "evaluation": 1.3,
            "conclusion": 1.3,
            "discussion": 1.2,
        }

        self.skip_sections = {"references", "acknowledgments", "appendix"}

    def normalize_query(self, query):
        query = query.strip()
        query = re.sub(r"\s+", " ", query)
        return query.lower()

    def rewrite(self, query):
        return query
    def search(self, query, top_k=None):
        if top_k is None:
            top_k = self.top_k

        clean_q = self.normalize_query(query)
        if self.rewrite_query:
            clean_q = self.rewrite(clean_q)
        q_emb = self.model.encode(
            [clean_q],
            convert_to_numpy=True
        ).astype("float32")
        raw_results = self.index.search(q_emb, top_k * 3)
        if isinstance(raw_results, list) and len(raw_results) == 1 and isinstance(raw_results[0], list):
            raw_results = raw_results[0]
        reranked = self._rerank(raw_results)

        return reranked[:top_k]

    def _rerank(self, results):
        enhanced = []

        for item in results:

            if isinstance(item, dict):
                meta = item["metadata"]
                chunk = item["chunk"]
                faiss_distance = item["score"]
            elif isinstance(item, (list, tuple)) and len(item) == 2:
                faiss_distance, idx = item
                meta = self.index.chunk_meta[idx]
                chunk = self.index.chunk_texts[idx]
            else:
                print("Unknown result format:", item)
                continue

            section = meta["section"].lower()
            if section in self.skip_sections:
                continue
            sim = 1 / (1 + faiss_distance)
            sec_bonus = self.section_priority.get(section, 1.0)
            final_score = sim * sec_bonus

            enhanced.append({
                "chunk": chunk,
                "metadata": meta,
                "distance": faiss_distance,
                "similarity": sim,
                "final_score": final_score
            })

        enhanced = sorted(enhanced, key=lambda x: x["final_score"], reverse=True)
        return enhanced

In [None]:
retriever = Retriever(
    embedding_model = generator.model,
    vector_index = vindex,
    top_k = 5,
    rewrite_query = False
)

query = "Explain reinforcement learning algorithms for large language models."
results = retriever.search(query, top_k=5)

for r in results:
    print("="*90)
    print("SCORE:", r["final_score"])
    print("SECTION:", r["metadata"]["section"])
    print("SOURCE:", r["metadata"]["source"])
    print("CHUNK ID:", r["metadata"]["chunk_id"])
    print(r["chunk"][:400], "...")



SCORE: 1.495981081669297
SECTION: introduction
SOURCE: RLVE Scaling Up Reinforcement Learning for Language Models.pdf
CHUNK ID: 6
Introduction Scaling up reinforcement learning (RL) has shown strong potential to improve language models (LMs) (Ouyang et al., 2022; OpenAI, 2024; DeepSeek-AI, 2025; Google DeepMind, 2025), but models improvement increasingly satu*Equal contribution 1University of Washington 2Allen Institute for Artificial Intelligence 3University of Illinois UrbanaChampaign 4Princeton University 5LMSYS Org. Corr ...
SCORE: 1.3456299217191456
SECTION: methods
SOURCE: RLAC REINFORCEMENT LEARNING WITH ADVERSARIAL CRITIC FOR FREE-FORM GENERATION TASKS.pdf
CHUNK ID: 11
methods for large language models (LLMs) have progressed dramatically over the past few years, from largely manual supervised fine-tuning (SFT) techniques that rely on a combination of manual data curation (Radford et al., 2018; Brown et al., 2020; Shengyu et al., 2023) to reinforcement learning (RL) ...
SCORE: 1

# AnswerFormatter

In [None]:
class AnswerFormatter:

    def __init__(self):
        pass

    def format_json(self, query, results):
        formatted = {
            "query": query,
            "top_k": len(results),
            "answers": []
        }

        for item in results:
            meta = item["metadata"]

            formatted["answers"].append({
                "score": item["final_score"],
                "source": meta["source"],
                "section": meta["section"],
                "chunk_id": meta["chunk_id"],
                "text": item["chunk"]
            })

        return formatted

    def format_pretty(self, query, results):
        lines = []
        lines.append("=" * 100)
        lines.append(f"QUERY: {query}")
        lines.append("=" * 100)

        for i, item in enumerate(results, 1):
            meta = item["metadata"]

            lines.append(f"\n[{i}]  Score: {item['final_score']:.4f}")
            lines.append(f"     Source : {meta['source']}")
            lines.append(f"     Section: {meta['section']}")
            lines.append(f"     ChunkID: {meta['chunk_id']}")
            lines.append("-" * 100)
            lines.append(item["chunk"][:600] + " ...")

        lines.append("\n" + "=" * 100)
        lines.append("CITED SOURCES:")
        cited = set()
        for item in results:
            meta = item["metadata"]
            cited.add(f"{meta['source']} (section={meta['section']}, chunk={meta['chunk_id']})")

        for c in cited:
            lines.append(" - " + c)

        lines.append("=" * 100)

        return "\n".join(lines)


In [None]:
formatter = AnswerFormatter()

query = "Explain RL algorithms for LLMs."
results = retriever.search(query, top_k=5)

json_output = formatter.format_json(query, results)
print(json_output)

print(formatter.format_pretty(query, results))

{'query': 'Explain RL algorithms for LLMs.', 'top_k': 5, 'answers': [{'score': 1.1026830370629692, 'source': 'RLAC REINFORCEMENT LEARNING WITH ADVERSARIAL CRITIC FOR FREE-FORM GENERATION TASKS (4).pdf', 'section': 'approach', 'chunk_id': 51, 'text': 'approach that we can use to train LLMs. As shown in Figure 1, we parameterize three task-agnostic components that interact with each other during RL training. Each component is instantiated differently based on the domain (as detailed in Section 4). Generator.The generator g, is an LLM that is fine-tuned to produce an output a A for an instruction s S. RLAC samples multiple response generations from g for each instruction s. We train g to maximize the probability of producing outputs that satisfy all task-specific rubrics.'}, {'score': 0.9497698586544167, 'source': 'DAPO An Open-Source LLM Reinforcement Learning System at Scale (4).pdf', 'section': 'model', 'chunk_id': 3, 'text': 'model. Unlike previous works that withhold training details

# RAGPipeline

In [None]:
class RAGPipeline:
    def __init__(self,
                 loader,
                 chunker,
                 embedder,
                 vector_index,
                 retriever,
                 formatter,
                 verbose=True):

        self.loader = loader
        self.chunker = chunker
        self.embedder = embedder
        self.index = vector_index
        self.retriever = retriever
        self.formatter = formatter
        self.verbose = verbose
        self.docs_loaded = False
        self.chunks_ready = False
        self.embeddings_ready = False
        self.index_ready = False

    def load_pdfs(self):
        if self.verbose:
            print("\n[1] LOADING PDFs...")
        self.loader.upload_pdfs()
        self.loader.load_documents()
        self.docs_loaded = True
        if self.verbose:
            print("[✓] PDFs loaded.\n")
        return self.loader.get_doc_map()

    def make_chunks(self):
        if not self.docs_loaded:
            raise Exception("Run load_pdfs() first.")
        if self.verbose:
            print("\n[2] CHUNKING...")
        self.chunker.process_documents(self.loader.get_doc_map())
        self.chunks_ready = True
        if self.verbose:
            print("[✓] Chunks created.\n")
        return self.chunker.get_chunk_meta()

    def embed(self):
        if not self.chunks_ready:
            raise Exception("Run make_chunks() first.")
        if self.verbose:
            print("\n[3] ENCODING EMBEDDINGS...")
        chunks = self.chunker.get_chunks()
        meta = self.chunker.get_chunk_meta()
        self.embedder.encode_chunks(chunks, meta)
        self.embeddings_ready = True
        if self.verbose:
            print("[✓] Embeddings done.\n")
        return self.embedder.embeddings

    def build_faiss(self):
        if not self.embeddings_ready:
            raise Exception("Run embed() first.")
        if self.verbose:
            print("\n[4] BUILDING FAISS INDEX...")
        self.index.build(
            embeddings=self.embedder.embeddings,
            chunk_texts=self.embedder.chunk_texts,
            chunk_meta=self.embedder.chunk_meta
        )
        self.index_ready = True
        if self.verbose:
            print("[✓] FAISS index ready.\n")

    def ask(self, query, top_k=5, pretty=False):
        if not self.index_ready:
            raise Exception("Run build_faiss() first.")

        if self.verbose:
            print(f"\n[5] QUERY → {query}")

        retrieved_results = self.retriever.search(query, top_k=top_k)

        if pretty:
            return self.formatter.format_pretty(query, retrieved_results)
        return self.formatter.format_json(query, retrieved_results)

    def full_build(self):
        self.load_pdfs()
        self.make_chunks()
        self.embed()
        self.build_faiss()
        print("\n[✓] PIPELINE READY.\n")

In [None]:
formatter = AnswerFormatter()

pipeline = RAGPipeline(
    loader=loader,
    chunker=sac,
    embedder=generator,
    vector_index=vindex,
    retriever=retriever,
    formatter=formatter,
    verbose=True
)

# Build everything
pipeline.full_build()

# Ask a question
query = "Explain reinforcement learning algorithms for large language models."
results = pipeline.ask(query, top_k=5)

# The results object is now a dictionary, so iterate over 'answers'
for r in results['answers']:
    print("="*90)
    print("SCORE:", r["score"])
    print("SECTION:", r["section"])
    print("SOURCE:", r["source"])
    print("CHUNK ID:", r["chunk_id"])
    print(r["text"][:300], "...\n")


[1] LOADING PDFs...


Saving DAPO An Open-Source LLM Reinforcement Learning System at Scale.pdf to DAPO An Open-Source LLM Reinforcement Learning System at Scale (6).pdf
Saving RLAC REINFORCEMENT LEARNING WITH ADVERSARIAL CRITIC FOR FREE-FORM GENERATION TASKS.pdf to RLAC REINFORCEMENT LEARNING WITH ADVERSARIAL CRITIC FOR FREE-FORM GENERATION TASKS (6).pdf
Saving RLVE Scaling Up Reinforcement Learning for Language Models.pdf to RLVE Scaling Up Reinforcement Learning for Language Models (6).pdf

Uploaded files (sorted):
 - DAPO An Open-Source LLM Reinforcement Learning System at Scale (6).pdf
 - RLAC REINFORCEMENT LEARNING WITH ADVERSARIAL CRITIC FOR FREE-FORM GENERATION TASKS (6).pdf
 - RLVE Scaling Up Reinforcement Learning for Language Models (6).pdf

Extracting & deep-cleaning documents...

Processing: DAPO An Open-Source LLM Reinforcement Learning System at Scale (6).pdf
 → Cleaned length: 39948 characters.

Processing: RLAC REINFORCEMENT LEARNING WITH ADVERSARIAL CRITIC FOR FREE-FORM GENERATION TASKS (6