# PDF → Chunks → Embeddings → Vector DB → Retrieve + Generate

This notebook builds a simple RAG pipeline over PDFs in `data non traite`:

1. Load PDF text (page by page)
2. Split into chunks
3. Embed chunks with a sentence-transformers model
4. Store embeddings in a FAISS index on disk
5. Retrieve relevant chunks for a question and generate an answer


In [1]:
from __future__ import annotations

import json
import os
from dataclasses import dataclass
from pathlib import Path
from typing import Any

import faiss
import numpy as np
from pypdf import PdfReader
from sentence_transformers import SentenceTransformer
from tqdm.auto import tqdm
from transformers import pipeline
import torch
import requests

PROJECT_DIR = Path.cwd()
DATA_DIR = PROJECT_DIR / "data non traite"
STORE_DIR = PROJECT_DIR / "vector_store"

STORE_DIR.mkdir(parents=True, exist_ok=True)

PDF_RECURSIVE = True
MAX_PDFS: int | None = None

if not DATA_DIR.exists():
    raise FileNotFoundError(f"Missing folder: {DATA_DIR}")

candidates = DATA_DIR.rglob("*") if PDF_RECURSIVE else DATA_DIR.glob("*")
pdf_paths = sorted([p for p in candidates if p.is_file() and p.suffix.lower() == ".pdf"])
if MAX_PDFS is not None:
    pdf_paths = pdf_paths[:MAX_PDFS]

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
GEN_DEVICE = 0 if torch.cuda.is_available() else -1

len(pdf_paths), pdf_paths[:5]

(18,
 [WindowsPath('c:/Users/m2l3k/Desktop/LLM/data non traite/50-Coding-Interview-Questions.pdf'),
  WindowsPath('c:/Users/m2l3k/Desktop/LLM/data non traite/College_Success_-_WEB_zQGCJTr.pdf'),
  WindowsPath('c:/Users/m2l3k/Desktop/LLM/data non traite/College_Success_Concise.pdf'),
  WindowsPath('c:/Users/m2l3k/Desktop/LLM/data non traite/Competitive Programmer_s Handbook.pdf'),
  WindowsPath('c:/Users/m2l3k/Desktop/LLM/data non traite/Cracking the coding interview 6th edition-1.pdf')])

In [None]:
def read_pdf_pages(pdf_path: Path) -> list[dict[str, Any]]:
    try:
        reader = PdfReader(str(pdf_path))
    except Exception:
        return []
    pages: list[dict[str, Any]] = []
    for page_index, page in enumerate(reader.pages):
        text = page.extract_text() or ""
        text = text.replace("\u00a0", " ").strip()
        if not text:
            continue
        pages.append(
            {
                "source": pdf_path.relative_to(DATA_DIR).as_posix(),
                "page": page_index + 1,
                "text": text,
            }
        )
    return pages


all_pages: list[dict[str, Any]] = []
for p in tqdm(pdf_paths, desc="Reading PDFs"):
    all_pages.extend(read_pdf_pages(p))

(
    len(all_pages),
    (all_pages[0]["source"] if all_pages else None),
    (all_pages[0]["page"] if all_pages else None),
    ((all_pages[0]["text"][:200]) if all_pages else None),
)

Reading PDFs:   0%|          | 0/18 [00:00<?, ?it/s]

In [13]:
CHUNK_SIZE = 900
CHUNK_OVERLAP = 150

def split_text(text: str, chunk_size: int = CHUNK_SIZE, chunk_overlap: int = CHUNK_OVERLAP) -> list[str]:
    if chunk_overlap >= chunk_size:
        raise ValueError("chunk_overlap must be < chunk_size")

    normalized = " ".join(text.split())
    if len(normalized) <= chunk_size:
        return [normalized]

    chunks: list[str] = []
    start = 0
    while start < len(normalized):
        end = min(start + chunk_size, len(normalized))

        if end < len(normalized):
            boundary = normalized.rfind(" ", start, end)
            if boundary != -1 and boundary > start + int(chunk_size * 0.6):
                end = boundary

        chunk = normalized[start:end].strip()
        if chunk:
            chunks.append(chunk)

        if end >= len(normalized):
            break
        start = max(0, end - chunk_overlap)

    return chunks


def chunk_pages(
    pages: list[dict[str, Any]],
    chunk_size: int = 900,
    chunk_overlap: int = 150,
) -> list[dict[str, Any]]:
    out: list[dict[str, Any]] = []
    for page in pages:
        chunks = split_text(page["text"], chunk_size=chunk_size, chunk_overlap=chunk_overlap)
        for i, c in enumerate(chunks):
            out.append(
                {
                    "source": page["source"],
                    "page": page["page"],
                    "chunk": c,
                    "chunk_index": i,
                }
            )
    return out


chunks = chunk_pages(all_pages, chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
(
    len(chunks),
    (chunks[0]["source"] if chunks else None),
    (chunks[0]["page"] if chunks else None),
    ((chunks[0]["chunk"][:200]) if chunks else None),
)

(12603,
 'College_Success_-_WEB_zQGCJTr.pdf',
 3,
 'College Success SENIOR CONTRIBUTING AUTHORS AMY BALDWIN, UNIVERSITY OF CENTRAL ARKANSAS')

In [14]:
@dataclass
class EmbeddingConfig:
    model_name: str = "sentence-transformers/all-MiniLM-L6-v2"
    normalize: bool = True


def embed_texts(texts: list[str], cfg: EmbeddingConfig) -> np.ndarray:
    model = SentenceTransformer(cfg.model_name, device=DEVICE)
    vectors = model.encode(texts, batch_size=64, show_progress_bar=True)
    vectors = np.asarray(vectors, dtype=np.float32)
    if cfg.normalize:
        norms = np.linalg.norm(vectors, axis=1, keepdims=True)
        norms[norms == 0] = 1.0
        vectors = vectors / norms
    return vectors


emb_cfg = EmbeddingConfig()
chunk_texts = [c["chunk"] for c in chunks]
embeddings = embed_texts(chunk_texts, emb_cfg)
embeddings.shape, embeddings.dtype

Batches: 100%|██████████| 197/197 [00:58<00:00,  3.37it/s]


((12603, 384), dtype('float32'))

In [15]:
def build_faiss_index(vectors: np.ndarray) -> faiss.Index:
    if vectors.ndim != 2:
        raise ValueError("vectors must be 2D")
    dim = vectors.shape[1]
    index = faiss.IndexFlatIP(dim)
    index.add(vectors)
    return index


index = build_faiss_index(embeddings)
index.ntotal

12603

In [16]:
INDEX_PATH = STORE_DIR / "chunks.faiss"
META_PATH = STORE_DIR / "chunks.json"
CFG_PATH = STORE_DIR / "store_config.json"

faiss.write_index(index, str(INDEX_PATH))

with META_PATH.open("w", encoding="utf-8") as f:
    json.dump(chunks, f, ensure_ascii=False)

with CFG_PATH.open("w", encoding="utf-8") as f:
    json.dump(
        {
            "embedding_model": emb_cfg.model_name,
            "normalize": emb_cfg.normalize,
            "chunk_size": CHUNK_SIZE,
            "chunk_overlap": CHUNK_OVERLAP,
        },
        f,
    )

INDEX_PATH, META_PATH, CFG_PATH

(WindowsPath('c:/Users/m2l3k/Desktop/LLM/vector_store/chunks.faiss'),
 WindowsPath('c:/Users/m2l3k/Desktop/LLM/vector_store/chunks.json'),
 WindowsPath('c:/Users/m2l3k/Desktop/LLM/vector_store/store_config.json'))

In [17]:
def load_store(store_dir: Path) -> tuple[faiss.Index, list[dict[str, Any]], dict[str, Any]]:
    idx = faiss.read_index(str(store_dir / "chunks.faiss"))
    meta = json.loads((store_dir / "chunks.json").read_text(encoding="utf-8"))
    cfg = json.loads((store_dir / "store_config.json").read_text(encoding="utf-8"))
    return idx, meta, cfg


def retrieve(
    question: str,
    idx: faiss.Index,
    meta: list[dict[str, Any]],
    embed_model: SentenceTransformer,
    top_k: int = 5,
) -> list[dict[str, Any]]:
    q_vec = embed_model.encode([question])
    q_vec = np.asarray(q_vec, dtype=np.float32)
    norms = np.linalg.norm(q_vec, axis=1, keepdims=True)
    norms[norms == 0] = 1.0
    q_vec = q_vec / norms
    scores, ids = idx.search(q_vec, top_k)
    out: list[dict[str, Any]] = []
    for rank, (score, i) in enumerate(zip(scores[0].tolist(), ids[0].tolist())):
        if i < 0:
            continue
        row = dict(meta[i])
        row["score"] = float(score)
        row["rank"] = rank
        out.append(row)
    return out


idx, meta, cfg = load_store(STORE_DIR)
embed_model = SentenceTransformer(cfg["embedding_model"])

hits = retrieve("What is computer science?", idx, meta, embed_model, top_k=5)
[(h["source"], h["page"], round(h["score"], 3)) for h in hits]

[('Introduction_To_Computer_Science_-_WEB.pdf', 43, 0.756),
 ('Introduction_To_Computer_Science_-_WEB.pdf', 20, 0.723),
 ('Introduction_To_Computer_Science_-_WEB.pdf', 32, 0.709),
 ('Introduction_To_Computer_Science_-_WEB.pdf', 34, 0.707),
 ('Introduction_To_Computer_Science_-_WEB.pdf', 44, 0.703)]

In [18]:
def build_prompt(question: str, contexts: list[dict[str, Any]]) -> str:
    max_context_chars = 3500
    parts: list[str] = []
    used = 0
    for c in contexts:
        part = f"Source: {c['source']} (page {c['page']})\n{c['chunk']}"
        if used + len(part) > max_context_chars:
            break
        parts.append(part)
        used += len(part)
    joined = "\n\n".join(parts)
    return (
        "You are a helpful tutor. Use only the provided context. "
        "If the context does not contain the answer, say you do not know.\n\n"
        f"Context:\n{joined}\n\n"
        f"Question: {question}\n"
        "Answer:"
    )


USE_GITHUB_MODELS = True
ANSWER_LANGUAGE = "English"

GITHUB_ENDPOINT = "https://models.github.ai/inference"
GITHUB_MODELS_URL = f"{GITHUB_ENDPOINT}/chat/completions"
GITHUB_MODEL = "meta/Llama-3.3-70B-Instruct"


def build_messages(question: str, contexts: list[dict[str, Any]]) -> list[dict[str, str]]:
    max_context_chars = 3500
    parts: list[str] = []
    used = 0
    for c in contexts:
        part = f"Source: {c['source']} (page {c['page']})\n{c['chunk']}"
        if used + len(part) > max_context_chars:
            break
        parts.append(part)
        used += len(part)
    joined = "\n\n".join(parts)
    system = (
        "You are a helpful tutor. Use only the provided context. "
        "If the context does not contain the answer, say you do not know. "
        f"Answer in {ANSWER_LANGUAGE}."
    )
    user = f"Context:\n{joined}\n\nQuestion: {question}"
    return [{"role": "system", "content": system}, {"role": "user", "content": user}]


def generate_with_github(question: str, contexts: list[dict[str, Any]]) -> str:
    token = (os.environ.get("GITHUB_TOKEN") or "").strip()
    if not token:
        raise RuntimeError("Missing GITHUB_TOKEN environment variable")
    messages = build_messages(question, contexts)
    r = requests.post(
        GITHUB_MODELS_URL,
        headers={
            "Accept": "application/vnd.github+json",
            "Authorization": f"Bearer {token}",
            "X-GitHub-Api-Version": "2022-11-28",
            "Content-Type": "application/json",
        },
        json={
            "model": GITHUB_MODEL,
            "messages": messages,
            "temperature": 0.2,
            "max_tokens": 512,
        },
        timeout=300,
    )
    r.raise_for_status()
    data = r.json()
    return str(data["choices"][0]["message"]["content"]).strip()


def answer_question(question: str, top_k: int = 5) -> dict[str, Any]:
    contexts = retrieve(question, idx, meta, embed_model, top_k=top_k)
    out = generate_with_github(question, contexts)
    sources = [
        {"source": c["source"], "page": c["page"], "score": c["score"]}
        for c in contexts
    ]
    return {"answer": out, "sources": sources}


RUN_DEMO = False
if RUN_DEMO:
    result = answer_question("give me the story of the computer ?", top_k=6)
    result["answer"], result["sources"][:3]

Device set to use cuda:0


('Based on the provided context, here\'s the story of the computer:\n\nThe story of the computer begins with early personal computers like the Programma and the Alto. These early computers set the stage for the rapid expansion of computing in the workplace. By 1980, there were several microcomputers on the market that made computing more accessible to small businesses and even individuals.\n\nComputing capabilities had expanded to include color graphics, spreadsheets, and word processing programs. The market competition between Microsoft, HP, IBM, Apple, and others shaped the industry and our society.\n\nIn 1983, Time magazine recognized the computer as "Machine of the Year," replacing its traditional "Man of the Year." These early computers have evolved into today\'s laptops, cell phones, tablets, and wearables.\n\nA computer is simply a programmable machine that can execute predefined lists of instructions and respond to new instructions. It has core features that remain the same des

In [None]:
def live_qa(top_k: int = 6) -> None:
    print("Type a question and press Enter. Type 'exit' to stop.")
    while True:
        try:
            q = input("Question: ").strip()
        except EOFError:
            return
        except Exception as e:
            if e.__class__.__name__ == "StdinNotImplementedError":
                return
            raise
        if not q or q.lower() in {"exit", "quit"}:
            return
        r = answer_question(q, top_k=top_k)
        print("\nAnswer:\n" + r["answer"])
        print("\nSources:")
        for s in r["sources"]:
            print(f"- {s['source']} (page {s['page']}, score {s['score']:.3f})")
        print("\n")


RUN_LIVE = True
if RUN_LIVE:
    live_qa(top_k=6)