
# 🔧 1. Setup

In [None]:
# Install all required Python packages for this workshop

!pip install langchain langchain-community faiss-cpu pymupdf pypdf sentence_transformers rich wget python-dotenv cryptography langchain_ollama langchain-docling pymupdf4llm

In [None]:
import os, time
from pathlib import Path

import langchain
import wget
from dotenv import load_dotenv
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains.retrieval import create_retrieval_chain
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.prompts import PromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain_ollama.llms import OllamaLLM
from langchain_docling import DoclingLoader
from langchain.document_loaders import PyPDFLoader
from langchain_docling.loader import ExportType
import pymupdf4llm
from langchain_core.documents.base import Document
from rich.console import Console
from rich.markdown import Markdown

console = Console()

------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

## 3. Extract Text from a Single PDF

In this step, we’ll load one PDF file and convert its pages into plain text (or Markdown) using three different methods:

- **PyPDFLoader** (LangChain): A straightforward loader that splits the PDF into page-level `Document` objects.  
- **PyMuPDF4LLM**: A fast, native extractor that generates Markdown-formatted text with optional page-wise chunking.  
- **Docling**: A robust parser that preserves layout and exports content as Markdown, either per page (DOC_CHUNKS) or whole-document (MARKDOWN).

You will see how to:

1. Read the PDF from disk.  
2. Extract every page’s text into a structured format.  
3. Time each method to compare performance.  
4. Preview a specific page for verification.

### 📁 Setup Paths & Choose only 1 PDF for testing

In [None]:
# Create the "data/sample_pdf" folder if it doesn't exist
SAMPLE_PDF_DIR = Path("data/sample_pdf")
os.makedirs(SAMPLE_PDF_DIR, exist_ok=True)

# URL of the PDFs to test
urls = [
    "https://github.com/ovaccarelli/LLM-RAG/blob/main/data/sample_pdf/2312.10997.pdf"
    "https://github.com/ovaccarelli/LLM-RAG/blob/main/data/sample_pdf/2312.10997_page13.pdf",
]

# Download the PDFs
for url in urls:
    name = url.split("/")[-1]
    if not (SAMPLE_PDF_DIR / name).is_file():
        filename = wget.download(url, f"data/PDFs/{name}")
console.print("Pdf file downloaded successfully.", style="bold green")

#### PyPDFLoader

In [None]:
pdf_path = SAMPLE_PDF_DIR/"2312.10997.pdf"  # Just pick one page for testing

# Load the PDF with PyPDFLoader
start = time.time()
loader = PyPDFLoader(str(pdf_path))
docs_pypdf = loader.load()                 # returns a list of Document objects, one per page
end = time.time()

print(f"Using file: {pdf_path.name}")
print(f"🕒 PyPDFLoader loaded {len(docs_pypdf)} pages in {end - start:.2f} seconds")

In [None]:
# --- Preview the PDF contents ---
# Pages are indexed starting from 0

page_to_print = ...  # Change this to the page index you want
max_num_characters = ... # Change the max num of characters you want to print

# Now preview the chosen page:

if 0 <= page_to_print < len(docs_pypdf):
    content = docs_pypdf[page_to_print].page_content
    print(f"--- 📄 Page {page_to_print + 1} / {len(docs_pypdf)} ---\n")
    print(content[:max_num_characters])
else:
    print(f"Page {page_to_print} is out of range (max:{len(docs_pypdf)})")

### PyMuPDF4LLM

In [None]:
# Load the PDF with PyMuPDF4LLM
start = time.time()
docs_pymupdf = pymupdf4llm.to_markdown(str(pdf_path), page_chunks=True)       # return a list of page dicts
end = time.time()

print(f"Using file: {pdf_path.name}")
print(f"🕒 PyMuPDF4LLM extracted {len(docs_pymupdf)} pages in {end - start:.2f} seconds\n")

In [None]:
# --- Preview the PDF contents ---
# Pages are indexed starting from 0

page_to_print = ...  # Change this to the page index you want
max_num_characters = ... # Change the max num of characters you want to print

# Now preview the chosen page:

if 0 <= page_to_print < len(docs_pymupdf):
    md = docs_pymupdf[page_to_print]["text"]
    print(f"--- 📄 Page {page_to_print + 1} / {len(docs_pymupdf)} ---\n")
    print(md[:max_num_characters])
else:
    print(f"Page {page_to_print} is out of range (max:{len(docs_pymupdf)})")

### Docling

In [None]:
pdf_path_docling = SAMPLE_PDF_DIR/"2312.10997_page13.pdf"  # Just pick one page for testing

# Load the PDF with Docling
start = time.time()
loader_docling = DoclingLoader(str(pdf_path_docling), export_type=ExportType.MARKDOWN)
docs_docling = loader_docling.load()
end = time.time()

print(f"Using file: {pdf_path_docling.name}")
print(f"🕒 Docling loaded {len(docs_docling)} document(s) in {end - start:.2f} seconds")

In [None]:
# --- Preview the PDF contents ---

# Print the full extracted text
for idx, doc in enumerate(docs_docling):
    print(f"\n--- 📄 PDF Document: {pdf_path_docling.name} ---\n")
    print(doc.page_content)

## 4. Construct the vectorstore

In this step, we take the PDF documents and transform them into a searchable vector database.


In [None]:
# Create the "data/PDFs" folder if it doesn't exist
PDF_FOLDER = Path("data/PDFs")
os.makedirs(PDF_FOLDER, exist_ok=True)

urls = [
    "https://github.com/ovaccarelli/LLM-RAG/blob/main/data/PDFs/Open_Source_AI_workshop.pdf",
]

# Download the PDFs
for url in urls:
    name = url.split("/")[-1]
    if not (PDF_FOLDER / name).is_file():
        filename = wget.download(url, f"data/PDFs/{name}")
console.print("Pdf file downloaded successfully.", style="bold green")

In [None]:
# 1. Create a folder to store the vector index
VECTORSTORES_DIR = Path("data/vectorstores")
os.makedirs(VECTORSTORES_DIR, exist_ok=True)

# 2. Point to the directory containing our PDFs
PDF_FOLDER = Path("data/PDFs")

# 3. Use PyPDFDirectoryLoader to load every PDF page as a Document
loader = PyPDFDirectoryLoader(PDF_FOLDER)
documents = loader.load()

# 4. Verify how many pages are loaded
print(f"Loaded {len(documents)} PDF pages")

### ✂️ Split Documents into Chunks

We break documents into smaller overlapping chunks using `RecursiveCharacterTextSplitter`.

- `chunk_size`: The number of characters per chunk.

- `chunk_overlap`: Ensures that we maintain context between chunks.

This is crucial for preserving semantic meaning across sentences and paragraphs.

In [None]:
# Set chunk size (how many characters per chunk) and overlap
CHUNK_SIZE = ...
CHUNK_OVERLAP = ...

# Initialize the text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP
)

# Split the loaded PDFs into smaller, overlapping chunks
all_splits = text_splitter.split_documents(documents)

print(f"✅ Split into {len(all_splits)} chunks")

### 🔍 Convert Text Chunks to Embeddings

We now convert each text chunk into a high-dimensional vector using the BGE model (`BAAI/bge-large-en-v1.5`). These vectors capture the semantic meaning of the text.

- We use `HuggingFaceBgeEmbeddings from LangChain`.

- Normalizing embeddings helps improve similarity search accuracy.

- We set the device to "cpu" for compatibility with Colab. (If you're running this on a local machine with GPU, you can switch "cpu" to "cuda" for better performance.)

In [None]:
# Define the embedding model — BGE is a strong open-source embedding model for English
EMBEDDING_MODEL_NAME = "BAAI/bge-large-en-v1.5"

embedding_model = HuggingFaceBgeEmbeddings(
    model_name=EMBEDDING_MODEL_NAME,
    model_kwargs={"device": "cpu"},  # "cuda" if you run locally with a GPU
    encode_kwargs={"normalize_embeddings": True},
)

### 🏗️ Create and Save the Vectorstore

Using the text chunks and embeddings, we build our vectorstore:

- FAISS (Facebook AI Similarity Search) is a fast library for vector similarity search.

- This index will let us retrieve the most relevant chunks given a user question.

We also save the vectorstore locally so that it can be reused later without recomputing everything.

In [None]:
# Create a FAISS index from the text chunks and their embeddings
vectorstore = FAISS.from_documents(documents=all_splits, embedding=embedding_model)

# Save the vectorstore locally for reuse
vectorstore.save_local(VECTORSTORES_DIR)

print("✅ Vectorstore created and saved successfully.")

💾 Reload the Vectorstore (Optional)

In [None]:
# You can reload the saved vectorstore anytime without recomputing everything
vectorstore = FAISS.load_local(
    VECTORSTORES_DIR,
    embedding_model,
    allow_dangerous_deserialization=True  # Required in Colab environments
)

print("✅ Vectorstore reloaded successfully.")

------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------