<a href="https://colab.research.google.com/github/nickanely/PDF_RAG_Explorer/blob/main/PDF_RAG_Explorer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import requests

# Get PDF document
pdf_path = "human-nutrition-text.pdf"

# Download PDF if it doesn't already exist
if not os.path.exists(pdf_path):
  print("File doesn't exist, downloading...")

  # The URL of the PDF you want to download
  url = "https://pressbooks.oer.hawaii.edu/humannutrition2/open/download?type=pdf"

  # The local filename to save the downloaded file
  filename = pdf_path

  # Send a GET request to the URL
  response = requests.get(url)

  # Check if the request was successful
  if response.status_code == 200:
      # Open a file in binary write mode and save the content to it
      with open(filename, "wb") as file:
          file.write(response.content)
      print(f"The file has been downloaded and saved as {filename}")
  else:
      print(f"Failed to download the file. Status code: {response.status_code}")
else:
  print(f"File {pdf_path} exists.")

File doesn't exist, downloading...
The file has been downloaded and saved as human-nutrition-text.pdf


In [2]:
!pip install PyMuPDF

Collecting PyMuPDF
  Downloading pymupdf-1.26.5-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.5-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m95.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.26.5


In [45]:
import fitz
import re
import pandas as pd
from spacy.lang.en import English


nlp = English()
nlp.add_pipe("sentencizer")

def search_for_page_number(page) -> int | None:
    blocks = page.get_text("blocks")
    h = page.rect.height
    footer_blocks = [b for b in blocks if b[1] > h * 0.8]
    if not footer_blocks:
        return None
    rightmost = max(footer_blocks, key=lambda b: b[2])
    m = re.search(r'\b\d{1,4}\b', rightmost[4])
    return int(m.group()) if m else None

def find_first_page_one(pdf_path: str) -> int | None:
    """Scan pages until you find the first that shows page number 1."""
    with fitz.open(pdf_path) as doc:
        for i, page in enumerate(doc):
            num = search_for_page_number(page)
            if num == 1:
                return i
    return None

def open_and_read_pdf(pdf_path: str) -> list[dict]:
    """Efficiently extract text + assign page numbers after finding the first '1'."""
    first_index = find_first_page_one(pdf_path)

    with fitz.open(pdf_path) as doc:
        content = []
        for i, page in enumerate(doc):
            text = page.get_text("text")
            text_clean = text.replace("\n", " ").strip()

            doc_spacy = nlp(text)
            sentences = [sent.text.strip() for sent in doc_spacy.sents if sent.text.strip()]

            logical_page = None
            if first_index is not None and i >= first_index:
                logical_page = 1 + (i - first_index)


            content.append({
                "file_name": pdf_path,
                "pdf_page_index": i,
                "page_number": logical_page,
                "page_char_count": len(text_clean),
                "page_word_count": len(text_clean.split()),
                "page_sentence_count_raw": len(text_clean.split(". ")),
                "page_token_count": len(text_clean) // 4,
                "sentences": sentences,
                "sentence_count": len(sentences),
                "text": text_clean
            })
        return content


content = open_and_read_pdf("human-nutrition-text.pdf")
# content = open_and_read_pdf("np.pdf")
df = pd.DataFrame(content)
display(df)

Unnamed: 0,file_name,pdf_page_index,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,sentences,sentence_count,text
0,human-nutrition-text.pdf,0,,29,4,1,7,[Human Nutrition: 2020 Edition],1,Human Nutrition: 2020 Edition
1,human-nutrition-text.pdf,1,,0,0,1,0,[],0,
2,human-nutrition-text.pdf,2,,320,42,1,80,[Human Nutrition: 2020 \nEdition \nUNIVERSITY ...,1,Human Nutrition: 2020 Edition UNIVERSITY OF ...
3,human-nutrition-text.pdf,3,,212,30,1,53,[Human Nutrition: 2020 Edition by University o...,1,Human Nutrition: 2020 Edition by University of...
4,human-nutrition-text.pdf,4,,797,116,2,199,[Contents \nPreface \nUniversity of Hawai‘i at...,2,Contents Preface University of Hawai‘i at Mā...
...,...,...,...,...,...,...,...,...,...,...
1203,human-nutrition-text.pdf,1203,1162.0,1676,241,18,419,"[39., Exercise 10.2 & 11.3 reused “Egg Oval Fo...",18,39. Exercise 10.2 & 11.3 reused “Egg Oval Food...
1204,human-nutrition-text.pdf,1204,1163.0,1617,230,10,404,[Images / Pixabay License; “Pumpkin Cartoon Or...,10,Images / Pixabay License; “Pumpkin Cartoon Ora...
1205,human-nutrition-text.pdf,1205,1164.0,1715,237,13,428,[Flashcard Images \nNote: Most images in the f...,13,Flashcard Images Note: Most images in the fla...
1206,human-nutrition-text.pdf,1206,1165.0,1733,238,13,433,"[ShareAlike \n11., Organs reused “Pancreas Org...",13,ShareAlike 11. Organs reused “Pancreas Organ ...


In [46]:
def efficient_chunking(sentences, max_tokens=200, overlap=1):
    """
    Chunk sentences with:
    - max_tokens: approximate token limit per chunk (1 token ~ 4 chars)
    - overlap: number of sentences to repeat between consecutive chunks
    Splits sentences that individually exceed max_tokens.
    """
    chunks = []
    i = 0
    n = len(sentences)

    while i < n:
        token_count = 0
        current_chunk = []
        j = i

        while j < n:
            sent = sentences[j]
            sent_tokens = len(sent) // 4

            # If a single sentence is larger than max_tokens, split it into sub-chunks
            if sent_tokens > max_tokens:
                approx_chars = max_tokens * 4
                sub_sentences = [sent[k:k+approx_chars].strip() for k in range(0, len(sent), approx_chars)]
                # Flush current chunk first
                if current_chunk:
                    chunks.append(current_chunk)
                    current_chunk = []
                # Add all sub-sentences as separate chunks
                for sub_sent in sub_sentences:
                    chunks.append([sub_sent])
                j += 1
                break  # go to next i after overlap handling

            # Normal sentence fits
            if token_count + sent_tokens > max_tokens:
                break  # current chunk is full

            current_chunk.append(sent)
            token_count += sent_tokens
            j += 1

        if current_chunk:
            chunks.append(current_chunk)

        # Advance i with overlap
        if j > i:
            i = max(j - overlap, i + 1)
        else:
            i += 1  # fallback to avoid infinite loop

    return chunks


In [27]:
chunked_data_filtered = []
min_tokens = 30

for _, row in df.iterrows():
    sentences = row["sentences"]
    chunks = efficient_chunking(sentences, max_tokens=200, overlap=3)
    for chunk_sentences in chunks:
        chunk_text = " ".join(chunk_sentences)
        chunk_token_count = len(chunk_text)//4

        if chunk_token_count < min_tokens:
            continue
        chunked_data_filtered.append({
            "file_name": row["file_name"],
            "pdf_page_index": row["pdf_page_index"],
            "page_number": row["page_number"],
            "chunk_text": chunk_text,
            "chunk_sentences": chunk_sentences,
            "chunk_char_count": len(chunk_text),
            "chunk_word_count": len(chunk_text.split()),
            "chunk_token_count": chunk_token_count,
        })

df_chunks = pd.DataFrame(chunked_data_filtered)
display(df_chunks)

Unnamed: 0,file_name,pdf_page_index,page_number,chunk_text,chunk_sentences,chunk_char_count,chunk_word_count,chunk_token_count
0,human-nutrition-text.pdf,2,,Human Nutrition: 2020 \nEdition \nUNIVERSITY O...,[Human Nutrition: 2020 \nEdition \nUNIVERSITY ...,320,42,80
1,human-nutrition-text.pdf,3,,Human Nutrition: 2020 Edition by University of...,[Human Nutrition: 2020 Edition by University o...,212,30,53
2,human-nutrition-text.pdf,4,,Contents \nPreface \nUniversity of Hawai‘i at ...,[Contents \nPreface \nUniversity of Hawai‘i at...,797,116,199
3,human-nutrition-text.pdf,5,,Lifestyles and Nutrition \nUniversity of Hawai...,[Lifestyles and Nutrition \nUniversity of Hawa...,564,84,141
4,human-nutrition-text.pdf,5,,The Human Body \nIntroduction \nUniversity of ...,[The Human Body \nIntroduction \nUniversity of...,411,60,102
...,...,...,...,...,...,...,...,...
2240,human-nutrition-text.pdf,1205,1164.0,Hyperlipidemia reused “Osmotic pressure on blo...,[Hyperlipidemia reused “Osmotic pressure on bl...,244,32,61
2241,human-nutrition-text.pdf,1206,1165.0,ShareAlike \n11. Organs reused “Pancreas Organ...,"[ShareAlike \n11., Organs reused “Pancreas Org...",702,94,175
2242,human-nutrition-text.pdf,1206,1165.0,Protein reused “The Macronutrients: Carbohydra...,[Protein reused “The Macronutrients: Carbohydr...,723,100,180
2243,human-nutrition-text.pdf,1206,1165.0,Vitamin A reused “Carrot Vegetable Orange Food...,[Vitamin A reused “Carrot Vegetable Orange Foo...,306,44,76


In [47]:
chunked_data_filtered = []
min_tokens = 30
max_tokens = 300
overlap_sentences = 1

for _, row in df.iterrows():
    sentences = row["sentences"]
    chunks = efficient_chunking(sentences, max_tokens=max_tokens, overlap=overlap_sentences)

    for chunk_id, chunk_sentences in enumerate(chunks):
        chunk_text = " ".join(chunk_sentences).strip()
        chunk_token_count = len(chunk_text)//4

        if chunk_token_count < min_tokens:
            continue


        chunked_data_filtered.append({
            "file_name": row["file_name"],
            "pdf_page_index": row["pdf_page_index"],
            "page_number": row["page_number"],
            "chunk_id": chunk_id,
            "sentence_chunk": chunk_text,
            "chunk_sentence_count": len(chunk_sentences),
            "chunk_char_count": len(chunk_text),
            "chunk_word_count": len(chunk_text.split()),
            "chunk_token_count": chunk_token_count,
        })

df_chunks = pd.DataFrame(chunked_data_filtered)

# Display example
display(df_chunks)

Unnamed: 0,file_name,pdf_page_index,page_number,chunk_id,sentence_chunk,chunk_sentence_count,chunk_char_count,chunk_word_count,chunk_token_count
0,human-nutrition-text.pdf,2,,0,Human Nutrition: 2020 \nEdition \nUNIVERSITY O...,1,320,42,80
1,human-nutrition-text.pdf,3,,0,Human Nutrition: 2020 Edition by University of...,1,212,30,53
2,human-nutrition-text.pdf,4,,0,Contents \nPreface \nUniversity of Hawai‘i at ...,2,797,116,199
3,human-nutrition-text.pdf,4,,1,Basic Concepts in Nutrition \nIntroduction \nU...,1,393,58,98
4,human-nutrition-text.pdf,5,,0,Lifestyles and Nutrition \nUniversity of Hawai...,3,976,144,244
...,...,...,...,...,...,...,...,...,...
1952,human-nutrition-text.pdf,1206,1165.0,0,ShareAlike \n11. Organs reused “Pancreas Organ...,9,1169,155,292
1953,human-nutrition-text.pdf,1206,1165.0,1,Soluble Fiber/Unsoluble Fiber reused “List of ...,5,702,104,175
1954,human-nutrition-text.pdf,1206,1165.0,2,Vitamin C reused “L-Ascorbic acid” by Yikrazuu...,1,151,21,37
1955,human-nutrition-text.pdf,1207,1166.0,0,23. Vitamin D reused “The Functions of Vitamin...,3,257,39,64


In [48]:


text_chunks = df_chunks["sentence_chunk"].tolist()

print(text_chunks)



In [49]:
%%time
from sentence_transformers import SentenceTransformer
import pickle

save_path = "/content/drive/MyDrive/LLMs/df_chunks_with_embeddings.pkl"
embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2",
                                      device="cuda")

text_chunks = df_chunks["sentence_chunk"].tolist()


embeddings = embedding_model.encode(
    text_chunks,
    batch_size=32,          # adjust depending on your GPU memory
    show_progress_bar=True,
    convert_to_tensor=True, # keeps it as a torch tensor (faster)
    normalize_embeddings=True  # optional, helps with cosine similarity
)

df_chunks["embedding"] = [e.cpu().numpy() for e in embeddings]


with open(save_path, "wb") as f:
    pickle.dump(df_chunks, f)

print("Saved df_chunks with embeddings!")



Batches:   0%|          | 0/62 [00:00<?, ?it/s]

CPU times: user 25.7 s, sys: 70.1 ms, total: 25.7 s
Wall time: 28.4 s


In [12]:
!pip install faiss-gpu

[31mERROR: Could not find a version that satisfies the requirement faiss-gpu (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for faiss-gpu[0m[31m
[0m

In [5]:
import pickle
save_path = "/content/drive/MyDrive/LLMs/df_chunks_with_embeddings.pkl"
with open(save_path, "rb") as f:
    df_chunks = pickle.load(f)

In [13]:
import numpy as np
import faiss

embeddings = np.vstack(df_chunks['embedding'].to_numpy())
dim = embeddings.shape[1]

# Build the index
M = 32   # number of neighbors
index = faiss.IndexHNSWFlat(dim, M)
index.hnsw.efConstruction = 100  # tradeoff between speed and recall
index.hnsw.efSearch = 50         # affects search quality

# Add embeddings to the index
index.add(embeddings)
print("Index size:", index.ntotal)

faiss.write_index(index, "/content/drive/MyDrive/LLMs/faiss_hnsw.index")
print("FAISS index saved successfully!")

Index size: 1957


In [23]:
import textwrap

def print_wrapped(text, wrap_length=80):
    wrapped_text = textwrap.fill(text, wrap_length)
    print(wrapped_text)


def print_faiss_results(query: str, faiss_index, chunks, embed_func, top_k: int = 5):
    """
    Prints top-k FAISS results for a query with readable output.

    Parameters:
    - query: str, the query text
    - faiss_index: FAISS index object
    - chunks: list of dicts with at least "text" and "page_number"
    - embed_func: function to convert text to embedding
    - top_k: int, number of results to show
    """

    query_embedding = embed_func([query])[0].astype('float32')


    if query_embedding.ndim == 1:
        query_embedding = query_embedding.reshape(1, -1)

    distances, indices = faiss_index.search(query_embedding, top_k)

    print(f"\nQuery: '{query}'\n")
    print("Results:")

    for score, idx in zip(distances[0], indices[0]):
        print(f"Score: {score:.4f}")
        print("Text:")
        print_wrapped(chunks[idx]["sentence_chunk"])
        if "page_number" in chunks[idx]:
            print(f"Page number: {chunks[idx]['page_number']}")
        print("\n")



In [27]:
model = SentenceTransformer("all-mpnet-base-v2", device="cuda")

print_faiss_results(
    query = "macronutrients functions",
    faiss_index=index,
    chunks=df_chunks.to_dict('records'),
    embed_func=model.encode,
    top_k = 10
    )


Query: 'macronutrients functions'

Results:
Score: 0.6461
Text:
Macronutrients  Nutrients  that  are  needed  in  large  amounts  are  called
macronutrients. There are three classes of macronutrients:  carbohydrates,
lipids, and proteins. These can be metabolically  processed into cellular
energy. The energy from macronutrients  comes from their chemical bonds. This
chemical energy is  converted into cellular energy that is then utilized to
perform work,  allowing our bodies to conduct their basic functions. A unit of
measurement of food energy is the calorie. On nutrition food labels  the amount
given for “calories” is actually equivalent to each calorie  multiplied by one
thousand. A kilocalorie (one thousand calories,  denoted with a small “c”) is
synonymous with the “Calorie” (with a  capital “C”) on nutrition food labels.
Water is also a macronutrient in  the sense that you require a large amount of
it, but unlike the other  macronutrients, it does not yield calories.
Carbohydrat