<a href="https://colab.research.google.com/github/nickanely/PDF_RAG_Explorer/blob/main/PDF_RAG_Explorer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -r requirements.txt

In [25]:
import os
import re
import pickle
import textwrap
import requests

import numpy as np
import pandas as pd
import torch
import faiss
import fitz
import matplotlib.pyplot as plt

from spacy.lang.en import English
from sentence_transformers import SentenceTransformer

In [3]:
# Get PDF document
pdf_path = "human-nutrition-text.pdf"

# Download PDF if it doesn't already exist
if not os.path.exists(pdf_path):
  print("File doesn't exist, downloading...")

  # The URL of the PDF you want to download
  url = "https://pressbooks.oer.hawaii.edu/humannutrition2/open/download?type=pdf"

  # The local filename to save the downloaded file
  filename = pdf_path

  # Send a GET request to the URL
  response = requests.get(url)

  # Check if the request was successful
  if response.status_code == 200:
      # Open a file in binary write mode and save the content to it
      with open(filename, "wb") as file:
          file.write(response.content)
      print(f"The file has been downloaded and saved as {filename}")
  else:
      print(f"Failed to download the file. Status code: {response.status_code}")
else:
  print(f"File {pdf_path} exists.")

File doesn't exist, downloading...
The file has been downloaded and saved as human-nutrition-text.pdf


In [2]:
!pip install PyMuPDF

Collecting PyMuPDF
  Downloading pymupdf-1.26.5-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.5-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.26.5


In [4]:
import fitz
import re
import pandas as pd
from spacy.lang.en import English


nlp = English()
nlp.add_pipe("sentencizer")

def search_for_page_number(page) -> int | None:
    blocks = page.get_text("blocks")
    h = page.rect.height
    footer_blocks = [b for b in blocks if b[1] > h * 0.8]
    if not footer_blocks:
        return None
    rightmost = max(footer_blocks, key=lambda b: b[2])
    m = re.search(r'\b\d{1,4}\b', rightmost[4])
    return int(m.group()) if m else None

def find_first_page_one(pdf_path: str) -> int | None:
    """Scan pages until you find the first that shows page number 1."""
    with fitz.open(pdf_path) as doc:
        for i, page in enumerate(doc):
            num = search_for_page_number(page)
            if num == 1:
                return i
    return None

def open_and_read_pdf(pdf_path: str) -> list[dict]:
    """Efficiently extract text + assign page numbers after finding the first '1'."""
    first_index = find_first_page_one(pdf_path)

    with fitz.open(pdf_path) as doc:
        content = []
        for i, page in enumerate(doc):
            text = page.get_text("text")
            text_clean = text.replace("\n", " ").strip()

            doc_spacy = nlp(text)
            sentences = [sent.text.strip() for sent in doc_spacy.sents if sent.text.strip()]

            logical_page = None
            if first_index is not None and i >= first_index:
                logical_page = 1 + (i - first_index)


            content.append({
                "file_name": pdf_path,
                "pdf_page_index": i,
                "page_number": logical_page,
                "page_char_count": len(text_clean),
                "page_word_count": len(text_clean.split()),
                "page_sentence_count_raw": len(text_clean.split(". ")),
                "page_token_count": len(text_clean) // 4,
                "sentences": sentences,
                "sentence_count": len(sentences),
                "text": text_clean
            })
        return content


content = open_and_read_pdf("human-nutrition-text.pdf")
# content = open_and_read_pdf("np.pdf")
df = pd.DataFrame(content)
display(df)

Unnamed: 0,file_name,pdf_page_index,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,sentences,sentence_count,text
0,human-nutrition-text.pdf,0,,29,4,1,7,[Human Nutrition: 2020 Edition],1,Human Nutrition: 2020 Edition
1,human-nutrition-text.pdf,1,,0,0,1,0,[],0,
2,human-nutrition-text.pdf,2,,320,42,1,80,[Human Nutrition: 2020 \nEdition \nUNIVERSITY ...,1,Human Nutrition: 2020 Edition UNIVERSITY OF ...
3,human-nutrition-text.pdf,3,,212,30,1,53,[Human Nutrition: 2020 Edition by University o...,1,Human Nutrition: 2020 Edition by University of...
4,human-nutrition-text.pdf,4,,797,116,2,199,[Contents \nPreface \nUniversity of Hawai‘i at...,2,Contents Preface University of Hawai‘i at Mā...
...,...,...,...,...,...,...,...,...,...,...
1203,human-nutrition-text.pdf,1203,1162.0,1676,241,18,419,"[39., Exercise 10.2 & 11.3 reused “Egg Oval Fo...",18,39. Exercise 10.2 & 11.3 reused “Egg Oval Food...
1204,human-nutrition-text.pdf,1204,1163.0,1617,230,10,404,[Images / Pixabay License; “Pumpkin Cartoon Or...,10,Images / Pixabay License; “Pumpkin Cartoon Ora...
1205,human-nutrition-text.pdf,1205,1164.0,1715,237,13,428,[Flashcard Images \nNote: Most images in the f...,13,Flashcard Images Note: Most images in the fla...
1206,human-nutrition-text.pdf,1206,1165.0,1733,238,13,433,"[ShareAlike \n11., Organs reused “Pancreas Org...",13,ShareAlike 11. Organs reused “Pancreas Organ ...


In [16]:
def sentence_chunking(sentences, max_tokens=200, overlap=1):
    """
    Chunk sentences with:
    - max_tokens: approximate token limit per chunk (1 token ~ 4 chars)
    - overlap: number of sentences to repeat between consecutive chunks
    Splits sentences that individually exceed max_tokens.
    """
    chunks = []
    i = 0
    n = len(sentences)

    while i < n:
        token_count = 0
        j = i
        current_chunk = []

        while j < n:
            sent = sentences[j]
            sent_tokens = len(sent)//4


            if sent_tokens > max_tokens:
                approx_chars = max_tokens * 4
                sub_sentences = [sent[k:k+approx_chars].strip() for k in range(0, len(sent), approx_chars)]

                if current_chunk:
                    chunks.append(current_chunk)
                    current_chunk = []
                    token_count = 0

                for sub_sent in sub_sentences:
                    chunks.append([sub_sent])
                j += 1
                break

            if token_count + sent_tokens > max_tokens:
                break
            current_chunk.append(sent)
            token_count += sent_tokens
            j += 1

        if current_chunk:
            chunks.append(current_chunk)

        i = max(j - overlap, j)

    return chunks

In [17]:
chunked_data_filtered = []
min_tokens = 30
max_tokens = 300
overlap_sentences = 1

for _, row in df.iterrows():
    sentences = row["sentences"]
    chunks = sentence_chunking(sentences, max_tokens=max_tokens, overlap=overlap_sentences)

    for chunk_id, chunk_sentences in enumerate(chunks):
        chunk_text = " ".join(chunk_sentences).strip()
        chunk_token_count = len(chunk_text)//4

        if chunk_token_count < min_tokens:
            continue


        chunked_data_filtered.append({
            "file_name": row["file_name"],
            "pdf_page_index": row["pdf_page_index"],
            "page_number": row["page_number"],
            "chunk_id": chunk_id,
            "sentence_chunk": chunk_text,
            "chunk_sentence_count": len(chunk_sentences),
            "chunk_char_count": len(chunk_text),
            "chunk_word_count": len(chunk_text.split()),
            "chunk_token_count": chunk_token_count,
        })

df_chunks = pd.DataFrame(chunked_data_filtered)

display(df_chunks)

Unnamed: 0,file_name,pdf_page_index,page_number,chunk_id,sentence_chunk,chunk_sentence_count,chunk_char_count,chunk_word_count,chunk_token_count
0,human-nutrition-text.pdf,2,,0,Human Nutrition: 2020 \nEdition \nUNIVERSITY O...,1,320,42,80
1,human-nutrition-text.pdf,3,,0,Human Nutrition: 2020 Edition by University of...,1,212,30,53
2,human-nutrition-text.pdf,4,,0,Contents \nPreface \nUniversity of Hawai‘i at ...,2,797,116,199
3,human-nutrition-text.pdf,5,,0,Lifestyles and Nutrition \nUniversity of Hawai...,3,976,144,244
4,human-nutrition-text.pdf,6,,0,The Cardiovascular System \nUniversity of Hawa...,1,1037,152,259
...,...,...,...,...,...,...,...,...,...
1705,human-nutrition-text.pdf,1205,1164.0,0,Flashcard Images \nNote: Most images in the fl...,9,1042,147,260
1706,human-nutrition-text.pdf,1205,1164.0,1,Glucagon reused “Weightlifter Gym Tool Athlete...,4,669,90,167
1707,human-nutrition-text.pdf,1206,1165.0,0,ShareAlike \n11. Organs reused “Pancreas Organ...,9,1169,155,292
1708,human-nutrition-text.pdf,1206,1165.0,1,Solutes reused “Syringe Doctor Needle” by janj...,4,563,83,140


In [20]:
%%time
from sentence_transformers import SentenceTransformer
import pickle
import torch

save_path = "/content/drive/MyDrive/LLMs/df_chunks_with_embeddings.pkl"

embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2",
                                      device="cpu")

text_chunks = df_chunks["sentence_chunk"].tolist()


embeddings = embedding_model.encode(
    text_chunks,
    batch_size=32,
    show_progress_bar=True,
    convert_to_tensor=True,
    normalize_embeddings=True
).half()


df_chunks["embedding"] = [e.cpu().numpy() for e in embeddings]


with open(save_path, "wb") as f:
    pickle.dump(df_chunks, f)

print("Saved df_chunks with embeddings!")

Batches:   0%|          | 0/54 [00:00<?, ?it/s]

FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/LLMs/df_chunks_with_embeddings.pkl'

In [85]:
save_path = "/content/drive/MyDrive/LLMs/df_chunks_with_embeddings.pkl"
with open(save_path, "rb") as f:
    df_chunks = pickle.load(f)

In [28]:
embeddings = np.vstack(df_chunks['embedding'].to_numpy()).astype('float16')
dim = embeddings.shape[1]


M = 32
index = faiss.IndexHNSWFlat(dim, M, faiss.METRIC_INNER_PRODUCT)
index.add(embeddings)

print(f"Index built with {index.ntotal} vectors")


faiss.write_index(index, "/content/drive/MyDrive/LLMs/faiss_cosine.index")


Index built with 1710 vectors


In [30]:
def retrieve_chunks(query: str, faiss_index, chunks_df, model, top_k: int = 5):

    query_emb = model.encode([query], normalize_embeddings=True).astype('float32')


    scores, indices = faiss_index.search(query_emb, top_k)

    results = []
    for score, idx in zip(scores[0], indices[0]):
        chunk = chunks_df.iloc[idx].to_dict()
        chunk['cosine_similarity'] = float(score)
        results.append(chunk)

    return results

query = "muscle building"
results = retrieve_chunks(query, index, df_chunks, embedding_model, top_k=5)

for i, chunk in enumerate(results, 1):
    print(f"[{i}] Cosine Similarity: {chunk['cosine_similarity']:.4f} | Page: {chunk['page_number']}")
    print(chunk['sentence_chunk'][:200], "\n")

[1] Cosine Similarity: 0.5912 | Page: 938.0
Image by 
Cosmed / 
CC BY-SA 
3.0 
Muscle Strength 
Muscle strength is developed and maintained by weight or 
resistance training that often is called anaerobic exercise. Anaerobic 
exercise consists  

[2] Cosine Similarity: 0.5365 | Page: 425.0
Moreover, the author of this review claims that high-quality 
protein foods are a better and cheaper source for branched-chain 
amino acids and says that a chicken breast (100 grams) contains 
the equ 

[3] Cosine Similarity: 0.5148 | Page: 960.0
are also able to burn more calories than women for the same activity 
because they have more muscle mass which requires more energy 
to support and move around.1 
Body weight and composition can have  

[4] Cosine Similarity: 0.4709 | Page: 969.0
https://journals.lww.com/
acsm-msse/Fulltext/2016/03000/
Nutrition_and_Athletic_Performance.25.aspx. Accessed 
March 17, 2018. Sports Nutrition  |  969 

[5] Cosine Similarity: 0.4666 | Page: 118.0
“Muscle 
Types” b

In [31]:
!pip install groq

Collecting groq
  Downloading groq-0.33.0-py3-none-any.whl.metadata (16 kB)
Downloading groq-0.33.0-py3-none-any.whl (135 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m135.8/135.8 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: groq
Successfully installed groq-0.33.0


In [33]:
from groq import Groq
from google.colab import userdata

groq_client = Groq(api_key=userdata.get('groq'))


def generate_answer(query: str, faiss_index, chunks_df, model, groq_client, top_k: int = 5):
    """RAG: Retrieve + Generate"""

    # Retrieve
    results = retrieve_chunks(query, faiss_index, chunks_df, model, top_k)

    # Build context
    context = ""
    for i, chunk in enumerate(results, 1):
        context += f"\n[Source {i}, Page {chunk['page_number']}]\n{chunk['sentence_chunk']}\n"

    # Generate answer
    prompt = f"""Answer the question using ONLY the context below. Cite sources like [Source 1].

Context:
{context}

Question: {query}

Answer:"""

    response = groq_client.chat.completions.create(
        model="llama-3.1-8b-instant",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.3,
        max_tokens=500
    )

    return {
        'answer': response.choices[0].message.content,
        'sources': results
    }


result = generate_answer("What builds muscle?", index, df_chunks, embedding_model, groq_client)
print("ANSWER:")
print(result['answer'])
print("\nSOURCES:")
for s in result['sources']:
    print(f"  - Page {s['page_number']} (similarity: {s['cosine_similarity']:.3f})")

ANSWER:
According to [Source 1, Page 938.0], muscle strength is developed and maintained by weight or resistance training, which is often called anaerobic exercise. This type of high-intensity training is used to build muscle strength by short, high-intensity activities.

Additionally, [Source 3, Page 425.0] suggests that consuming high-quality protein foods, such as chicken breast, dairy proteins (casein and whey), and soy proteins, can positively influence muscle recovery in response to hard training.

[Source 4, Page 407.0] also emphasizes the importance of a balanced diet that includes protein-rich foods, such as soybeans, tofu, tempeh, lentils, and beans, as well as other essential nutrients like carbohydrates, fats, vitamins B12 and D, and calcium.

Therefore, a combination of regular exercise, such as weight or resistance training, and a balanced diet that includes protein-rich foods, can help build muscle.

SOURCES:
  - Page 938.0 (similarity: 0.587)
  - Page 118.0 (similarity:

In [34]:

def hyde_retrieve(query: str, faiss_index, chunks_df, model, groq_client, top_k: int = 5):
    """
    HyDE: Generate fake answer → embed it → retrieve with it
    Often better than embedding the question directly
    """

    hyde_prompt = f"""Write a detailed passage that would answer this question.
Write like a textbook or encyclopedia.

Question: {query}

Passage:"""

    response = groq_client.chat.completions.create(
        model="llama-3.1-8b-instant",
        messages=[{"role": "user", "content": hyde_prompt}],
        temperature=0.7,
        max_tokens=200
    )

    hypothetical_doc = response.choices[0].message.content
    print("Generated hypothetical document:")
    print(hypothetical_doc[:250], "...\n")


    hyde_emb = model.encode([hypothetical_doc], normalize_embeddings=True).astype('float16')

    scores, indices = faiss_index.search(hyde_emb, top_k)

    results = []
    for score, idx in zip(scores[0], indices[0]):
        chunk = chunks_df.iloc[idx].to_dict()
        chunk['cosine_similarity'] = float(score)
        results.append(chunk)

    return results






In [37]:
def rag_pipeline(query: str, faiss_index, chunks_df, model, groq_client,
                 use_hyde: bool = False, top_k: int = 5):
    """Complete RAG with optional HyDE"""

    # Retrieve (with or without HyDE)
    if use_hyde:
        print("Using HyDE for retrieval...\n")
        results = hyde_retrieve(query, faiss_index, chunks_df, model, groq_client, top_k)
    else:
        results = retrieve_chunks(query, faiss_index, chunks_df, model, top_k)

    # Build context
    context = ""
    for i, chunk in enumerate(results, 1):
        sim = chunk['cosine_similarity']
        page = chunk['page_number']
        context += f"\n[Source {i}, Page {page}, Similarity: {sim:.3f}]\n{chunk['sentence_chunk']}\n"

    # Generate answer
    prompt = f"""Answer the question using ONLY the provided context.
Cite sources like [Source 1]. Be specific and concise.

Context:
{context}

Question: {query}

Answer:"""

    response = groq_client.chat.completions.create(
        model="llama-3.1-8b-instant",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.3,
        max_tokens=500
    )

    return {
        'answer': response.choices[0].message.content,
        'sources': results,
        'used_hyde': use_hyde
    }


# Test with and without HyDE
query = "What are the main benefits of exercise?"

print("\n" + "=" * 80)
print("RAG WITHOUT HyDE:")
print("=" * 80)
result1 = rag_pipeline(query, index, df_chunks, embedding_model, groq_client, use_hyde=False)
print(result1['answer'])

print("\n\n" + "=" * 80)
print("RAG WITH HyDE:")
print("=" * 80)
result2 = rag_pipeline(query, index, df_chunks, embedding_model, groq_client, use_hyde=True)
print(result2['answer'])


RAG WITHOUT HyDE:
According to the provided sources, the main benefits of exercise include:

1. Longer life and reduced risk of dying early from heart disease, certain cancers, and other leading causes of death [Source 1].
2. Healthier weight, weight loss, and prevention of excessive weight gain [Source 1].
3. Cardiovascular disease prevention, including boosted HDL cholesterol and decreased unhealthy triglycerides [Source 1].
4. Management of chronic conditions, such as metabolic syndrome, type 2 diabetes, depression, arthritis, and certain types of cancer [Source 1].
5. Energy boosts [Source 1].
6. Improved muscle tone and strength [Source 2].
7. Stronger bones, which can slow the loss of bone density that typically accompanies aging [Source 2].
8. Mental and emotional benefits, including mood improvement, reduced risk of depression, cognitive skills retention, and better sleep [Source 3].
9. Reduced risk of dying early by 40 percent compared to those who are active for less than th