<a href="https://colab.research.google.com/github/nickanely/PDF_RAG_Explorer/blob/main/PDF_RAG_Explorer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [130]:
import nbformat

notebook_path = "/content/drive/MyDrive/Colab Notebooks/PDF_RAG_Explorer.ipynb" # Corrected path
nb = nbformat.read(notebook_path, as_version=nbformat.NO_CONVERT)

# Remove widgets metadata
for cell in nb.cells:
    if "metadata" in cell and "widgets" in cell.metadata:
        del cell.metadata["widgets"]

nbformat.write(nb, notebook_path)
print("Removed widgets metadata successfully!")

Removed widgets metadata successfully!


In [1]:
import os
import requests

# Get PDF document
pdf_path = "human-nutrition-text.pdf"

# Download PDF if it doesn't already exist
if not os.path.exists(pdf_path):
  print("File doesn't exist, downloading...")

  # The URL of the PDF you want to download
  url = "https://pressbooks.oer.hawaii.edu/humannutrition2/open/download?type=pdf"

  # The local filename to save the downloaded file
  filename = pdf_path

  # Send a GET request to the URL
  response = requests.get(url)

  # Check if the request was successful
  if response.status_code == 200:
      # Open a file in binary write mode and save the content to it
      with open(filename, "wb") as file:
          file.write(response.content)
      print(f"The file has been downloaded and saved as {filename}")
  else:
      print(f"Failed to download the file. Status code: {response.status_code}")
else:
  print(f"File {pdf_path} exists.")

File doesn't exist, downloading...
The file has been downloaded and saved as human-nutrition-text.pdf


In [2]:
!pip install PyMuPDF

Collecting PyMuPDF
  Downloading pymupdf-1.26.5-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.5-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m95.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.26.5


In [120]:
import fitz
import re
import pandas as pd
from spacy.lang.en import English


nlp = English()
nlp.add_pipe("sentencizer")

def search_for_page_number(page) -> int | None:
    blocks = page.get_text("blocks")
    h = page.rect.height
    footer_blocks = [b for b in blocks if b[1] > h * 0.8]
    if not footer_blocks:
        return None
    rightmost = max(footer_blocks, key=lambda b: b[2])
    m = re.search(r'\b\d{1,4}\b', rightmost[4])
    return int(m.group()) if m else None

def find_first_page_one(pdf_path: str) -> int | None:
    """Scan pages until you find the first that shows page number 1."""
    with fitz.open(pdf_path) as doc:
        for i, page in enumerate(doc):
            num = search_for_page_number(page)
            if num == 1:
                return i
    return None

def open_and_read_pdf(pdf_path: str) -> list[dict]:
    """Efficiently extract text + assign page numbers after finding the first '1'."""
    first_index = find_first_page_one(pdf_path)

    with fitz.open(pdf_path) as doc:
        content = []
        for i, page in enumerate(doc):
            text = page.get_text("text")
            text_clean = text.replace("\n", " ").strip()

            doc_spacy = nlp(text)
            sentences = [sent.text.strip() for sent in doc_spacy.sents if sent.text.strip()]

            logical_page = None
            if first_index is not None and i >= first_index:
                logical_page = 1 + (i - first_index)


            content.append({
                "file_name": pdf_path,
                "pdf_page_index": i,
                "page_number": logical_page,
                "page_char_count": len(text_clean),
                "page_word_count": len(text_clean.split()),
                "page_sentence_count_raw": len(text_clean.split(". ")),
                "page_token_count": len(text_clean) // 4,
                "sentences": sentences,
                "sentence_count": len(sentences),
                "text": text_clean
            })
        return content


# content = open_and_read_pdf("human-nutrition-text.pdf")
content = open_and_read_pdf("np.pdf")
df = pd.DataFrame(content)
display(df)

Unnamed: 0,file_name,pdf_page_index,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,sentences,sentence_count,text
0,np.pdf,0,,0,0,1,0,[],0,
1,np.pdf,1,,40,4,1,10,[INTRODUCTION TO \nNUMERICAL \nPROGRAMMING],1,INTRODUCTION TO NUMERICAL PROGRAMMING
2,np.pdf,2,,490,67,4,122,[Steven A. Gottlieb and Rubin H. Landau\nSerie...,2,Steven A. Gottlieb and Rubin H. Landau Series ...
3,np.pdf,3,,281,38,3,70,[SERIES IN COMPUTATIONAL PHYSICS\nSteven A. Go...,1,SERIES IN COMPUTATIONAL PHYSICS Steven A. Gott...
4,np.pdf,4,,2109,317,13,527,[CRC Press\nTaylor & Francis Group\n6000 Broke...,10,CRC Press Taylor & Francis Group 6000 Broken S...
...,...,...,...,...,...,...,...,...,...,...
658,np.pdf,658,638.0,1757,147,1,439,[638\nAppendix G\nListing\n/INP/Ch06/Python\nL...,1,638 Appendix G Listing /INP/Ch06/Python Listin...
659,np.pdf,659,639.0,1903,149,1,475,[Appendix G\n639\nListing\n/INP/Ch10/Python\nL...,1,Appendix G 639 Listing /INP/Ch10/Python Listin...
660,np.pdf,660,640.0,776,63,1,194,[640\nAppendix G\nListing\n/INP/Ch13/Python\nL...,1,640 Appendix G Listing /INP/Ch13/Python Listin...
661,np.pdf,661,641.0,0,0,1,0,[],0,


In [121]:
def efficient_chunking(sentences, max_tokens=200, overlap=1):
    """
    Chunk sentences with:
    - max_tokens: approximate token limit per chunk (1 token ~ 4 chars)
    - overlap: number of sentences to repeat between consecutive chunks
    Splits sentences that individually exceed max_tokens.
    """
    chunks = []
    i = 0
    n = len(sentences)

    while i < n:
        token_count = 0
        current_chunk = []
        j = i

        while j < n:
            sent = sentences[j]
            sent_tokens = len(sent) // 4


            if sent_tokens > max_tokens:
                approx_chars = max_tokens * 4
                sub_sentences = [sent[k:k+approx_chars].strip() for k in range(0, len(sent), approx_chars)]

                if current_chunk:
                    chunks.append(current_chunk)
                    current_chunk = []

                for sub_sent in sub_sentences:
                    chunks.append([sub_sent])
                j += 1
                break

            if token_count + sent_tokens > max_tokens:
                break

            current_chunk.append(sent)
            token_count += sent_tokens
            j += 1

        if current_chunk:
            chunks.append(current_chunk)

        if j > i:
            i = max(j - overlap, i + 1)
        else:
            i += 1

    return chunks


In [122]:
chunked_data_filtered = []
min_tokens = 30
max_tokens = 300
overlap_sentences = 1

for _, row in df.iterrows():
    sentences = row["sentences"]
    chunks = efficient_chunking(sentences, max_tokens=max_tokens, overlap=overlap_sentences)

    for chunk_id, chunk_sentences in enumerate(chunks):
        chunk_text = " ".join(chunk_sentences).strip()
        chunk_token_count = len(chunk_text)//4

        if chunk_token_count < min_tokens:
            continue


        chunked_data_filtered.append({
            "file_name": row["file_name"],
            "pdf_page_index": row["pdf_page_index"],
            "page_number": row["page_number"],
            "chunk_id": chunk_id,
            "sentence_chunk": chunk_text,
            "chunk_sentence_count": len(chunk_sentences),
            "chunk_char_count": len(chunk_text),
            "chunk_word_count": len(chunk_text.split()),
            "chunk_token_count": chunk_token_count,
        })

df_chunks = pd.DataFrame(chunked_data_filtered)

display(df_chunks)

Unnamed: 0,file_name,pdf_page_index,page_number,chunk_id,sentence_chunk,chunk_sentence_count,chunk_char_count,chunk_word_count,chunk_token_count
0,np.pdf,2,,0,Steven A. Gottlieb and Rubin H. Landau\nSeries...,2,490,67,122
1,np.pdf,2,,1,Introduction to Numerical Programming: A Pract...,1,330,44,82
2,np.pdf,3,,0,SERIES IN COMPUTATIONAL PHYSICS\nSteven A. Got...,1,281,38,70
3,np.pdf,4,,0,CRC Press\nTaylor & Francis Group\n6000 Broken...,4,974,155,243
4,np.pdf,4,,1,If any copyright material has not been acknowl...,6,1132,167,283
...,...,...,...,...,...,...,...,...,...
2017,np.pdf,659,639.0,1,2-PendulumEC.cpp\n12.12\nP12-ThrowV.py\nP12-Th...,1,703,50,175
2018,np.pdf,660,640.0,0,640\nAppendix G\nListing\n/INP/Ch13/Python\nLi...,1,776,63,194
2019,np.pdf,662,642.0,0,K16451\nSERIES IN COMPUTATIONAL PHYSICS\nSteve...,6,1214,171,303
2020,np.pdf,662,642.0,1,"—Professor Mike Wheatland, The University of S...",4,1035,137,258


In [123]:
%%time
from sentence_transformers import SentenceTransformer
import pickle
import torch

save_path = "/content/drive/MyDrive/LLMs/df_chunks_with_embeddings.pkl"

embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2",
                                      device="cuda")

text_chunks = df_chunks["sentence_chunk"].tolist()


embeddings = embedding_model.encode(
    text_chunks,
    batch_size=32,
    show_progress_bar=True,
    convert_to_tensor=True,
    normalize_embeddings=True
).half()


df_chunks["embedding"] = [e.cpu().numpy() for e in embeddings]


with open(save_path, "wb") as f:
    pickle.dump(df_chunks, f)

print("Saved df_chunks with embeddings!")

Batches:   0%|          | 0/64 [00:00<?, ?it/s]

Saved df_chunks with embeddings!
CPU times: user 43.1 s, sys: 84.1 ms, total: 43.2 s
Wall time: 44.2 s


In [85]:
import pickle
save_path = "/content/drive/MyDrive/LLMs/df_chunks_with_embeddings.pkl"
with open(save_path, "rb") as f:
    df_chunks = pickle.load(f)

In [124]:
import numpy as np
import faiss


embeddings = np.vstack(df_chunks['embedding'].to_numpy()).astype('float16')
dim = embeddings.shape[1]


M = 32
index = faiss.IndexHNSWFlat(dim, M, faiss.METRIC_INNER_PRODUCT)
index.add(embeddings)

print(f"Index built with {index.ntotal} vectors")


faiss.write_index(index, "/content/drive/MyDrive/LLMs/faiss_cosine.index")


Index built with 2022 vectors


In [125]:
import textwrap
import numpy as np
import fitz
import matplotlib.pyplot as plt

In [126]:

def retrieve_chunks(query: str, faiss_index, chunks_df, model, top_k: int = 5):

    query_emb = model.encode([query], normalize_embeddings=True).astype('float32')


    scores, indices = faiss_index.search(query_emb, top_k)

    results = []
    for score, idx in zip(scores[0], indices[0]):
        chunk = chunks_df.iloc[idx].to_dict()
        chunk['cosine_similarity'] = float(score)
        results.append(chunk)

    return results


query = "muscle building"
results = retrieve_chunks(query, index, df_chunks, model, top_k=5)

for i, chunk in enumerate(results, 1):
    print(f"[{i}] Cosine Similarity: {chunk['cosine_similarity']:.4f} | Page: {chunk['page_number']}")
    print(chunk['sentence_chunk'][:200], "\n")

[1] Cosine Similarity: 0.1390 | Page: 618.0
#
#
Returns: fmin, fmax, icol, ncol, nintv
#----------------------------------------------------------------------------
global w, nxw, nyw
# canvas object and size
nfont = min(int((iymin-iymax)/20.), 

[2] Cosine Similarity: 0.1385 | Page: 622.0
# font size
font = ("Helvetica",nfont)
# title font
w.create_text((ixmin+ixmax)/2,iymax-3*nfont,text=title,font=font)
# title
f = (xmax-xmin)/(ymax-ymin)
# scale viewport proportionally
if (f < float( 

[3] Cosine Similarity: 0.1385 | Page: 622.0
# font size
font = ("Helvetica",nfont)
# title font
w.create_text((ixmin+ixmax)/2,iymax-3*nfont,text=title,font=font)
# title
f = (xmax-xmin)/(ymax-ymin)
# scale viewport proportionally
if (f < float( 

[4] Cosine Similarity: 0.1349 | Page: 617.0
Appendix E
617
iy = Nint(iymin + i*htic)
w.create_line(ixmin,iy,ixmin+tic,iy)
# tics
w.create_line(ixmax,iy,ixmax-tic,iy)
if (ytext ! = "None"):
(mant,expn) = FormStr(ymin+i*h,scale,nsigd)
w.create_te 

[5] Cosine S

In [103]:
!pip install groq

Collecting groq
  Downloading groq-0.33.0-py3-none-any.whl.metadata (16 kB)
Downloading groq-0.33.0-py3-none-any.whl (135 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/135.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m135.8/135.8 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: groq
Successfully installed groq-0.33.0


In [127]:
from groq import Groq
from google.colab import userdata

groq_client = Groq(api_key=userdata.get('groq'))


def generate_answer(query: str, faiss_index, chunks_df, model, groq_client, top_k: int = 5):
    """RAG: Retrieve + Generate"""

    # Retrieve
    results = retrieve_chunks(query, faiss_index, chunks_df, model, top_k)

    # Build context
    context = ""
    for i, chunk in enumerate(results, 1):
        context += f"\n[Source {i}, Page {chunk['page_number']}]\n{chunk['sentence_chunk']}\n"

    # Generate answer
    prompt = f"""Answer the question using ONLY the context below. Cite sources like [Source 1].

Context:
{context}

Question: {query}

Answer:"""

    response = groq_client.chat.completions.create(
        model="llama-3.1-8b-instant",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.3,
        max_tokens=500
    )

    return {
        'answer': response.choices[0].message.content,
        'sources': results
    }


result = generate_answer("What builds muscle?", index, df_chunks, model, groq_client)
print("ANSWER:")
print(result['answer'])
print("\nSOURCES:")
for s in result['sources']:
    print(f"  - Page {s['page_number']} (similarity: {s['cosine_similarity']:.3f})")

ANSWER:
Unfortunately, the provided context does not mention anything about building muscle. The context appears to be related to programming and mathematics, specifically discussing matrix operations and linear congruential generators.

However, if we look at the provided sources, we can see that [Source 4, Page 408.0] discusses linear congruential generators, which are used to generate random numbers. While random number generation is not directly related to building muscle, it's worth noting that resistance training and progressive overload, which can be achieved through weightlifting or other forms of exercise, are commonly used to build muscle.

But, based on the provided context, it's not possible to give a direct answer to the question.

SOURCES:
  - Page 105.0 (similarity: 0.144)
  - Page nan (similarity: 0.124)
  - Page 592.0 (similarity: 0.115)
  - Page 408.0 (similarity: 0.108)
  - Page 593.0 (similarity: 0.090)


In [112]:

def hyde_retrieve(query: str, faiss_index, chunks_df, model, groq_client, top_k: int = 5):
    """
    HyDE: Generate fake answer → embed it → retrieve with it
    Often better than embedding the question directly
    """

    hyde_prompt = f"""Write a detailed passage that would answer this question.
Write like a textbook or encyclopedia.

Question: {query}

Passage:"""

    response = groq_client.chat.completions.create(
        model="llama-3.1-8b-instant",
        messages=[{"role": "user", "content": hyde_prompt}],
        temperature=0.7,
        max_tokens=200
    )

    hypothetical_doc = response.choices[0].message.content
    print("Generated hypothetical document:")
    print(hypothetical_doc[:250], "...\n")


    hyde_emb = model.encode([hypothetical_doc], normalize_embeddings=True).astype('float16')

    scores, indices = faiss_index.search(hyde_emb, top_k)

    results = []
    for score, idx in zip(scores[0], indices[0]):
        chunk = chunks_df.iloc[idx].to_dict()
        chunk['cosine_similarity'] = float(score)
        results.append(chunk)

    return results






In [128]:
def rag_pipeline(query: str, faiss_index, chunks_df, model, groq_client,
                 use_hyde: bool = False, top_k: int = 5):
    """Complete RAG with optional HyDE"""

    # Retrieve (with or without HyDE)
    if use_hyde:
        print("Using HyDE for retrieval...\n")
        results = hyde_retrieve(query, faiss_index, chunks_df, model, groq_client, top_k)
    else:
        results = retrieve_chunks(query, faiss_index, chunks_df, model, top_k)

    # Build context
    context = ""
    for i, chunk in enumerate(results, 1):
        sim = chunk['cosine_similarity']
        page = chunk['page_number']
        context += f"\n[Source {i}, Page {page}, Similarity: {sim:.3f}]\n{chunk['sentence_chunk']}\n"

    # Generate answer
    prompt = f"""Answer the question using ONLY the provided context.
Cite sources like [Source 1]. Be specific and concise.

Context:
{context}

Question: {query}

Answer:"""

    response = groq_client.chat.completions.create(
        model="llama-3.1-8b-instant",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.3,
        max_tokens=500
    )

    return {
        'answer': response.choices[0].message.content,
        'sources': results,
        'used_hyde': use_hyde
    }


# Test with and without HyDE
query = ""

print("\n" + "=" * 80)
print("RAG WITHOUT HyDE:")
print("=" * 80)
result1 = rag_pipeline(query, index, df_chunks, model, groq_client, use_hyde=False)
print(result1['answer'])

print("\n\n" + "=" * 80)
print("RAG WITH HyDE:")
print("=" * 80)
result2 = rag_pipeline(query, index, df_chunks, model, groq_client, use_hyde=True)
print(result2['answer'])


RAG WITHOUT HyDE:
Numerical programming is not explicitly defined in the provided context. However, it can be inferred as a field that deals with the implementation of numerical methods and algorithms, often accompanied by graphical output, to solve mathematical problems. This is supported by [Source 1] which mentions "numerical programming" in the title of the book's website, and [Source 2] and [Source 4] which provide implementations and graphical output for numerical methods.

A more detailed explanation can be found in [Source 3] which states that the book aims to provide a well-balanced combination of basic theoretical knowledge, code design, and efficiency analysis skills for numerical programming. Additionally, [Source 5] mentions that the book provides a rigorous and accessible presentation of fundamental numerical methods, which are complemented with implementations and applications.

Therefore, based on the provided context, numerical programming can be defined as the implem