In [1]:
import os
import re
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain_community.llms import Ollama

In [7]:
#load the file from local directory
pdf_path = "HumanNutrition.pdf"
loader = PyPDFLoader(pdf_path)
docs = loader.load()

print(f"Total pages loaded: {len(docs)}")
print(docs[0].page_content[:500])
print( docs[2].page_content[:500])
print(docs[3].page_content[:500])

Total pages loaded: 426
Human Nutrition 
Human Nutrition 
LANGARA COLLEGE, NUTRITION AND FOOD SERVICE MANA GEMENT 
PROGRAM 
JANICE SORENSEN 
Human Nutrition by Langara College, Nutrition and Food Service Management Program is licensed under a Creative Commons Attribution 4.0 
International License, except where otherwise noted. 
This Human Nutrition OER textbook includes content from a number of OER sources. All new content added to this book is licensed 
under a Creative Commons CC BY 4.0 license, while select chapters have been used and are shared under a CC BY-NC-SA license. All 
other content not under a CC is used fairly and is 


In [8]:
# Clean text 
def clean_text(text):
    text = text.replace('\n', ' ')
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'(\w+)-\s+(\w+)', r'\1\2', text)
    text = re.sub(
        r'(\w)\s+(\w)',
        lambda m: m.group(1) + m.group(2)
        if m.group(1).islower() and m.group(2).islower()
        else m.group(1) + ' ' + m.group(2),
        text
    )
    return text.strip()

for d in docs:
    d.page_content = clean_text(d.page_content)

print(" Text cleaning done.")

 Text cleaning done.


In [9]:
# Split text
splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,
    chunk_overlap=150
)
chunks = splitter.split_documents(docs)
print(f"✅ Total chunks created: {len(chunks)}")

✅ Total chunks created: 1192


In [10]:
# Create Vector Store 
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
db = FAISS.from_documents(chunks, embeddings)

# Create Retriever
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 3})

# Load LLM 
llm = Ollama(model="tinyllama")

In [None]:
# Build RAG Chain 
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    return_source_documents=True
)

# Ask a question 
query = "What are the main sources of Vitamin C?"
result = qa_chain.invoke({"query": query})

print("\ Question:", query)
print( "Answer:", result["result"])