In [9]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_community.llms import CTransformers
import os

# === Load and split PDF ===
loader = PyPDFLoader("Chapter_17_Chemical_carcinogens.pdf")
data = loader.load()

splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=100)
docs = splitter.split_documents(data)

# === Embed and persist ===
embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
vectorstore = Chroma.from_documents(docs, embedding=embedding_function, persist_directory=os.getcwd())

# === Define retriever ===
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5})

# === Prompt template ===
template = """Answer the question based on the following context.

Context:
{context}

Question:
{question}

Answer:"""
prompt_template = PromptTemplate.from_template(template)

# === Load GGUF model locally via ctransformers ===
llm = CTransformers(
    model="C:\Endava\EndevLocal\AI_speech_text_summarize\mistral-7b-instruct-v0.1.Q4_K_M.gguf",
    model_type="mistral"
)

# === Build RAG chain ===
rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt_template
    | llm
)

# === Run it ===
question = "What compounds generate Breast cancer?"
answer = rag_chain.invoke(question)

print("Answer:", answer)

Ignoring wrong pointing object 8 0 (offset 0)
Ignoring wrong pointing object 10 0 (offset 0)
Ignoring wrong pointing object 12 0 (offset 0)
Ignoring wrong pointing object 14 0 (offset 0)
Ignoring wrong pointing object 29 0 (offset 0)
Ignoring wrong pointing object 31 0 (offset 0)
Ignoring wrong pointing object 33 0 (offset 0)
Ignoring wrong pointing object 40 0 (offset 0)
Ignoring wrong pointing object 48 0 (offset 0)
Ignoring wrong pointing object 50 0 (offset 0)
Ignoring wrong pointing object 65 0 (offset 0)
Ignoring wrong pointing object 79 0 (offset 0)
Ignoring wrong pointing object 84 0 (offset 0)
Ignoring wrong pointing object 86 0 (offset 0)
Ignoring wrong pointing object 91 0 (offset 0)
Ignoring wrong pointing object 93 0 (offset 0)
Ignoring wrong pointing object 98 0 (offset 0)
Ignoring wrong pointing object 101 0 (offset 0)
Number of tokens (1445) exceeded maximum context length (512).
Number of tokens (1446) exceeded maximum context length (512).
Number of tokens (1447) exce

Answer: 
Compoundsounder: chromium: chromium: chromium:
Answer based on this question 09
