In [2]:
import json
from langchain_ollama import OllamaLLM
from langchain.document_loaders import PyPDFLoader
from langchain_ollama import OllamaEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

def load_pdf_and_create_vector_store(pdf_path):
    """Loads a PDF file, creates embeddings, and stores them in FAISS."""
    loader = PyPDFLoader(pdf_path)
    docs = loader.load()

    # ✅ Debugging: Check if PDF loaded correctly
    print(f"📄 Loaded {len(docs)} documents from PDF.")
    for i, doc in enumerate(docs[:3]):  # Print first 3 documents for verification
        print(f"🔹 Document {i+1}:\n{doc.page_content[:500]}\n---")

    if not docs:
        raise ValueError("🚨 No documents were loaded from the PDF! Check if the file is valid.")

    embeddings = OllamaEmbeddings(model="deepseek-llm:7b-chat")
    vectorstore = FAISS.from_documents(docs, embeddings)
    return docs, vectorstore  # Return both docs and vectorstore

def query_agent(question: str, docs, vectorstore, llm):
    """Queries the vectorstore using the LLM and retrieves the answer."""
    try:
        retriever = vectorstore.as_retriever(search_kwargs={"k": 5})  # ✅ Fetch more documents
        retrieved_docs = retriever.invoke(question)  # ✅ Use `invoke()` instead of deprecated method

        # ✅ Debugging: Print retrieved documents
        print(f"🔎 Retrieved {len(retrieved_docs)} documents for query: {question}")
        for i, doc in enumerate(retrieved_docs[:3]):  # Print first 3 for verification
            print(f"📜 Doc {i+1}:\n{doc.page_content[:500]}\n---")

        # If no relevant documents retrieved, fallback to full PDF
        if not retrieved_docs:
            print("⚠️ No relevant documents found in vector search. Using entire PDF content as fallback.")

            # Use entire PDF as context
            context = "\n".join([doc.page_content for doc in docs if doc.page_content.strip()])
        else:
            context = "\n".join([doc.page_content for doc in retrieved_docs if doc.page_content.strip()])


        QA_PROMPT = PromptTemplate(
            template="""Use the following context to answer the question. You will be given a query from the user. Your task is to find the similarity between this question and the context, then answer in the following format:

            Context: {context}  
            Question: {query}  

            Provide the answer in JSON format:  
            {{  
                "answer": "Provide the chapter number from which the query question is chosen. It may be exact or similar.",  
                "explanation": "Provide the solution to that specific question. Follow the example section from the context and solve it within this paradigm."  
            }}
            """,
            input_variables=["context", "query"],
        )

        formatted_prompt = QA_PROMPT.format(context=context, query=question)

        # ✅ Send to LLM and ensure JSON parsing
        response = llm.invoke(formatted_prompt).strip()

        try:
            parsed_response = json.loads(response)
        except json.JSONDecodeError:
            parsed_response = {
                "answer": response,
                "explanation": "The model did not return a structured JSON response."
            }

        return {
            "answer": parsed_response.get("answer", "No answer found"),
            "explanation": parsed_response.get("explanation", "No explanation provided"),
            "source_documents": [doc.page_content for doc in retrieved_docs if doc.page_content.strip()]
        }

    except Exception as e:
        print(f"❌ Error: {str(e)}")
        return {
            "answer": "Error occurred while processing the query",
            "explanation": str(e),
            "source_documents": []
        }

if __name__ == "__main__":
    pdf_path = "Math-9th-Class.pdf" # Make sure this path is correct
    docs, vectorstore = load_pdf_and_create_vector_store(pdf_path)

    llm = OllamaLLM(model="deepseek-llm:7b-chat")

    query = "Find the transpose of the matrix: B=[5 1-6]"
    result = query_agent(query, docs, vectorstore, llm)
    print(json.dumps(result, indent=2))


📄 Loaded 308 documents from PDF.
🔹 Document 1:

---
🔹 Document 2:
"Education is a matter of life and death for 
Pakistan. The world is progressing so rapidly 
that without requisite advance in education, not 
only shall we be left behind others but may be 
wiped out altogether." 
(September 26, 1947, Karachi) 
J~Ji; ~!;( 
l:)~~ 01 
J~ Ji; 
, 
(I/ ~?1 ~j 
Publishers Note 
Quaid-e-Azam 
Muhammad Ali Jinnah 
Founder of Pakistan 
J~ Ji; ~;; J~ 
l:)(; J~ (f~l:JJ 
~ 1/ 
(lll! ( ~Jj J~ 
~,._W '(j 
J~ J v~ f~ 
Jl&>l:J (; ,J"L l:) Lz/ , , 
This book is being published 
---
🔹 Document 3:

---


KeyboardInterrupt: 

In [None]:
import json
import pytesseract
from PIL import Image
from pdf2image import convert_from_path
from langchain_ollama import OllamaLLM, OllamaEmbeddings
from langchain.document_loaders import PyPDFLoader
from langchain.vectorstores import FAISS
from langchain.prompts import PromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter

def extract_text_from_image(image):
    """Extracts text from an image using Tesseract OCR."""
    return pytesseract.image_to_string(image, lang="eng")

def load_pdf_and_create_vector_store(pdf_path):
    """Loads a PDF, extracts text (including OCR for images), and creates embeddings."""
    loader = PyPDFLoader(pdf_path)
    docs = loader.load()
    
    if not docs:
        print("🔍 No text found in PDF. Using OCR for extraction...")
        images = convert_from_path(pdf_path)
        ocr_texts = [extract_text_from_image(img) for img in images]
        text_content = "\n".join(ocr_texts)
        docs = [text_content]
    
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
    split_docs = text_splitter.create_documents([doc.page_content if hasattr(doc, 'page_content') else doc for doc in docs])

    embeddings = OllamaEmbeddings(model="deepseek-llm:7b-chat")
    vectorstore = FAISS.from_documents(split_docs, embeddings)
    
    return split_docs, vectorstore

def query_agent(question: str, docs, vectorstore, llm):
    """Queries the vector store using LLM and retrieves an answer."""
    retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
    retrieved_docs = retriever.invoke(question)
    
    context = "\n".join([doc.page_content if hasattr(doc, 'page_content') else doc for doc in retrieved_docs if doc.strip()])
    
    QA_PROMPT = PromptTemplate(
        template="""
        Use the following context to answer the question:
        Context: {context}
        Question: {query}
        
        Provide the answer in JSON format:
        {{
            "answer": "Provide the chapter number if possible.",
            "explanation": "Solve the query question based on the context."
        }}
        """,
        input_variables=["context", "query"],
    )
    
    formatted_prompt = QA_PROMPT.format(context=context, query=question)
    response = llm.invoke(formatted_prompt).strip()
    
    try:
        parsed_response = json.loads(response)
    except json.JSONDecodeError:
        parsed_response = {"answer": response, "explanation": "Response not in JSON format."}
    
    return {
        "answer": parsed_response.get("answer", "No answer found"),
        "explanation": parsed_response.get("explanation", "No explanation provided"),
        "source_documents": [doc.page_content if hasattr(doc, 'page_content') else doc for doc in retrieved_docs if doc.strip()]
    }

if __name__ == "__main__":
    pdf_path = "Math-9th-Class.pdf"
    docs, vectorstore = load_pdf_and_create_vector_store(pdf_path)
    llm = OllamaLLM(model="deepseek-llm:7b-chat")
    query = "Find the transpose of the matrix: B=[5 1 -6]"
    result = query_agent(query, docs, vectorstore, llm)
    print(json.dumps(result, indent=2))

AttributeError: 'Document' object has no attribute 'strip'