In [7]:
import json
from langchain_ollama import OllamaLLM
from langchain.document_loaders import PyPDFLoader
from langchain_ollama import OllamaEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

def load_pdf_and_create_vector_store(pdf_path):
    loader = PyPDFLoader(pdf_path)
    docs = loader.load()
    embeddings = OllamaEmbeddings(model="deepseek-llm:7b-chat")
    vectorstore = FAISS.from_documents(docs, embeddings)
    return vectorstore

def query_agent(question: str, vectorstore, llm):
    try:
        retriever = vectorstore.as_retriever()

        QA_PROMPT = PromptTemplate(
            template="""Use the following context to answer the question. If the context doesn't contain the answer, say 'I don't know'.

            Context: {context}
            Question: {query}

            Answer in JSON format:
            {{
                "answer": "your concise answer here",
                "explanation": "your detailed explanation here",
                "source_documents": "{source_documents}"
            }}
            """,
            input_variables=["context", "query", "source_documents"],
        )

        qa = RetrievalQA.from_chain_type(
            llm=llm,
            chain_type="stuff",
            retriever=retriever,
            return_source_documents=True,  # Keep this True
            chain_type_kwargs={"prompt": QA_PROMPT},  # Set the prompt here
        )
        
        response = qa({"query": question})

        try:
            # Access and parse the result correctly
            result = json.loads(response['result'])
            return result
        except json.JSONDecodeError as e:
            print(f"JSON decode error: {e}")
            print(f"Raw response: {response['result']}") # Print raw response for debugging
            return {
                "answer": "Could not parse JSON response",
                "explanation": response['result'],
                "source_documents": response.get('source_documents', [])
            }

    except Exception as e:
        print(f"Error: {str(e)}")
        return {
            "answer": "Error occurred while processing the query",
            "explanation": str(e),
            "source_documents": []
        }


if __name__ == "__main__":
    pdf_path = "English.pdf"
    vectorstore = load_pdf_and_create_vector_store(pdf_path)

    llm = OllamaLLM(model="deepseek-llm:7b-chat")

    query = "What is the role of counselling in preventing drug addiction?"
    result = query_agent(query, vectorstore, llm)
    print(json.dumps(result, indent=2))

Error: Missing some input keys: {'query', 'source_documents'}
{
  "answer": "Error occurred while processing the query",
  "explanation": "Missing some input keys: {'query', 'source_documents'}",
  "source_documents": []
}


In [13]:
import json
from langchain_ollama import OllamaLLM
from langchain.document_loaders import PyPDFLoader
from langchain_ollama import OllamaEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

def load_pdf_and_create_vector_store(pdf_path):
    """Loads a PDF file, creates embeddings, and stores them in FAISS."""
    loader = PyPDFLoader(pdf_path)
    docs = loader.load()

    # ✅ Debugging: Check if PDF loaded correctly
    print(f"📄 Loaded {len(docs)} documents from PDF.")
    for i, doc in enumerate(docs[:3]):  # Print first 3 documents for verification
        print(f"🔹 Document {i+1}:\n{doc.page_content[:500]}\n---")

    if not docs:
        raise ValueError("🚨 No documents were loaded from the PDF! Check if the file is valid.")

    embeddings = OllamaEmbeddings(model="deepseek-llm:7b-chat")
    vectorstore = FAISS.from_documents(docs, embeddings)
    return vectorstore

def query_agent(question: str, vectorstore, llm):
    """Queries the vectorstore using the LLM and retrieves the answer."""
    try:
        retriever = vectorstore.as_retriever(search_kwargs={"k": 5})  # ✅ Fetch more documents
        retrieved_docs = retriever.invoke(question)  # ✅ Use `invoke()` instead of deprecated method

        # ✅ Debugging: Print retrieved documents
        print(f"🔎 Retrieved {len(retrieved_docs)} documents for query: {question}")
        for i, doc in enumerate(retrieved_docs[:3]):  # Print first 3 for verification
            print(f"📜 Doc {i+1}:\n{doc.page_content[:500]}\n---")

        if not retrieved_docs:
            return {
                "answer": "I don't know.",
                "explanation": "No relevant information found in the documents.",
                "source_documents": []
            }

        context = "\n".join([doc.page_content for doc in retrieved_docs if doc.page_content.strip()])

        QA_PROMPT = PromptTemplate(
            template="""Use the following context to answer the question. If the context doesn't contain the answer, say 'I don't know'.

            Context: {context}
            Question: {query}

            Provide the answer in JSON format:
            {{
                "answer": "your concise answer here",
                "explanation": "your detailed explanation here"
            }}
            """,
            input_variables=["context", "query"],
        )

        formatted_prompt = QA_PROMPT.format(context=context, query=question)

        # ✅ Send to LLM and ensure JSON parsing
        response = llm.invoke(formatted_prompt).strip()

        try:
            parsed_response = json.loads(response)
        except json.JSONDecodeError:
            parsed_response = {
                "answer": response,
                "explanation": "The model did not return a structured JSON response."
            }

        return {
            "answer": parsed_response.get("answer", "No answer found"),
            "explanation": parsed_response.get("explanation", "No explanation provided"),
            "source_documents": [doc.page_content for doc in retrieved_docs if doc.page_content.strip()]
        }

    except Exception as e:
        print(f"❌ Error: {str(e)}")
        return {
            "answer": "Error occurred while processing the query",
            "explanation": str(e),
            "source_documents": []
        }

if __name__ == "__main__":
    pdf_path = "English.pdf"  # Make sure this path is correct
    vectorstore = load_pdf_and_create_vector_store(pdf_path)

    llm = OllamaLLM(model="deepseek-llm:7b-chat")

    query = "What is the role of counselling in preventing drug addiction?"
    result = query_agent(query, vectorstore, llm)
    print(json.dumps(result, indent=2))


📄 Loaded 148 documents from PDF.
🔹 Document 1:

---
🔹 Document 2:

---
🔹 Document 3:

---
🔎 Retrieved 5 documents for query: What is the role of counselling in preventing drug addiction?
📜 Doc 1:

---
📜 Doc 2:

---
📜 Doc 3:

---
{
  "answer": "Counselling plays a role in preventing drug addiction by helping individuals identify and change negative thought patterns, cope with stressors, develop problem-solving skills, and improve relationships.",
  "explanation": "Counseling provides support, guidance, and education to help people overcome substance abuse disorders. It helps them understand the causes of their addiction and equips them with coping mechanisms to prevent relapse. Additionally, it facilitates personal growth by improving communication skills, building resilience, and promoting healthy behaviors.",
  "source_documents": []
}


In [17]:
import json
import pytesseract
from PIL import Image
from langchain_ollama import OllamaLLM
from langchain.document_loaders import PyMuPDFLoader
from langchain_ollama import OllamaEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

def extract_text_with_ocr(pdf_path):
    """Uses OCR to extract text from a scanned PDF."""
    import fitz  # PyMuPDF
    
    # Open the PDF
    doc = fitz.open(pdf_path)
    text = ""
    
    # Extract text from each page
    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        pix = page.get_pixmap()
        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
        text += pytesseract.image_to_string(img)  # OCR text extraction
        
    return text.strip()

def load_pdf_and_create_vector_store(pdf_path):
    """Load the PDF file, apply OCR if needed, and create a vector store."""
    
    # First, try normal PDF extraction with PyMuPDFLoader
    loader = PyMuPDFLoader(pdf_path)
    docs = loader.load()
    
    # If no valid documents were extracted, try OCR on the PDF
    if not any(doc.page_content.strip() for doc in docs):
        print("⚠️ No readable text found. Attempting OCR on the PDF...")
        text = extract_text_with_ocr(pdf_path)
        
        # Create a single document with OCR text
        docs = [{"page_content": text}]
        
        if not text.strip():
            raise ValueError("🚨 No valid text extracted even with OCR!")
    
    print(f"📄 Loaded {len(docs)} valid documents.")
    embeddings = OllamaEmbeddings(model="deepseek-llm:7b-chat")
    vectorstore = FAISS.from_documents(docs, embeddings)
    return vectorstore

def query_agent(question: str, vectorstore, llm):
    """Queries the vectorstore using the LLM and retrieves the answer."""
    try:
        retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
        retrieved_docs = retriever.invoke(question)

        # Filter out empty documents again
        retrieved_docs = [doc for doc in retrieved_docs if doc.page_content.strip()]

        print(f"🔎 Retrieved {len(retrieved_docs)} valid documents for query: {question}")

        if not retrieved_docs:
            return {
                "answer": "I don't know.",
                "explanation": "No relevant information found in the documents.",
                "source_documents": []
            }

        context = "\n".join([doc.page_content for doc in retrieved_docs])

        QA_PROMPT = PromptTemplate(
            template="""Use the following context to answer the question. If the context doesn't contain the answer, say 'I don't know'.

            Context: {context}
            Question: {query}

            Provide the answer in JSON format:
            {{
                "answer": "your concise answer here",
                "explanation": "your detailed explanation here"
            }}
            """,
            input_variables=["context", "query"],
        )

        formatted_prompt = QA_PROMPT.format(context=context, query=question)
        response = llm.invoke(formatted_prompt).strip()

        try:
            parsed_response = json.loads(response)
        except json.JSONDecodeError:
            parsed_response = {
                "answer": response,
                "explanation": "The model did not return a structured JSON response."
            }

        return {
            "answer": parsed_response.get("answer", "No answer found"),
            "explanation": parsed_response.get("explanation", "No explanation provided"),
            "source_documents": [doc.page_content for doc in retrieved_docs]
        }

    except Exception as e:
        print(f"❌ Error: {str(e)}")
        return {
            "answer": "Error occurred while processing the query",
            "explanation": str(e),
            "source_documents": []
        }

if __name__ == "__main__":
    pdf_path = "English.pdf"  # Ensure correct path
    vectorstore = load_pdf_and_create_vector_store(pdf_path)

    llm = OllamaLLM(model="deepseek-llm:7b-chat")

    query = "What is the role of counselling in preventing drug addiction?"
    result = query_agent(query, vectorstore, llm)
    print(json.dumps(result, indent=2))


⚠️ No readable text found. Attempting OCR on the PDF...


TesseractNotFoundError: tesseract is not installed or it's not in your PATH. See README file for more information.