In [5]:
import json
from langchain_ollama import OllamaLLM
from langchain.document_loaders import PyPDFLoader
from langchain_ollama import OllamaEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

def load_pdf_and_create_vector_store(pdf_path):
    """Loads a PDF file, creates embeddings, and stores them in FAISS."""
    loader = PyPDFLoader(pdf_path)
    docs = loader.load()

    # ✅ Debugging: Check if PDF loaded correctly
    print(f"📄 Loaded {len(docs)} documents from PDF.")
    for i, doc in enumerate(docs[:3]):  # Print first 3 documents for verification
        print(f"🔹 Document {i+1}:\n{doc.page_content[:500]}\n---")

    if not docs:
        raise ValueError("🚨 No documents were loaded from the PDF! Check if the file is valid.")

    embeddings = OllamaEmbeddings(model="deepseek-llm:7b-chat")
    vectorstore = FAISS.from_documents(docs, embeddings)
    return vectorstore

def query_agent(question: str, vectorstore, llm):
    """Queries the vectorstore using the LLM and retrieves the answer."""
    try:
        retriever = vectorstore.as_retriever(search_kwargs={"k": 5})  # ✅ Fetch more documents
        retrieved_docs = retriever.invoke(question)  # ✅ Use `invoke()` instead of deprecated method

        # ✅ Debugging: Print retrieved documents
        print(f"🔎 Retrieved {len(retrieved_docs)} documents for query: {question}")
        for i, doc in enumerate(retrieved_docs[:3]):  # Print first 3 for verification
            print(f"📜 Doc {i+1}:\n{doc.page_content[:500]}\n---")

        if not retrieved_docs:
            return {
                "answer": "I don't know.",
                "explanation": "No relevant information found in the documents.",
                "source_documents": []
            }

        context = "\n".join([doc.page_content for doc in retrieved_docs if doc.page_content.strip()])

        QA_PROMPT = PromptTemplate(
            template="""Use the following context to answer the question. If the context doesn't contain the answer, say 'I don't know'.

            Context: {context}
            Question: {query}

            Provide the answer in JSON format:
            {{
                "answer": "your concise answer here",
                "explanation": "your detailed explanation here"
            }}
            """,
            input_variables=["context", "query"],
        )

        formatted_prompt = QA_PROMPT.format(context=context, query=question)

        # ✅ Send to LLM and ensure JSON parsing
        response = llm.invoke(formatted_prompt).strip()

        try:
            parsed_response = json.loads(response)
        except json.JSONDecodeError:
            parsed_response = {
                "answer": response,
                "explanation": "The model did not return a structured JSON response."
            }

        return {
            "answer": parsed_response.get("answer", "No answer found"),
            "explanation": parsed_response.get("explanation", "No explanation provided"),
            "source_documents": [doc.page_content for doc in retrieved_docs if doc.page_content.strip()]
        }

    except Exception as e:
        print(f"❌ Error: {str(e)}")
        return {
            "answer": "Error occurred while processing the query",
            "explanation": str(e),
            "source_documents": []
        }

if __name__ == "__main__":
    pdf_path = "English.pdf"  # Make sure this path is correct
    vectorstore = load_pdf_and_create_vector_store(pdf_path)

    llm = OllamaLLM(model="deepseek-llm:7b-chat")

    query = "How can we become a strong nation? Please do not add any new line character symbols"
    result = query_agent(query, vectorstore, llm)
    print(json.dumps(result, indent=2))


📄 Loaded 148 documents from PDF.
🔹 Document 1:

---
🔹 Document 2:

---
🔹 Document 3:

---
🔎 Retrieved 5 documents for query: How can we become a strong nation? Please do not add any new line character symbols
📜 Doc 1:

---
📜 Doc 2:

---
📜 Doc 3:

---
{
  "answer": "Focus on building a strong economy, investing in education and infrastructure, promoting unity among citizens, and engaging with international partnerships.",
  "explanation": "A strong nation is built upon several key pillars. Firstly, economic growth is crucial to ensure the stability and well-being of its citizens. This can be achieved by fostering innovation, attracting investment, and creating job opportunities. Secondly, investing in education allows a nation to develop its human capital, promoting critical thinking and problem-solving skills.",
  "source_documents": []
}


In [6]:
import json
from langchain_ollama import OllamaLLM
from langchain.document_loaders import PyPDFLoader
from langchain_ollama import OllamaEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

def load_pdf_and_create_vector_store(pdf_path):
    """Loads a PDF file, creates embeddings, and stores them in FAISS."""
    loader = PyPDFLoader(pdf_path)
    docs = loader.load()

    # ✅ Debugging: Check if PDF loaded correctly
    print(f"📄 Loaded {len(docs)} documents from PDF.")
    for i, doc in enumerate(docs[:3]):  # Print first 3 documents for verification
        print(f"🔹 Document {i+1}:\n{doc.page_content[:500]}\n---")

    if not docs:
        raise ValueError("🚨 No documents were loaded from the PDF! Check if the file is valid.")

    embeddings = OllamaEmbeddings(model="deepseek-llm:7b-chat")
    vectorstore = FAISS.from_documents(docs, embeddings)
    return docs, vectorstore  # Return both docs and vectorstore

def query_agent(question: str, docs, vectorstore, llm):
    """Queries the vectorstore using the LLM and retrieves the answer."""
    try:
        retriever = vectorstore.as_retriever(search_kwargs={"k": 5})  # ✅ Fetch more documents
        retrieved_docs = retriever.invoke(question)  # ✅ Use `invoke()` instead of deprecated method

        # ✅ Debugging: Print retrieved documents
        print(f"🔎 Retrieved {len(retrieved_docs)} documents for query: {question}")
        for i, doc in enumerate(retrieved_docs[:3]):  # Print first 3 for verification
            print(f"📜 Doc {i+1}:\n{doc.page_content[:500]}\n---")

        # If no relevant documents retrieved, fallback to full PDF
        if not retrieved_docs:
            print("⚠️ No relevant documents found in vector search. Using entire PDF content as fallback.")

            # Use entire PDF as context
            context = "\n".join([doc.page_content for doc in docs if doc.page_content.strip()])
        else:
            context = "\n".join([doc.page_content for doc in retrieved_docs if doc.page_content.strip()])

        QA_PROMPT = PromptTemplate(
            template="""Use the following context to answer the question. If the context doesn't contain the answer, say 'I don't know'.

            Context: {context}
            Question: {query}

            Provide the answer in JSON format:
            {{
                "answer": "your concise answer here",
                "explanation": "your detailed explanation here"
            }}
            """,
            input_variables=["context", "query"],
        )
    

        formatted_prompt = QA_PROMPT.format(context=context, query=question)

        # ✅ Send to LLM and ensure JSON parsing
        response = llm.invoke(formatted_prompt).strip()

        try:
            parsed_response = json.loads(response)
        except json.JSONDecodeError:
            parsed_response = {
                "answer": response,
                "explanation": "The model did not return a structured JSON response."
            }

        return {
            "answer": parsed_response.get("answer", "No answer found"),
            "explanation": parsed_response.get("explanation", "No explanation provided"),
            "source_documents": [doc.page_content for doc in retrieved_docs if doc.page_content.strip()]
        }

    except Exception as e:
        print(f"❌ Error: {str(e)}")
        return {
            "answer": "Error occurred while processing the query",
            "explanation": str(e),
            "source_documents": []
        }

if __name__ == "__main__":
    pdf_path = "English.pdf"  # Make sure this path is correct
    docs, vectorstore = load_pdf_and_create_vector_store(pdf_path)

    llm = OllamaLLM(model="deepseek-llm:7b-chat")

    query = "Write down the summary of the poem *Stopping by the Woods on a Snowy Evening* by Robert Frost"
    result = query_agent(query, docs, vectorstore, llm)
    print(json.dumps(result, indent=2))


📄 Loaded 148 documents from PDF.
🔹 Document 1:

---
🔹 Document 2:

---
🔹 Document 3:

---
🔎 Retrieved 5 documents for query: Write down the summary of the poem *Stopping by the Woods on a Snowy Evening* by Robert Frost
📜 Doc 1:

---
📜 Doc 2:

---
📜 Doc 3:

---
{
  "answer": "I don't know.",
  "explanation": "The model did not return a structured JSON response.",
  "source_documents": []
}
