In [1]:
# Install dependent libraries
!pip install --quiet langchain_google_genai langchain_community langchain docarray pymupdf google.generativeai

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.5 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m2.5/2.5 MB[0m [31m146.2 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m64.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.8/302.8 kB[0m [31m21.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m63.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m438.1/438.1 kB[0m [31m29.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [4]:
import os
import re
import getpass
import fitz
import google.generativeai as genai
from langchain.text_splitter import CharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_community.vectorstores import DocArrayInMemorySearch
from langchain.docstore.document import Document


class ChatBot:
    """
    RAG chatbot using native Google Generative AI API
    """

    def __init__(self, pdf_path, api_key=None):
        ## Initialize the chatbot with a PDF file

        # Set up API key
        if api_key:
            os.environ["GOOGLE_API_KEY"] = api_key
            genai.configure(api_key=api_key)
        else:
            genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))

        # Load and process PDF
        self.vector_db, self.split_docs = self._load_and_process_pdf(pdf_path)

        # Initialize native Google AI model
        self.model = genai.GenerativeModel('gemini-1.5-flash')

        # Chat history
        self.chat_history = []

        print(f"Chatbot initialized successfully with {pdf_path}")

    def _load_and_process_pdf(self, pdf_path):
        ## Load PDF and create vector database

        if not os.path.exists(pdf_path):
            raise FileNotFoundError(f"PDF file not found: {pdf_path}")

        # Load the documents
        doc = fitz.open(pdf_path)
        documents = []
        for i, page in enumerate(doc):
          text = page.get_text()
          documents.append(Document(page_content=text, metadata={"page": i + 1}))
        doc.close()

        print(f"Loaded PDF with {len(documents)} pages")

        # Split the documents
        text_splitter = CharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=100,
            separator="\n"
        )

        split_docs = []
        for doc in documents:
          chunks=text_splitter.split_text(doc.page_content)
          for chunk in chunks:
            split_docs.append(Document(page_content=chunk, metadata=doc.metadata.copy()))

        # Create embeddings and vector store
        embeddings = GoogleGenerativeAIEmbeddings(model='models/embedding-001')
        vector_db = DocArrayInMemorySearch.from_documents(split_docs, embeddings)

        return vector_db, split_docs

    def chat(self):
        ## Start interactive chat session

        print("\nChatbot ready! Type 'quit' to exit.\n")

        while True:
            question = input("You: ").strip()

            match = re.search(r'page\s+(\d+)', question.lower())
            page_number = int(match.group(1)) if match else None

            if page_number:
              # Try to summarize that specific page
              page_docs = [doc for doc in self.split_docs if doc.metadata.get("page") == page_number]

              if not page_docs:
                print(f"No content found for page {page_number}")
                continue

              # Combine chunks from that page
              page_text = "\n\n".join([doc.page_content for doc in page_docs])

              # Create summarization prompt
              prompt = f"""You are a helpful assistant. Please summarize the content of page {page_number} of the document.
                        PAGE {page_number} CONTENT:{page_text}
                        SUMMARY:"""

              # Use Gemini to generate summary
              response = self.model.generate_content(prompt)
              summary = response.text

              # Print the summary
              print("-" * 50)
              print(f"\nSummary of Page {page_number}:\n{summary}")
              self.chat_history.append(f"Q: {question}")
              self.chat_history.append(f"A: {summary}")
              continue  # Skip to next user input

            if question.lower() in ['quit', 'exit', 'bye', 'q']:
                  print("Goodbye!")
                  break

            if not question:
                continue

            # Retrieve relevant documents for a question
            retriever = self.vector_db.as_retriever(search_kwargs={"k": 4})
            relevant_docs = retriever.invoke(question)

            # Combine relevant document content
            context = "\n\n---\n\n".join([f"(Page {doc.metadata.get('page', 'unknown')}):\n{doc.page_content}" for doc in relevant_docs])

            # Add chat history if available
            history_text = ""
            if self.chat_history and len(self.chat_history) > 0:
              recent_history = self.chat_history[-4:]  # Last 2 Q&A pairs
              history_text = "\n\nPrevious conversation:\n" + "\n".join(recent_history)

            prompt = f"""You are a helpful AI assistant. Answer the question based on the provided context from the document. If the answer cannot be found in the context, please say so clearly.
                      CONTEXT FROM DOCUMENT:
                      {context}
                      {history_text}
                      QUESTION: {question}
                      ANSWER:"""

            # Get response using native Google API
            response = self.model.generate_content(prompt)
            answer = response.text

            # Update chat history
            self.chat_history.append(f"Q: {question}")
            self.chat_history.append(f"A: {answer}")

            result={
                'answer': answer,
                'source_documents': relevant_docs,
                'num_sources': len(relevant_docs)
            }
            print("-" * 50)
            print(f"\n Chat Bot: {result['answer']}")


if __name__ == "__main__":
    print("RAG Chatbot with Native Google AI API\n")

    GEMINI_API_KEY = getpass.getpass("Enter Gemini API Key: ")
    os.environ["GOOGLE_API_KEY"] = GEMINI_API_KEY

    # Input file manually
    pdf_path = input("Enter path to your PDF file: ").strip()

    print("\nStarting interactive chat session...")
    print("   Ask questions about your PDF!")
    chatbot = ChatBot(pdf_path)
    chatbot.chat()



RAG Chatbot with Native Google AI API

Enter Gemini API Key: ··········
Enter path to your PDF file: /content/Book.pdf

Starting interactive chat session...
   Ask questions about your PDF!
Loaded PDF with 218 pages




Chatbot initialized successfully with /content/Book.pdf

Chatbot ready! Type 'quit' to exit.

You: what is the context of pdf
--------------------------------------------------

 Chat Bot: Based on the provided text snippets, the context of the PDF appears to be a book about customer experience, focusing on understanding and improving customer requirements and implementing changes within organizations.  The excerpts touch upon topics like organizational orientation (customer-centric vs. other priorities), customer desires (conscious, subconscious, and deceptive), and the challenges of implementing customer experience improvements within companies.

You: summarize page 50
--------------------------------------------------

Summary of Page 50:
Page 50 argues that business research often overemphasizes rational, conscious decision-making (like that measured by surveys), neglecting the much larger, subconscious influences on consumer behavior.  It uses the metaphor of an iceberg:  the easi