In [1]:
from pdfminer.high_level import extract_text
from langchain.text_splitter import RecursiveCharacterTextSplitter

def text_extracter(path):
    text = extract_text(path)
    return text

def text_chunk(text):
    splitter = RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=200,separators = ["\n\n", "\n", ".", " ", ""])
    chunks = splitter.split_text(text)
    return chunks

In [2]:
import spacy

nlp = spacy.load('en_core_web_sm')

def text_cleaning(chunks):
    cleaned_chunks = []
    for chunk in chunks:
        doc = nlp(chunk)
        cleaned = " ".join([token.text for token in doc if not token.is_stop and not token.is_punct])
        cleaned_chunks.append(cleaned)
    return cleaned_chunks

def lemmatize_text(cleaned_chunks):
    lemma_chunks = []
    for chunk in cleaned_chunks:
        doc = nlp(chunk)
        lemma = " ".join([token.lemma_ for token in doc])
        lemma_chunks.append(lemma)
    return lemma_chunks

In [3]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.docstore.document import Document
import pickle
import os

def embed_store(lemma_chunks):
  docs = [Document(page_content=chunk) for chunk in lemma_chunks]
  embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
  db = FAISS.from_documents(docs, embeddings)  # FAISS - Facebook ai similarity search (full form for revision)
  
  storage_dir = "/Users/pmanthan/Desktop/tomo.ai/faiss_storage"
  os.makedirs(storage_dir, exist_ok=True)
  db.save_local(storage_dir)
  
  with open("/Users/pmanthan/Desktop/tomo.ai/faiss_storage/faiss_storage.pkl", "wb") as f:
    pickle.dump(embeddings, f)
    
  return db


def load_store():
  storage_dir = "/Users/pmanthan/Desktop/tomo.ai/faiss_storage"
  with open("/Users/pmanthan/Desktop/tomo.ai/faiss_storage/faiss_storage.pkl", "rb") as f:
    embeddings = pickle.load(f)
    
    vector_store = FAISS.load_local(storage_dir, embeddings, allow_dangerous_deserialization=True)
    return vector_store
  
  return vector_store

In [4]:
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain_groq import ChatGroq

def rag_pipeline(): 
   extracted_text = text_extracter("/Users/pmanthan/Desktop/attentionisalluneed.pdf")
   chunked_text = text_chunk(extracted_text)
   cleaned_text = text_cleaning(chunked_text)
   lemma_text = lemmatize_text(cleaned_text)
   embed_text = embed_store(lemma_text)
   return load_store()


In [5]:
llm=ChatGroq(model="llama3-70b-8192",api_key="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")

In [6]:
vector_store = rag_pipeline()
retriever = vector_store.as_retriever(search_kwargs={"k": 5})
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True,output_key="answer")
qa_chain = ConversationalRetrievalChain.from_llm(llm=llm,retriever=retriever,memory=memory,return_source_documents=True)

CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
  embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
  memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True,output_key="answer")


In [7]:
def hybrid_answer(question):
    response = qa_chain.invoke({"question": question})

    if not response["source_documents"]:
        fallback_answer = llm.invoke(question)
        return fallback_answer

    return response["answer"]

def input_output(question):
    hy_answer = hybrid_answer(question)
    print(hy_answer)


input_output("can u summarize the pdf for me")


AuthenticationError: Error code: 401 - {'error': {'message': 'Invalid API Key', 'type': 'invalid_request_error', 'code': 'invalid_api_key'}}

In [None]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Tomo AI Chatbot\n",
    "\n",
    "This notebook implements a conversational AI assistant with specialized agents for different types of queries."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Import required libraries\n",
    "import os\n",
    "import spacy\n",
    "from typing import TypedDict, Annotated, List\n",
    "\n",
    "from dotenv import load_dotenv\n",
    "from langchain_core.tools import tool\n",
    "from langchain_core.prompts import ChatPromptTemplate\n",
    "from langchain_core.runnables import Runnable, RunnableLambda\n",
    "from langchain_core.messages import SystemMessage, HumanMessage, ToolMessage\n",
    "from langchain_core.documents import Document\n",
    "from langgraph.graph.message import AnyMessage, add_messages\n",
    "from langgraph.graph.state import StateGraph, END, START\n",
    "from langgraph.prebuilt import ToolNode, tools_condition\n",
    "\n",
    "from langchain_groq import ChatGroq\n",
    "from langchain_community.tools.tavily_search import TavilySearchResults\n",
    "from langchain.embeddings import HuggingFaceEmbeddings\n",
    "from langchain.vectorstores import FAISS\n",
    "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
    "from langchain.memory import ConversationBufferMemory\n",
    "from langchain.tools.retriever import create_retriever_tool\n",
    "from langchain.agents import create_react_agent, AgentExecutor\n",
    "from langchain.prompts import PromptTemplate\n",
    "\n",
    "# Initialize NLP model\n",
    "nlp = spacy.load('en_core_web_sm')\n",
    "\n",
    "# Load environment variables\n",
    "load_dotenv()\n",
    "\n",
    "# Initialize LLM\n",
    "llm = ChatGroq(\n",
    "    model=\"llama3-70b-8192\",\n",
    "    api_key=os.getenv(\"GROQ_API_KEY\"),  # Store API key in environment variables\n",
    "    temperature=0.1,\n",
    "    max_tokens=2048\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## State Definition\n",
    "\n",
    "Define the state structure for the conversation graph."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "class GraphState(TypedDict):\n",
    "    \"\"\"State definition for the conversation graph.\"\"\"\n",
    "    messages: Annotated[List[AnyMessage], add_messages]\n",
    "    documents: List[Document]\n",
    "    pdf_path: str\n",
    "    question: str\n",
    "    question_type: str\n",
    "    pdf_context: str\n",
    "    calendar_context: str\n",
    "    intermediate_steps: list\n",
    "    parent_run_id: str"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## RAG Tool Implementation\n",
    "\n",
    "Tool for extracting and processing PDF documents."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "@tool(description=\"Extracts text from a PDF, cleans it, lemmatizes it, and stores it in a vector store for retrieval.\")\n",
    "def rag_tool(state: GraphState) -> GraphState:\n",
    "    \"\"\"Process PDF document and create vector store for retrieval.\"\"\"\n",
    "    from pdfminer.high_level import extract_text\n",
    "    \n",
    "    path = state[\"pdf_path\"]\n",
    "    \n",
    "    def text_extractor(path: str) -> str:\n",
    "        \"\"\"Extract text from PDF.\"\"\"\n",
    "        try:\n",
    "            return extract_text(path)\n",
    "        except Exception as e:\n",
    "            raise ValueError(f\"Failed to extract text from PDF: {str(e)}\")\n",
    "\n",
    "    def text_chunk(text: str) -> List[str]:\n",
    "        \"\"\"Split text into chunks.\"\"\"\n",
    "        splitter = RecursiveCharacterTextSplitter(\n",
    "            chunk_size=1000,\n",
    "            chunk_overlap=200,\n",
    "            separators=[\"\\n\\n\", \"\\n\", \".\", \" \", \"\"]\n",
    "        )\n",
    "        return splitter.split_text(text)\n",
    "    \n",
    "    def text_cleaning(chunks: List[str]) -> List[str]:\n",
    "        \"\"\"Clean text by removing stopwords and punctuation.\"\"\"\n",
    "        cleaned_chunks = []\n",
    "        for chunk in chunks:\n",
    "            doc = nlp(chunk)\n",
    "            cleaned = \" \".join([token.text for token in doc if not token.is_stop and not token.is_punct])\n",
    "            cleaned_chunks.append(cleaned)\n",
    "        return cleaned_chunks\n",
    "    \n",
    "    def lemmatize_text(cleaned_chunks: List[str]) -> List[str]:\n",
    "        \"\"\"Lemmatize text chunks.\"\"\"\n",
    "        lemma_chunks = []\n",
    "        for chunk in cleaned_chunks:\n",
    "            doc = nlp(chunk)\n",
    "            lemma = \" \".join([token.lemma_ for token in doc])\n",
    "            lemma_chunks.append(lemma)\n",
    "        return lemma_chunks\n",
    "    \n",
    "    def embed_store(lemma_chunks: List[str]) -> FAISS:\n",
    "        \"\"\"Create and save FAISS vector store.\"\"\"\n",
    "        docs = [Document(page_content=chunk) for chunk in lemma_chunks]\n",
    "        embeddings = HuggingFaceEmbeddings(model_name=\"all-MiniLM-L6-v2\")\n",
    "        db = FAISS.from_documents(docs, embeddings)\n",
    "        \n",
    "        storage_dir = \"faiss_storage\"\n",
    "        os.makedirs(storage_dir, exist_ok=True)\n",
    "        db.save_local(storage_dir)\n",
    "        \n",
    "        return db\n",
    "    \n",
    "    def rag_pipeline(path: str) -> FAISS:\n",
    "        \"\"\"Complete RAG pipeline from PDF to vector store.\"\"\"\n",
    "        extracted_text = text_extractor(path)\n",
    "        chunked_text = text_chunk(extracted_text)\n",
    "        cleaned_text = text_cleaning(chunked_text)\n",
    "        lemma_text = lemmatize_text(cleaned_text)\n",
    "        return embed_store(lemma_text)\n",
    "    \n",
    "    # Execute RAG pipeline and update state\n",
    "    vector_store = rag_pipeline(path)\n",
    "    state[\"documents\"] = vector_store.as_retriever(search_kwargs={\"k\": 5})\n",
    "    return state"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Node Definitions\n",
    "\n",
    "Define the nodes for the conversation graph."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Initialize conversation memory\n",
    "memory = ConversationBufferMemory(memory_key=\"chat_history\", return_messages=True)\n",
    "\n",
    "# Define prompts\n",
    "router_prompt = \"\"\"You are an AI question router. Classify incoming user questions to route them to the correct specialized agent.\n",
    "\n",
    "Question types:\n",
    "- PDF: Questions about PDF document content\n",
    "- CALENDAR: Questions related to scheduling or calendar\n",
    "- GENERAL: Other questions\n",
    "\n",
    "Only respond with one word: PDF, CALENDAR, or GENERAL.\"\"\"\n",
    "\n",
    "pdf_prompt = \"\"\"You are an expert in answering questions based on PDF content.\n",
    "\n",
    "Your tasks:\n",
    "- Answer questions using the PDF content\n",
    "- Summarize key points when no specific question is asked\n",
    "- Keep responses concise and document-relevant\n",
    "\"\"\"\n",
    "\n",
    "calendar_prompt = \"\"\"You are a scheduling assistant for Google Calendar.\n",
    "\n",
    "Your tasks:\n",
    "- Schedule meetings/events\n",
    "- Check availability\n",
    "- Cancel/reschedule events\n",
    "- Ask for missing details (time, date, etc.)\n",
    "\"\"\"\n",
    "\n",
    "react_prompt = PromptTemplate.from_template(\n",
    "    \"\"\"Answer questions using available tools. Follow this format:\n",
    "\n",
    "Question: the input question\n",
    "Thought: think about what to do\n",
    "Action: the action to take\n",
    "Action Input: the input to the action\n",
    "Observation: the result\n",
    "... (repeat as needed)\n",
    "Thought: I now know the final answer\n",
    "Final Answer: the final answer\n",
    "\n",
    "Available tools:\n",
    "{tools}\n",
    "\n",
    "Question: {input}\n",
    "{agent_scratchpad}\n",
    "\"\"\"\n",
    ")\n",
    "\n",
    "def router_node(state: GraphState) -> dict:\n",
    "    \"\"\"Route questions to appropriate agent.\"\"\"\n",
    "    messages = [SystemMessage(content=router_prompt), HumanMessage(content=state[\"question\"])]\n",
    "    response = llm.invoke(messages)\n",
    "    return {\"question_type\": response.content}\n",
    "\n",
    "def rag_node(state: GraphState) -> dict:\n",
    "    \"\"\"Process PDF-related questions.\"\"\"\n",
    "    # Create retriever tool\n",
    "    retriever_tool = create_retriever_tool(\n",
    "        state[\"documents\"], \n",
    "        name=\"rag_tool\", \n",
    "        description=\"Retrieves relevant information from PDF documents.\"\n",
    "    )\n",
    "    \n",
    "    # Create and execute agent\n",
    "    rag_agent = create_react_agent(llm=llm, tools=[retriever_tool], prompt=react_prompt)\n",
    "    agent_executor = AgentExecutor.from_agent_and_tools(\n",
    "        agent=rag_agent, \n",
    "        tools=[retriever_tool],\n",
    "        memory=memory,\n",
    "        verbose=False,\n",
    "        handle_parsing_errors=True\n",
    "    )\n",
    "    \n",
    "    input_message = {\"input\": state[\"question\"], \"chat_history\": state[\"messages\"]}\n",
    "    response = agent_executor.invoke(input_message)\n",
    "    \n",
    "    return {\"pdf_context\": response[\"output\"], \"messages\": state[\"messages\"] + [response[\"output\"]]}\n",
    "\n",
    "def general_assistant_node(state: GraphState) -> dict:\n",
    "    \"\"\"Handle general questions.\"\"\"\n",
    "    general_agent = create_react_agent(llm=llm, tools=[], prompt=react_prompt)\n",
    "    agent_executor = AgentExecutor.from_agent_and_tools(\n",
    "        agent=general_agent,\n",
    "        tools=[],\n",
    "        memory=memory,\n",
    "        verbose=False,\n",
    "        handle_parsing_errors=True\n",
    "    )\n",
    "    \n",
    "    input_message = {\"input\": state[\"question\"]}\n",
    "    response = agent_executor.invoke(input_message)\n",
    "    \n",
    "    return {\"messages\": state[\"messages\"] + [response[\"output\"]]}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Graph Construction\n",
    "\n",
    "Build the conversation graph with nodes and edges."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def route_question(state: GraphState) -> str:\n",
    "    \"\"\"Determine the next node based on question type.\"\"\"\n",
    "    question_type = state.get(\"question_type\", \"GENERAL\")\n",
    "    return question_type if question_type in [\"PDF\", \"CALENDAR\", \"GENERAL\"] else \"GENERAL\"\n",
    "\n",
    "# Initialize graph\n",
    "tomo_ai = StateGraph(GraphState)\n",
    "\n",
    "# Add nodes\n",
    "tomo_ai.add_node(\"router_node\", router_node)\n",
    "tomo_ai.add_node(\"rag_node\", rag_node)\n",
    "tomo_ai.add_node(\"general_assistant_node\", general_assistant_node)\n",
    "\n",
    "# Set entry point\n",
    "tomo_ai.add_edge(START, \"router_node\")\n",
    "\n",
    "# Conditional edges based on question type\n",
    "tomo_ai.add_conditional_edges(\n",
    "    \"router_node\",\n",
    "    route_question,\n",
    "    {\n",
    "        \"PDF\": \"rag_node\",\n",
    "        \"CALENDAR\": \"general_assistant_node\",\n",
    "        \"GENERAL\": \"general_assistant_node\"\n",
    "    }\n",
    ")\n",
    "\n",
    "# Connect nodes to END\n",
    "tomo_ai.add_edge(\"rag_node\", END)\n",
    "tomo_ai.add_edge(\"general_assistant_node\", END)\n",
    "\n",
    "# Compile the graph\n",
    "graph = tomo_ai.compile()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Chat Interface\n",
    "\n",
    "Interactive interface for the chatbot."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def run_tomo_ai(user_input: str, pdf_path: str = None) -> None:\n",
    "    \"\"\"Process user input through the conversation graph.\"\"\"\n",
    "    initial_state = {\n",
    "        \"messages\": [],\n",
    "        \"documents\": [],\n",
    "        \"pdf_path\": pdf_path or \"\",\n",
    "        \"question\": user_input,\n",
    "        \"question_type\": \"\",\n",
    "        \"pdf_context\": \"\",\n",
    "        \"calendar_context\": \"\",\n",
    "        \"parent_run_id__\": \"\",\n",
    "        \"intermediate_steps\": []\n",
    "    }\n",
    "    \n",
    "    for event in graph.stream(initial_state):\n",
    "        for key, value in event.items():\n",
    "            if key == \"router_node\":\n",
    "                print(f\"Question classified as: {value['question_type']}\")\n",
    "            elif key == \"rag_node\" and \"pdf_context\" in value:\n",
    "                print(f\"PDF Answer: {value['pdf_context']}\")\n",
    "            elif key == \"general_assistant_node\" and \"messages\" in value:\n",
    "                print(f\"Assistant: {value['messages'][-1]}\")\n",
    "\n",
    "# Interactive chat loop\n",
    "if __name__ == \"__main__\":\n",
    "    print(\"Tomo AI Chatbot - Type 'exit' to quit\")\n",
    "    \n",
    "    # Example PDF path - replace with your actual PDF path\n",
    "    pdf_path = \"path/to/your/document.pdf\"\n",
    "    \n",
    "    while True:\n",
    "        user_input = input(\"You: \")\n",
    "        if user_input.lower() == \"exit\":\n",
    "            break\n",
    "        run_tomo_ai(user_input, pdf_path)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Graph Visualization\n",
    "\n",
    "Visual representation of the conversation flow."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from IPython.display import Image, display\n",
    "\n",
    "try:\n",
    "    display(Image(graph.get_graph().draw_mermaid_png()))\n",
    "except Exception as e:\n",
    "    print(f\"Could not display graph: {str(e)}\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}