# LabbyBot

#### Document Loading

In [1]:
from langchain_community.document_loaders import DirectoryLoader, PyMuPDFLoader

# file path for raw documents which are not yet embedded
raw_files_path = "E:/AI_Engineer/portfolio/RAG/LabbyBot/data/raw/class_9"
loader = DirectoryLoader(path= raw_files_path, glob="**/*.pdf", loader_cls= PyMuPDFLoader)





In [2]:
docs = loader.load()

#### text splitting

In [3]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [4]:
text_splitter = RecursiveCharacterTextSplitter(separators=["\n\n", "\n", ".", " "], chunk_size = 750, chunk_overlap = 200)

In [6]:
# use this method for splitting documents
chunks = text_splitter.split_documents(docs)
print(chunks)

[Document(metadata={'producer': 'GPL Ghostscript 8.15', 'creator': 'PageMaker 7.0', 'creationdate': '2019-05-28T12:43:00+00:00', 'source': 'E:\\AI_Engineer\\portfolio\\RAG\\LabbyBot\\data\\raw\\class_9\\mathematics\\lelm401.pdf', 'file_path': 'E:\\AI_Engineer\\portfolio\\RAG\\LabbyBot\\data\\raw\\class_9\\mathematics\\lelm401.pdf', 'total_pages': 37, 'format': 'PDF 1.6', 'title': 'Lab manual IX (setting on 21-05-09) 1_10.pmd', 'author': 'dtpcell11', 'subject': '', 'keywords': '', 'moddate': '2022-07-05T12:48:12+05:30', 'trapped': '', 'modDate': "D:20220705124812+05'30'", 'creationDate': 'D:20190528124300Z', 'page': 0}, page_content='The Purpose of the Mathematics Laboratory\nA mathematics laboratory can foster mathematical awareness, skill building,\npositive attitudes and learning by doing experiences in different branches of\nmathematics such as Algebra, Geometry, Mensuration, Trigonometry, Coordinate\nGeometry, Statistics and Probability etc. It is the place where students can learn

#### embedding

In [8]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_chroma import Chroma
from dotenv import load_dotenv

load_dotenv()


True

#### vector store

In [9]:
# embedding model
embeddings = GoogleGenerativeAIEmbeddings(model= "gemini-embedding-001")

# file path for processed data which are embedded, eg. for chromadb storage
processed_files_path = "E:/AI_Engineer/portfolio/RAG/LabbyBot/data/processed/chromadb"

# creation of a chromadb
vector_store = Chroma(collection_name= "class_9_lab_manuals", persist_directory= processed_files_path, embedding_function= embeddings)

# adding document into the chroma db
vector_store.add_documents(chunks)

['5abce8b1-2f36-46cd-ba16-11582c247ad5',
 '9eb21ee7-2a28-41fe-9dac-433fcb860655',
 '43cf7a29-bf95-4545-99d5-583f7adf8f09',
 'b288a2f9-f212-4069-872b-d52c1d6ace77',
 '0d0a1532-7a32-4da2-8c53-683fe23b3795',
 'bbe32002-ec4e-46d4-9059-0e0cf06944d6',
 'a24b043d-f227-4f51-b87b-da9e25b153f7',
 '8ef765e0-f438-4129-8cda-597875a5a3a1',
 '824c262e-a563-40ab-bb4e-1de8c9f8bc25',
 'a3fe15ee-5425-4182-976f-2fd226250efe',
 '1584739c-e51f-4c98-9d12-e0691be1535e',
 'afc027ae-0554-4c21-8854-8d021ea7bd9e',
 '49ad8d5f-3bff-4fc2-adb4-8b52219f28f4',
 'e8e93720-5dab-4a4f-b972-1603b717a1c6',
 '1b6490b4-4544-4e8d-8f95-47f096c35c19',
 'f0bbbee2-6f2d-43e7-ade8-5d49626e2572',
 'f15dcb06-1116-47e2-af4d-9473ba429e81',
 'f25171fa-5fe7-495f-b122-852417df176b',
 'f6404185-fb00-49d8-b6c4-d233651675e7',
 '6a71c124-91d9-4543-a163-375525620ef6',
 'df57ad20-85ea-4f2f-9b39-bee7739107fe',
 '78a3bb69-e7e9-4e27-88a6-5e90c3d9bcfc',
 '0f0418ac-a77a-49b9-9e3e-9802748c81f7',
 '2d7090e7-bc4d-4cd2-9536-221f455d94c0',
 'ff78a2c6-a5e5-

In [10]:
vector_store.get(include=["embeddings", "metadatas"])

{'ids': ['5abce8b1-2f36-46cd-ba16-11582c247ad5',
  '9eb21ee7-2a28-41fe-9dac-433fcb860655',
  '43cf7a29-bf95-4545-99d5-583f7adf8f09',
  'b288a2f9-f212-4069-872b-d52c1d6ace77',
  '0d0a1532-7a32-4da2-8c53-683fe23b3795',
  'bbe32002-ec4e-46d4-9059-0e0cf06944d6',
  'a24b043d-f227-4f51-b87b-da9e25b153f7',
  '8ef765e0-f438-4129-8cda-597875a5a3a1',
  '824c262e-a563-40ab-bb4e-1de8c9f8bc25',
  'a3fe15ee-5425-4182-976f-2fd226250efe',
  '1584739c-e51f-4c98-9d12-e0691be1535e',
  'afc027ae-0554-4c21-8854-8d021ea7bd9e',
  '49ad8d5f-3bff-4fc2-adb4-8b52219f28f4',
  'e8e93720-5dab-4a4f-b972-1603b717a1c6',
  '1b6490b4-4544-4e8d-8f95-47f096c35c19',
  'f0bbbee2-6f2d-43e7-ade8-5d49626e2572',
  'f15dcb06-1116-47e2-af4d-9473ba429e81',
  'f25171fa-5fe7-495f-b122-852417df176b',
  'f6404185-fb00-49d8-b6c4-d233651675e7',
  '6a71c124-91d9-4543-a163-375525620ef6',
  'df57ad20-85ea-4f2f-9b39-bee7739107fe',
  '78a3bb69-e7e9-4e27-88a6-5e90c3d9bcfc',
  '0f0418ac-a77a-49b9-9e3e-9802748c81f7',
  '2d7090e7-bc4d-4cd2-9536-

#### retrieval

In [11]:
retriever = vector_store.as_retriever(search_type = "mmr", search_kwargs = {"lambda_mult": 0, "k": 10})

In [12]:
# format the retrieved documents into text
def format_docs(retriever_docs):
    return "".join(doc.page_content for doc in retriever_docs)

### Building RAG pipeline

In [13]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import PromptTemplate, ChatMessagePromptTemplate
from langchain_core.runnables import RunnableLambda, RunnablePassthrough, RunnableParallel
from langchain_core.output_parsers import StrOutputParser

In [14]:
# llm model
chat_model = ChatGoogleGenerativeAI(model= "gemini-2.5-flash-lite")
query_model = ChatGoogleGenerativeAI(model= "gemini-2.5-flash-lite", temperature= 0)

In [15]:
# parser
parser = StrOutputParser()

#### prompt engineering


In [33]:
RAG_SYSTEM_TEMPLATE = """
    You are an expert AI assistant specialized in providing comprehensive and accurate answers based on the context provided in the 'CONTEXT' section below and in format of the manuals.

    Instructions:
    1. Grounding: Your response must be based on the provided context. Only use external
       knowledge or pre-trained data if needed for the answer.

    CONTEXT:
    --------------------
    {context}
    --------------------

    QUESTION:
    {question}
    """
system_prompt = PromptTemplate(template= RAG_SYSTEM_TEMPLATE)

In [34]:
query_prompt = PromptTemplate(template="""
You are an expert Query Optimizer for a Retrieval Augmented Generation (RAG) system.
Your sole purpose is to rewrite an initial, potentially vague student question into a single search query that will maximize the relevance of retrieved documents.

Follow these rules:
1. Output: Your final output MUST be only the refined search query, with no explanations,
   introductory phrases, or punctuation other than what belongs in the query itself.

class --> {student_class}, subject --> {subject}, question --> {query}
"""
)

#### structured output

In [35]:
from pydantic import BaseModel, Field
from typing import Literal

class Query(BaseModel):
    student_class: Literal["9th", "10th", "11th", "12th"] = Field(description= "This is the class in which the student is studying")
    subject: Literal["Science", "Physics", "Mathematics", "Chemistry"] = Field(description= "This provides the subject the student is reffering to note that science is only present for 9th and 10th graders, physics and chemistry for 11th and 12th graders.")
    
structured_model = query_model.with_structured_output(Query)

def query_modifier(query: str):
    
    structured_query = structured_model.invoke(query)
    return {"student_class": structured_query.student_class, "subject": structured_query.subject, "query": query}    

#### building chains

In [36]:
# query chain
query_chain = RunnableLambda(query_modifier) | query_prompt | chat_model | parser

# parallel chain
parallel_chain = RunnableParallel({"context": query_chain | retriever | RunnableLambda(format_docs), "question": RunnablePassthrough()})

# final chain to execute
main_chain = parallel_chain | system_prompt | chat_model | parser

print(main_chain.get_graph().draw_ascii())

                +---------------------------------+           
                | Parallel<context,question>Input |           
                +---------------------------------+           
                      ***                 ***                 
                  ****                       ***              
                **                              ****          
+------------------------+                          **        
| ChatGoogleGenerativeAI |                           *        
+------------------------+                           *        
             *                                       *        
             *                                       *        
             *                                       *        
  +---------------------+                            *        
  | PydanticToolsParser |                            *        
  +---------------------+                            *        
             *                                       * 

In [41]:
query = "teach me about angle"
result = main_chain.invoke(query)
print(result)

The provided context describes several activities and experiments related to angles, primarily within a mathematics and science curriculum. Here's a breakdown of what it teaches about angles:

**1. Relationship Between Sides and Angles in a Triangle:**

*   **Concept:** The angle opposite a longer side in a triangle is greater than the angle opposite a shorter side.
*   **Demonstration:** A cut-out angle is compared with two other angles. If ∠A is opposite the longest side, it will be greater than ∠B and ∠C, which are opposite shorter sides.
*   **Observation:** Students measure the lengths of the sides of a triangle and the measures of the angles opposite them to verify this relationship.
*   **Application:** This result can be used in solving geometry problems.

**2. Vertically Opposite Angles:**

*   **Objective:** To experimentally verify that when two lines intersect, the vertically opposite angles are equal.
*   **Method:** A protractor and two transparent strips with intersectin