In [44]:
from langchain_core.documents import Document

In [45]:
doc = Document(
    page_content = "Indiana University Bloomington was founded in the year 1818. It is a public research based university.",
    metadata = {
        "source": "Indiana.txt",
        "pages": 5,
        "timestamp": "2025-09-12",
    }
)

In [46]:
doc

Document(metadata={'source': 'Indiana.txt', 'pages': 5, 'timestamp': '2025-09-12'}, page_content='Indiana University Bloomington was founded in the year 1818. It is a public research based university.')

In [47]:
import os

os.makedirs("../data/text_files", exist_ok=True)

In [48]:
sample_texts={
    "../data/text_files/python_intro.txt": """Python Programming Introduction

Python is a high-level, interpreted programming language known for its simplicity and readability.
Created by Guido van Rossum and first released in 1991, Python has become one of the most popular
programming languages in the world.

Key Features:
- Easy to learn and use
- Extensive standard library
- Cross-platform compatibility
- Strong community support

Python is widely used in web development, data science, artificial intelligence, and automation.""",
    
    "../data/text_files/machine_learning.txt": """Machine Learning Basics

Machine learning is a subset of artificial intelligence that enables systems to learn and improve
from experience without being explicitly programmed. It focuses on developing computer programs
that can access data and use it to learn for themselves.

Types of Machine Learning:
1. Supervised Learning: Learning with labeled data
2. Unsupervised Learning: Finding patterns in unlabeled data
3. Reinforcement Learning: Learning through rewards and penalties

Applications include image recognition, speech processing, and recommendation systems"""
}

for filepath, content in sample_texts.items():
    with open(filepath, 'w', encoding="utf-8") as f:
        f.write(content)

print("sample text files created!")

sample text files created!


In [49]:
from langchain.document_loaders import TextLoader

loader=TextLoader("/Users/User/Documents/RAG AI agent/data/text_files/machine_learning.txt", encoding="utf-8")

document=loader.load()

print(document)

[Document(metadata={'source': '/Users/User/Documents/RAG AI agent/data/text_files/machine_learning.txt'}, page_content='Machine Learning Basics\n\nMachine learning is a subset of artificial intelligence that enables systems to learn and improve\nfrom experience without being explicitly programmed. It focuses on developing computer programs\nthat can access data and use it to learn for themselves.\n\nTypes of Machine Learning:\n1. Supervised Learning: Learning with labeled data\n2. Unsupervised Learning: Finding patterns in unlabeled data\n3. Reinforcement Learning: Learning through rewards and penalties\n\nApplications include image recognition, speech processing, and recommendation systems')]


In [50]:
from langchain_community.document_loaders import DirectoryLoader
import tqdm

dir_loader=DirectoryLoader(
    "../data/text_files",
    glob="**/*.txt", # Pattern to match files
    loader_cls=TextLoader, # loader class to use
    loader_kwargs={"encoding":"utf-8"},
    show_progress=True
)

documents=dir_loader.load()
print(documents)

100%|██████████| 2/2 [00:00<00:00, 1236.16it/s]

[Document(metadata={'source': '../data/text_files/python_intro.txt'}, page_content='Python Programming Introduction\n\nPython is a high-level, interpreted programming language known for its simplicity and readability.\nCreated by Guido van Rossum and first released in 1991, Python has become one of the most popular\nprogramming languages in the world.\n\nKey Features:\n- Easy to learn and use\n- Extensive standard library\n- Cross-platform compatibility\n- Strong community support\n\nPython is widely used in web development, data science, artificial intelligence, and automation.'), Document(metadata={'source': '../data/text_files/machine_learning.txt'}, page_content='Machine Learning Basics\n\nMachine learning is a subset of artificial intelligence that enables systems to learn and improve\nfrom experience without being explicitly programmed. It focuses on developing computer programs\nthat can access data and use it to learn for themselves.\n\nTypes of Machine Learning:\n1. Supervised




In [51]:
from langchain_community.document_loaders import PyMuPDFLoader

dir_loader=DirectoryLoader(
    "../data/pdf",
    glob="**/*.pdf",
    loader_cls=PyMuPDFLoader,
    # loader_kwargs={"encoding":"utf-8"},
    show_progress=True
)

pdf_documents=dir_loader.load()

print(pdf_documents)

100%|██████████| 12/12 [00:00<00:00, 42.75it/s]






In [52]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path
# Text splitting get into chunks

def split_documents(documents, chunk_size=1000, chunk_overlap=200):
    """Split documents into smaller size for better RAG performance"""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = chunk_size,
        chunk_overlap = chunk_overlap,
        length_function = len,
        separators = ["\n\n", "\n", " ", ""]
    )
    split_docs = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents in {len(split_docs)} chunks")

    if split_docs:
        print(f"\nExample Chunk: ")
        print(f"Content: {split_docs[0].page_content[:200]}...")
        print(f"Metadata: {split_docs[0].metadata}")

    return split_docs

In [53]:
chunks=split_documents(pdf_documents)
chunks

Split 85 documents in 248 chunks

Example Chunk: 
Content: MoF Policy 007: Officer Recusal Rules 
Effective: 2024-07-01​
 Owner: Ministry of Flowgramming (MoF)​
 Category: Ordinance IV — Conflicts of Interest & Human-in-the-Loop 
1. Mandatory Recusal Categori...
Metadata: {'producer': 'Skia/PDF m141 Google Docs Renderer', 'creator': '', 'creationdate': '', 'source': '../data/pdf/MoF-policy-07.pdf', 'file_path': '../data/pdf/MoF-policy-07.pdf', 'total_pages': 7, 'format': 'PDF 1.4', 'title': 'MoF-policy-07', 'author': '', 'subject': '', 'keywords': '', 'moddate': '', 'trapped': '', 'modDate': '', 'creationDate': '', 'page': 0}


[Document(metadata={'producer': 'Skia/PDF m141 Google Docs Renderer', 'creator': '', 'creationdate': '', 'source': '../data/pdf/MoF-policy-07.pdf', 'file_path': '../data/pdf/MoF-policy-07.pdf', 'total_pages': 7, 'format': 'PDF 1.4', 'title': 'MoF-policy-07', 'author': '', 'subject': '', 'keywords': '', 'moddate': '', 'trapped': '', 'modDate': '', 'creationDate': '', 'page': 0}, page_content="MoF Policy 007: Officer Recusal Rules \nEffective: 2024-07-01\u200b\n Owner: Ministry of Flowgramming (MoF)\u200b\n Category: Ordinance IV — Conflicts of Interest & Human-in-the-Loop \n1. Mandatory Recusal Categories \n1.1 Family Relationship Requirements \nOfficers must recuse themselves from any application review involving first-degree relatives \nincluding spouses, parents, children, or siblings. First-degree relationships are determined \nby blood, marriage, or legal adoption regardless of current relationship status. Recusal \napplies to the reviewing officer's relationships as well as relati

# Embedding and VectoreStoreDB

In [54]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Tuple, Any
from sklearn.metrics.pairwise import cosine_similarity



In [55]:
class EmbeddingManager:
    """Handles documents embedding generation with the help of SentenceTransformer"""

    def __init__(self, model_name: str="all-MiniLM-L6-v2"):
        """
        Initiliaze the Embedding Manager
        
        Args:
            model_name: HuggingFace model name for embeddings
        """
        self.model_name= model_name
        self.model=None
        self._load_model()

    def _load_model(self):
        """Load the SentenceTransformer model"""

        try:
            print(f"Load embedding model: {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model loaded successfully. Embedding dimension {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error loading model {self.model_name}: {e}")
            raise

    def generate_embeddings(self, texts:List[str]) -> np.ndarray:
        """
        Generate embedings for a list of texts

        Args:
            List of text strings to embed

        Returns:
            numpy array of embeddings with shape len(texts), embedding_dim)
        """

        if not self.model:
            raise ValueError("Model not loaded")
        
        print(f"Generate embeddings for {len(texts)} texts...")
        embeddings = self.model.encode(texts, show_progress_bar=True)
        print("Generated embeddings with shape: {embeddings.shape}")
        return embeddings
    

embedding_manager = EmbeddingManager()
embedding_manager

Load embedding model: all-MiniLM-L6-v2
Model loaded successfully. Embedding dimension 384


<__main__.EmbeddingManager at 0x301b21a90>

# VectoreStore

In [56]:
class VectorStore:
    """ Manages document embeddings in a ChromaDB vector store"""
    def __init__(self, collection_name: str="pdf_documents", persist_directory: str = "../data/vectore_store"):
        """
        Initialize the vector store
        
        Args:
            collection_name: Name of the ChromaDB collection
            persist_directory: Directory to persist the vector store
        """
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self.initialize_store()

    def initialize_store(self):
        """Initialize ChromaDB client and collection"""

        try:
            # Create chromaDB client and collection
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_directory) # creates client that has a reference to the chromadb VectorStore using chromadb.PersistentClient fn

            # Get or create collection
            self.collection = self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={"description": "PDF document embeddings for RAG"} # metadata contains what is the collection information
            )
            print(f"Collection store initialized. Collection: {self.collection_name}")
            print(f"Existing documents in collection: {self.collection.count()}")
        
        except Exception as e:
            print(f"Error initializing vector store: {e}")
            raise

    # Now that collection is created we need to insert documents into the collection
    # Fn to add list of documents to the collection
    def add_documents(self, documents: List[Any], embeddings: np.ndarray):
        """
        Add documents and their embeddings to the vectorstore

        Args:
            documents: List of Langchain documents
            embeddings: Corresponding embeddings to the documents
        """

        if len(documents) != len(embeddings):
            raise ValueError("Number of documents must match the number of embeddings")
        
        print(f"Add {len(documents)} documents to the vectorstore...")

        # Prepare data for ChromaDB(vectorstore)
        ids = []
        metadatas = []
        document_texts = []
        embeddings_list = []

        for i, (doc, embeddings) in enumerate(zip(documents, embeddings)): # documents zipped and created tuples with embeddings
            # Generate unique id
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)

            # Prepare metadata
            metadata = dict(doc.metadata)
            metadata['doc_index'] = i
            metadata['content_length'] = len(doc.page_content)
            metadatas.append(metadata)
            
            # document content
            document_texts.append(doc.page_content)

            # embeddings 
            embeddings_list.append(embeddings.tolist())

            try:
                self.collection.add(
                    ids=ids,
                    embeddings=embeddings_list,
                    metadatas=metadatas,
                    documents=document_texts
                )
                print(f"Successfully added {len(documents)} documents to vectorestore")
                print(f"Total documents in collection: {self.collection.count()}")

            except Exception as e:
                print(f"Error loading documents in the vector store: {e}")
                raise 

vectorstore=VectorStore()
vectorstore
        

Collection store initialized. Collection: pdf_documents
Existing documents in collection: 248


<__main__.VectorStore at 0x175ad3380>

In [57]:
chunks

[Document(metadata={'producer': 'Skia/PDF m141 Google Docs Renderer', 'creator': '', 'creationdate': '', 'source': '../data/pdf/MoF-policy-07.pdf', 'file_path': '../data/pdf/MoF-policy-07.pdf', 'total_pages': 7, 'format': 'PDF 1.4', 'title': 'MoF-policy-07', 'author': '', 'subject': '', 'keywords': '', 'moddate': '', 'trapped': '', 'modDate': '', 'creationDate': '', 'page': 0}, page_content="MoF Policy 007: Officer Recusal Rules \nEffective: 2024-07-01\u200b\n Owner: Ministry of Flowgramming (MoF)\u200b\n Category: Ordinance IV — Conflicts of Interest & Human-in-the-Loop \n1. Mandatory Recusal Categories \n1.1 Family Relationship Requirements \nOfficers must recuse themselves from any application review involving first-degree relatives \nincluding spouses, parents, children, or siblings. First-degree relationships are determined \nby blood, marriage, or legal adoption regardless of current relationship status. Recusal \napplies to the reviewing officer's relationships as well as relati

In [58]:
### Convert the texts to embeddings

texts=[doc.page_content for doc in chunks]
texts

["MoF Policy 007: Officer Recusal Rules \nEffective: 2024-07-01\u200b\n Owner: Ministry of Flowgramming (MoF)\u200b\n Category: Ordinance IV — Conflicts of Interest & Human-in-the-Loop \n1. Mandatory Recusal Categories \n1.1 Family Relationship Requirements \nOfficers must recuse themselves from any application review involving first-degree relatives \nincluding spouses, parents, children, or siblings. First-degree relationships are determined \nby blood, marriage, or legal adoption regardless of current relationship status. Recusal \napplies to the reviewing officer's relationships as well as relationships of their immediate \nhousehold members. Failure to identify and disclose family relationships constitutes a \nserious ethics violation. Noise: Officers sometimes provide detailed family genealogy charts \nor discuss complex relationship dynamics, which may be personally interesting but do not \nalter the bright-line first-degree relationship standard. \n1.2 Employment History Confli

In [59]:
embeddings=embedding_manager.generate_embeddings(texts)

vectorstore.add_documents(chunks, embeddings)

Generate embeddings for 248 texts...


Batches: 100%|██████████| 8/8 [00:00<00:00, 10.79it/s]


Generated embeddings with shape: {embeddings.shape}
Add 248 documents to the vectorstore...
Successfully added 248 documents to vectorestore
Total documents in collection: 249
Successfully added 248 documents to vectorestore
Total documents in collection: 250
Successfully added 248 documents to vectorestore
Total documents in collection: 251
Successfully added 248 documents to vectorestore
Total documents in collection: 252
Successfully added 248 documents to vectorestore
Total documents in collection: 253
Successfully added 248 documents to vectorestore
Total documents in collection: 254
Successfully added 248 documents to vectorestore
Total documents in collection: 255
Successfully added 248 documents to vectorestore
Total documents in collection: 256
Successfully added 248 documents to vectorestore
Total documents in collection: 257
Successfully added 248 documents to vectorestore
Total documents in collection: 258
Successfully added 248 documents to vectorestore
Total documents in 

# Retriever pipeline from VectoreStore


In [None]:
class RAGretriever:
    """Handles query-based retrieval from the vectorestore"""

    def __init__(self, vector_store: VectorStore, embedding_manager: EmbeddingManager):
        """
        Initialize the retriever

        Args:
            vector_store: Contains embeddings of the documents
            embedding_manager: Manager for generating query embeddings
        """
        self.vector_store = vector_store
        self.embedding_manager = embedding_manager

    def retrieve(self, query: str, top_k: int = 5, score_threshold: float = 0.0) -> List[Dict[str, Any]]:
        """
        Retrieve relevant documents for a query

        Args:
            query: The search query
            top_k: Number of top results to return 
            score_threshold: Minimum similarity score threshold

        Returns:
            List of dictionaries containing retrieved documents and metadata
        """

        print(f"Retrieving following documents for the query: {query}")
        print(f"Top k: {top_k}, Score threshold: {score_threshold}")

        # Generate query embedding

        query_embedding = self.embedding_manager.generate_embeddings([query])[0]

        # Search in vector store  
        try:
            results = self.vector_store.collection.query(    # We pass the query embedding(in the form of a list) to the vector_store and the top_k results
                query_embeddings=[query_embedding.tolist()],
                n_results = top_k
            )
        
            # Processes docs
            retrieved_docs = []

            # Once we get the results there will be an internal key called documents
            # Then we can get the documents info, metadata, distances etc.
            if results['documents'] and results['documents'][0]:
                documents = results['documents'][0]
                metadatas = results['metadatas'][0]
                distances = results['distances'][0]
                ids = results['ids'][0]

                for i, (doc_id, document, metadata, distance) in enumerate(zip(ids, documents, metadatas, distances)):
                    # Convert distance to similarity score (ChromaDB uses cosine distance)
                    similarity_score = 1 - distance

                    if similarity_score >= score_threshold:
                        retrieved_docs.append({
                            'id': doc_id,
                            'content': document,
                            'metadata': metadata,
                            'similarity_score': similarity_score,
                            'distance': distance,
                            'rank': i+1
                        })
                print(f"Retrieved {len(retrieved_docs)} documents (after filtering)")
            else:
                print("Documents not found")

            return retrieved_docs
    
        except Exception as e:
            print(f"Error during retrieval {e}")
            return []

rag_retriever = RAGretriever(vectorstore, embedding_manager)



In [61]:
rag_retriever

<__main__.RAGretriever at 0x301b21550>

In [67]:
rag_retriever.retrieve("""MINISTRY OF FLOWGRAMMING
EMERGENCY PROCESSING DIRECTIVE 011
Expedited Approval Protocols for Critical Workflows
CLASSIFICATION: Administrative Urgent
EFFECTIVE: July 1, 2025
SUPERSEDES: All previous emergency protocols
DISTRIBUTION: All reviewing officers, supervisory staff""")

Retrieving following documents for the query: MINISTRY OF FLOWGRAMMING
EMERGENCY PROCESSING DIRECTIVE 011
Expedited Approval Protocols for Critical Workflows
CLASSIFICATION: Administrative Urgent
EFFECTIVE: July 1, 2025
SUPERSEDES: All previous emergency protocols
DISTRIBUTION: All reviewing officers, supervisory staff
Top k: 5, Score threshold: 0.0
Generate embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00,  2.89it/s]

Generated embeddings with shape: {embeddings.shape}
Retrieved 5 documents (after filtering)





[{'id': 'doc_c63ab0e5_105',
  'content': 'MINISTRY OF FLOWGRAMMING \nEMERGENCY PROCESSING DIRECTIVE 011 \nExpedited Approval Protocols for Critical Workflows \nCLASSIFICATION: Administrative Urgent\u200b\n EFFECTIVE: July 1, 2025\u200b\n SUPERSEDES: All previous emergency protocols\u200b\n DISTRIBUTION: All reviewing officers, supervisory staff \n \nSECTION I: EMERGENCY CRITERIA MATRIX \nEmergency processing shall be granted ONLY when applications meet one or more of the \nfollowing critical thresholds: \nTIER 1 - IMMEDIATE THREATS \n●\u200b Active security breach mitigation \n●\u200b Critical infrastructure failure response \n●\u200b Public safety emergency workflows \n●\u200b Regulatory compliance under legal deadline \n●\u200b System failure requiring immediate restoration \nTIER 2 - TIME-CRITICAL OPERATIONS \n●\u200b Vendor system outage affecting business continuity \n●\u200b Financial reporting deadline compliance \n●\u200b Third-party integration failure resolution \n●\u200b Dat

# Integration VectorDB context pipeline with LLM output

In [None]:
from langchain_groq import ChatGroq
import os
from dotenv import load_dotenv
load_dotenv()

groq_api_key = os.getenv("GROQ_API_KEY")

llm=ChatGroq(groq_api_key=groq_api_key, model_name="gemma-9b-it", temperature=0.1, max_tokens=1024)

# Simple RAG function: retrieve context + answer generation
def simple_rag(query, retriever, llm, top_k=3):
    # retrieve the context by passing the query to the embeddingmanager (generating vectors) passing the vector to the ChromaDB 
    # and asking it to return top_k contexts wrt to our query embedding
    results = retriever.retrieve(query, top_k=top_k)
    

_IncompleteInputError: incomplete input (3026923275.py, line 12)