Data Ingestion

Document Data structure

In [83]:
from langchain_core.documents import Document

doc = Document(
    page_content="demo Contect used to explain or take a hands on with the document data structure",
    metadata={
        "source":"example.txt",
        "pages":1,
        "author":"Mr. White",
        "date_created":"22-10-2025"}
    # }helps for filtering 
)
doc

Document(metadata={'source': 'example.txt', 'pages': 1, 'author': 'Mr. White', 'date_created': '22-10-2025'}, page_content='demo Contect used to explain or take a hands on with the document data structure')

In [84]:
import os
os.makedirs("../data/text_files", exist_ok=True)

In [3]:
sample_texts={
    "../data/text_files/python_intro.txt":"""Python Programming Introduction

Python is a high-level, interpreted programming language known for its simplicity and readability.
Created by Guido van Rossum and first released in 1991, Python has become one of the most popular
programming languages in the world.

Key Features:
- Easy to learn and use
- Extensive standard library
- Cross-platform compatibility
- Strong community support

Python is widely used in web development, data science, artificial intelligence, and automation.""",
    
    "../data/text_files/machine_learning.txt": """Machine Learning Basics

Machine learning is a subset of artificial intelligence that enables systems to learn and improve
from experience without being explicitly programmed. It focuses on developing computer programs
that can access data and use it to learn for themselves.

Types of Machine Learning:
1. Supervised Learning: Learning with labeled data
2. Unsupervised Learning: Finding patterns in unlabeled data
3. Reinforcement Learning: Learning through rewards and penalties

Applications include image recognition, speech processing, and recommendation systems
    
    
    """

}

for filepath,content in sample_texts.items():
    with open(filepath,'w',encoding="utf-8") as f:
        f.write(content)


In [4]:
from langchain_community.document_loaders import TextLoader

loader = TextLoader("../data/text_files/python_intro.txt", encoding="utf-8")

doc = loader.load()
print(doc)

  from .autonotebook import tqdm as notebook_tqdm


[Document(metadata={'source': '../data/text_files/python_intro.txt'}, page_content='Python Programming Introduction\n\nPython is a high-level, interpreted programming language known for its simplicity and readability.\nCreated by Guido van Rossum and first released in 1991, Python has become one of the most popular\nprogramming languages in the world.\n\nKey Features:\n- Easy to learn and use\n- Extensive standard library\n- Cross-platform compatibility\n- Strong community support\n\nPython is widely used in web development, data science, artificial intelligence, and automation.')]


In [85]:
from langchain_community.document_loaders import DirectoryLoader

dir_loader = DirectoryLoader(
    "../data/text_files",
    glob="**/*txt",
    loader_cls=TextLoader,
    loader_kwargs={'encoding':"utf-8"}
)

doc=dir_loader.load()
print(doc)

[Document(metadata={'source': '..\\data\\text_files\\machine_learning.txt'}, page_content='Machine Learning Basics\n\nMachine learning is a subset of artificial intelligence that enables systems to learn and improve\nfrom experience without being explicitly programmed. It focuses on developing computer programs\nthat can access data and use it to learn for themselves.\n\nTypes of Machine Learning:\n1. Supervised Learning: Learning with labeled data\n2. Unsupervised Learning: Finding patterns in unlabeled data\n3. Reinforcement Learning: Learning through rewards and penalties\n\nApplications include image recognition, speech processing, and recommendation systems\n\n\n    '), Document(metadata={'source': '..\\data\\text_files\\python_intro.txt'}, page_content='Python Programming Introduction\n\nPython is a high-level, interpreted programming language known for its simplicity and readability.\nCreated by Guido van Rossum and first released in 1991, Python has become one of the most popul

In [86]:
from langchain_community.document_loaders import PyMuPDFLoader

pdf_doc_loader = DirectoryLoader(
    "../data/pdfs",
    glob='**/*.pdf',
    loader_cls=PyMuPDFLoader
)

doc_pdf = pdf_doc_loader.load()
print(doc_pdf)

[Document(metadata={'producer': 'PyPDF2', 'creator': '', 'creationdate': '', 'source': '..\\data\\pdfs\\NIPS-2017-attention-is-all-you-need-Paper.pdf', 'file_path': '..\\data\\pdfs\\NIPS-2017-attention-is-all-you-need-Paper.pdf', 'total_pages': 11, 'format': 'PDF 1.3', 'title': 'Attention is All you Need', 'author': 'Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Łukasz Kaiser, Illia Polosukhin', 'subject': 'Neural Information Processing Systems http://nips.cc/', 'keywords': '', 'moddate': '2018-02-12T21:22:10-08:00', 'trapped': '', 'modDate': "D:20180212212210-08'00'", 'creationDate': '', 'page': 0}, page_content='Attention Is All You Need\nAshish Vaswani∗\nGoogle Brain\navaswani@google.com\nNoam Shazeer∗\nGoogle Brain\nnoam@google.com\nNiki Parmar∗\nGoogle Research\nnikip@google.com\nJakob Uszkoreit∗\nGoogle Research\nusz@google.com\nLlion Jones∗\nGoogle Research\nllion@google.com\nAidan N. Gomez∗†\nUniversity of Toronto\naidan@cs.toronto.edu

Embeddind and Vector Store DB

In [87]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity


In [88]:
class EmbeddingManager:
    def __init__(self, model_name : str='all-MiniLM-L6-v2'):
        self.model_name=model_name
        self.model=None
        self._load_model()
    
    def _load_model(self):
        try:
            self.model = SentenceTransformer(self.model_name)
            print(self.model.get_sentence_embedding_dimension())
        except Exception as e:
            print(e)
            raise
        
    def generate_embeddings(self, texts:List[str])->np.ndarray:
        if not self.model:
            raise ValueError("Model not found")
        
        print("generating embeddings for {len(texts)} texts")
        embeddings = self.model.encode(texts, show_progress_bar=True)
        print(embeddings.shape)
        return embeddings
    
    
embedding_manager = EmbeddingManager()
embedding_manager

384


<__main__.EmbeddingManager at 0x1db9053bce0>

Vecotr DB- Store

In [89]:
class VectorStore:
    """Manages document embeddings in a ChromaDB vector store"""
    
    def __init__(self, collection_name: str = "pdf_documents", persist_directory: str = "../data/vector_store"):
        """
        Initialize the vector store
        
        Args:
            collection_name: Name of the ChromaDB collection
            persist_directory: Directory to persist the vector store
        """
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_store()

    def _initialize_store(self):
        """Initialize ChromaDB client and collection"""
        try:
            # Create persistent ChromaDB client
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_directory)
            
            # Get or create collection
            self.collection = self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={"description": "PDF document embeddings for RAG"}
            )
            print(f"Vector store initialized. Collection: {self.collection_name}")
            print(f"Existing documents in collection: {self.collection.count()}")
            
        except Exception as e:
            print(f"Error initializing vector store: {e}")
            raise

    def add_documents(self, documents: List[Any], embeddings: np.ndarray):
        """
        Add documents and their embeddings to the vector store
        
        Args:
            documents: List of LangChain documents
            embeddings: Corresponding embeddings for the documents
        """
        if len(documents) != len(embeddings):
            raise ValueError("Number of documents must match number of embeddings")
        
        print(f"Adding {len(documents)} documents to vector store...")
        
        # Prepare data for ChromaDB
        ids = []
        metadatas = []
        documents_text = []
        embeddings_list = []
        
        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
            # Generate unique ID
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)
            
            # Prepare metadata
            metadata = dict(doc.metadata)
            metadata['doc_index'] = i
            metadata['content_length'] = len(doc.page_content)
            metadatas.append(metadata)
            
            # Document content
            documents_text.append(doc.page_content)
            
            # Embedding
            embeddings_list.append(embedding.tolist())
        
        # Add to collection
        try:
            self.collection.add(
                ids=ids,
                embeddings=embeddings_list,
                metadatas=metadatas,
                documents=documents_text
            )
            print(f"Successfully added {len(documents)} documents to vector store")
            print(f"Total documents in collection: {self.collection.count()}")
            
        except Exception as e:
            print(f"Error adding documents to vector store: {e}")
            raise

vectorstore=VectorStore()
vectorstore
    

Vector store initialized. Collection: pdf_documents
Existing documents in collection: 86


<__main__.VectorStore at 0x1db90539d60>

In [90]:
from pathlib import Path
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
import os

def process_all_pdfs(pdf_directory):
    """Process all PDF files in a directory"""
    all_documents = []
    pdf_dir = Path(pdf_directory)
    
    # Find all PDF files recursively
    pdf_files = list(pdf_dir.glob("**/*.pdf"))
    
    print(f"Found {len(pdf_files)} PDF files to process")
    
    for pdf_file in pdf_files:
        print(f"\nProcessing: {pdf_file.name}")
        try:
            loader = PyPDFLoader(str(pdf_file))
            documents = loader.load()
            
            # Add source information to metadata
            for doc in documents:
                doc.metadata['source_file'] = pdf_file.name
                doc.metadata['file_type'] = 'pdf'
            
            all_documents.extend(documents)
            print(f"  ✓ Loaded {len(documents)} pages")
            
        except Exception as e:
            print(f"  ✗ Error: {e}")
    
    print(f"\nTotal documents loaded: {len(all_documents)}")
    return all_documents

# Process all PDFs in the data directory
all_pdf_documents = process_all_pdfs("../data/pdfs")

Found 3 PDF files to process

Processing: NIPS-2017-attention-is-all-you-need-Paper.pdf
  ✓ Loaded 11 pages

Processing: sample.pdf
  ✓ Loaded 1 pages

Processing: somatosensory.pdf
  ✓ Loaded 4 pages

Total documents loaded: 16


In [91]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

def split_documents(documents,chunk_size=1000,chunk_overlap=200):
    """Split documents into smaller chunks for better RAG performance"""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )
    split_docs = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(split_docs)} chunks")
    
    # Show example of a chunk
    if split_docs:
        print(f"\nExample chunk:")
        print(f"Content: {split_docs[0].page_content[:200]}...")
        print(f"Metadata: {split_docs[0].metadata}")
    
    return split_docs

chunks=split_documents(all_pdf_documents)
chunks

Split 16 documents into 56 chunks

Example chunk:
Content: Attention Is All You Need
Ashish Vaswani∗
Google Brain
avaswani@google.com
Noam Shazeer∗
Google Brain
noam@google.com
Niki Parmar∗
Google Research
nikip@google.com
Jakob Uszkoreit∗
Google Research
usz...
Metadata: {'producer': 'PyPDF2', 'creator': 'PyPDF', 'creationdate': '', 'subject': 'Neural Information Processing Systems http://nips.cc/', 'publisher': 'Curran Associates, Inc.', 'language': 'en-US', 'created': '2017', 'eventtype': 'Poster', 'description-abstract': 'The dominant sequence transduction models are based on complex recurrent orconvolutional neural networks in an encoder and decoder configuration. The best performing such models also connect the encoder and decoder through an attentionm echanisms.  We propose a novel, simple network architecture based solely onan attention mechanism, dispensing with recurrence and convolutions entirely.Experiments on two machine translation tasks show these models to be superiorin

[Document(metadata={'producer': 'PyPDF2', 'creator': 'PyPDF', 'creationdate': '', 'subject': 'Neural Information Processing Systems http://nips.cc/', 'publisher': 'Curran Associates, Inc.', 'language': 'en-US', 'created': '2017', 'eventtype': 'Poster', 'description-abstract': 'The dominant sequence transduction models are based on complex recurrent orconvolutional neural networks in an encoder and decoder configuration. The best performing such models also connect the encoder and decoder through an attentionm echanisms.  We propose a novel, simple network architecture based solely onan attention mechanism, dispensing with recurrence and convolutions entirely.Experiments on two machine translation tasks show these models to be superiorin quality while being more parallelizable and requiring significantly less timeto train. Our single model with 165 million parameters, achieves 27.5 BLEU onEnglish-to-German translation, improving over the existing best ensemble result by over 1 BLEU. On 

In [92]:
texts = [doc.page_content for doc in chunks]

embeddings = embedding_manager.generate_embeddings(texts)

vectorstore.add_documents(chunks, embeddings)

generating embeddings for {len(texts)} texts


Batches: 100%|██████████| 2/2 [00:02<00:00,  1.45s/it]

(56, 384)
Adding 56 documents to vector store...
Successfully added 56 documents to vector store
Total documents in collection: 142





Rag Retriever

In [93]:
class RAGRetriever():
    def __init__(self, vectorstore:VectorStore, embedding_manager:EmbeddingManager):
        self.vector_store=vectorstore
        self.embedding_manager=embedding_manager
        
    def retrieve(self, query: str, top_k: int = 5, score_threshold: float = 0.0) -> List[Dict[str, Any]]:
        """
        Retrieve relevant documents for a query
        
        Args:
            query: The search query
            top_k: Number of top results to return
            score_threshold: Minimum similarity score threshold
            
        Returns:
            List of dictionaries containing retrieved documents and metadata
        """
        print(f"Retrieving documents for query: '{query}'")
        print(f"Top K: {top_k}, Score threshold: {score_threshold}")
        
        # Generate query embedding
        query_embedding = self.embedding_manager.generate_embeddings([query])[0]
        
        # Search in vector store
        try:
            results = self.vector_store.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=top_k
            )
            
            # Process results
            retrieved_docs = []
            
            if results['documents'] and results['documents'][0]:
                documents = results['documents'][0]
                metadatas = results['metadatas'][0]
                distances = results['distances'][0]
                ids = results['ids'][0]
                
                for i, (doc_id, document, metadata, distance) in enumerate(zip(ids, documents, metadatas, distances)):
                    # Convert distance to similarity score (ChromaDB uses cosine distance)
                    similarity_score = 1 - distance
                    
                    if similarity_score >= score_threshold:
                        retrieved_docs.append({
                            'id': doc_id,
                            'content': document,
                            'metadata': metadata,
                            'similarity_score': similarity_score,
                            'distance': distance,
                            'rank': i + 1
                        })
                
                print(f"Retrieved {len(retrieved_docs)} documents (after filtering)")
            else:
                print("No documents found")
            
            return retrieved_docs
            
        except Exception as e:
            print(f"Error during retrieval: {e}")
            return []

rag_retriever=RAGRetriever(vectorstore,embedding_manager)

In [97]:
rag_retriever.retrieve("What is somatosensory ?")

Retrieving documents for query: 'What is somatosensory ?'
Top K: 5, Score threshold: 0.0
generating embeddings for {len(texts)} texts


Batches: 100%|██████████| 1/1 [00:00<00:00, 72.77it/s]

(1, 384)
Retrieved 1 documents (after filtering)





[{'id': 'doc_1896671e_47',
  'content': 'This is a sample document to\nshowcase page-based formatting. It\ncontains a chapter from a Wikibook\ncalled Sensory Systems. None of the\ncontent has been changed in this\narticle, but some content has been\nremoved.\nAnatomy of the Somatosensory System\nFROM WIKIBOOKS1\nOur somatosensory system consists of sensors in the skin\nand sensors in our muscles, tendons, and joints. The re-\nceptors in the skin, the so called cutaneous rec eptors, tell\nus about temperature (thermoreceptors), pressure and sur-\nface te xture ( mechano rec eptors), and pain ( nociceptors).\nThe receptors in muscles and joints pro vide information\nabout muscle length, muscle tension, and joint angles.\nCutaneous receptors\nSensory information from Meissner corpuscles and rapidly\nadapting afferents leads to adjustment of grip f orce when\nobjects are lif ted. These aff erents respond with a brief\nburst of action potentials when objects move a small dis-\ntance during 

Integrate VectorDb context pipeling with LLM output

In [99]:
from dotenv import load_dotenv
load_dotenv()

from langchain_groq import ChatGroq

groq_api_key=os.getenv('GROQ_API_KEY')
llm = ChatGroq(groq_api_key=groq_api_key, model='llama-3.3-70b-versatile', temperature=0.1, max_tokens=1024)

In [112]:
def rag_simple(query, rag_retriever, llm, top_k=3):
    result=rag_retriever.retrieve(query, top_k=top_k)
    context="\n\n".join([doc['content'] for doc in result]) if result else ""
    
    if not context:
        return "No relevant contect found to answer the question. "
    
    prompt = f"""Use the following context to anwr the given question, Now if the context is No relevant contect found to answer the question. then mention the same and also try finding the result from your memory yourself instead of context.
    context:{context},
    Question:{query},
    Answer:
    """
    
    res = llm.invoke([prompt.format(context, query)])
    return res.content

In [117]:
res = rag_simple("What is somatosensory? ", rag_retriever, llm)
print(res)

Retrieving documents for query: 'What is somatosensory? '
Top K: 3, Score threshold: 0.0
generating embeddings for {len(texts)} texts


Batches: 100%|██████████| 1/1 [00:00<00:00, 63.82it/s]

(1, 384)
Retrieved 1 documents (after filtering)





The somatosensory system refers to the network of sensors and receptors in the human body that are responsible for detecting and processing sensory information related to touch, temperature, pain, and movement. This system includes receptors in the skin (cutaneous receptors) that detect temperature, pressure, surface texture, and pain, as well as receptors in muscles, tendons, and joints that provide information about muscle length, muscle tension, and joint angles. 

In simpler terms, the somatosensory system is what allows us to feel and perceive the world around us through our sense of touch and movement. It plays a crucial role in our ability to interact with and navigate our environment.


In [122]:
# --- Enhanced RAG Pipeline Features ---
def rag_advanced(query, retriever, llm, top_k=5, min_score=0.2, return_context=False):
    """
    RAG pipeline with extra features:
    - Returns answer, sources, confidence score, and optionally full context.
    """
    results = retriever.retrieve(query, top_k=top_k, score_threshold=min_score)
    if not results:
        return {'answer': 'No relevant context found.', 'sources': [], 'confidence': 0.0, 'context': ''}
    
    # Prepare context and sources
    context = "\n\n".join([doc['content'] for doc in results])
    sources = [{
        'source': doc['metadata'].get('source_file', doc['metadata'].get('source', 'unknown')),
        'page': doc['metadata'].get('page', 'unknown'),
        'score': doc['similarity_score'],
        'preview': doc['content'][:300] + '...'
    } for doc in results]
    confidence = max([doc['similarity_score'] for doc in results])
    
    # Generate answer
    prompt = f"""Use the following context to answer the question concisely.\nContext:\n{context}\n\nQuestion: {query}\n\nAnswer:"""
    response = llm.invoke([prompt.format(context=context, query=query)])
    
    output = {
        'answer': response.content,
        'sources': sources,
        'confidence': confidence
    }
    if return_context:
        output['context'] = context
    return output

# Example usage:
result = rag_advanced("Whats is somatosensory ?", rag_retriever, llm, top_k=3, min_score=0.1, return_context=True)
print("Answer:", result['answer'])
print("Sources:", result['sources'])
print("Confidence:", result['confidence'])
print("Context Preview:", result['context'][:300])

Retrieving documents for query: 'Whats is somatosensory ?'
Top K: 3, Score threshold: 0.1
generating embeddings for {len(texts)} texts


Batches: 100%|██████████| 1/1 [00:00<00:00, 66.16it/s]

(1, 384)
Retrieved 1 documents (after filtering)





Answer: The somatosensory system refers to the sensors in the skin and muscles, tendons, and joints that provide information about temperature, pressure, texture, pain, muscle length, tension, and joint angles.
Sources: [{'source': 'somatosensory.pdf', 'page': 0, 'score': 0.10452806949615479, 'preview': 'This is a sample document to\nshowcase page-based formatting. It\ncontains a chapter from a Wikibook\ncalled Sensory Systems. None of the\ncontent has been changed in this\narticle, but some content has been\nremoved.\nAnatomy of the Somatosensory System\nFROM WIKIBOOKS1\nOur somatosensory system consists of...'}]
Confidence: 0.10452806949615479
Context Preview: This is a sample document to
showcase page-based formatting. It
contains a chapter from a Wikibook
called Sensory Systems. None of the
content has been changed in this
article, but some content has been
removed.
Anatomy of the Somatosensory System
FROM WIKIBOOKS1
Our somatosensory system consists of
