##### <code><b><i>RAG Pipeline - Data Ingetion to VectorDB Pipeline </i></b></code>

In [1]:
import os 
from langchain_community.document_loaders import PyMuPDFLoader,PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pathlib import Path

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load all PDF files from the data directory
def load_pdfs_from_directory(directory_path):
  pdf_files = Path(directory_path).glob("*.pdf")
  all_documents = []
  
  for pdf_file in pdf_files:
    print(f"Loading document: {pdf_file.name}")
    try:
        loader = PyPDFLoader(str(pdf_file))
        documents = loader.load()
        
        for doc in documents:
          doc.metadata["source"] = pdf_file.name
          doc.metadata["file_type"] = 'pdf'
          
        all_documents.extend(documents)
        print(f"-> Loaded {len(documents)} pages from {pdf_file.name}")
    except Exception:
        print(f"Failed to load {pdf_file.name}")
  
  print(f"Total documents loaded: {len(all_documents)}")
  return all_documents

all_documents = load_pdfs_from_directory("../data/pdf")

Loading document: Aptitude Book.pdf


Ignoring wrong pointing object 6 0 (offset 0)
Ignoring wrong pointing object 9 0 (offset 0)
Ignoring wrong pointing object 11 0 (offset 0)
Ignoring wrong pointing object 36 0 (offset 0)
Ignoring wrong pointing object 38 0 (offset 0)
Ignoring wrong pointing object 55 0 (offset 0)
Ignoring wrong pointing object 88 0 (offset 0)
Ignoring wrong pointing object 90 0 (offset 0)
Ignoring wrong pointing object 92 0 (offset 0)
Ignoring wrong pointing object 105 0 (offset 0)
Ignoring wrong pointing object 128 0 (offset 0)
Ignoring wrong pointing object 139 0 (offset 0)
Ignoring wrong pointing object 158 0 (offset 0)
Ignoring wrong pointing object 195 0 (offset 0)
Ignoring wrong pointing object 309 0 (offset 0)
Ignoring wrong pointing object 314 0 (offset 0)
Ignoring wrong pointing object 560 0 (offset 0)
Ignoring wrong pointing object 767 0 (offset 0)


-> Loaded 212 pages from Aptitude Book.pdf
Loading document: Resume_DataScience.pdf
-> Loaded 1 pages from Resume_DataScience.pdf
Total documents loaded: 213


In [3]:
# Text Splitting
def split_documents(documents, chunk_size=1000, chunk_overlap=200):
  text_splitter = RecursiveCharacterTextSplitter(
      chunk_size=chunk_size,
      chunk_overlap=chunk_overlap,
      length_function=len,
      separators=["\n\n", "\n", " ", ""]
  )
  
  split_docs = text_splitter.split_documents(documents)
  print(f"Total documents after splitting: {len(split_docs)}")  
  return split_docs

chunks = split_documents(all_documents)
chunks[:2]  # Display first two split documents

Total documents after splitting: 832


[Document(metadata={'producer': 'macOS Version 13.2.1 (Build 22D68) Quartz PDFContext', 'creator': 'Word', 'creationdate': "D:20230415170124Z00'00'", 'title': 'Microsoft Word - Workbook.docx', 'moddate': "D:20230415170124Z00'00'", 'source': 'Aptitude Book.pdf', 'total_pages': 212, 'page': 0, 'page_label': '1', 'file_type': 'pdf'}, page_content='1 \n             THE APTITUDE TRIAD Mastering Quantitative, Logical, and Verbal Skills'),
 Document(metadata={'producer': 'macOS Version 13.2.1 (Build 22D68) Quartz PDFContext', 'creator': 'Word', 'creationdate': "D:20230415170124Z00'00'", 'title': 'Microsoft Word - Workbook.docx', 'moddate': "D:20230415170124Z00'00'", 'source': 'Aptitude Book.pdf', 'total_pages': 212, 'page': 1, 'page_label': '2', 'file_type': 'pdf'}, page_content='2 \nTABLE OF CONTENTS  SECTION A – QUANTITATIVE APTITUDE Module Topic    Page No. Module 1 Number System 5 Module 2 HCF, LCM and Decimal Fractions 11 Module 3 Simplification 15 Module 4 Percentages 18 Module 5 Profit

##### <code><b><i>Embedding and VectorDB</i></b></code>

In [4]:
import numpy as np
import uuid
import chromadb 
from chromadb.config import Settings
from sentence_transformers import SentenceTransformer
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity


In [5]:
class EmbeddingManager:
  def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
      self.model_name = model_name
      self.model = None
      self._load_model()
      
  def _load_model(self):
    try:
      print(f"Loading embedding model: {self.model_name}...")
      self.model = SentenceTransformer(self.model_name)
      print("Model loaded successfully.",self.model.get_sentence_embedding_dimension())
    except Exception as e:
      print(f"Error loading model: {self.model_name}. Exception: {e}")
      raise
    
  def generate_embeddings(self, texts: List[str]) -> np.ndarray:
    if not self.model:
      raise ValueError("Embedding model is not loaded.")
    
    print(f"Generating embeddings for {len(texts)} texts...")
    embeddings = self.model.encode(texts, convert_to_numpy=True) 
    print("Gentrated embeddings for shape:",embeddings.shape)
    return embeddings
  
embedding_manager = EmbeddingManager()
embedding_manager

Loading embedding model: all-MiniLM-L6-v2...
Model loaded successfully. 384


<__main__.EmbeddingManager at 0x228c005ede0>

##### <code><b><i>VectorStore</i></b></code>

In [6]:
class VectorStore:
  def __init__(self, collection_name: str = "pdf_documents",presistence_dir: str = "../data/vector_db"):
    self.collection_name = collection_name
    self.persistance_dir = presistence_dir
    self.client = None
    self.collection = None
    self._initialize_vector_store()
    
  def _initialize_vector_store(self):
    try:
      os.makedirs(self.persistance_dir, exist_ok=True)
      self.client = chromadb.PersistentClient(path=self.persistance_dir)
      
      self.collection = self.client.get_or_create_collection(
        name=self.collection_name,
        metadata={"discription": "Collection of PDF document embeddings"}
        )
      
      print(f"Vector store initialized at {self.persistance_dir} with collection '{self.collection_name}'.")
      print(f"Current number of vectors in collection: {self.collection.count()}")
    except Exception as e:
      print(f"Error initializing vector store: {e}")
      raise    
    
  def add_documents(self, documents: List[Any], embeddings: np.ndarray):
    if len(documents) != len(embeddings):
      raise ValueError("Number of documents and embeddings must match.")
    print(f"Adding {len(documents)} documents to vector store...")
    
    #Prepare data for ChromaDB
    ids = []
    metadatas = []
    document_text = []
    embedding_list = []
    
    for i,(doc,embadding) in enumerate(zip(documents,embeddings)):
      # Generate a unique ID for each document
      doc_id = f'doc_{ uuid.uuid4().hex[:8]}_{i}'
      ids.append(doc_id)
      
      #Prepare metadata
      metadata = dict(doc.metadata)
      metadata["doc_index"] = i
      metadata["conttent_length"] = len(doc.page_content)
      metadatas.append(metadata)
      
      #Document content
      document_text.append(doc.page_content)
      
      # Embedding
      embedding_list.append(embadding.tolist())
      
    # Add to Collection
    try:
      self.collection.add(
        ids=ids,
        metadatas=metadatas,
        documents=document_text,
        embeddings=embedding_list
      )
      print(f"Successfully added {len(documents)} documents to vector store.")
      print(f"Total documents in collection now: {self.collection.count()}")
    except Exception as e:
      print(f"Error adding documents to vector store: {e}")
      raise
    
vectorStore = VectorStore()
vectorStore

Vector store initialized at ../data/vector_db with collection 'pdf_documents'.
Current number of vectors in collection: 0


<__main__.VectorStore at 0x228bff280e0>

In [7]:
#Convert the text to embeddings
texts = [doc.page_content for doc in chunks]

#Generate embeddings
embeddings = embedding_manager.generate_embeddings(texts)

#Store in vector DB
vectorStore.add_documents(chunks, embeddings)

Generating embeddings for 832 texts...
Gentrated embeddings for shape: (832, 384)
Adding 832 documents to vector store...
Successfully added 832 documents to vector store.
Total documents in collection now: 832


##### <code><b><i>Retriever Pipeline from Vectorstore</i></b></code>

In [18]:
class RAGRetriever:
  def __init__(self,vector_store: VectorStore, embedding_manager: EmbeddingManager):
    self.vector_store = vector_store
    self.embedding_manager = embedding_manager
    
  def retrieve(self, query: str, top_k: int = 5,score_threshold: float = 0.0) -> List[Dict[str, Any]]:
    print(f"Retrieving top {top_k} documents for query: '{query}' and score threshold: {score_threshold}")
    #Generate embedding for the query
    query_embedding = self.embedding_manager.generate_embeddings([query])[0]
    
    #Search in vector store
    try:
      results = self.vector_store.collection.query(
        query_embeddings=[query_embedding.tolist()],
        n_results=top_k
      )
      
      retrieved_docs = []
      
      if results["documents"] and results["documents"][0]:
        documents = results["documents"][0]
        metadatas = results["metadatas"][0]
        distances = results["distances"][0]
        ids = results["ids"][0]
        
        for i,(doc_id,document,metadata,distance) in enumerate(zip(ids,documents,metadatas,distances)):
          
          score = 1 - distance  # Convert distance to similarity score
          if score >= score_threshold:
            retrieved_docs.append({
              "id": doc_id,
              "content": document ,
              "metadata": metadata,
              "similarity_score": score,
              "rank": i + 1
            })
        print(f"Retrieved {len(retrieved_docs)} documents after applying score threshold.")
      else:
        print("No documents retrieved from vector store.")
      return retrieved_docs 
    
    except Exception as e:
      print(f"Error during retrieval: {e}")
      return []
  
rag_retriever = RAGRetriever(vectorStore, embedding_manager)

In [22]:
retrieved_docs = rag_retriever.retrieve("What is odd number?", top_k=5, score_threshold=0.0)

Retrieving top 5 documents for query: 'What is odd number?' and score threshold: 0.0
Generating embeddings for 1 texts...
Gentrated embeddings for shape: (1, 384)
Retrieved 4 documents after applying score threshold.
