### Data ingestion to vector db
#### Data parsing, chuncking, embedding, and vector storage

In [1]:
import os
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path

In [2]:
### Read all the pdfs inside directory

def process_all_pdfs(pdf_directory):
    """Process all the pdf files in a directory"""
    all_documents = []
    pdf_directory = Path(pdf_directory)

    pdf_files = list(pdf_directory.glob("**/*.pdf"))

    print(f"Found {len(pdf_files)} PDF files to process")

    for pdf_file in pdf_files:
        print(f"\nProcessing {pdf_file.name}...")
        try:
            loader = PyPDFLoader(str(pdf_file))
            documents = loader.load()

            ## add source info to metadata
            for doc in documents:
                doc.metadata['source_file'] = pdf_file.name
                doc.metadata['file_type'] = 'pdf'

            all_documents.extend(documents)

            print(f"Loaded {len(documents)} documents from {pdf_file}")

        except Exception as e:
            print(f"error: {e}")
    
    print(f"\ntotal documents loaded: {len(all_documents)}")
    return all_documents

all_pdf_documents = process_all_pdfs("../data/pdf")

Found 4 PDF files to process

Processing xgboost.pdf...
Loaded 13 documents from ../data/pdf/xgboost.pdf

Processing embeddings.pdf...
Loaded 16 documents from ../data/pdf/embeddings.pdf

Processing attention.pdf...
Loaded 11 documents from ../data/pdf/attention.pdf

Processing object_detection.pdf...
Loaded 21 documents from ../data/pdf/object_detection.pdf

total documents loaded: 61


In [3]:
all_pdf_documents

[Document(metadata={'producer': 'pdfTeX-1.40.12', 'creator': 'LaTeX with hyperref package', 'creationdate': '2016-06-14T01:29:40+00:00', 'author': '', 'keywords': '', 'moddate': '2016-06-14T01:29:40+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.1415926-2.3-1.40.12 (TeX Live 2011) kpathsea version 6.0.1', 'subject': '', 'title': '', 'trapped': '/False', 'source': '../data/pdf/xgboost.pdf', 'total_pages': 13, 'page': 0, 'page_label': '1', 'source_file': 'xgboost.pdf', 'file_type': 'pdf'}, page_content='XGBoost: A Scalable Tree Boosting System\nTianqi Chen\nUniversity of Washington\ntqchen@cs.washington.edu\nCarlos Guestrin\nUniversity of Washington\nguestrin@cs.washington.edu\nABSTRACT\nTree boosting is a highly eﬀective and widely used machine\nlearning method. In this paper, we describe a scalable end-\nto-end tree boosting system called XGBoost, which is used\nwidely by data scientists to achieve state-of-the-art results\non many machine learning challenges. We propose a novel

In [4]:
### split texts and get into chuncks

def split_documents(documents, chunk_size=1000, overlap_size=200):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = chunk_size,
        chunk_overlap = overlap_size,
        length_function = len,
        separators= ["\n\n", "\n", " ", ""]
    )
    split_docs = text_splitter.split_documents(all_pdf_documents)
    print(f"Split {len(all_pdf_documents)} documents into {len(split_docs)} chunks")

    ## Example of chunck
    if split_docs:
        print("\nExample chunck: ")
        print(f"Content: {split_docs[0].page_content[:200]}...")
        print(f"Metadata: {split_docs[0].metadata}")

    return split_docs


In [5]:
chunks = split_documents(all_pdf_documents)

Split 61 documents into 401 chunks

Example chunck: 
Content: XGBoost: A Scalable Tree Boosting System
Tianqi Chen
University of Washington
tqchen@cs.washington.edu
Carlos Guestrin
University of Washington
guestrin@cs.washington.edu
ABSTRACT
Tree boosting is a h...
Metadata: {'producer': 'pdfTeX-1.40.12', 'creator': 'LaTeX with hyperref package', 'creationdate': '2016-06-14T01:29:40+00:00', 'author': '', 'keywords': '', 'moddate': '2016-06-14T01:29:40+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.1415926-2.3-1.40.12 (TeX Live 2011) kpathsea version 6.0.1', 'subject': '', 'title': '', 'trapped': '/False', 'source': '../data/pdf/xgboost.pdf', 'total_pages': 13, 'page': 0, 'page_label': '1', 'source_file': 'xgboost.pdf', 'file_type': 'pdf'}


### Embedding and vectorstore

In [6]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity

In [7]:
class EmbeddingManager:
    def __init__(self, model_name: str = "sentence-transformers/all-MiniLM-L6-v2"):
        """
            Initialize the embedding manager with a pre-trained model
            Model Used: Huggingface model name for sentence transformer
        """
        self.model_name = model_name
        self.model = None
        self._load_model()

    def _load_model(self):
        """ Load the sentence transformer model """
        try:
            print(f"Loading model: {self.model_name}...")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model loaded successfully whose embedding dimension is {self.model.get_sentence_embedding_dimension()}")

        except Exception as e:
            print(f"Error loading model {self.model_name} : {e}")
            raise
    
    def generate_embedding(self, texts:List[str]) -> np.ndarray:
        """ Generate embeddings for a list of texts using the sentence transformer model """

        if not self.model:
            raise ValueError("Model not loaded. Please call _load_model() first.")
        
        print(f"Generating embeddings for {len(texts)} texts...")

        embeddings = self.model.encode(texts, show_progress_bar=True)
        print(f"Embeddings generated successfully with shape {embeddings.shape}")

        return embeddings            

In [8]:
### initialise Embedding

embedding_manager = EmbeddingManager()

Loading model: sentence-transformers/all-MiniLM-L6-v2...
Model loaded successfully whose embedding dimension is 384


### VectorStore

In [9]:
class VectorStore:
    """ Manages document embeddings in vector database"""
    def __init__(self, collection_name:str = "pdf-documents", persistent_directory = "../data/vector_store"):
        """ Initialize the vector store with a collection name and persistent directory """
        self.collection_name = collection_name
        self.persistent_directory = persistent_directory
        self.client = None
        self.collection = None
        self._initialize_store()

    def _initialize_store(self):
        """ Initialize the chroma client """
        try:
            os.makedirs(self.persistent_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persistent_directory)
            self.collection = self.client.get_or_create_collection(name = self.collection_name,
                metadata = {"Description": "This is a collection of documents for RAG",
                "hnsw:space": "cosine"
                }
            )
            print(f"Vector store initialized successfully with collection {self.collection_name}")
            print(f"count of existing document in collection: {self.collection.count()}")

        except Exception as e:
            print(f"Error initializing vector store: {e}")
            raise

    def add_documents(self, documents:List[Any], embeddings:np.ndarray):
        """ Add documents to the vector store 
        Arguments: 
            documents: List of langchain documents
            embeddings: numpy array of their embeddings
        """
        if len(documents)!=len(embeddings):
            raise ValueError("Number of documents should match with number of embeddings")
        
        print(f"Adding {len(documents)} documents to the vector store...")

        ## Prepare data to ingest in chroma db

        ids = []
        metadatas = []
        documents_text = []
        embedding_list = []

        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
            ## Generate unique id
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)

            ## Prepare metadata

            metadata = dict(doc.metadata)
            metadata['doc_index'] = i
            metadata['contect_length'] = len(doc.page_content)
            metadatas.append(metadata)

            ## Document content
            documents_text.append(doc.page_content)

            ## Embedding

            embedding_list.append(embedding.tolist())

        ## Add to collection

        try:
            self.collection.add(
                ids = ids,
                documents = documents_text,
                metadatas = metadatas,
                embeddings = embedding_list
            )
            print(f"{len(documents)} documents added successfully to the vector store")
            print(f"total documents in the collection: {self.collection.count()}")

        except Exception as e:
            print(f"Error adding documents to the vector store: {e}")
            raise
        

In [10]:
vectorstore = VectorStore()

Vector store initialized successfully with collection pdf-documents
count of existing document in collection: 0


In [11]:
vectorstore

<__main__.VectorStore at 0x132eacc20>

In [13]:
print(chunks[:2])

[Document(metadata={'producer': 'pdfTeX-1.40.12', 'creator': 'LaTeX with hyperref package', 'creationdate': '2016-06-14T01:29:40+00:00', 'author': '', 'keywords': '', 'moddate': '2016-06-14T01:29:40+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.1415926-2.3-1.40.12 (TeX Live 2011) kpathsea version 6.0.1', 'subject': '', 'title': '', 'trapped': '/False', 'source': '../data/pdf/xgboost.pdf', 'total_pages': 13, 'page': 0, 'page_label': '1', 'source_file': 'xgboost.pdf', 'file_type': 'pdf'}, page_content='XGBoost: A Scalable Tree Boosting System\nTianqi Chen\nUniversity of Washington\ntqchen@cs.washington.edu\nCarlos Guestrin\nUniversity of Washington\nguestrin@cs.washington.edu\nABSTRACT\nTree boosting is a highly eﬀective and widely used machine\nlearning method. In this paper, we describe a scalable end-\nto-end tree boosting system called XGBoost, which is used\nwidely by data scientists to achieve state-of-the-art results\non many machine learning challenges. We propose a novel

In [14]:
### Convert text to embeddings
texts = [doc.page_content for doc in chunks]

In [15]:
## Generate Embeddings
embeddings = embedding_manager.generate_embedding(texts)

## Add embeddings to vector store

vectorstore.add_documents(chunks, embeddings)

Generating embeddings for 401 texts...


Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Embeddings generated successfully with shape (401, 384)
Adding 401 documents to the vector store...
401 documents added successfully to the vector store
total documents in the collection: 401


In [16]:
class RAGRetriever:
    """Handles query based retrival from vector store"""
    def __init__(self, vectorstore:VectorStore, embedding_manager: EmbeddingManager):

        self.vectorstore = vectorstore
        self.embedding_manager = embedding_manager

    def retrieve(self, query:str, top_k:int = 5, score_threshold: float = 0.0) -> List[Dict[str, Any]]:
        
        print(f"Retrieving top {top_k} documents for query: {query}")
        print(f"Top K: {top_k}, Score Threshold: {score_threshold}")

        query_embedding = self.embedding_manager.generate_embedding([query])[0]

        ## Perform similarity search
        try:
            results = self.vectorstore.collection.query(
                query_embeddings = [query_embedding.tolist()],
                n_results = top_k
            )
            # print(results)

            ## Process results

            retrived_docs = []

            if results['documents'] and results['documents'][0]:
                documents = results['documents'][0]
                metadatas = results['metadatas'][0]
                scores = results['distances'][0]
                ids = results['ids'][0]

                # print(f"Documents: {documents}")
                # print(f"Metadatas: {metadatas}")
                # print(f"Scores: {scores}")
                # print(f"IDs: {ids}")

                for i, (doc, metadata, score, id) in enumerate(zip(documents, metadatas, scores, ids)):
                    similarity_score = 1 - score

                    if similarity_score >= score_threshold:
                        retrived_docs.append({
                            "id": id,
                            "content": doc,
                            "metadata": metadata,
                            "similarity_score": similarity_score,
                            "distance": score,
                            "rank": i+1
                        })
                
                print(f"Retrived {len(retrived_docs)} documents with score greater than {score_threshold}")
            else:
                print("No documents found in the vector store")
            
            return retrived_docs
                    
        except Exception as e:
            print(f"Error during Retrival: {e}")
            return []


In [17]:
rag_retriver = RAGRetriever(vectorstore, embedding_manager)

In [18]:
rag_retriver

<__main__.RAGRetriever at 0x1330716a0>

In [19]:
rag_retriver.retrieve("what is role of Deep learning in Pedestrian Detection")

Retrieving top 5 documents for query: what is role of Deep learning in Pedestrian Detection
Top K: 5, Score Threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Embeddings generated successfully with shape (1, 384)
Retrived 5 documents with score greater than 0.0


[{'id': 'doc_117aab48_200',
  'content': 'ditions. It should be noticed that the covered domains are\ndiversiﬁed. Pedestrian and face images have regular structures,\nwhile general objects and scene images have more complex\nvariations in geometric structures and layouts. Therefore,\ndifferent deep models are required by various images.\nThere has been a relevant pioneer effort [34] which mainly\nfocuses on relevant software tools to implement deep learning\ntechniques for image classiﬁcation and object detection, but\npays little attention on detailing speciﬁc algorithms. Different\nfrom it, our work not only reviews deep learning based object\ndetection models and algorithms covering different applica-\ntion domains in detail, but also provides their corresponding\nexperimental comparisons and meaningful analyses.\nThe rest of this paper is organized as follows. In Section\n2, a brief introduction on the history of deep learning and the\nbasic architecture of CNN is provided. Generic

#### Integration of context to LLM

In [20]:
### Simple RAG pipeline using GROQ API

from langchain_groq import ChatGroq
import os
from dotenv import load_dotenv

load_dotenv()

groq_api_key = os.getenv("GROQ_API_KEY")

llm = ChatGroq(api_key=groq_api_key, model_name="llama-3.1-8b-instant", temperature=0, max_tokens = 1024)

def rag_simple(query:str, retriever, llm, top_k:int = 5):
    """ Simple RAG pipeline """
    # print(f"Query: {query}")
    # print(f"Top K: {top_k}")

    ## Retrieve
    results = retriever.retrieve(query, top_k=top_k)

    context = "\n\n".join([result['content'] for result in results]) if results else ""

    if not context:
        return "No relevant context found in the document"

    ## Generate response using Groq LLM
    prompt = f""" Use the following question to answer the question concisely.

        context: {context}

        question: {query}

        Answer:
        """

    response = llm.invoke(prompt)
    

    return response.content


In [21]:
answer = rag_simple("why is xgboost so popular", rag_retriver, llm)
print(answer)

Retrieving top 5 documents for query: why is xgboost so popular
Top K: 5, Score Threshold: 0.0
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Embeddings generated successfully with shape (1, 384)
Retrived 5 documents with score greater than 0.0
XGBoost is so popular due to its scalability in all scenarios, running more than ten times faster than existing popular solutions on a single machine and scaling to billions of examples in distributed or memory-limited settings. Its success can be attributed to several key innovations, including:

1. A novel tree learning algorithm for handling sparse data.
2. A theoretically justified weighted quantile sketch procedure for handling instance weights.
3. Parallel and distributed computing, making learning faster and enabling quicker model exploration.
4. The ability to exploit out-of-core data sets with given resources.

These innovations, combined with its ability to handle large datasets and provide state-of-the-art results on a wide range of problems, have made XGBoost the consensus choice of learner in many competitions and real-world applications.


In [22]:
### Enhanced RAG pipeline

def enhanced_rag(query:str, retriever, llm, top_k:int = 5, min_score:float = 0.5, summarize:bool = True):
    """ Enhanced RAG pipeline """

    results = retriever.retrieve(query, top_k=top_k, score_threshold=min_score)

    print(results)

    if not results:
        answer = "No relevant context found in the document"
        sources = []
        context = ""
        summary = ""
    else:
        context = "\n\n".join([result['content'] for result in results])
        sources =[{
            "source": doc['metadata']['source_file'],
            "page": doc['metadata']['page_label'],
            "score": doc['similarity_score']

        } for doc in results]

        prompt = f""" Use the following question to answer the question concisely.

        context: {context}

        question: {query}

        Answer:
        """

        response = llm.invoke(prompt)
        answer = response.content

        ## add citation to the answer

        citations = [f"[{i+1}] {src['source']} - Page {src['page']}" for i, src in enumerate(sources)]

        answer_with_citation = f"{answer}\n\nSources:\n{'\n'.join(citations)}"

        if summarize:
            summarizer_prompt = f""" Summarize the following answer in 1 paragraph:

            answer: {answer}

            Summary:
            """

            summary_response = llm.invoke(summarizer_prompt)
            summary = summary_response.content
            
        else:
            summary = ""

        return {
            'question': query,
            'answer': answer_with_citation,
            'summary': summary,
            'sources': sources
        }

    return {
        'question': query,
        'answer': answer,
        'summary': summary,
        'sources': sources
    }

In [23]:
result = enhanced_rag("how is distance between two planets calculates", rag_retriver, llm)

Retrieving top 5 documents for query: how is distance between two planets calculates
Top K: 5, Score Threshold: 0.5
Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Embeddings generated successfully with shape (1, 384)
Retrived 0 documents with score greater than 0.5
[]


In [24]:
result

{'question': 'how is distance between two planets calculates',
 'answer': 'No relevant context found in the document',
 'summary': '',
 'sources': []}

In [25]:
print("\nFinal Answer:", result['answer'])
print("Summary:", result['summary'])


Final Answer: No relevant context found in the document
Summary: 


In [26]:
from rank_bm25 import BM25Okapi
import re

In [57]:
# data = vectorstore.get_all_documents()

In [49]:
class BM25Retriever:
    """Handles query based retrival from vector store using BM25"""
    def __init__(self, documents:List[Any], chunks: List[Any] = None):

        self.documents = documents
        self.chunks = chunks

        ##Tokenize documents for BM25
        print("Building BM25 index...")
        tokenized_docs = [doc.split() for doc in documents]

        self.bm25 = BM25Okapi(tokenized_docs)
        print(f"BM25 index built successfully for {len(documents)} documents")

    def retrieve(self, query:str, top_k:int = 5) -> List[Dict[str, Any]]:
        """Retrieve documents from BM25 index"""
        print(f"Retrieving top {top_k} documents for query: {query}")

        ## Tokenize query
        query_tokens = re.findall(r'\w+', query.lower())

        if not query_tokens:
            print("Empty Query")
            return []
        
        ## Get BM25 scores
        scores = self.bm25.get_scores(query_tokens)

        ## Get top k documents
        top_docs_indices = np.argsort(scores)[::-1][:top_k]
        
        retrieved_docs = []
        for i, idx in enumerate(top_docs_indices):
            result = {
                "doc_index": int(idx),  # Index in the original documents list
                "content": self.documents[idx],
                "bm25_score": float(scores[idx]),
                "rank": i + 1
            }
            
            # Add metadata if chunks are available
            if self.chunks and idx < len(self.chunks):
                result["metadata"] = self.chunks[idx].metadata
                result["source_file"] = self.chunks[idx].metadata.get('source_file', 'unknown')
                result["page"] = self.chunks[idx].metadata.get('page_label', 'unknown')
            else:
                result["metadata"] = {}
                result["source_file"] = "unknown"
                result["page"] = "unknown"
            
            retrieved_docs.append(result)
        
        print(f"Retrieved {len(retrieved_docs)} documents")
        print(f"BM25 scores range: {min(scores):.4f} to {max(scores):.4f}")
        return retrieved_docs
        

In [50]:
bm25_documents = [chunk.page_content for chunk in chunks]
print(f"Prepared {len(bm25_documents)} documents for BM25 indexing")
print(f"\nExample document (first 200 chars):")
print(bm25_documents[0][:200])

Prepared 401 documents for BM25 indexing

Example document (first 200 chars):
XGBoost: A Scalable Tree Boosting System
Tianqi Chen
University of Washington
tqchen@cs.washington.edu
Carlos Guestrin
University of Washington
guestrin@cs.washington.edu
ABSTRACT
Tree boosting is a h


In [51]:
# Step 3: Initialize BM25 Retriever
bm25_retriever = BM25Retriever(
    documents=bm25_documents,  # List of text strings
    chunks=chunks              # Original chunks for metadata
)

Building BM25 index...
BM25 index built successfully for 401 documents


In [52]:
query = "what is role of Deep learning in Pedestrian Detection"
results = bm25_retriever.retrieve(query, top_k=5)

Retrieving top 5 documents for query: what is role of Deep learning in Pedestrian Detection
Retrieved 5 documents
BM25 scores range: 0.0000 to 14.6651


In [61]:
results

[{'doc_index': 201,
  'content': 'The rest of this paper is organized as follows. In Section\n2, a brief introduction on the history of deep learning and the\nbasic architecture of CNN is provided. Generic object detec-\ntion architectures are presented in Section 3. Then reviews\nof CNN applied in several speciﬁc tasks, including salient\nobject detection, face detection and pedestrian detection, are\nexhibited in Section 4-6, respectively. Several promising future\ndirections are proposed in Section 7. At last, some concluding\nremarks are presented in Section 8.\nII. A B RIEF OVERVIEW OF DEEP LEARNING\nPrior to overview on deep learning based object detection\napproaches, we provide a review on the history of deep\nlearning along with an introduction on the basic architecture\nand advantages of CNN.\nA. The History: Birth, Decline and Prosperity\nDeep models can be referred to as neural networks with\ndeep structures. The history of neural networks can date back\nto 1940s [35], and 