### PIPELINE : Data ingestion to vecor DB

In [1]:
import os
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pathlib import Path

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
### read all pdfs inside directory
def process_pdfs_in_directory(directory_path):
    all_docs = []
    for filename in os.listdir(directory_path):
        if filename.endswith(".pdf"):
            file_path = os.path.join(directory_path, filename)
            loader = PyMuPDFLoader(file_path)
            documents = loader.load()
            # add source metadata
            for doc in documents:
                doc.metadata["source_file"] = file_path
                doc.metadata["file_type"] = "pdf"
            all_docs.extend(documents)
    return all_docs

all_documents = process_pdfs_in_directory("data/pdf")
all_documents

[Document(metadata={'producer': 'PyPDF2', 'creator': '', 'creationdate': '', 'source': 'data/pdf\\attention.pdf', 'file_path': 'data/pdf\\attention.pdf', 'total_pages': 11, 'format': 'PDF 1.3', 'title': 'Attention is All you Need', 'author': 'Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Łukasz Kaiser, Illia Polosukhin', 'subject': 'Neural Information Processing Systems http://nips.cc/', 'keywords': '', 'moddate': '2018-02-12T21:22:10-08:00', 'trapped': '', 'modDate': "D:20180212212210-08'00'", 'creationDate': '', 'page': 0, 'source_file': 'data/pdf\\attention.pdf', 'file_type': 'pdf'}, page_content='Attention Is All You Need\nAshish Vaswani∗\nGoogle Brain\navaswani@google.com\nNoam Shazeer∗\nGoogle Brain\nnoam@google.com\nNiki Parmar∗\nGoogle Research\nnikip@google.com\nJakob Uszkoreit∗\nGoogle Research\nusz@google.com\nLlion Jones∗\nGoogle Research\nllion@google.com\nAidan N. Gomez∗†\nUniversity of Toronto\naidan@cs.toronto.edu\nŁukasz Kaise

In [3]:
# text splitting into chunks
def split_documents(documents, chunk_size=500, chunk_overlap=50):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", " ", ""]
    )
    split_docs = []
    for doc in documents:
        splits = text_splitter.split_text(doc.page_content)
        for i, split in enumerate(splits):
            new_doc = doc.copy()
            new_doc.page_content = split
            new_doc.metadata["chunk_index"] = i
            split_docs.append(new_doc)
    
    # info about total chunks created out of documents
    print(f"Total chunks created: {len(split_docs)} from {len(documents)} documents.")

    # show example of a chunk
    if split_docs:
        print("Example chunk:")
        print(split_docs[0].page_content[:200])  # print first 200 characters
        print("Metadata:", split_docs[0].metadata)

    return split_docs

In [4]:
chunks = split_documents(all_documents)
chunks

Total chunks created: 722 from 122 documents.
Example chunk:
Attention Is All You Need
Ashish Vaswani∗
Google Brain
avaswani@google.com
Noam Shazeer∗
Google Brain
noam@google.com
Niki Parmar∗
Google Research
nikip@google.com
Jakob Uszkoreit∗
Google Research
usz
Metadata: {'producer': 'PyPDF2', 'creator': '', 'creationdate': '', 'source': 'data/pdf\\attention.pdf', 'file_path': 'data/pdf\\attention.pdf', 'total_pages': 11, 'format': 'PDF 1.3', 'title': 'Attention is All you Need', 'author': 'Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Łukasz Kaiser, Illia Polosukhin', 'subject': 'Neural Information Processing Systems http://nips.cc/', 'keywords': '', 'moddate': '2018-02-12T21:22:10-08:00', 'trapped': '', 'modDate': "D:20180212212210-08'00'", 'creationDate': '', 'page': 0, 'source_file': 'data/pdf\\attention.pdf', 'file_type': 'pdf', 'chunk_index': 6}


C:\Users\Student\AppData\Local\Temp\ipykernel_28312\341794563.py:12: PydanticDeprecatedSince20: The `copy` method is deprecated; use `model_copy` instead. See the docstring of `BaseModel.copy` for details about how to handle `include` and `exclude`. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.12/migration/
  new_doc = doc.copy()


[Document(metadata={'producer': 'PyPDF2', 'creator': '', 'creationdate': '', 'source': 'data/pdf\\attention.pdf', 'file_path': 'data/pdf\\attention.pdf', 'total_pages': 11, 'format': 'PDF 1.3', 'title': 'Attention is All you Need', 'author': 'Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Łukasz Kaiser, Illia Polosukhin', 'subject': 'Neural Information Processing Systems http://nips.cc/', 'keywords': '', 'moddate': '2018-02-12T21:22:10-08:00', 'trapped': '', 'modDate': "D:20180212212210-08'00'", 'creationDate': '', 'page': 0, 'source_file': 'data/pdf\\attention.pdf', 'file_type': 'pdf', 'chunk_index': 6}, page_content='Attention Is All You Need\nAshish Vaswani∗\nGoogle Brain\navaswani@google.com\nNoam Shazeer∗\nGoogle Brain\nnoam@google.com\nNiki Parmar∗\nGoogle Research\nnikip@google.com\nJakob Uszkoreit∗\nGoogle Research\nusz@google.com\nLlion Jones∗\nGoogle Research\nllion@google.com\nAidan N. Gomez∗†\nUniversity of Toronto\naidan@cs.toronto

### Embedding and vector store DB 

In [5]:
import numpy as np
from sentence_transformers import SentenceTransformer
# import chromadb
# from chromadb.config import Settings
import uuid
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity

In [6]:
class EmbeddingManager:
    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        self.model_name = model_name
        self.model = None
        self._load_model()

    def _load_model(self):
        if self.model is None:
            self.model = SentenceTransformer(self.model_name)
            print(f"Loaded embedding model: {self.model_name}")
            print(f"Model details: {self.model.get_sentence_embedding_dimension()} dimensions")
    
    def embed_texts(self, texts: List[str]) -> np.ndarray:
        return self.model.encode(texts, convert_to_numpy=True, show_progress_bar=True)
    

In [7]:
## initialize embedding manager
embedding_manager = EmbeddingManager()

Loaded embedding model: all-MiniLM-L6-v2
Model details: 384 dimensions


In [8]:
# embeddings = embedding_manager.embed_texts([doc.page_content for doc in chunks])

### Vector store

In [9]:
from chromadb import PersistentClient


class VectorStore:
    def __init__(
        self,
        collection_name: str = "document_chunks",
        persist_directory: str = "data/vector_store"
    ):
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_client()

    def _initialize_client(self):
        os.makedirs(self.persist_directory, exist_ok=True)

        if self.client is None:
            self.client = PersistentClient(path=self.persist_directory)
            self.collection = self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={"description": "Collection of document chunks"}
            )
            print(f"Initialized persistent ChromaDB collection: {self.collection_name}")

    def add_documents(self, documents: List[Dict[str, Any]], embeddings: np.ndarray):
        if len(documents) != embeddings.shape[0]:
            raise ValueError("Number of documents and embeddings must match.")

        print(f"Adding {len(documents)} documents to the vector store...")

        ids = [str(uuid.uuid4()) for _ in documents]
        metadatas = [doc.metadata for doc in documents]
        page_contents = [doc.page_content for doc in documents]

        self.collection.add(
            ids=ids,
            embeddings=embeddings.tolist(),
            metadatas=metadatas,
            documents=page_contents
        )

        # flush to disk
        # self.client.persist()
        print(f"Persisted {len(documents)} documents to disk.")

In [10]:
vector_store = VectorStore()

Initialized persistent ChromaDB collection: document_chunks


In [11]:
# convert chunks text to embeddings
texts = [doc.page_content for doc in chunks]
# generate embeddings
embeddings = embedding_manager.embed_texts(texts)

Batches: 100%|██████████| 23/23 [00:14<00:00,  1.61it/s]


In [12]:
# store documents and embeddings in vector store
vector_store.add_documents(
    # documents=[{"page_content": doc.page_content, "metadata": doc.metadata} for doc in chunks],
    documents=chunks,
    embeddings=embeddings
)

Adding 722 documents to the vector store...
Persisted 722 documents to disk.


### Retriever PIPELINE

In [13]:
class RagRetriever:
    def __init__(self, vector_store: VectorStore, embedding_manager: EmbeddingManager):
        self.vector_store = vector_store
        self.embedding_manager = embedding_manager

    def retrieve(self, query: str, top_k: int = 5, score_threshold: float = 0.7) -> List[Dict[str, Any]]:
        query_embedding = self.embedding_manager.embed_texts([query])[0]
        
        # ✅ Ajouter include=['documents', 'distances', 'metadatas'] pour récupérer les métadonnées
        results = self.vector_store.collection.query(
            query_embeddings=[query_embedding.tolist()],
            n_results=top_k,
            include=['documents', 'distances', 'metadatas']  # ← Important !
        )
        
        retrieved_docs = []
        
        # ✅ Ajouter les métadonnées dans le zip
        for doc, score, metadata in zip(
            results['documents'][0], 
            results['distances'][0],
            results['metadatas'][0]  # ← Métadonnées récupérées ici
        ):
            similarity = 1 - score  # assuming distances are cosine distances
            if similarity >= score_threshold:
                retrieved_docs.append({
                    "document": doc,
                    "similarity": similarity,
                    "metadata": metadata  # ✅ Inclure les métadonnées
                })
        
        return retrieved_docs

# class RagRetriver:
#     def __init__(self, vector_store: VectorStore, embedding_manager: EmbeddingManager):
#         self.vector_store = vector_store
#         self.embedding_manager = embedding_manager

#     def retrieve(self, query: str, top_k: int = 5, score_threshold: float = 0.7) -> List[Dict[str, Any]]:
#         query_embedding = self.embedding_manager.embed_texts([query])[0]
#         results = self.vector_store.collection.query(
#             query_embeddings=[query_embedding.tolist()],
#             n_results=top_k
#         )
        
#         retrieved_docs = []
#         for doc, score in zip(results['documents'][0], results['distances'][0]):
#             similarity = 1 - score  # assuming distances are cosine distances
#             if similarity >= score_threshold:
#                 retrieved_docs.append({
#                     "document": doc,
#                     "similarity": similarity
#                 })
        
#         return retrieved_docs

In [15]:
rage_retriever = RagRetriever(vector_store=vector_store, embedding_manager=embedding_manager)
rage_retriever

<__main__.RagRetriever at 0x206b9ad5350>

In [16]:
rage_retriever.retrieve("The usefulness of autonomous driving technology", top_k=3, score_threshold=0.0)

Batches: 100%|██████████| 1/1 [00:00<00:00, 13.52it/s]


[{'document': 'Of course, in addition to all the advantages of this technology, there are a number of weaknesses at \nmoment: \n\uf0b7 \nthe difficulty of moving in areas under construction that have not been implemented on the \ndigital map; \n\uf0b7 \nthe difficulty of moving in off road areas or in chaotic traffic; \n\uf0b7 \nCyber security, because there is a risk of hacking. \n6.  The usefulness of autonomous driving technology - possible ways of use',
  'similarity': 0.36064738035202026,
  'metadata': {'subject': '',
   'total_pages': 9,
   'file_path': 'data/pdf\\Autonomous_vehicles.pdf',
   'author': 'George Evans',
   'page': 6,
   'source': 'data/pdf\\Autonomous_vehicles.pdf',
   'source_file': 'data/pdf\\Autonomous_vehicles.pdf',
   'moddate': '2021-08-13T04:12:55+01:00',
   'file_type': 'pdf',
   'title': 'Open Access proceedings Journal of Physics: Conference series',
   'creator': 'Appligent StampPDF Batch 4.5.1',
   'keywords': 'open access, proceedings, template, fast, 

### VectorDB context and LLM output

In [None]:
# simple RAG PIPELINE with Groq LLM
### VectorDB context and LLM output
from langchain_groq import ChatGroq
import os
from dotenv import load_dotenv
load_dotenv()
api_key = os.getenv("API_KEY")

llm = ChatGroq(api_key=api_key, model="llama-3.1-8b-instant", temperature=0.1, max_tokens=1024)

# function to retrieve context and generate answer
def rag_pipeline(query: str, retriever: RagRetriever, llm: ChatGroq) -> str:
    # retrieve relevant documents
    retrieved_docs = retriever.retrieve(query, top_k=5, score_threshold=0.0)
    
    # prepare context
    context = "\n\n".join([doc["document"] for doc in retrieved_docs])

    if not context:
        return "No relevant context found to answer the query."
        
    
    # create prompt
    prompt = f"Context:\n{context}\n\nQuestion: {query}\nAnswer:"
    
    # generate answer using LLM
    response = llm.invoke(prompt)
    
    return 

In [19]:
reply = rag_pipeline("What is malaria", rage_retriever, llm)
print("RAG Pipeline Response:")
print(reply)

Batches: 100%|██████████| 1/1 [00:00<00:00, 25.47it/s]


RAG Pipeline Response:
Based on the context provided, it seems like the question is asking for a definition or explanation of the term "malaria." However, the context itself appears to be a pattern of repeating the word "Malaria" with a number, which doesn't provide a clear answer.

If I had to provide an answer based on general knowledge, I would say:

Malaria is a serious and sometimes life-threatening disease that is caused by a parasite, typically transmitted through the bite of an infected mosquito. It is characterized by symptoms such as fever, chills, and flu-like symptoms, and can lead to severe complications if left untreated.

However, if I had to provide an answer based on the pattern in the context, it seems like the number "36" might be the answer, but that doesn't make sense in the context of the question.


In [21]:
## advanced RAG Pipeline with source references
def advanced_rag_pipeline(query: str, retriever: RagRetriever, llm: ChatGroq) -> str:
    # retrieve relevant documents
    retrieved_docs = retriever.retrieve(query, top_k=5, score_threshold=0.0)
    
    # prepare context with source references
    context_parts = []
    for doc in retrieved_docs:
        context_parts.append(f"{doc['document']}\n(Source: {doc.get('metadata', {}).get('source_file', 'unknown')})")
    context = "\n\n".join(context_parts)

    if not context:
        return "No relevant context found to answer the query."
    
    source_list = "\n".join(
        [f"- {doc.get('metadata', {}).get('source_file', 'unknown')}" for doc in retrieved_docs]
    )
    
    # create prompt
    prompt = f"Context:\n{context}\n\nQuestion: {query}\nAnswer:"
    
    # generate answer using LLM
    response = llm.invoke(prompt)
    
    return source_list + "\n\n" + response.content

In [26]:
reply = advanced_rag_pipeline("what is attention mechanism", rage_retriever, llm)
print(reply)

Batches: 100%|██████████| 1/1 [00:00<00:00, 18.16it/s]


- data/pdf\attention.pdf
- data/pdf\attention.pdf
- data/pdf\attention.pdf
- data/pdf\attention.pdf
- data/pdf\attention.pdf

An attention mechanism is a function that maps a query and a set of key-value pairs to an output, where the query, keys, values, and output are all vectors. The output is computed as a weighted sum of the values, where the weights are determined by the similarity between the query and the keys. 

In the context of deep learning models, attention mechanisms are often used to focus on specific parts of the input data that are relevant to the task at hand. This is particularly useful in natural language processing tasks, such as machine translation, text summarization, and question answering, where the model needs to weigh the importance of different words or phrases in the input sentence.

In the provided text, it is mentioned that the attention mechanism is used in a self-attention approach, where the model attends to different parts of the input sequence and com