## data ingestion

### step1: loading pdfs

In [1]:
# directory loader
from langchain.document_loaders import DirectoryLoader
# text loader
from langchain.document_loaders import TextLoader
#pdf loader
from langchain.document_loaders import PyPDFLoader, PyMuPDFLoader

ModuleNotFoundError: No module named 'langchain.document_loaders'

In [None]:
dir_loader=DirectoryLoader(
    path="../data/pdfs",
    glob="**/*.pdf",
    loader_cls= PyMuPDFLoader,
    show_progress=True
)
pdfDocuments= dir_loader.load()
pdfDocuments

In [None]:
type(pdfDocuments[0])

#### So, the pdfDocuments is a list of Documents (metadata + page_content) and each document contains a page of the pdf

### respliting documents into chunks to reduce the number of tokens given to the embedding model and to make the embedding process more specific

In [None]:
from langchain_core.documents import Document
from typing import List
from langchain.text_splitter import RecursiveCharacterTextSplitter
def split_documents(documents: List[Document], chunk_size: int=1000, chunk_overlap: int=200):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )
    split_docs=text_splitter.split_documents(documents)
    print(f"{len(documents)} splitted into {len(split_docs)} chunks")
    if split_docs:
        print("exemple chunk:")
        print(f"content:{split_docs[1].page_content[:200]}")
        print(f"metadata: {split_docs[1].metadata}")
    return split_docs

In [None]:
chunks=split_documents(pdfDocuments)

### step2: embedding

In [None]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity


### creating the Embedding manager class that:
* loads the embedding model 
* contains a methode for generating the embedding (it takes a list of texts -> returns a numpy array of correspending embeddings)

In [None]:
class EmbeddingManager:
    #embedding using SentenceTransformer
    def __init__(self, model_name: str="all-MiniLM-L6-v2"):
        """Attributes:
            model_name:model name
            model: initialized automaticaly using the private methode _load_model
        """
        self.model_name = model_name
        self.model = None
        self._load_model()
    def _load_model(self):
        try:
            print(f"loading embedding model: {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"model loaded successfully. Embedding dimension: {self.model.get_sentence_embedding_dimension}")
        except Exception as e:
            print(f"error in loading the embedding model: {self.model_name}: {e}")
            raise
    def generate_embedding(self, texts:list[str])->np.ndarray:
        if not(self.model):
            raise ValueError("model not loaded")
        print(f"generate embeddings for {len(texts)} texts:")
        embeddings=self.model.encode(texts, show_progress_bar=True)
        print(f"embeddings shape: {embeddings.shape}")
        return embeddings



### initialize the embeddings:

In [None]:
embedding_manager = EmbeddingManager()


In [None]:
embedding_manager

### step3: vector store with chromaDB

prerequisites: every document loaded is composed of : **metadata** and **page content** 

In [None]:
pdfDocuments[1].metadata

In [None]:
pdfDocuments[1].page_content

In [None]:
import os
class VectorStore:
    def __init__(self, collection_name: str="pdf_documents", persist_directory="../data/vector_store"):
        self.collection_name = collection_name
        self.persist_directory= persist_directory
        self.client = None 
        self.collection=None
        self._initialize_store()
    def _initialize_store(self):
        try:
            #chromadb client
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_directory)
            
            # chromadb collection
            self.collection = self.client.get_or_create_collection(
                name=self.collection_name,
                metadata= {"description" : "pdf documents embedded for RAG"}
            )
            print(f"vector store initialized . collection: {self.collection_name}")
            print(f"number of documents in the collection: {self.collection.count()}")
        except Exception as e:
            print(f"error initializing vectore store:{e}")
            raise

    # function to add new documents:
    def add_documents(self, documents: List[Any], embeddings: np.ndarray):
        if len(documents)!=len(embeddings):
            raise ValueError("number of documents must match the number of embeddings")
        print(f"adding{len(documents)} to the collection (vectore store)")
        #preparing data to chromadb:
        ids=[]
        metadatas=[]
        documents_text=[]
        embedding_list=[]

        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
            # step1: generate a unique ID:
            doc_id=f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)
            # step2-1: prepare metadata:
            metadata = dict(doc.metadata)
            metadata['doc_index']= i
            metadata['content_length']=len(doc.page_content)
            #step2-2:apend the metadata to the metadatas list
            metadatas.append(metadata)
            #step3: apend the page content of the doc to the documents_text
            documents_text.append(doc.page_content)
            #step4: apend the embedding of current doc to the embeddings list
            embedding_list.append(embedding.tolist())

        #add to collection:
        try:
            self.collection.add(
                ids= ids, 
                embeddings= embedding_list,
                metadatas= metadatas,
                documents=documents_text
            )
            print(f"{len(ids)} documents were added to the vector  store")
            print(f"tatale documents in the collection: {self.collection.count()}")
        except Exception as e:
            print(f"error in adding new documents to the collection: {e}")
            raise



In [None]:
vectorestore=VectorStore()

In [None]:
chunks

### convert text to embeddings

In [None]:
texts= [doc.page_content for doc in chunks]
# generate the embeddings:
embeddings= embedding_manager.generate_embedding(texts)

#store texts and embeddings into vector
vectorestore.add_documents(chunks, embeddings)

In [None]:
print(f"Prepared {len(texts)} texts for embedding")
print(f"First text sample: {texts[0][:100]}...")


In [None]:
print(f"Embeddings shape: {embeddings.shape}")

## 2- data retrieval 

In [None]:
class RAGRetriever:
    def __init__(self, vectore_store: VectorStore, embedding_manager:EmbeddingManager):
        self.vectore_store = vectore_store
        self.embedding_manager=embedding_manager
    def retrieve(self, query: str, top_k:int=5, score_threshold: float= -0.1)->List[Dict[str, Any]]:
        """args:
                query=question+context
                top_k: umber of top results to return
                score_threshold: min score of similarity (threshold)"""
        #generate query embedding:
        print("1")
        query_embedding=self.embedding_manager.generate_embedding([query])[0]
        try:
            print("2")
            results= self.vectore_store.collection.query(query_embeddings=[query_embedding.tolist()],
                                                         n_results=top_k)
            retrieved_docs=[]
            
            if results['documents'] and results['documents'][0]:
                documents=results['documents'][0]
                print(documents[0])
                metadatas=results['metadatas'][0]
                distances = results['distances'][0]
                ids=results['ids'][0]
                for i, (doc_id, doc_distance, doc_metadata, doc_content) in enumerate(zip(ids, distances, metadatas, documents)):
                    similarity_score= 1 - doc_distance
                    if similarity_score >= score_threshold:
                        retrieved_docs.append({
                            'id':doc_id,
                            'content': doc_content,
                            'metadata': doc_metadata,
                            'distance': doc_distance,
                            'rank': i+1
                        })
                    else:
                        print(f"the: document: {doc_content} is not admitted beecause it has the similarity score under 0: {similarity_score}")
                print(f"retrieved: {len(retrieved_docs)} documents after filtering")
                # Vérifiez comment vous comptez les documents
                print(f"Nombre réel de documents récupérés: {len(retrieved_docs)}")
            else:
                print("no documents found")
                print
            return retrieved_docs

        except Exception as e:
            print(f"error in embedding the query: {e}") 
            return []

In [None]:
class RAGRetriever:
    def __init__(self, vectore_store: VectorStore, embedding_manager:EmbeddingManager):
        self.vectore_store = vectore_store
        self.embedding_manager=embedding_manager
    def retrieve(self, query: str, top_k:int=5, score_threshold: float= 0.5)->List[Dict[str, Any]]:
        """args:
                query=question+context
                top_k: umber of top results to return
                score_threshold: min score of similarity (threshold)"""
        #generate query embedding:
        print("1")
        query_embedding=self.embedding_manager.generate_embedding([query])[0]
        try:
            print("2")
            results= self.vectore_store.collection.query(query_embeddings=[query_embedding.tolist()],
                                                         n_results=top_k)
            
            return results['documents'][0]

        except Exception as e:
            print(f"error in embedding the query: {e}") 
            return []

In [None]:
rag_retriever=RAGRetriever(vectorestore, embedding_manager)


In [None]:
rag_retriever

In [None]:
rag_retriever.retrieve("quelle est la definition de la big data")

## finale step: test:

## 1-test1: question sur le premier document

In [None]:
rag_retriever.retrieve("qu'elle sont les different types des données")

## 3-test2: question sur le deuxieme document

In [None]:
rag_retriever.retrieve("donne moi les differents concepts de kafka")

## 3-test3: question sur le 3eme document

In [None]:
rag_retriever.retrieve("donne moi une idée sur l'entreprise yazaki")

## query retrievel pipeline