### Generating Documents


In [3]:
from langchain_community.document_loaders import DirectoryLoader, PyMuPDFLoader,TextLoader,PyPDFLoader
from pathlib import Path
from langchain_text_splitters import RecursiveCharacterTextSplitter
import os

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
def processing_pdf(pdf_directory:str):
    all_documents = []
    pdf_dir = Path(pdf_directory)
    pathname = '**/*'
    files = (
        list(pdf_dir.glob(f'{pathname}.pdf'))+
        list(pdf_dir.glob(f'{pathname}.txt'))
    )
    
    print(f'Found {len(files)} files')
    
    for file in files:
        print(f'\nProcessing "{file.name}"')
        try:
            
            if str(file).endswith('pdf'):
                loader = PyPDFLoader(str(file))
                pdf_documents = loader.load()
                all_documents.extend(pdf_documents)
                print(f'Loaded {len(pdf_documents)} pages')
                
            elif str(file).endswith('txt'):
                loader = TextLoader(str(file),encoding='utf-8')
                txt_documents = loader.load()
                all_documents.extend(txt_documents)
                print(f'Loaded: {file.name} ')
        
        except Exception as e:
            print(f'Error: {e}')
            
    print(f'\nTotal documents loaded: {len(all_documents)}')
    return all_documents        


docs = processing_pdf('data')

# doc_loader = DirectoryLoader(
#     'data/pdf',
#     glob="*.pdf",
#     loader_cls=PyMuPDFLoader
# )

# doc = doc_loader.load()
# doc

Found 3 files

Processing "Important Backend Interview Questions.pdf"
Loaded 46 pages

Processing "Narasimha Karumanchi - Data Structures and Algorithms Made Easy_ Data Structures and Algorithmic Puzzles-CareerMonk Plublications (2017).pdf"
Loaded 828 pages

Processing "deep learning.txt"
Loaded: deep learning.txt 

Total documents loaded: 875


In [5]:
print(docs)




### Chunking - splitting into pieces

In [6]:
# split documents into chunks - splitting into small pieces/parts

def split_documents(documents,chunk_size=1000,chunk_overlap = 200):
    """splitting for better performance"""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_overlap=chunk_overlap,
        chunk_size=chunk_size,
        length_function = len,
        separators=["\n\n","\n"," ",""]
    )
    
    split_docs = text_splitter.split_documents(documents)
    print(f'Split {len(documents)} into {len(split_docs)} chunks')
    
    return split_docs

split_docs = split_documents(docs)
print(split_docs)

Split 875 into 2722 chunks


In [7]:
texts = [doc.page_content for doc in split_docs]
texts[:10]

['I M P O R T A N T\nBa ckend\nConcept s f or Int er vie w\nInt er vie w Questions & Explanations\n</> <HTML>\n<PHP>\nCSSPY THON',
 'Ev er y one learns uniquely .  \r\n\r\nLearn Back end in a structur ed manner and \nmast er it b y practically applying y our skills. \n\r\n\r\nThis Doc will help y ou wit h t he same.\r\n*Disc laimer *\n2www .boss c oderac ademy . c om',
 'What ar e t he diff er ent languages pr esent \nin DBMS?\nQ . 1\nT he f o ur t ypes of DBMS l ang ua ges ar e as f ollo ws:\nDa t a M anip ul a t ion L ang ua ge (DML ): It is us ed t o manip ulat e the data \nand c onsist s of the c ommand f or the s ame . E. g.: SELE C T , INSERT , \nDELETE, UPD A TE, et c.\nDa t a D efinit ion L ang ua ge (DDL ): It is us ed t o define and updat e the \ndata.  E. g.: TRUNC A TE, AL TER , DR OP , CREA TE, RENAME, et c.\r\nDa t a C ont r ol L ang ua ge (DCL ): It is us ed t o c ont r ol the ac c ess t o the \ndata. E. g.: GR ANT , REV OKE, et c.\nT r ans a ct ion C ont r ol L ang ua g

### Embedding

In [8]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List,Tuple,Dict,Any
from sklearn.metrics.pairwise import cosine_similarity

In [9]:
class EmbeddingManager:
    def __init__(self,model_name :str  = "all-MiniLM-L6-v2"):
        
        self.model_name = model_name
        self.model = None
        self._load_model()
    
    def _load_model(self):
        try:
            print(f'Loading Embedding Model {self.model_name}')
            self.model = SentenceTransformer(self.model_name)
            print(f'Model loaded successfully. Embedding dimension: {self.model.get_sentence_embedding_dimension()}')
        except Exception as e:
            print(f'Error loading Model {self.model_name}:e')
            raise            
    
    def generate_embeddings(self,texts:List[str]) ->np.ndarray :
        """
        Generate Embeddings for a list
        
        Args: 
            texts: list of text strings to embed
            
        Return:
            numpy array of embeddings with shape(len(texts),embedding_dim)"""
        
        if not self.model:
            raise ValueError("Model not found")
        print(f'Generating embedding for the {len(texts)} texts')
        embeddings = self.model.encode(texts,show_progress_bar=True)
        print(f'Generated embeddings with shape {embeddings.shape}')
        return embeddings
        
        
embed_manager = EmbeddingManager()
embed_manager
        

Loading Embedding Model all-MiniLM-L6-v2
Model loaded successfully. Embedding dimension: 384


<__main__.EmbeddingManager at 0x1aba3684bb0>

In [10]:
embeddings = embed_manager.generate_embeddings(texts)

Generating embedding for the 2722 texts


Batches: 100%|██████████| 86/86 [04:11<00:00,  2.92s/it]

Generated embeddings with shape (2722, 384)





### Ingesting to vector DB

In [11]:
class VectorDB:
    def __init__(self,collection_name:str='documents',persist_directory = 'data/vetor_db'):
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_store()
    
    def _initialize_store(self):
        try:
            
            # create persistent client
            os.makedirs(self.persist_directory,exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_directory)
            
            # get or create collection
            self.collection = self.client.get_or_create_collection(
                name = self.collection_name,
                metadata={"description":"pdf embeddings for RAG"}
            )
            print(f'VectorDB initialized. Collection: "{self.collection_name}"')
            print(f'Existing documents in collection: {self.collection.count()}')
        except Exception as e:
            print(f'Error initializing vector db: {e}')
            raise
        
    def add_documents(self,documents:List[Any],embeddings:np.ndarray):
        if len(embeddings) != len(documents):
            raise ValueError("Number of embeddings must match Number of documents.")

        print(f'Adding {len(documents)} to vectorDB')
        
        # data for chromadb
        
        ids = []
        metadatas = []
        documents_text = []
        embeddings_list = []
        
        for i,(doc,embedding) in enumerate(zip(documents,embeddings)):
            # unique id
            doc_id = f'doc_{uuid.uuid4().hex[:8]}_{i}'
            ids.append(doc_id)
            
            # metadata
            metadata = dict(doc.metadata)
            metadata['doc_index'] = i
            metadata['content_length'] = len(doc.page_content)
            metadatas.append(metadata)
            
            #document content
            documents_text.append(doc.page_content)
            
            # embedding
            embeddings_list.append(embedding.tolist())
        
        # add to collection
        try:
            self.collection.add(
                ids = ids,
                embeddings = embeddings_list,
                metadatas = metadatas,
                documents = documents_text
            )
            
            print(f'Successfully added {len(documents)} documents to vectorDB')
            print(f'Total documents in collection: {self.collection.count()}')
        except Exception as e:
            print(f'Error adding documents to vector db: {e}')
            raise

In [12]:
DB = VectorDB()
DB.add_documents(split_docs,embeddings)

VectorDB initialized. Collection: "documents"
Existing documents in collection: 2722
Adding 2722 to vectorDB
Successfully added 2722 documents to vectorDB
Total documents in collection: 5444


### Retriever Pipeline from vector db


In [13]:
class RAGRetriever:
    def __init__(self,vector_db:VectorDB,embedding_manager:EmbeddingManager):
        self.vector_db = vector_db
        self.embedding_manager = embedding_manager
        
    def retrieve(self,query:str,top_k:int = 5,score_threshold: float = 0.0) -> List[Dict[str,Any]]:
        print(f'Retriving documents for the Query: "{query}"')
        print(f'Top k: {top_k}, Score Threshold: {score_threshold}')
        
        query_embeddings = self.embedding_manager.generate_embeddings(query)
        
        try:
            results = self.vector_db.collection.query(
                query_embeddings = [query_embeddings.tolist()],
                n_results = top_k
            )
            
            retrieved_docs = []
            
            if results['documents'] and results['documents'][0]:
                documents = results['documents'][0]
                metadatas = results['metadatas'][0]
                distances = results['distances'][0]
                ids = results['ids'][0]
                
                for i, (docs_id,document,metadata,distance) in enumerate(zip(ids,documents,metadatas,distances)):
                    similarity_score = 1-distance
                    if similarity_score >= score_threshold:
                        retrieved_docs.append({
                            'id':docs_id,
                            'content':document,
                            'metadata':metadata,
                            'similarity_score':similarity_score,
                            'distance':distance,
                            'rank':i+1
                        })
                print(f'Retrieved {len(retrieved_docs)} documents after filtering')
            else:
                print("No documents found")
            return retrieved_docs
        except Exception as e:
            print(f'Error during retrieval: {e}')
            return [] 
        
rag_retriever = RAGRetriever(DB,embed_manager)   

In [None]:
# prompt = rag_retriever.retrieve("what is recursion?")
# prompt

Retriving documents for the Query: "what is recursion?"
Top k: 5, Score Threshold: 0.0
Generating embedding for the 18 texts


Batches: 100%|██████████| 1/1 [00:00<00:00, 33.92it/s]

Generated embeddings with shape (384,)
Retrieved 5 documents after filtering





[{'id': 'doc_6d35401c_152',
  'content': '2.3\tWhy\tRecursion?\nRecursion\tis\ta\tuseful\ttechnique\tborrowed\tfrom\tmathematics.\tRecursive\tcode\tis\tgenerally\tshorter\nand\teasier\tto\twrite\tthan\titerative\tcode.\tGenerally,\tloops\tare\tturned\tinto\trecursive\tfunctions\twhen\nthey\tare\tcompiled\tor\tinterpreted.\nRecursion\tis\tmost\tuseful\tfor\ttasks\tthat\tcan\tbe\tdefined\tin\tterms\tof\tsimilar\tsubtasks.\tFor\texample,\nsort,\tsearch,\tand\ttraversal\tproblems\toften\thave\tsimple\trecursive\tsolutions.\n2.4\tFormat\tof\ta\tRecursive\tFunction\nA\trecursive\tfunction\tperforms\ta\ttask\tin\tpart\tby\tcalling\titself\tto\tperform\tthe\tsubtasks.\tAt\tsome\npoint,\tthe\tfunction\tencounters\ta\tsubtask\tthat\tit\tcan\tperform\twithout\tcalling\titself.\tThis\tcase,\twhere\nthe\tfunction\tdoes\tnot\trecur,\tis\tcalled\tthe\t\nbase\tcase\n.\tThe\tformer,\twhere\tthe\tfunction\tcalls\titself\tto\nperform\ta\tsubtask,\tis\treferred\tto\tas\tthe\t\necursive\tcase\n.\tWe\tcan\t

In [25]:
# import re
# response = [res['content'] for res in prompt]
# titles = [res['metadata']['title'] for res in prompt]
# pattern = r'https?://\S+.com|in'
# refer = ["".join(re.findall(pattern,res['metadata']['producer'])) for res in prompt if 'http' in res['metadata']['producer'] ]

# references = list(set(titles+refer))
# print("".join(response))
# references

In [31]:
from transformers import pipeline

llm = pipeline(
    "text2text-generation",
    model="google/flan-t5-base",
    max_new_tokens=1056
)

Device set to use cpu


In [41]:
def rag_simple(query,retriever,llm,top_k=3):
    results = retriever.retrieve(query,top_k=top_k)
    
    if not results:
        return "No relevant documents found"
    context = "\n\n".join([res['content'] for res in results]) if results else ""
    
    if not context:
        return "No relevant context found for the question"
    
    prompt = f"""Use the following context to answer the question concisely.
        Context: {context}
        Question: {query}
        Answer: """
    response = llm([prompt.format(context=context,query=query)])
    return response[0]['generated_text']

In [51]:
answer = rag_simple("what is integer datatype",rag_retriever,llm)
print(answer)

Retriving documents for the Query: "what is integer datatype"
Top k: 3, Score Threshold: 0.0
Generating embedding for the 24 texts


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches: 100%|██████████| 1/1 [00:00<00:00,  2.96it/s]


Generated embeddings with shape (384,)
Retrieved 3 documents after filtering
data types that are defined by system are called primitive data types. The primitive data types provided by many programming languages are: int, float, char, double, bool, etc. The number of bits allocated for each primitive data type depends on the programming languages, the compiler and the operating system. For the same primitive data type, different languages may use different sizes. Depending on the size of the data types, the total available values (domain) will also change. For example, “int” may take 2 bytes or 4 bytes. If it takes 2 bytes (16 bits), then the total possible values are minus 32,768 to plus 32,767 (-2 15 to 2 15 -1). If it takes 4 bytes (32 bits), then the possible values are between -2,147,483,648 and +2,147,483,647 (-2 31 to 2 31 -1). The same is the case with other data types. User defined data types If the system-defined data types are not enough, then most programming languages allo