# rag pipeline - data ingestion to vector DB PIPELINE

In [15]:
import os
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pathlib import Path

In [16]:
## reading the pdfs 
def process_all_pdfs(pdf_directory):
    """process all the pdf files in a directory"""
    all_documents = []
    pdf_dir = Path(pdf_directory)  # Fixed: path -> Path

    # find all pdfs recursively 
    pdf_files = list(pdf_dir.glob("**/*.pdf"))
    
    print(f"\nFound {len(pdf_files)} PDF files")  # Fixed: added count before loop

    for pdf_file in pdf_files:  # Fixed: odf_files -> pdf_file, pdf_files -> pdf_file
        try:
            print(f"\nProcessing: {pdf_file.name}")  # Fixed: moved inside loop
            loader = PyPDFLoader(str(pdf_file))  # Fixed: pdf_files -> pdf_file
            documents = loader.load()
            
            # add source information to metadata
            for doc in documents:
                doc.metadata['source_file'] = pdf_file.name  # Fixed: source_files -> source_file
                doc.metadata['file_type'] = 'pdf'
            
            all_documents.extend(documents)  # Fixed: indentation
            print(f"Loaded {len(documents)} pages")  # Fixed: indentation and spacing
            
        except Exception as e:
            print(f"Error processing {pdf_file.name}: {e}")  # Fixed: added filename

    print(f"\nTotal documents loaded: {len(all_documents)}")  # Fixed: |n -> \n
    return all_documents 

all_pdf_documents = process_all_pdfs("../data")


Found 3 PDF files

Processing: Parvath_Reddy_Quality_analyst.pdf
Loaded 1 pages

Processing: Parvath_Reddy_Machine_Learning_Engineer.pdf
Loaded 1 pages

Processing: ParvathReddy_DataScience_Resume.pdf
Loaded 1 pages

Total documents loaded: 3


In [17]:
### Text splitting get into chunks

def split_documents(documents, chunk_size=1000, chunk_overlap=200):
    """Split documents into smaller chunks for better RAG performance"""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )
    
    split_docs = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(split_docs)} chunks")
    
    # Show example of a chunk
    if split_docs:
        print(f"\nExample chunk:")
        print(f"Content: {split_docs[0].page_content[:200]}...")
        print(f"Metadata: {split_docs[0].metadata}")
    
    return split_docs

In [18]:
chunks=split_documents(all_pdf_documents)
chunks

Split 3 documents into 16 chunks

Example chunk:
Content: PARVATH REDDY
Quality Analyst
Bengaluru, KA|♂phone+91-7675043309/envel⌢peparvathreddy50@gmail.com/linkedinlinkedin.com/in/parvath-reddy
Summary
Quality Analyst with expertise in data validation, proce...
Metadata: {'producer': 'pdfTeX-1.40.27', 'creator': 'LaTeX with hyperref', 'creationdate': '2025-09-16T04:28:48+00:00', 'author': '', 'keywords': '', 'moddate': '2025-09-16T04:28:48+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.27 (TeX Live 2025) kpathsea version 6.4.1', 'subject': '', 'title': '', 'trapped': '/False', 'source': '../data/pdf_files/Parvath_Reddy_Quality_analyst.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1', 'source_file': 'Parvath_Reddy_Quality_analyst.pdf', 'file_type': 'pdf'}


[Document(metadata={'producer': 'pdfTeX-1.40.27', 'creator': 'LaTeX with hyperref', 'creationdate': '2025-09-16T04:28:48+00:00', 'author': '', 'keywords': '', 'moddate': '2025-09-16T04:28:48+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.27 (TeX Live 2025) kpathsea version 6.4.1', 'subject': '', 'title': '', 'trapped': '/False', 'source': '../data/pdf_files/Parvath_Reddy_Quality_analyst.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1', 'source_file': 'Parvath_Reddy_Quality_analyst.pdf', 'file_type': 'pdf'}, page_content='PARVATH REDDY\nQuality Analyst\nBengaluru, KA|♂phone+91-7675043309/envel⌢peparvathreddy50@gmail.com/linkedinlinkedin.com/in/parvath-reddy\nSummary\nQuality Analyst with expertise in data validation, process improvement, and quality assurance methodologies.\nProficient in SQL, Python, and statistical analysis for identifying defects, ensuring data integrity, and implementing\nquality control measures to enhance operational efficiency and re

## embadings and vectorstoredb

In [19]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity

In [20]:
class EmbeddingManager:
    """Handling documents embedding generation using SentenceTransformer"""

    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        """
        Initializes the embedding manager

        Args:
            model_name: HuggingFace model name from sentence-transformers
        """
        self.model_name = model_name
        self.model = None
        self._load_model()

    def _load_model(self):
        """Loads the SentenceTransformer model."""
        try:
            self.model = SentenceTransformer(self.model_name)
            print(f"Model '{self.model_name}' loaded successfully.")
            print(f"Model loaded successfully. Embedding dimensions: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error loading model: {e}")

    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        """
        Generate embeddings for a list of texts
        Args:
            texts: List of text strings to embed
        Returns:
            numpy array of embeddings with shape (len(texts), embedding_dim)
        """
        if not self.model:
            raise ValueError("Model not loaded")
        print(f"Generating embeddings for {len(texts)} texts...")
        embeddings = self.model.encode(texts, show_progress_bar=True)
        print(f"Generated embeddings with shape: {embeddings.shape}")
        return embeddings


# Initializing the embedding manager
embedding_manager = EmbeddingManager()

# Test it
sample_texts = ["Hello world", "This is a test"]
embeddings = embedding_manager.generate_embeddings(sample_texts)
print(f"Final embeddings shape: {embeddings.shape}")

Model 'all-MiniLM-L6-v2' loaded successfully.
Model loaded successfully. Embedding dimensions: 384
Generating embeddings for 2 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00,  3.53it/s]

Generated embeddings with shape: (2, 384)
Final embeddings shape: (2, 384)





In [21]:
import os
import uuid
from typing import List, Any
import numpy as np
import chromadb

class VectorStore:
    """Manages document embeddings in a ChromaDB vector store"""
    
    def __init__(self, collection_name: str = "pdf_documents", persist_directory: str = "../data/vector_store"):
        """
        Initialize the vector store
        
        Args:
            collection_name: Name of the ChromaDB collection
            persist_directory: Directory to persist the vector store
        """
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_store()

    def _initialize_store(self):
        """Initialize ChromaDB client and collection"""
        try:
            # Create persistent ChromaDB client
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_directory)
            
            # Get or create collection WITH COSINE SIMILARITY
            self.collection = self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={
                    "description": "PDF document embeddings for RAG",
                    "hnsw:space": "cosine"  # ← THIS IS THE FIX
                }
            )
            print(f"Vector store initialized. Collection: {self.collection_name}")
            print(f"Existing documents in collection: {self.collection.count()}")
            
        except Exception as e:
            print(f"Error initializing vector store: {e}")
            raise

    def add_documents(self, documents: List[Any], embeddings: np.ndarray):
        """
        Add documents and their embeddings to the vector store
        
        Args:
            documents: List of LangChain documents
            embeddings: Corresponding embeddings for the documents
        """
        if len(documents) != len(embeddings):
            raise ValueError("Number of documents must match number of embeddings")
        
        print(f"Adding {len(documents)} documents to vector store...")
        
        # Prepare data for ChromaDB
        ids = []
        metadatas = []
        documents_text = []
        embeddings_list = []
        
        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
            # Generate unique ID
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)
            
            # Prepare metadata
            metadata = dict(doc.metadata)
            metadata['doc_index'] = i
            metadata['content_length'] = len(doc.page_content)
            metadatas.append(metadata)
            
            # Document content
            documents_text.append(doc.page_content)
            
            # Embedding
            embeddings_list.append(embedding.tolist())
        
        # Add to collection
        try:
            self.collection.add(
                ids=ids,
                embeddings=embeddings_list,
                metadatas=metadatas,
                documents=documents_text
            )
            print(f"Successfully added {len(documents)} documents to vector store")
            print(f"Total documents in collection: {self.collection.count()}")
            
        except Exception as e:
            print(f"Error adding documents to vector store: {e}")
            raise

# Initialize the vector store
vectorstore = VectorStore()
print(vectorstore)

Vector store initialized. Collection: pdf_documents
Existing documents in collection: 16
<__main__.VectorStore object at 0x116e5d540>


In [22]:
chunks

[Document(metadata={'producer': 'pdfTeX-1.40.27', 'creator': 'LaTeX with hyperref', 'creationdate': '2025-09-16T04:28:48+00:00', 'author': '', 'keywords': '', 'moddate': '2025-09-16T04:28:48+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.27 (TeX Live 2025) kpathsea version 6.4.1', 'subject': '', 'title': '', 'trapped': '/False', 'source': '../data/pdf_files/Parvath_Reddy_Quality_analyst.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1', 'source_file': 'Parvath_Reddy_Quality_analyst.pdf', 'file_type': 'pdf'}, page_content='PARVATH REDDY\nQuality Analyst\nBengaluru, KA|♂phone+91-7675043309/envel⌢peparvathreddy50@gmail.com/linkedinlinkedin.com/in/parvath-reddy\nSummary\nQuality Analyst with expertise in data validation, process improvement, and quality assurance methodologies.\nProficient in SQL, Python, and statistical analysis for identifying defects, ensuring data integrity, and implementing\nquality control measures to enhance operational efficiency and re

In [23]:
### Convert the text to embeddings
texts=[doc.page_content for doc in chunks]
texts

## Generate the Embeddings

embeddings=embedding_manager.generate_embeddings(texts)

##store int he vector dtaabase
vectorstore.add_documents(chunks,embeddings)

Generating embeddings for 16 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00,  7.20it/s]

Generated embeddings with shape: (16, 384)
Adding 16 documents to vector store...
Successfully added 16 documents to vector store
Total documents in collection: 32





In [24]:
class RAGRetriever:
    """Handles query-based retrieval from the vector store"""
    
    def __init__(self, vector_store: VectorStore, embedding_manager: EmbeddingManager):
        """
        Initialize the retriever
        
        Args:
            vector_store: Vector store containing document embeddings
            embedding_manager: Manager for generating query embeddings
        """
        self.vector_store = vector_store
        self.embedding_manager = embedding_manager

    def retrieve(self, query: str, top_k: int = 5, score_threshold: float = 0.0) -> List[Dict[str, Any]]:
        """
        Retrieve relevant documents for a query
        
        Args:
            query: The search query
            top_k: Number of top results to return
            score_threshold: Minimum similarity score threshold
            
        Returns:
            List of dictionaries containing retrieved documents and metadata
        """
        print(f"Retrieving documents for query: '{query}'")
        print(f"Top K: {top_k}, Score threshold: {score_threshold}")
        
        # Generate query embedding
        query_embedding = self.embedding_manager.generate_embeddings([query])[0]
        
        # Search in vector store
        try:
            results = self.vector_store.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=top_k
            )
            
            # Process results
            retrieved_docs = []
            
            if results['documents'] and results['documents'][0]:
                documents = results['documents'][0]
                metadatas = results['metadatas'][0]
                distances = results['distances'][0]
                ids = results['ids'][0]
                
                for i, (doc_id, document, metadata, distance) in enumerate(zip(ids, documents, metadatas, distances)):
                    # Convert distance to similarity score (ChromaDB uses cosine distance)
                    similarity_score = 1 - distance
                    
                    if similarity_score >= score_threshold:
                        retrieved_docs.append({
                            'id': doc_id,
                            'content': document,
                            'metadata': metadata,
                            'similarity_score': similarity_score,
                            'distance': distance,
                            'rank': i + 1
                        })
                
                print(f"Retrieved {len(retrieved_docs)} documents (after filtering)")
            else:
                print("No documents found")
            
            return retrieved_docs
            
        except Exception as e:
            print(f"Error during retrieval: {e}")
            return []

rag_retriever=RAGRetriever(vectorstore,embedding_manager)

In [25]:
rag_retriever

<__main__.RAGRetriever at 0x1168fab00>

In [26]:
rag_retriever.retrieve("machine learning experience")

Retrieving documents for query: 'machine learning experience'
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 49.98it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)





[{'id': 'doc_5383b99d_4',
  'content': 'PARVATH REDDY\nMachine Learning Engineer\nBengaluru, KA | ♂phone+91-7675043309 /envel⌢peparvathreddy50@gmail.com /linkedinlinkedin.com/in/parvath-reddy /githubgithub.com/parvath-reddy\nSummary\nMachine Learning Engineer with expertise in building production-ready ML systems, deep learning models, and RAG-based\napplications. Proficient in Python, TensorFlow, PyTorch, and MLOps, with experience deploying scalable models and\ncomputer vision solutions achieving 95% accuracy.\nTechnical Skills\nMachine Learning: Scikit-learn, XGBoost, LightGBM, Random Forest, SVM, Ensemble Methods\nDeep Learning & AI:TensorFlow, PyTorch, Keras, OpenAI API, Transformers, LSTM, CNN\nComputer Vision & NLP:OpenCV, YOLO, Object Detection, OpenAI Whisper, RAG, Embeddings\nMLOps & Deployment:Streamlit, Docker, Model Versioning, Pipeline Optimization\nProgramming & Tools:Python, SQL, Git, Jupyter, Pandas, NumPy, Matplotlib, Seaborn\nDatabases: MySQL, PostgreSQL, Vector Data

In [27]:
rag_retriever.retrieve("parvath reddy")

Retrieving documents for query: 'parvath reddy'
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 52.88it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)





[{'id': 'doc_4cb176fe_0',
  'content': 'PARVATH REDDY\nQuality Analyst\nBengaluru, KA|♂phone+91-7675043309/envel⌢peparvathreddy50@gmail.com/linkedinlinkedin.com/in/parvath-reddy\nSummary\nQuality Analyst with expertise in data validation, process improvement, and quality assurance methodologies.\nProficient in SQL, Python, and statistical analysis for identifying defects, ensuring data integrity, and implementing\nquality control measures to enhance operational efficiency and reduce error rates.\nSkills\nQuality Assurance:Test Case Design, Defect Management, Root Cause Analysis, Quality Control, Process\nValidation, Compliance Testing\nTechnical:SQL, Python, Excel, Statistical Analysis, Data Validation, Quality Metrics, MySQL, PostgreSQL,\nPower BI, Test Automation\nExperience\nQuality Analyst InternMay 2025 – July 2025\nMileto Foods (Remote) Tools Used: Excel, SQL, Power BI, Statistical Analysis\n•Performed comprehensive quality auditsacross 5+ state operations, validating data integr

# integration vector db context pipeline with LLM output

In [41]:
### Simple RAG pipeline with Groq LLM
from langchain_groq import ChatGroq
import os
from dotenv import load_dotenv

load_dotenv()

### Initialize the Groq LLM (set your GROQ_API_KEY in environment)
groq_api_key = os.getenv("GROQ_API_KEY")
llm = ChatGroq(
    groq_api_key=groq_api_key,
    model_name="llama-3.3-70b-versatile",  # Currently supported model (replaced llama3-70b-8192)
    temperature=0.1,
    max_tokens=1024
)

## 2. Simple RAG function: retrieve context + generate response
def rag_simple(query, retriever, llm, top_k=3):
    ## retrieve the context
    results = retriever.retrieve(query, top_k=top_k)
    context = "\n\n".join([doc['content'] for doc in results]) if results else ""
    if not context:
        return "No relevant context found to answer the question."
    
    ## generate the answer using GROQ LLM
    prompt = f"""Use the following context to answer the question concisely.
Context:
{context}

Question: {query}
Answer:"""
    
    response = llm.invoke([prompt])
    return response.content

In [42]:
answer=rag_simple("What is the best project in all of it ?",rag_retriever,llm)
print(answer)

Retrieving documents for query: 'What is the best project in all of it ?'
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00,  3.44it/s]


Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
The RAG AI Teaching Assistant project stands out, achieving 85-90% query relevance accuracy and sub-3-second response time, making it a notable project.


In [44]:
# --- Enhanced RAG Pipeline Features ---
def rag_advanced(query, retriever, llm, top_k=5, min_score=0.2, return_context=False):
    """
    RAG pipeline with extra features:
    - Returns answer, sources, confidence score, and optionally full context.
    """
    results = retriever.retrieve(query, top_k=top_k, score_threshold=min_score)
    if not results:
        return {'answer': 'No relevant context found.', 'sources': [], 'confidence': 0.0, 'context': ''}
    
    # Prepare context and sources
    context = "\n\n".join([doc['content'] for doc in results])
    sources = [{
        'source': doc['metadata'].get('source_file', doc['metadata'].get('source', 'unknown')),
        'page': doc['metadata'].get('page', 'unknown'),
        'score': doc['similarity_score'],
        'preview': doc['content'][:300] + '...'
    } for doc in results]
    confidence = max([doc['similarity_score'] for doc in results])
    
    # Generate answer
    prompt = f"""Use the following context to answer the question concisely.\nContext:\n{context}\n\nQuestion: {query}\n\nAnswer:"""
    response = llm.invoke([prompt.format(context=context, query=query)])
    
    output = {
        'answer': response.content,
        'sources': sources,
        'confidence': confidence
    }
    if return_context:
        output['context'] = context
    return output

# Example usage:
result = rag_advanced("summrise all the files", rag_retriever, llm, top_k=3, min_score=0.1, return_context=True)
print("Answer:", result['answer'])
print("Sources:", result['sources'])
print("Confidence:", result['confidence'])
print("Context Preview:", result['context'][:300])

Retrieving documents for query: 'summrise all the files'
Top K: 3, Score threshold: 0.1
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:01<00:00,  1.41s/it]


Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
Answer: The context appears to be a resume or a portfolio of a Data Scientist, Parvath Reddy. Here's a concise summary:

**Summary:**
Parvath Reddy is a Data Scientist with skills in machine learning, deep learning, and AI development. He has experience in building production-ready ML systems and financial risk models.

**Key Experience:**

* Developed a credit risk prediction model using Python, Scikit-learn, and Extra Trees Classifier, achieving 78.5% accuracy.
* Built a production-ready Streamlit application with real-time prediction capabilities and interactive Plotly visualizations.
* Optimized credit evaluation time by 60% through a mechanized risk scoring system.

**Skills:**

* Programming: Python, SQL, R, JavaScript
* ML/DL & AI: Scikit-learn, XGBoost, TensorFlow, PyTorch, OpenAI API, RAG, NLP
* Data Science: Pandas, NumPy, Matplotlib, Seaborn, Plotly, Jupyter, Streamlit

**Certifications:**

* O