In [1]:
print('villam RAG chatbot setup')

villam RAG chatbot setup


In [2]:
import os 
from dotenv import load_dotenv  # to load API keys from .env file 

# load .env file content (GOOGLE_API_KEY, PINECONE_API_KEY)
load_dotenv()

# check if API keys loaded
print(f'GOOGLE_API_KEY Loaded: {bool(os.getenv('GOOGLE_API_KEY'))}')
print(f'PINECONE_API_KEY loaded: {bool(os.getenv('PINECONE_API_KEY'))}')

GOOGLE_API_KEY Loaded: True
PINECONE_API_KEY loaded: True


In [3]:
from langchain_community.document_loaders import PyPDFLoader

# load the PDF file 
pdf_path = 'Villam_Hub.pdf'
pdf_loader = PyPDFLoader(pdf_path)

#load all pages in the pdf 
documents = pdf_loader.load()

# check how many pages loaded 
print(f'Loaded {len(documents)} pages from {pdf_path}')

Ignoring wrong pointing object 7 0 (offset 0)


Loaded 4 pages from Villam_Hub.pdf


In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# the function to split the text into chuncks 
def split_docs(documents, chunk_size=1500, chunk_overlap=100): 
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, 
        chunk_overlap=chunk_overlap
    )

    return text_splitter.split_documents(documents)

# split the ddocuments using the function 
docs = split_docs(documents,chunk_size=1500, chunk_overlap=100)

# check how many chunks
print(f'split into {len(docs)} chunks') 

split into 7 chunks


In [33]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings

# initialize the goggle embedding model (768 dimensions)
embedding_model = GoogleGenerativeAIEmbeddings(model= 'models/embedding-001')

# to test the model
test_text = "Villam Hub empowers urban farmers."
embedding = embedding_model.embed_query(test_text)

print(f"Length of vector: {len(embedding)}")

Length of vector: 768


In [35]:
from tqdm import tqdm
import concurrent.futures
import time   #used to pause between retries if embedding fails

# Function to retry embedding in case in case of temporary errors
def embed_batch_with_retry(embed_model, batch_contents, max_attempts=3):
    """
    Tries to embed a batch of text chunks using the provided embedding model.
    Retries up to `max_attempts` times if it fails.
    """
    for attempt in range(max_attempts):
        try:
            # Try to embed the batch and return the result
            return embed_model.embed_documents(batch_contents)   
        except Exception as e:         
            print(f"Attempt {attempt + 1} failed: {e}") 
             # If it's the last attempt, raise the error to stop execution
            if attempt == max_attempts - 1:
                raise
            else:
                # Wait before retrying (in case of rate limits)
                time.sleep(10) 
                
# Basically, the function sends a batch of text chunks to be embedded, and if it fails, it patiently retries 3 times.

In [39]:
# Function to embed all document chunks using concurrent threads for speed
def concurrent_embed_documents(embed_model, documents, batch_size=50, max_workers=4):
    """
    Splits the full list of `documents` into batches.
    Sends each batch to be embedded in parallel using threads.
    Returns two lists: all vector embeddings and the original text chunks.
    """
    all_embeddings = []  # Store all the vector outputs (embedded texts) here
    all_contents = []    # Store all the original matching text chunks here
    futures = []         # Track background embedding tasks

    # Create a pool of up to `max_workers` threads to run batches in parallel
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Loop through documents in batches of 50 (`batch_size`)
        for i in range(0, len(documents), batch_size):
            batch = documents[i:i+batch_size]                            # Get the current batch
            batch_contents = [doc.page_content for doc in batch]        # Extract just the text

            # Submit the batch to be processed in the background
            future = executor.submit(embed_batch_with_retry, embed_model, batch_contents)
            futures.append((future, batch_contents))

        # Show progress bar while each future is being processed
        for future, contents in tqdm(futures, total=len(futures), desc="Embedding batches"):
            try:
                batch_embeddings = future.result()                      # Wait for result from the thread
                all_embeddings.extend(batch_embeddings)                 # Add results to the main list
                all_contents.extend(contents)                           # Save the original texts too
            except Exception as e:
                print(f"Batch failed: {e}")

    return all_embeddings, all_contents

# the function Breaks the "documents" into 50-piece chunks,Embeds them in parallel (faster)...
# as well as Handles errors and retrying & Returns all the vectors and their matching text.

In [41]:
# HOW to use 
print("Generating embeddings for all document chunks...")
all_embeddings, all_batch_content = concurrent_embed_documents(embedding_model, docs)
print(f" Generated {len(all_embeddings)} embeddings.")


Generating embeddings for all document chunks...


Embedding batches: 100%|█████████████████████████████████████████████████████████████████| 1/1 [00:04<00:00,  4.45s/it]

 Generated 7 embeddings.



