Imports

In [1]:
import os
import re
import chromadb
import uuid

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from dotenv import load_dotenv

Configuration

In [2]:
load_dotenv()
# Configure API key for Google Generative AI
genai_api_key = os.getenv("GOOGLE_API_KEY")
embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")

Data ingestion

In [3]:
with open('manappuram_content.md','r',encoding='utf-8') as a:
    text = a.read()

text[:500]

"# Manappuram Website Content\n\n## Source: https://www.manappuram.com/\n\nNBFC Loans - Business, Gold, Personal, Vehicle & More | Manappuram Finance Skip to main content Happy being a 'farmer'Play Video Most affordable Gold LoanRead more Business AssociatesEnquire now Vehicle & Equipment FinanceRead More ONLINE GOLD LOANWith our new online gold loan facility, you can avail gold loan 24x7, from anywhere in the world!Get startedGold loan in 3 easy stepsQuick and simpleInstant loan DisbursementMinimum "

Text-splitting


In [4]:
chunks = []

# Initialize text splitter 
splitter = RecursiveCharacterTextSplitter(
    chunk_size=5000, 
    chunk_overlap=500,
    separators = ["\n---\n", "\n\n", "\n", " ", ""]
    )

In [5]:
# Split the text by the separator pattern to get sections
sections = re.split(r'\n---\n', text.strip())

for section in sections:
    section = section.strip()
    if not section:
        continue
    
    # Extract URL from each section
    url_match = re.match(r'## Source: (https?://[^\n]+)', section)
    
    if url_match:
        url = url_match.group(1)
        # Remove the URL line from the content
        content = re.sub(r'## Source: https?://[^\n]+\n*', '', section, count=1)
        content = content.strip()
        
        if content:  # Only process if there's actual content
            # Split the content into smaller chunks
            content_chunks = splitter.split_text(content)
            
            # Add each chunk with its URL metadata
            for chunk in content_chunks:
                chunks.append({
                    'text': chunk,
                    'url': url
                })

print(f"Split into {len(chunks)} chunks with URL metadata")

Split into 1108 chunks with URL metadata


In [6]:
# Extract just the text and metadata for ChromaDB
chunk_texts = [chunk['text'] for chunk in chunks]
chunk_urls = [chunk['url'] for chunk in chunks]

In [7]:
# Create embeddings
print("Creating embeddings...")
vectors = embeddings.embed_documents(chunk_texts)
print(f"Created {len(vectors)} embeddings")

Creating embeddings...
Created 1108 embeddings


In [8]:
# Generate IDs and metadata with URLs
ids = [str(uuid.uuid4()) for _ in range(len(chunks))]
metadata = [{"source": "manappuram_website", "chunk_index": i, "url": chunk_urls[i]} 
           for i in range(len(chunks))]

Vector database-setup


In [9]:
# Save into chromadb
client = chromadb.PersistentClient(path='./chromadb')
collection = client.get_or_create_collection(name="data")

try:
    # Store the embedding in ChromaDB
    collection.add(
        documents=chunk_texts, 
        embeddings=vectors, 
        ids=ids, 
        metadatas=metadata
    )
    print("Vector database created successfully.")
    print(f"Stored {len(chunks)} chunks in ChromaDB with URL metadata")
    
except Exception as e:
    print(f"Error creating Vector database: {e}")

Vector database created successfully.
Stored 1108 chunks in ChromaDB with URL metadata


In [10]:
# Optional: Preview first few chunks with their URLs
print("\nFirst 3 chunks preview:")
for i in range(min(3, len(chunks))):
    print(f"Chunk {i+1} URL: {chunk_urls[i]}")
    print(f"Content preview: {chunk_texts[i][:100]}...")
    print("---")


First 3 chunks preview:
Chunk 1 URL: https://www.manappuram.com/blogs/flexibility-gold-loans
Content preview: The flexibility of gold loans | Manappuram Finance Limited Skip to main content BlogRead our blogs f...
---
Chunk 2 URL: https://www.manappuram.com/blogs/flexibility-gold-loans
Content preview: the quality and quantity of the gold you provide. However, the amount you get will not go above the ...
---
Chunk 3 URL: https://www.manappuram.com/blogs/reasons-why-gold-loan-inexpensive
Content preview: Reasons why a gold loan is inexpensive | Manappuram Finance Limited Skip to main content BlogRead ou...
---


Testing

In [12]:
from langchain_chroma import Chroma
from dotenv import load_dotenv

load_dotenv()
# Configure API key for Google Generative AI
genai_api_key = os.getenv("GOOGLE_API_KEY")

In [13]:
# Initialize ChromaDB
embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")
vectorstore = Chroma(collection_name="data",embedding_function=embeddings, persist_directory='./chromadb')

In [14]:
results = vectorstore.similarity_search(
    "bitcoin gold"
)
for res in results:
    print(f"document =  {res.page_content} \n metadata = {res.metadata['url']}")

document =  for now, the fact is, investors who put money into gold are often those seeking a safe haven investment, especially at a time of valid fears about currency debasement causing a resurgence of inflation. The US Fed has printed money at a frenetic pace after the pandemic, and the run-up in inflation has already begun. In April this year, US consumer price inflation registered an increase of 4.2% over the year, with equally hefty increases in the months following. Yet, gold price has not gained as conventional wisdom would suggest.The Bitcoin FactorThat brings us to Bitcoin and the outsized gains this cryptocurrency has posted over the last one to one-and-half years. Consider the period from 7 August 2020 onwards. That was the day that gold price had hit a peak of US$ 2,062 per ounce and Bitcoin was trading at US$11,592 per digital coin. Fast forward to recent days (first week of September) when gold trades at about US$ 1,812, a decline of 12% from the peak, while Bitcoin is fe