In [1]:
from langchain_groq import ChatGroq
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os

In [None]:
from langchain_groq import ChatGroq
import os
import hashlib
from langchain.document_loaders import DirectoryLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.vectorstores import Chroma
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
# from langchain.chat_models import ChatGroq

def initialize_llm():
    llm = ChatGroq(
        temperature=0,
        groq_api_key="API-KEY",
        model_name="deepseek-r1-distill-llama-70b"  #llama-3.3-70b-versatile
    )
    return llm

def hash_file_content(filepath):
    """Generate a hash for the content of a file."""
    with open(filepath, "rb") as file:
        return hashlib.md5(file.read()).hexdigest()

def create_vector_db():
    loader = DirectoryLoader("data/", glob="*.pdf", loader_cls=PyPDFLoader)
    documents = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    text = text_splitter.split_documents(documents)
    embeddings = HuggingFaceBgeEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    vector_db = Chroma.from_documents(text, embeddings, persist_directory='./chroma_db')
    vector_db.persist()

    print("ChromaDB created and data saved.")
    return vector_db

def update_vector_db():
    """Only add new documents to the vector database."""
    # Initialize database and retriever
    embeddings = HuggingFaceBgeEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    vector_db = Chroma(persist_directory='./chroma_db', embedding_function=embeddings)
    
    # Load existing metadata
    existing_metadata = {doc.metadata.get("file_hash", "") for doc in vector_db.similarity_search("", k=vector_db._collection.count())}

    # Load new documents
    loader = DirectoryLoader("data/", glob="*.pdf", loader_cls=PyPDFLoader)
    documents = loader.load()
    
    # Filter out already processed files
    new_docs = []
    for doc in documents:
        file_hash = hash_file_content(doc.metadata["source"])
        if file_hash not in existing_metadata:
            doc.metadata["file_hash"] = file_hash  # Add hash to metadata
            new_docs.append(doc)
    
    # Add only new documents to the vector DB
    if new_docs:
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
        text = text_splitter.split_documents(new_docs)
        vector_db.add_documents(text)
        vector_db.persist()
        print(f"Added {len(new_docs)} new documents to the vector database.")
    else:
        print("No new documents to add.")
    
    return vector_db

def setup_qa_chain(vector_db, llm):
    retriever = vector_db.as_retriever()
    prompt_templates = """You are an expert financial analyst specializing in the Indian Budget. Provide clear, insightful, and data-driven responses to the following question:  
    {context}  
    User: {question}  
    BudgetBot:"""

    PROMPT = PromptTemplate(template=prompt_templates, input_variables=['context', 'question'])
    
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=retriever,
        chain_type_kwargs={"prompt": PROMPT}
    )
    return qa_chain

def main():
    print("Initializing Chatbot.........")
    llm = initialize_llm()
    db_path = "./chroma_db"

    if not os.path.exists(db_path):
        print("Creating a new vector database...")
        vector_db = create_vector_db()
    else:
        print("Updating the vector database with new documents...")
        vector_db = update_vector_db()
    
    qa_chain = setup_qa_chain(vector_db, llm)

    while True:
        query = input("\nHuman: ")
        if query.lower() == "exit":
            print("Chatbot: Take care of yourself, goodbye!")
            break
        response = qa_chain.run(query)
        print(f"Chatbot: {response}")

if __name__ == "__main__":
    main()


Initializing Chatbot.........
Updating the vector database with new documents...


  embeddings = HuggingFaceBgeEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')





  vector_db = Chroma(persist_directory='./chroma_db', embedding_function=embeddings)


No new documents to add.



Human:  exit


Chatbot: Take care of yourself, goodbye!



Human:  summary of the budget


Chatbot: <think>
Okay, so I need to summarize the budget for 2025-26 based on the information provided. Let me go through the details step by step.

First, the total receipts other than borrowings are estimated at ₹34.96 lakh crore. That's a significant figure. Then, the total expenditure is ₹50.65 lakh crore. So, the government is spending more than it's receiving, which makes sense because there's a fiscal deficit mentioned.

The net tax receipts are ₹28.37 lakh crore. That's a key part of the receipts. Now, the fiscal deficit is 4.4% of GDP. I remember that the previous year's revised estimate was 4.8%, so this is a slight improvement, moving towards the fiscal deficit target.

To finance this deficit, the government plans to borrow ₹11.54 lakh crore through net market borrowings from dated securities. The rest of the financing will come from other sources, but the exact details aren't provided here.

Looking at the objectives of the budget, it's focused on accelerating growth, incl

# weaviate

In [3]:
# !pip install weaviate-client
# !pip install langchain
# !pip install openai

In [None]:
groq_api_key="API-KEY"
WEAVIATE_URL="URL"
WEAVIATE_API_KEY="API-KEY"

In [5]:
loader = DirectoryLoader("data/", glob="*.pdf", loader_cls=PyPDFLoader)
data = loader.load()

In [6]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
text_chunks = text_splitter.split_documents(data)
embeddings = HuggingFaceBgeEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')

In [7]:
len(embeddings.embed_query("what is you"))

384

In [8]:
# Extract the page_content from the Document objects
texts = [t.page_content for t in text_chunks]

# Step 3: Initialize the HuggingFaceBgeEmbeddings model
embedding_model = HuggingFaceBgeEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')

# Step 4: Generate embeddings for the text chunks
embeddings = embedding_model.embed_documents(texts)  # Use .embed_documents()

# Debugging: Print the first embedding
# print("First embedding:", embeddings[0])

In [9]:
# print(texts)

In [None]:
import pinecone
from langchain.vectorstores import Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.schema import Document

from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(
        api_key="API-KEY"
    )

index_name = "rag" # put in the name of your pinecone index here
# Connect to the existing index
index = pc.Index(index_name)

In [24]:


# Step 6: Prepare vectors for upserting
vectors = [
    {
        "id": f"vec_{i}",  # Replace with a unique ID for each vector
        "values": embedding,  # No need to convert to float; it's already a list of floats
        "metadata": {"text": t.page_content}  # Optional: Add metadata (e.g., the text chunk)
    }
    for i, (t, embedding) in enumerate(zip(text_chunks, embeddings))
]

# Debugging: Print the first vector to verify its structure
# print("First vector:", vectors[0])


In [25]:
# Step 7: Upsert the embeddings into the Pinecone index
index.upsert(vectors=vectors)

print("Embeddings upserted successfully!")

Embeddings upserted successfully!


In [11]:
from langchain.vectorstores import Pinecone
# from langchain.embeddings.openai import OpenAIEmbeddings

import pinecone  

# Initialize Pinecone vector store
docsearch = Pinecone(index, embedding_model.embed_query, "text")

  docsearch = Pinecone(index, embedding_model.embed_query, "text")


In [12]:
# !pip install pinecone-client langchain
# Perform a similarity search
query = "What is the budget allocation for incometax?"
results = docsearch.similarity_search(query, k=5)

# Display results
for result in results:
    print(result.page_content)

19  
 
Budget Estimates 2025-26 
112. Coming to 2025 -26, the total receipts other than  borrowings and the 
total expenditure are estimated at ` 34.96 lakh crore and ` 50.65 lakh crore 
respectively. The net tax receipts are estimated at ` 28.37 lakh crore. 
113. The fiscal deficit is estimated to be 4.4 per cent of GDP. 
114. To finance the fiscal deficit, the net market borrowings from dated 
securities are estimated at ` 11.54 lakh crore. The balance financing is expected
statement.           
Revised Estimates 2024-25 
110. The Revised Estimate of the total receipts other than borrowings is  
` 31.47 lakh crore, of which the net tax receipts are ` 25.57 lakh crore. The 
Revised Estimate of the total expenditure is ` 47.16 lakh crore, of which the 
capital expenditure is about ` 10.18 lakh crore. 
111. The Revised Estimate of the fiscal deficit is 4.8 per cent of GDP.
applicable for income marginally higher than ` 12,00,000.  
• A few examples for calculation of tax benefit are giv