In [1]:
from langchain_groq import ChatGroq
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os

In [None]:
from langchain_groq import ChatGroq
import os
import hashlib
from langchain.document_loaders import DirectoryLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.vectorstores import Chroma
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
# from langchain.chat_models import ChatGroq

def initialize_llm():
    llm = ChatGroq(
        temperature=0,
        groq_api_key="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",
        model_name="deepseek-r1-distill-llama-70b"  #llama-3.3-70b-versatile
    )
    return llm

def hash_file_content(filepath):
    """Generate a hash for the content of a file."""
    with open(filepath, "rb") as file:
        return hashlib.md5(file.read()).hexdigest()

def create_vector_db():
    loader = DirectoryLoader("data/", glob="*.pdf", loader_cls=PyPDFLoader)
    documents = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    text = text_splitter.split_documents(documents)
    embeddings = HuggingFaceBgeEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    vector_db = Chroma.from_documents(text, embeddings, persist_directory='./chroma_db')
    vector_db.persist()

    print("ChromaDB created and data saved.")
    return vector_db

def update_vector_db():
    """Only add new documents to the vector database."""
    # Initialize database and retriever
    embeddings = HuggingFaceBgeEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    vector_db = Chroma(persist_directory='./chroma_db', embedding_function=embeddings)
    
    # Load existing metadata
    existing_metadata = {doc.metadata.get("file_hash", "") for doc in vector_db.similarity_search("", k=vector_db._collection.count())}

    # Load new documents
    loader = DirectoryLoader("data/", glob="*.pdf", loader_cls=PyPDFLoader)
    documents = loader.load()
    
    # Filter out already processed files
    new_docs = []
    for doc in documents:
        file_hash = hash_file_content(doc.metadata["source"])
        if file_hash not in existing_metadata:
            doc.metadata["file_hash"] = file_hash  # Add hash to metadata
            new_docs.append(doc)
    
    # Add only new documents to the vector DB
    if new_docs:
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
        text = text_splitter.split_documents(new_docs)
        vector_db.add_documents(text)
        vector_db.persist()
        print(f"Added {len(new_docs)} new documents to the vector database.")
    else:
        print("No new documents to add.")
    
    return vector_db

def setup_qa_chain(vector_db, llm):
    retriever = vector_db.as_retriever()
    prompt_templates = """You are an expert financial analyst specializing in the Indian Budget. Provide clear, insightful, and data-driven responses to the following question:  
    {context}  
    User: {question}  
    BudgetBot:"""

    PROMPT = PromptTemplate(template=prompt_templates, input_variables=['context', 'question'])
    
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=retriever,
        chain_type_kwargs={"prompt": PROMPT}
    )
    return qa_chain

def main():
    print("Initializing Chatbot.........")
    llm = initialize_llm()
    db_path = "./chroma_db"

    if not os.path.exists(db_path):
        print("Creating a new vector database...")
        vector_db = create_vector_db()
    else:
        print("Updating the vector database with new documents...")
        vector_db = update_vector_db()
    
    qa_chain = setup_qa_chain(vector_db, llm)

    while True:
        query = input("\nHuman: ")
        if query.lower() == "exit":
            print("Chatbot: Take care of yourself, goodbye!")
            break
        response = qa_chain.run(query)
        print(f"Chatbot: {response}")

if __name__ == "__main__":
    main()


Initializing Chatbot.........
Creating a new vector database...


  embeddings = HuggingFaceBgeEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')





  vector_db.persist()


ChromaDB created and data saved.



Human:  hello 


  response = qa_chain.run(query)


Chatbot: <think>
Okay, so I'm trying to understand the Indian Budget based on the information provided. Let me break it down step by step.

First, the budget seems to have a theme that's built around four pillars: Exports, Reforms (referred to as "the fuel"), Inclusivity ("our guiding spirit"), and the destination being a Developed India or "Viksit Bharat." The vision is to achieve greater prosperity and a stronger global position under Prime Minister Modi's leadership.

The document mentions that as we're in the first quarter of the 21st century, there are geopolitical issues that might slow down global economic growth in the medium term. However, the aspiration for a developed India and the work done in the government's first two terms are guiding them to move forward resolutely.

Looking at the specific points, there are some amendments to the Customs Act, 1962. Section 127C is getting a new sub-section (12) after sub-section (11), making the sub-sections of 127C applicable to the I


Human:  summary of the budget


Chatbot: <think>
Okay, so I need to summarize the budget for 2025-26 based on the information provided. Let me go through the details step by step.

First, the total receipts other than borrowings are estimated at ₹34.96 lakh crore. That's a significant figure. Then, the total expenditure is ₹50.65 lakh crore. So, the government is spending more than it's receiving, which makes sense because there's a fiscal deficit mentioned.

The net tax receipts are ₹28.37 lakh crore. That's a key part of the receipts. Now, the fiscal deficit is 4.4% of GDP. I remember that the previous year's revised estimate was 4.8%, so this is a slight improvement, moving towards the fiscal deficit target.

To finance this deficit, the government plans to borrow ₹11.54 lakh crore through net market borrowings from dated securities. The rest of the financing will come from other sources, but the exact details aren't provided here.

Looking at the objectives of the budget, it's focused on accelerating growth, incl