In [None]:
import os
import torch
from torch import bfloat16
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    pipeline,
    GenerationConfig,
    BitsAndBytesConfig,
)
from langchain_groq import ChatGroq
from langchain.document_loaders import TextLoader
from langchain_core.prompts import PromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate, HumanMessagePromptTemplate
from rich import print as rprint
from rich.panel import Panel
from tqdm import tqdm
import warnings
import re

In [None]:
device = 'cpu'
if(torch.cuda.is_available()):
    device = 'cuda'

print(device)

In [None]:
loader = TextLoader("/Pytorch_Codes/BD Police.txt", encoding="utf8")
texts = loader.load()

character_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50)
texts = character_splitter.split_documents(texts)

print(f"Number of chunks: {len(texts)}")
print("Document created successfully!")

In [None]:
documents = [doc.page_content for doc in texts]
len(documents)

In [None]:
model_kwargs = {"device": device}
embed_model_id = "BAAI/bge-m3"

embeddings = HuggingFaceEmbeddings(model_name=embed_model_id, model_kwargs = model_kwargs)
print(f"Embedding Model: {embed_model_id} has been loaded!")

db = Chroma.from_texts(texts=documents, embedding=embeddings, persist_directory="chroma_db")
print("Chroma database updated successfully!")

In [None]:
template = """[INST]
<>
You are a helpful Bangla AI assistant.

Use the following pieces of 'context' to answer the user's questions. Only Respond in Bangla.

Context:
    {context}

Question: {question}[/INST]
Helpful Answer (in Bangla):
"""

prompt_template = ChatPromptTemplate(
    input_variables=["question", "context"],
    output_parser=None,
    partial_variables={},
    messages=[
        HumanMessagePromptTemplate(
            prompt=PromptTemplate(
                input_variables=["question", "context"],
                output_parser=None,
                partial_variables={},
                template=template,
                template_format="f-string",
                validate_template=True,
            ),
            additional_kwargs={},
        )
    ],
)

In [None]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [None]:

api_key = ""  # Groq API Key
groq_chat = ChatGroq(
            groq_api_key=api_key,
            model_name='llama-3.1-8b-instant' # OR 'gemma2-9b-it'
    )

In [None]:
rag_chain_from_docs = (
    RunnablePassthrough.assign(
        context=lambda x: format_docs(x["context"])
    )
    | prompt_template
    | groq_chat
    | StrOutputParser()
)

retriever = db.as_retriever(search_type="mmr", search_kwargs={"k": 4, "fetch_k": 10})

print("Retreiver initialized successfully!")

rag_chain_with_source = RunnableParallel(
    {"context": retriever, "question": RunnablePassthrough()}
).assign(answer=rag_chain_from_docs)

chain = rag_chain_with_source
print("RAG chain created successfully!")

## Evaluation

In [None]:
import pandas as pd

evalset = pd.read_excel("/kaggle/input/finaldataset/FinalRegNLP.xlsx")
evalset.head()

In [None]:
!pip install pandas openpyxl

In [None]:
questions = []
response_answers = []
actual_answers = []

for i in range(len(evalset)):
    user_query = evalset['Question'][i]
    response = chain.invoke(user_query)

    answer = response["result"]

    questions.append(user_query)
    response_answers.append(answer)
    actual_answers.append(evalset['Answer'][i])

print("Completed!")

In [None]:
print("Number of questions:", len(questions))
print("Number of generated responses:", len(response_answers))
print("Number of actual answers:", len(actual_answers))

In [None]:
if len(questions) == len(response_answers) == len(actual_answers):
    new_df = pd.DataFrame({
        'Question': questions,
        'Generated Response': response_answers,
        'Answer': actual_answers,
    })

    new_df.to_excel('LegalRAG_v1.xlsx', index=False, engine='openpyxl')
    print("Results saved successfully!")
else:
    print("Error: Lengths of the lists do not match. Please check the data.")