In [1]:
from langchain.memory import ConversationBufferMemory
from langchain.prompts import PromptTemplate
from langchain.llms.base import LLM
from langchain.chains import ConversationChain
from typing import Optional, List, Any
import faiss
import json
import numpy as np
from sentence_transformers import SentenceTransformer, util
import requests
import re
from bs4 import BeautifulSoup
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer



In [2]:
embedder = SentenceTransformer("bhavyagiri/InLegal-Sbert")

In [3]:
def load_index_and_meta(index_path, meta_path):
    index = faiss.read_index(index_path)
    with open(meta_path, "r", encoding="utf-8") as f:
        meta = json.load(f)
    return index, meta

def search_faiss(query, index, meta, top_k=2):
    query_vec = embedder.encode([query])
    query_vec = np.array(query_vec).astype("float32")
    D, I = index.search(query_vec, top_k)
    results = []
    for idx in I[0]:
        if idx < len(meta):
            results.append(meta[idx])
    return results
class ollama(LLM):
    model_name: str = "mistral"

    @property
    def _llm_type(self) -> str:
        return "ollama"
        
    def _call(self,prompt: str, stop: Optional[List[str]] = None, **kwargs: Any):
        response = requests.post(
            "http://127.0.0.1:11434/api/generate",
            json={"model": self.model_name, "prompt": prompt, "stream": False}
        )
        return response.json()["response"]

def get_top_judgment_text(query):
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}
    url = f"https://indiankanoon.org/search/?formInput={query}&type=judgement"
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    results = soup.find_all("div", class_="result_title")
    for result in results:
        link_tag = result.find('a')
        if not link_tag:
            continue
        link = link_tag['href']
        full_link = "https://indiankanoon.org" + link
        judgment_page = requests.get(full_link, headers=headers)
        judgment_soup = BeautifulSoup(judgment_page.text, 'html.parser')
        judgment_div = judgment_soup.find("div", {"id": "judgment"})
        if judgment_div:
            judgment_text = judgment_div.get_text(separator="\n", strip=True)
        else:
            judgment_text = judgment_soup.get_text(separator="\n", strip=True)
        return judgment_text
    return None
def summarize_long_text(text, tokenizer, model, chunk_size=1024, overlap=100):
    # Split into words for rough chunking
    words = text.split()
    summaries = []

    for i in range(0, len(words), chunk_size - overlap):
        chunk = " ".join(words[i:i + chunk_size])
        inputs = tokenizer(chunk, return_tensors="pt", max_length=chunk_size, truncation=True)
        summary_ids = model.generate(
            **inputs,
            max_length=200,
            num_beams=4,
            early_stopping=True
        )
        summaries.append(tokenizer.decode(summary_ids[0], skip_special_tokens=True))

    # Optionally combine and summarize again for final condensation
    combined_summary = " ".join(summaries)
    final_inputs = tokenizer(combined_summary, return_tensors="pt", truncation=True)
    final_summary_ids = model.generate(
        **final_inputs,
        max_length=300,
        num_beams=4,
        early_stopping=True
    )
    return tokenizer.decode(final_summary_ids[0], skip_special_tokens=True)



In [4]:
ipc_index, ipc_meta = load_index_and_meta("ipc.index", "ipc.json")

model_dir = "./legal_summarizer_model"

tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForSeq2SeqLM.from_pretrained(model_dir)
model.eval()


T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear(in_features=768, out_features=2048, bias=False)
              (wo):

In [5]:
custom=PromptTemplate(input_variables=["history","input"],template=""" You are an legal Chatbot assistant specializing in India Penal Code.
                      
                      Previous conversations:{history}
                      
                      Current question and also similar judgements :{input}
                      
                      Read user query carefully and Speak to them like an chatbot.
                      Utilize judgements given and also previous conversations for your knowledge.
                      DO NOT REPEAT ABOUT THE SAME JUDGEMENT FOR EVERY REPLY 
                      BE CONCISE BUT ALSO SPEAK WELL.
                      DO NOT HALLUCINATE WITH YOUR LEGAL KNOWLEDGE.
                      CONSIDER YOURSELF WELL TRAINED LEGAL CHATBOT.""")

In [6]:
llm=ollama()
memory=ConversationBufferMemory()
conversion = ConversationChain(llm=llm,memory=memory,prompt=custom,verbose=False)

  memory=ConversationBufferMemory()
  conversion = ConversationChain(llm=llm,memory=memory,prompt=custom,verbose=False)


In [7]:
print("‚öñÔ∏è IPC + Judgement Chatbot (type 'exit' to quit)\n")

user_input = input("You: ").strip()


ipc_results = search_faiss(user_input, ipc_index, ipc_meta)
context = f"User query : {user_input} \n "
for item in ipc_results:
    context+=f"\nIPC \n {item['section_no']} \n Description:{item['Description']} \n Punishments : {item['punishment_raw']} \n Judgement Summary: \n"
    if 'section_no' in item and ':-' in item['section_no']:
        item['section_no'] = item['section_no'].split(':-', 1)[1].strip()
        context += summarize_long_text(get_top_judgment_text(item["section_no"]), tokenizer, model)


print("Bot:",conversion.predict(input=context))
while True:
    user_input = input("You: ").strip()
    if user_input.lower() in ["exit", "quit"]:
        print("Bot: Goodbye!")
        break

    answer = conversion.predict(input=user_input)

    print(f"\nBot: {answer}\n")





‚öñÔ∏è IPC + Judgement Chatbot (type 'exit' to quit)

Bot:  In India, the Indian Penal Code (IPC) Section 80 discusses accidents in doing a lawful act. If you've accidentally caused harm, such as driving under the influence and causing an accident, without any criminal intention or knowledge, it may be considered excusable if proper care and caution were taken. However, this does not mean you are completely exempt from legal consequences.

IPC Section 159 defines affray as two or more persons fighting in a public place that disturbs the peace. This could potentially apply if multiple people are involved in an altercation due to your actions.

Remember, every case is unique and the outcome depends on various factors such as evidence, intent, and specific circumstances. It's crucial to consult with a legal professional for advice tailored to your situation. I'm here to help guide you, but my responses should not be taken as legal advice.
Bot: Goodbye!


In [8]:
print(context)

User query : i killed someone while drunk driving 
 
IPC 
 Section80:- Accident in doing a lawful act 
 Description:Nothing is an offence which is done by accident or misfortune, and without any criminal intention or knowledge in the doing of a lawful act in a lawful manner by lawful means and with proper care and caution. Illustrations A is at work with a hatchet; the head flies off and kills a man who is standing by. Here, if there was no want of proper caution on the part of A, his act is excusable and not an offence. 
 Punishments :  
 Judgement Summary: 
The first respondent B purchased a Touzi in 24 Parganas Collectorate at a revenue sale held on 9th January 1942. As such purchaser he acquired under section 37 of the Bengal Revenue Sales Act 1859 the right "to avoid and annul all under tenures and forthwith to eject all under tenants" with certain exceptions which are not material here. In exercise of that right he gave notices of ejectment and brought a suit in 1946 to evict cer

In [10]:
summary = tokenizer(get_top_judgment_text(item["section_no"]), return_tensors="pt", max_length=1024, truncation=True)

ConnectionError: HTTPSConnectionPool(host='indiankanoon.org', port=443): Max retries exceeded with url: /search/?formInput=Affray&type=judgement (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x000001BE422EFCA0>: Failed to resolve 'indiankanoon.org' ([Errno 11001] getaddrinfo failed)"))

In [11]:
summary

NameError: name 'summary' is not defined

In [13]:
summary_ids = model.generate(
    **summary,
    max_length=200,   
    num_beams=4,
    early_stopping=True
)

In [14]:
tokenizer.decode(summary_ids[0], skip_special_tokens=True)

'The first respondent B purchased a Touzi in 24 Parganas Collectorate at a revenue sale held on 9th January 1942. As such purchaser he acquired under section 37 of the Bengal Revenue Sales Act 1859 the right "to avoid and annul all under tenures and forthwith to eject all under tenants" with certain exceptions which are not material here. In exercise of that right he gave notices of ejectment and brought a suit in 1946 to evict certain under tenants including the second respondent herein and to recover possession of the lands. The suit was decreed against the second respondent who preferred an appeal to the District Judge 24 Parganas contending that his under tenure came within one of the exceptions referred to in section 37. When the appeal was pending the Bill which was later passed as the West Bengal Revenue Sales (West Bengal Amendment) Act 1950 was introduced in the West Bengal'

In [11]:
get_top_judgment_text(item["section_no"])

'Arun Singh vs State Of U.P. on 10 February, 2020\nShare Link\nMobile View\nFree features\nPremium features\nCase removal\nView Complete document\nArun Singh vs State Of U.P. on 10 February, 2020\nShowing the contexts in which\nCohabitation caused by a man deceitfully inducing a belief of lawful marriage\nappears in the document\nChange context size\nCurrent\n‚Äú\nCohabit\na\ntion\ncaused\nby\na\nman\ndeceit\nfully\ninducing\na\nbelief\nof\nlaw\nful\nmarriage\n. - Every\nman\nwho by\ndeceit\ncauses\na\nny\n         wo\nman\nwho is not\nlaw\nfully m\na\nrried to him to believe th\na\nt she is\nlaw\nfully m\na\nrried to him\na\nnd to\ncohabit\nor h\na\nve sexu\na\nl\n         intercourse with him in th\na\nt\nbelief\n, sh\na\nll be punished with\n         imprisonment of either description for\na\nterm which m\nay\nextend\n         to ten ye\na\nrs,\na\nnd sh\na\nll\na\nlso be li\na\nble to fine.‚Äù\n18.\nA\npl\na\nin re\na\nding of the Section goes to show th\na\nt in order to constitut

In [15]:
get_top_judgment_text(item["section_no"])

'Arun Singh vs State Of U.P. on 10 February, 2020\nShare Link\nMobile View\nFree features\nPremium features\nCase removal\nView Complete document\nArun Singh vs State Of U.P. on 10 February, 2020\nShowing the contexts in which\nCohabitation caused by a man deceitfully inducing a belief of lawful marriage\nappears in the document\nChange context size\nCurrent\n‚Äú\nCohabit\na\ntion\ncaused\nby\na\nman\ndeceit\nfully\ninducing\na\nbelief\nof\nlaw\nful\nmarriage\n. - Every\nman\nwho by\ndeceit\ncauses\na\nny\n         wo\nman\nwho is not\nlaw\nfully m\na\nrried to him to believe th\na\nt she is\nlaw\nfully m\na\nrried to him\na\nnd to\ncohabit\nor h\na\nve sexu\na\nl\n         intercourse with him in th\na\nt\nbelief\n, sh\na\nll be punished with\n         imprisonment of either description for\na\nterm which m\nay\nextend\n         to ten ye\na\nrs,\na\nnd sh\na\nll\na\nlso be li\na\nble to fine.‚Äù\n18.\nA\npl\na\nin re\na\nding of the Section goes to show th\na\nt in order to constitut

In [None]:
while True:
    user_input = input("You: ").strip()
    if user_input.lower() in ["exit", "quit"]:
        print("Bot: Goodbye!")
        break

    

    answer = conversion.predict(input=user_input)

    print(f"\nBot: {answer}\n")

Bot: Goodbye!


In [24]:
# Test queries for evaluation
test_queries = [
    "What is the punishment for murder?",
    "Explain Section 302 IPC",
    "What are the differences between theft and robbery?",
    "Punishment for dowry death",
    "What is culpable homicide?",
    "Explain Section 420 IPC about cheating",
    "What is the punishment for kidnapping?",
    "Define Section 376 IPC",
    "What are the provisions for defamation?",
    "Explain attempt to murder under IPC"
]

# Use the SAME model you're already using (no need to load a new one)
eval_model = embedder  # Reuse your existing InLegal-Sbert model

print(f"‚úÖ Ready with {len(test_queries)} test queries")
print(f"‚úÖ Using evaluation model: bhavyagiri/InLegal-Sbert (your existing embedder)")

‚úÖ Ready with 10 test queries
‚úÖ Using evaluation model: bhavyagiri/InLegal-Sbert (your existing embedder)


In [25]:
# EVALUATION 1: RETRIEVAL CONSISTENCY
print("\n" + "="*60)
print("RETRIEVAL CONSISTENCY")
print("="*60)

test_query = "What is the punishment for murder?"
num_runs = 5

ipc_retrieved = []
judg_retrieved = []

for run in range(num_runs):
    ipc_results = search_faiss(test_query, ipc_index, ipc_meta, top_k=2)
    judg_results = search_faiss(test_query, judg_index, judg_meta, top_k=2)
    
    ipc_ids = tuple([r['section_no'] for r in ipc_results])
    judg_ids = tuple([r['id'] for r in judg_results])
    
    ipc_retrieved.append(ipc_ids)
    judg_retrieved.append(judg_ids)

ipc_consistent = len(set(ipc_retrieved)) == 1
judg_consistent = len(set(judg_retrieved)) == 1

print(f"\nQuery: '{test_query}'")
print(f"üìä IPC Retrieval Consistent: {'‚úÖ YES' if ipc_consistent else '‚ùå NO'}")
print(f"üìä Judgement Retrieval Consistent: {'‚úÖ YES' if judg_consistent else '‚ùå NO'}")
print(f"   Retrieved: {ipc_retrieved[0]}")


RETRIEVAL CONSISTENCY

Query: 'What is the punishment for murder?'
üìä IPC Retrieval Consistent: ‚úÖ YES
üìä Judgement Retrieval Consistent: ‚úÖ YES
   Retrieved: ('Section302:- Punishment for murder', 'Section507:- Criminal intimidation by an anonymous communication')


In [26]:
# EVALUATION 2: ANSWER CONSISTENCY
print("\n" + "="*60)
print("ANSWER CONSISTENCY")
print("="*60)

test_query = "What is the punishment for murder?"
num_samples = 3

answers = []
for i in range(num_samples):
    ipc_results = search_faiss(test_query, ipc_index, ipc_meta, top_k=2)
    
    context = "Relevant IPC Sections:\n"
    for sec in ipc_results:
        context += f"- Section {sec['section_no']}: {sec['Description']} (Punishment: {sec['punishment_raw']})\n"
    
    prompt = f"User Query: {test_query}\n\n{context}\n\nBased on the above IPC sections, Name the sections relating to the user query and also list possible punishments."
    answer = llm._call(prompt)
    answers.append(answer)

embeddings = eval_model.encode(answers, convert_to_tensor=True)
similarities = []

for i in range(len(answers)):
    for j in range(i+1, len(answers)):
        sim = util.cos_sim(embeddings[i], embeddings[j]).item()
        similarities.append(sim)

consistency_score = np.mean(similarities)

print(f"\nüìä Consistency Score: {consistency_score:.3f}")
print(f"   Min Similarity: {np.min(similarities):.3f}")
print(f"   Max Similarity: {np.max(similarities):.3f}")

if consistency_score > 0.8:
    print(f"   ‚úÖ HIGH consistency")
elif consistency_score > 0.6:
    print(f"   ‚ö†Ô∏è MODERATE consistency")
else:
    print(f"   ‚ùå LOW consistency")


ANSWER CONSISTENCY

üìä Consistency Score: 0.970
   Min Similarity: 0.964
   Max Similarity: 0.976
   ‚úÖ HIGH consistency


In [27]:
# EVALUATION 3: CONTEXT RELEVANCE
print("\n" + "="*60)
print("CONTEXT RELEVANCE")
print("="*60)

relevance_scores = []

for query in test_queries[:5]:
    ipc_results = search_faiss(query, ipc_index, ipc_meta, top_k=2)
    
    context = "Relevant IPC Sections:\n"
    for sec in ipc_results:
        context += f"- Section {sec['section_no']}: {sec['Description']} (Punishment: {sec['punishment_raw']})\n"
    
    query_emb = eval_model.encode(query, convert_to_tensor=True)
    context_emb = eval_model.encode(context, convert_to_tensor=True)
    similarity = util.cos_sim(query_emb, context_emb).item()
    
    relevance_scores.append(similarity)
    print(f"Query: {query[:50]}... ‚Üí Score: {similarity:.3f}")

print(f"\nüìä Mean Context Relevance: {np.mean(relevance_scores):.3f}")
print(f"   Range: [{np.min(relevance_scores):.3f}, {np.max(relevance_scores):.3f}]")


CONTEXT RELEVANCE
Query: What is the punishment for murder?... ‚Üí Score: 0.659
Query: Explain Section 302 IPC... ‚Üí Score: 0.673
Query: What are the differences between theft and robbery... ‚Üí Score: 0.743
Query: Punishment for dowry death... ‚Üí Score: 0.698
Query: What is culpable homicide?... ‚Üí Score: 0.680

üìä Mean Context Relevance: 0.691
   Range: [0.659, 0.743]


In [28]:
# EVALUATION 4: ANSWER RELEVANCE
print("\n" + "="*60)
print("ANSWER RELEVANCE")
print("="*60)

relevance_scores = []

for query in test_queries[:5]:
    ipc_results = search_faiss(query, ipc_index, ipc_meta, top_k=2)
    
    context = "Relevant IPC Sections:\n"
    for sec in ipc_results:
        context += f"- Section {sec['section_no']}: {sec['Description']} (Punishment: {sec['punishment_raw']})\n"
    
    prompt = f"User Query: {query}\n\n{context}\n\nBased on the above IPC sections, Name the sections relating to the user query and also list possible punishments."
    answer = llm._call(prompt)
    
    query_emb = eval_model.encode(query, convert_to_tensor=True)
    answer_emb = eval_model.encode(answer, convert_to_tensor=True)
    similarity = util.cos_sim(query_emb, answer_emb).item()
    
    relevance_scores.append(similarity)
    print(f"Query: {query[:50]}... ‚Üí Score: {similarity:.3f}")

print(f"\nüìä Mean Answer Relevance: {np.mean(relevance_scores):.3f}")
print(f"   Range: [{np.min(relevance_scores):.3f}, {np.max(relevance_scores):.3f}]")


ANSWER RELEVANCE
Query: What is the punishment for murder?... ‚Üí Score: 0.671
Query: Explain Section 302 IPC... ‚Üí Score: 0.619
Query: What are the differences between theft and robbery... ‚Üí Score: 0.753
Query: Punishment for dowry death... ‚Üí Score: 0.703
Query: What is culpable homicide?... ‚Üí Score: 0.693

üìä Mean Answer Relevance: 0.688
   Range: [0.619, 0.753]


In [29]:
# EVALUATION 5: FAITHFULNESS
print("\n" + "="*60)
print("FAITHFULNESS (Answer grounded in Context)")
print("="*60)

faithfulness_scores = []

for query in test_queries[:5]:
    ipc_results = search_faiss(query, ipc_index, ipc_meta, top_k=2)
    judg_results = search_faiss(query, judg_index, judg_meta, top_k=2)
    
    context = "Relevant IPC Sections:\n"
    for sec in ipc_results:
        context += f"- Section {sec['section_no']}: {sec['Description']} (Punishment: {sec['punishment_raw']})\n"
    
    context += "\nRelevant Judgements:\n"
    for case in judg_results:
        context += f"- Case facts: {case['case_facts']}\n"
    
    prompt = f"User Query: {query}\n\n{context}\n\nBased on the above IPC sections, Name the sections relating to the user query and also list possible punishments."
    answer = llm._call(prompt)
    
    answer_emb = eval_model.encode(answer, convert_to_tensor=True)
    context_emb = eval_model.encode(context, convert_to_tensor=True)
    similarity = util.cos_sim(answer_emb, context_emb).item()
    
    faithfulness_scores.append(similarity)
    print(f"Query: {query[:50]}... ‚Üí Score: {similarity:.3f}")

print(f"\nüìä Mean Faithfulness: {np.mean(faithfulness_scores):.3f}")

if np.mean(faithfulness_scores) > 0.7:
    print(f"   ‚úÖ Answers well-grounded in context")
else:
    print(f"   ‚ö†Ô∏è Possible hallucination detected")


FAITHFULNESS (Answer grounded in Context)
Query: What is the punishment for murder?... ‚Üí Score: 0.932
Query: Explain Section 302 IPC... ‚Üí Score: 0.926
Query: What are the differences between theft and robbery... ‚Üí Score: 0.945
Query: Punishment for dowry death... ‚Üí Score: 0.943
Query: What is culpable homicide?... ‚Üí Score: 0.956

üìä Mean Faithfulness: 0.940
   ‚úÖ Answers well-grounded in context


In [30]:
# EVALUATION 6: BERTSCORE
print("\n" + "="*60)
print("BERTSCORE")
print("="*60)

try:
    from bert_score import score as bert_score
    
    generated_answers = []
    contexts = []
    
    for query in test_queries[:5]:
        ipc_results = search_faiss(query, ipc_index, ipc_meta, top_k=2)
        
        context = "Relevant IPC Sections:\n"
        for sec in ipc_results:
            context += f"- Section {sec['section_no']}: {sec['Description']} (Punishment: {sec['punishment_raw']})\n"
        
        prompt = f"User Query: {query}\n\n{context}\n\nBased on the above IPC sections, Name the sections relating to the user query and also list possible punishments."
        answer = llm._call(prompt)
        
        generated_answers.append(answer)
        contexts.append(context)
    
    P, R, F1 = bert_score(generated_answers, contexts, lang='en', verbose=False)
    
    print(f"\nüìä BERTScore (Answer vs Context):")
    print(f"   Precision: {P.mean().item():.3f}")
    print(f"   Recall: {R.mean().item():.3f}")
    print(f"   F1: {F1.mean().item():.3f}")
    
    if F1.mean().item() > 0.7:
        print(f"   ‚úÖ GOOD semantic similarity")
    else:
        print(f"   ‚ö†Ô∏è MODERATE semantic similarity")
        
except ImportError:
    print("‚ùå Install: pip install bert-score")


BERTSCORE


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



üìä BERTScore (Answer vs Context):
   Precision: 0.834
   Recall: 0.852
   F1: 0.843
   ‚úÖ GOOD semantic similarity


In [46]:
# EVALUATION 7: LLM-AS-JUDGE (Using Google Gemini)
print("\n" + "="*60)
print("LLM-AS-JUDGE (Google Gemini)")
print("="*60)

import google.generativeai as genai

# Configure Gemini API (replace with your API key)
GEMINI_API_KEY = "AIzaSyDbxRX-ytJxp6yDmw2rQr8IHUNWnO4nB7w"  # Get from https://makersuite.google.com/app/apikey
genai.configure(api_key=GEMINI_API_KEY)

test_query = "What is the punishment for murder?"

ipc_results = search_faiss(test_query, ipc_index, ipc_meta, top_k=2)
context = "Relevant IPC Sections:\n"
for sec in ipc_results:
    context += f"- Section {sec['section_no']}: {sec['Description']} (Punishment: {sec['punishment_raw']})\n"

prompt = f"User Query: {test_query}\n\n{context}\n\nBased on the above IPC sections, Name the sections relating to the user query and also list possible punishments."
answer = llm._call(prompt)

judge_prompt = f"""You are an expert legal evaluator. Rate this Indian legal chatbot response.

Query: {test_query}
Context: {context}
Answer: {answer}

Rate 1-5 for: relevance, faithfulness, legal_accuracy, completeness, clarity

Respond ONLY in JSON format (no markdown, no other text):
{{"relevance": 4, "faithfulness": 5, "legal_accuracy": 4, "completeness": 3, "clarity": 5}}
"""

try:
    # Call Gemini
    model = genai.GenerativeModel('gemini-2.0-flash-lite')
    judge_response = model.generate_content(judge_prompt)
    judge_text = judge_response.text
    
    print(f"\nüí¨ Answer: {answer[:150]}...")
    print(f"\n‚öñÔ∏è Judge Response: {judge_text}")
    
    # Parse JSON
    json_match = re.search(r'\{.*\}', judge_text, re.DOTALL)
    if json_match:
        scores = json.loads(json_match.group())
        print(f"\nüìä Scores:")
        for key, value in scores.items():
            print(f"   {key}: {value}/5")
        avg_score = np.mean(list(scores.values()))
        print(f"\n   Average: {avg_score:.2f}/5 {'‚úÖ' if avg_score >= 3.5 else '‚ö†Ô∏è'}")
    else:
        print("   ‚ö†Ô∏è Could not parse JSON from response")
        
except Exception as e:
    print(f"‚ùå Gemini API Error: {e}")
    print("   Make sure you've set GEMINI_API_KEY and installed: pip install google-generativeai")


LLM-AS-JUDGE (Google Gemini)

üí¨ Answer:  Based on your user query, the relevant Indian Penal Code (IPC) section for murder is Section 302. The punishment for this crime includes:

1. Death p...

‚öñÔ∏è Judge Response: ```json
{"relevance": 4, "faithfulness": 5, "legal_accuracy": 4, "completeness": 3, "clarity": 5}
```

üìä Scores:
   relevance: 4/5
   faithfulness: 5/5
   legal_accuracy: 4/5
   completeness: 3/5
   clarity: 5/5

   Average: 4.20/5 ‚úÖ
