In [16]:
from langchain.memory import ConversationBufferMemory
from langchain.prompts import PromptTemplate
from langchain.llms.base import LLM
from langchain.chains import ConversationChain
from typing import Optional, List, Any
import faiss
import json
import numpy as np
from sentence_transformers import SentenceTransformer, util
import requests
import re

In [17]:
embedder = SentenceTransformer("bhavyagiri/InLegal-Sbert")

In [None]:
def load_index_and_meta(index_path, meta_path):
    index = faiss.read_index(index_path)
    with open(meta_path, "r", encoding="utf-8") as f:
        meta = json.load(f)
    return index, meta

def search_faiss(query, index, meta, top_k=2):
    query_vec = embedder.encode([query])
    query_vec = np.array(query_vec).astype("float32")
    D, I = index.search(query_vec, top_k)
    results = []
    for idx in I[0]:
        if idx < len(meta):
            results.append(meta[idx])
    return results
class ollama(LLM):
    model_name: str = "mistral"

    @property
    def _llm_type(self) -> str:
        return "ollama"
        
    def _call(self,prompt: str, stop: Optional[List[str]] = None, **kwargs: Any):
        response = requests.post(
            "http://127.0.0.1:11434/api/generate",
            json={"model": self.model_name, "prompt": prompt, "stream": False}
        )
        return response.json()["response"]

In [19]:
ipc_index, ipc_meta = load_index_and_meta("ipc.index", "ipc.json")
judg_index, judg_meta = load_index_and_meta("judgments.index", "judgments.json")

In [20]:
custom=PromptTemplate(input_variables=["history","input"],template=""" You are an legal Chatbot assistant specializing in India Penal Code.
                      
                      Previous conversations:{history}
                      
                      Current question and also judgements :{input}
                      
                    provide an suitable reply for the current question by utilizing previous conversations and keep it around 20 to 30 words .
                      Utilize similar case judgement and compare it with the user case summary and then reply but do not repeat about judgements in every replies.
                      Only use the given text as knowledge, Do Not Retrieve your own knowledge
                      Only Reply using the given IPC sections, STICK TO ONLY THIS TEXT AND PREVIOUS CONVERSATION FOR KNOWLEDGE.
                      Consider youself as an legal advisor and you are the professional.""")

In [21]:
llm=ollama()
memory=ConversationBufferMemory()
conversion = ConversationChain(llm=llm,memory=memory,prompt=custom,verbose=False)

In [22]:
print("‚öñÔ∏è IPC + Judgement Chatbot (type 'exit' to quit)\n")

user_input = input("You: ").strip()


ipc_results = search_faiss(user_input, ipc_index, ipc_meta)
judg_results = search_faiss(user_input, judg_index, judg_meta)

        # Step 2: Build context
context = "Relevant IPC Sections:\n"
for sec in ipc_results:
    context += f"- Section {sec['section_no']}: {sec['Description']} (Punishment: {sec['punishment_raw']})\n"

prompt = f"User Query: {user_input}\n\n{context}\n\nBased on the above IPC sections, Name the sections relating to the user query and also list possible punishments."
Reply_1=llm._call(prompt)

context += "\nRelevant Judgements:\n"
for case in judg_results:
    context += f"""- Case facts: {case['case_facts']}
Issues: {case['issues']}
Arguments: {case['arguments']}
Decision: {case['decision']}
"""



print(f"\nBot: {Reply_1}\n")

    


‚öñÔ∏è IPC + Judgement Chatbot (type 'exit' to quit)


Bot:  Based on the user's query, it seems like there might be a combination of offenses that could apply in this hypothetical situation. Here are the relevant IPC sections and their potential punishments:

1. Section 300 (Murder): This section applies when a person causes someone's death with the intention to cause death or to cause such bodily injury as is likely to cause death, or with the knowledge that he is likely by such act to cause death. The punishment for murder is imprisonment for life or capital punishment.

   However, the act of killing might not be intentional but a probable consequence of the abetment, as mentioned in Section 111. In such a case, the abettors could potentially be liable under this section.

2. Section 304 Part I (Punishment for culpable homicide not amounting to murder): This section applies when a person causes someone's death without any intention to cause death or to cause such bodily injury as i

In [23]:
while True:
    user_input = input("You: ").strip()
    if user_input.lower() in ["exit", "quit"]:
        print("Bot: Goodbye!")
        break

    user_input+=f"\n\n similar case judgements : {context}"

    answer = conversion.predict(input=user_input)

    print(f"\nBot: {answer}\n")

Bot: Goodbye!


In [24]:
# Test queries for evaluation
test_queries = [
    "What is the punishment for murder?",
    "Explain Section 302 IPC",
    "What are the differences between theft and robbery?",
    "Punishment for dowry death",
    "What is culpable homicide?",
    "Explain Section 420 IPC about cheating",
    "What is the punishment for kidnapping?",
    "Define Section 376 IPC",
    "What are the provisions for defamation?",
    "Explain attempt to murder under IPC"
]

# Use the SAME model you're already using (no need to load a new one)
eval_model = embedder  # Reuse your existing InLegal-Sbert model

print(f"‚úÖ Ready with {len(test_queries)} test queries")
print(f"‚úÖ Using evaluation model: bhavyagiri/InLegal-Sbert (your existing embedder)")

‚úÖ Ready with 10 test queries
‚úÖ Using evaluation model: bhavyagiri/InLegal-Sbert (your existing embedder)


In [25]:
# EVALUATION 1: RETRIEVAL CONSISTENCY
print("\n" + "="*60)
print("RETRIEVAL CONSISTENCY")
print("="*60)

test_query = "What is the punishment for murder?"
num_runs = 5

ipc_retrieved = []
judg_retrieved = []

for run in range(num_runs):
    ipc_results = search_faiss(test_query, ipc_index, ipc_meta, top_k=2)
    judg_results = search_faiss(test_query, judg_index, judg_meta, top_k=2)
    
    ipc_ids = tuple([r['section_no'] for r in ipc_results])
    judg_ids = tuple([r['id'] for r in judg_results])
    
    ipc_retrieved.append(ipc_ids)
    judg_retrieved.append(judg_ids)

ipc_consistent = len(set(ipc_retrieved)) == 1
judg_consistent = len(set(judg_retrieved)) == 1

print(f"\nQuery: '{test_query}'")
print(f"üìä IPC Retrieval Consistent: {'‚úÖ YES' if ipc_consistent else '‚ùå NO'}")
print(f"üìä Judgement Retrieval Consistent: {'‚úÖ YES' if judg_consistent else '‚ùå NO'}")
print(f"   Retrieved: {ipc_retrieved[0]}")


RETRIEVAL CONSISTENCY

Query: 'What is the punishment for murder?'
üìä IPC Retrieval Consistent: ‚úÖ YES
üìä Judgement Retrieval Consistent: ‚úÖ YES
   Retrieved: ('Section302:- Punishment for murder', 'Section507:- Criminal intimidation by an anonymous communication')


In [26]:
# EVALUATION 2: ANSWER CONSISTENCY
print("\n" + "="*60)
print("ANSWER CONSISTENCY")
print("="*60)

test_query = "What is the punishment for murder?"
num_samples = 3

answers = []
for i in range(num_samples):
    ipc_results = search_faiss(test_query, ipc_index, ipc_meta, top_k=2)
    
    context = "Relevant IPC Sections:\n"
    for sec in ipc_results:
        context += f"- Section {sec['section_no']}: {sec['Description']} (Punishment: {sec['punishment_raw']})\n"
    
    prompt = f"User Query: {test_query}\n\n{context}\n\nBased on the above IPC sections, Name the sections relating to the user query and also list possible punishments."
    answer = llm._call(prompt)
    answers.append(answer)

embeddings = eval_model.encode(answers, convert_to_tensor=True)
similarities = []

for i in range(len(answers)):
    for j in range(i+1, len(answers)):
        sim = util.cos_sim(embeddings[i], embeddings[j]).item()
        similarities.append(sim)

consistency_score = np.mean(similarities)

print(f"\nüìä Consistency Score: {consistency_score:.3f}")
print(f"   Min Similarity: {np.min(similarities):.3f}")
print(f"   Max Similarity: {np.max(similarities):.3f}")

if consistency_score > 0.8:
    print(f"   ‚úÖ HIGH consistency")
elif consistency_score > 0.6:
    print(f"   ‚ö†Ô∏è MODERATE consistency")
else:
    print(f"   ‚ùå LOW consistency")


ANSWER CONSISTENCY

üìä Consistency Score: 0.970
   Min Similarity: 0.964
   Max Similarity: 0.976
   ‚úÖ HIGH consistency


In [27]:
# EVALUATION 3: CONTEXT RELEVANCE
print("\n" + "="*60)
print("CONTEXT RELEVANCE")
print("="*60)

relevance_scores = []

for query in test_queries[:5]:
    ipc_results = search_faiss(query, ipc_index, ipc_meta, top_k=2)
    
    context = "Relevant IPC Sections:\n"
    for sec in ipc_results:
        context += f"- Section {sec['section_no']}: {sec['Description']} (Punishment: {sec['punishment_raw']})\n"
    
    query_emb = eval_model.encode(query, convert_to_tensor=True)
    context_emb = eval_model.encode(context, convert_to_tensor=True)
    similarity = util.cos_sim(query_emb, context_emb).item()
    
    relevance_scores.append(similarity)
    print(f"Query: {query[:50]}... ‚Üí Score: {similarity:.3f}")

print(f"\nüìä Mean Context Relevance: {np.mean(relevance_scores):.3f}")
print(f"   Range: [{np.min(relevance_scores):.3f}, {np.max(relevance_scores):.3f}]")


CONTEXT RELEVANCE
Query: What is the punishment for murder?... ‚Üí Score: 0.659
Query: Explain Section 302 IPC... ‚Üí Score: 0.673
Query: What are the differences between theft and robbery... ‚Üí Score: 0.743
Query: Punishment for dowry death... ‚Üí Score: 0.698
Query: What is culpable homicide?... ‚Üí Score: 0.680

üìä Mean Context Relevance: 0.691
   Range: [0.659, 0.743]


In [28]:
# EVALUATION 4: ANSWER RELEVANCE
print("\n" + "="*60)
print("ANSWER RELEVANCE")
print("="*60)

relevance_scores = []

for query in test_queries[:5]:
    ipc_results = search_faiss(query, ipc_index, ipc_meta, top_k=2)
    
    context = "Relevant IPC Sections:\n"
    for sec in ipc_results:
        context += f"- Section {sec['section_no']}: {sec['Description']} (Punishment: {sec['punishment_raw']})\n"
    
    prompt = f"User Query: {query}\n\n{context}\n\nBased on the above IPC sections, Name the sections relating to the user query and also list possible punishments."
    answer = llm._call(prompt)
    
    query_emb = eval_model.encode(query, convert_to_tensor=True)
    answer_emb = eval_model.encode(answer, convert_to_tensor=True)
    similarity = util.cos_sim(query_emb, answer_emb).item()
    
    relevance_scores.append(similarity)
    print(f"Query: {query[:50]}... ‚Üí Score: {similarity:.3f}")

print(f"\nüìä Mean Answer Relevance: {np.mean(relevance_scores):.3f}")
print(f"   Range: [{np.min(relevance_scores):.3f}, {np.max(relevance_scores):.3f}]")


ANSWER RELEVANCE
Query: What is the punishment for murder?... ‚Üí Score: 0.671
Query: Explain Section 302 IPC... ‚Üí Score: 0.619
Query: What are the differences between theft and robbery... ‚Üí Score: 0.753
Query: Punishment for dowry death... ‚Üí Score: 0.703
Query: What is culpable homicide?... ‚Üí Score: 0.693

üìä Mean Answer Relevance: 0.688
   Range: [0.619, 0.753]


In [29]:
# EVALUATION 5: FAITHFULNESS
print("\n" + "="*60)
print("FAITHFULNESS (Answer grounded in Context)")
print("="*60)

faithfulness_scores = []

for query in test_queries[:5]:
    ipc_results = search_faiss(query, ipc_index, ipc_meta, top_k=2)
    judg_results = search_faiss(query, judg_index, judg_meta, top_k=2)
    
    context = "Relevant IPC Sections:\n"
    for sec in ipc_results:
        context += f"- Section {sec['section_no']}: {sec['Description']} (Punishment: {sec['punishment_raw']})\n"
    
    context += "\nRelevant Judgements:\n"
    for case in judg_results:
        context += f"- Case facts: {case['case_facts']}\n"
    
    prompt = f"User Query: {query}\n\n{context}\n\nBased on the above IPC sections, Name the sections relating to the user query and also list possible punishments."
    answer = llm._call(prompt)
    
    answer_emb = eval_model.encode(answer, convert_to_tensor=True)
    context_emb = eval_model.encode(context, convert_to_tensor=True)
    similarity = util.cos_sim(answer_emb, context_emb).item()
    
    faithfulness_scores.append(similarity)
    print(f"Query: {query[:50]}... ‚Üí Score: {similarity:.3f}")

print(f"\nüìä Mean Faithfulness: {np.mean(faithfulness_scores):.3f}")

if np.mean(faithfulness_scores) > 0.7:
    print(f"   ‚úÖ Answers well-grounded in context")
else:
    print(f"   ‚ö†Ô∏è Possible hallucination detected")


FAITHFULNESS (Answer grounded in Context)
Query: What is the punishment for murder?... ‚Üí Score: 0.932
Query: Explain Section 302 IPC... ‚Üí Score: 0.926
Query: What are the differences between theft and robbery... ‚Üí Score: 0.945
Query: Punishment for dowry death... ‚Üí Score: 0.943
Query: What is culpable homicide?... ‚Üí Score: 0.956

üìä Mean Faithfulness: 0.940
   ‚úÖ Answers well-grounded in context


In [30]:
# EVALUATION 6: BERTSCORE
print("\n" + "="*60)
print("BERTSCORE")
print("="*60)

try:
    from bert_score import score as bert_score
    
    generated_answers = []
    contexts = []
    
    for query in test_queries[:5]:
        ipc_results = search_faiss(query, ipc_index, ipc_meta, top_k=2)
        
        context = "Relevant IPC Sections:\n"
        for sec in ipc_results:
            context += f"- Section {sec['section_no']}: {sec['Description']} (Punishment: {sec['punishment_raw']})\n"
        
        prompt = f"User Query: {query}\n\n{context}\n\nBased on the above IPC sections, Name the sections relating to the user query and also list possible punishments."
        answer = llm._call(prompt)
        
        generated_answers.append(answer)
        contexts.append(context)
    
    P, R, F1 = bert_score(generated_answers, contexts, lang='en', verbose=False)
    
    print(f"\nüìä BERTScore (Answer vs Context):")
    print(f"   Precision: {P.mean().item():.3f}")
    print(f"   Recall: {R.mean().item():.3f}")
    print(f"   F1: {F1.mean().item():.3f}")
    
    if F1.mean().item() > 0.7:
        print(f"   ‚úÖ GOOD semantic similarity")
    else:
        print(f"   ‚ö†Ô∏è MODERATE semantic similarity")
        
except ImportError:
    print("‚ùå Install: pip install bert-score")


BERTSCORE


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



üìä BERTScore (Answer vs Context):
   Precision: 0.834
   Recall: 0.852
   F1: 0.843
   ‚úÖ GOOD semantic similarity


In [46]:
# EVALUATION 7: LLM-AS-JUDGE (Using Google Gemini)
print("\n" + "="*60)
print("LLM-AS-JUDGE (Google Gemini)")
print("="*60)

import google.generativeai as genai

# Configure Gemini API (replace with your API key)
GEMINI_API_KEY = "AIzaSyDbxRX-ytJxp6yDmw2rQr8IHUNWnO4nB7w"  # Get from https://makersuite.google.com/app/apikey
genai.configure(api_key=GEMINI_API_KEY)

test_query = "What is the punishment for murder?"

ipc_results = search_faiss(test_query, ipc_index, ipc_meta, top_k=2)
context = "Relevant IPC Sections:\n"
for sec in ipc_results:
    context += f"- Section {sec['section_no']}: {sec['Description']} (Punishment: {sec['punishment_raw']})\n"

prompt = f"User Query: {test_query}\n\n{context}\n\nBased on the above IPC sections, Name the sections relating to the user query and also list possible punishments."
answer = llm._call(prompt)

judge_prompt = f"""You are an expert legal evaluator. Rate this Indian legal chatbot response.

Query: {test_query}
Context: {context}
Answer: {answer}

Rate 1-5 for: relevance, faithfulness, legal_accuracy, completeness, clarity

Respond ONLY in JSON format (no markdown, no other text):
{{"relevance": 4, "faithfulness": 5, "legal_accuracy": 4, "completeness": 3, "clarity": 5}}
"""

try:
    # Call Gemini
    model = genai.GenerativeModel('gemini-2.0-flash-lite')
    judge_response = model.generate_content(judge_prompt)
    judge_text = judge_response.text
    
    print(f"\nüí¨ Answer: {answer[:150]}...")
    print(f"\n‚öñÔ∏è Judge Response: {judge_text}")
    
    # Parse JSON
    json_match = re.search(r'\{.*\}', judge_text, re.DOTALL)
    if json_match:
        scores = json.loads(json_match.group())
        print(f"\nüìä Scores:")
        for key, value in scores.items():
            print(f"   {key}: {value}/5")
        avg_score = np.mean(list(scores.values()))
        print(f"\n   Average: {avg_score:.2f}/5 {'‚úÖ' if avg_score >= 3.5 else '‚ö†Ô∏è'}")
    else:
        print("   ‚ö†Ô∏è Could not parse JSON from response")
        
except Exception as e:
    print(f"‚ùå Gemini API Error: {e}")
    print("   Make sure you've set GEMINI_API_KEY and installed: pip install google-generativeai")


LLM-AS-JUDGE (Google Gemini)

üí¨ Answer:  Based on your user query, the relevant Indian Penal Code (IPC) section for murder is Section 302. The punishment for this crime includes:

1. Death p...

‚öñÔ∏è Judge Response: ```json
{"relevance": 4, "faithfulness": 5, "legal_accuracy": 4, "completeness": 3, "clarity": 5}
```

üìä Scores:
   relevance: 4/5
   faithfulness: 5/5
   legal_accuracy: 4/5
   completeness: 3/5
   clarity: 5/5

   Average: 4.20/5 ‚úÖ


In [41]:
for model in genai.list_models():
    if 'generateContent' in model.supported_generation_methods:
        print(model.name)

models/gemini-2.5-pro-preview-03-25
models/gemini-2.5-flash-preview-05-20
models/gemini-2.5-flash
models/gemini-2.5-flash-lite-preview-06-17
models/gemini-2.5-pro-preview-05-06
models/gemini-2.5-pro-preview-06-05
models/gemini-2.5-pro
models/gemini-2.0-flash-exp
models/gemini-2.0-flash
models/gemini-2.0-flash-001
models/gemini-2.0-flash-exp-image-generation
models/gemini-2.0-flash-lite-001
models/gemini-2.0-flash-lite
models/gemini-2.0-flash-preview-image-generation
models/gemini-2.0-flash-lite-preview-02-05
models/gemini-2.0-flash-lite-preview
models/gemini-2.0-pro-exp
models/gemini-2.0-pro-exp-02-05
models/gemini-exp-1206
models/gemini-2.0-flash-thinking-exp-01-21
models/gemini-2.0-flash-thinking-exp
models/gemini-2.0-flash-thinking-exp-1219
models/gemini-2.5-flash-preview-tts
models/gemini-2.5-pro-preview-tts
models/learnlm-2.0-flash-experimental
models/gemma-3-1b-it
models/gemma-3-4b-it
models/gemma-3-12b-it
models/gemma-3-27b-it
models/gemma-3n-e4b-it
models/gemma-3n-e2b-it
models

In [66]:
class ollama(LLM):
    model_name: str = "mistral"

    @property
    def _llm_type(self) -> str:
        return "ollama"

    def _call(self, prompt: str, stop: Optional[List[str]] = None, *args, **kwargs: Any):
        import requests
        response = requests.post(
            "http://127.0.0.1:11434/api/generate",
            json={"model": self.model_name, "prompt": prompt, "stream": False},
            timeout=4000

        )
        return response.json()["response"]
llm=ollama()

In [67]:
# EVALUATION 8: RAGAS
print("\n" + "="*60)
print("RAGAS FRAMEWORK")
print("="*60)

try:
    import torch
    from ragas import evaluate
    from ragas.metrics import faithfulness, answer_relevancy
    from datasets import Dataset
    from sentence_transformers import SentenceTransformer
    from langchain.embeddings import HuggingFaceEmbeddings

    # ‚úÖ Use your custom sentence embedder
    embedder = SentenceTransformer("bhavyagiri/InLegal-Sbert")

    # --- Optional: wrap it for RAGAS compatibility (if it expects LangChain embedding interface)
    class CustomEmbedder:
        def embed_documents(self, texts):
            return embedder.encode(texts, convert_to_numpy=True).tolist()
        
        def embed_query(self, text):
            return embedder.encode([text], convert_to_numpy=True).tolist()[0]

    embedding_model = CustomEmbedder()

    # ‚úÖ Create your data
    data = {'question': [], 'contexts': [], 'answer': []}

    for query in test_queries[:5]:
        ipc_results = search_faiss(query, ipc_index, ipc_meta, top_k=2)

        context = "Relevant IPC Sections:\n"
        for sec in ipc_results:
            context += f"- Section {sec['section_no']}: {sec['Description']} (Punishment: {sec['punishment_raw']})\n"

        prompt = (
            f"User Query: {query}\n\n{context}\n\n"
            "Based on the above IPC sections, name the relevant sections and list possible punishments."
        )
        answer = llm._call(prompt)

        data['question'].append(query)
        data['contexts'].append([context])
        data['answer'].append(answer)

    dataset = Dataset.from_dict(data)

    print("‚è≥ Evaluating...")

    # ‚úÖ Pass custom embedding model explicitly
    result = evaluate(
        dataset,
        metrics=[faithfulness, answer_relevancy],
        llm=llm,
        embeddings=embedding_model
    )

    faith = result["faithfulness"]
    relev = result["answer_relevancy"]

# If they're lists, take the mean
    if isinstance(faith, list):
        faith = sum(faith) / len(faith)
    if isinstance(relev, list):
        relev = sum(relev) / len(relev)

    print("\nüìä RAGAS Results:")
    print(f"   Faithfulness:      {faith:.3f}")
    print(f"   Answer Relevancy:  {relev:.3f}")

    avg = (faith + relev) / 2
    print(f"\n   Average: {avg:.3f} {'‚úÖ' if avg > 0.7 else '‚ö†Ô∏è'}")


except ImportError:
    print("‚ùå Install required packages: pip install ragas datasets sentence-transformers langchain")
except Exception as e:
    print(f"‚ùå Error: {e}")



RAGAS FRAMEWORK
‚è≥ Evaluating...


Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

Exception raised in Job[0]: TimeoutError()
Exception raised in Job[2]: TimeoutError()
Exception raised in Job[6]: TimeoutError()
Exception raised in Job[9]: TimeoutError()
Exception raised in Job[8]: TimeoutError()
Exception raised in Job[5]: TimeoutError()
Exception raised in Job[7]: TimeoutError()
Exception raised in Job[4]: TimeoutError()
Exception raised in Job[1]: TimeoutError()
Exception raised in Job[3]: TimeoutError()



üìä RAGAS Results:
   Faithfulness:      nan
   Answer Relevancy:  nan

   Average: nan ‚ö†Ô∏è


In [64]:
# FINAL SUMMARY
print("\n" + "="*60)
print("‚úÖ EVALUATION COMPLETE")
print("="*60)
print(f"""
Completed Evaluations:
‚úÖ Retrieval Consistency
‚úÖ Answer Consistency  
‚úÖ Context Relevance
‚úÖ Answer Relevance
‚úÖ Faithfulness
‚úÖ BERTScore
‚úÖ LLM-as-Judge
‚úÖ RAGAS

Review scores above. Scores < 0.6 need improvement.
""")


‚úÖ EVALUATION COMPLETE

Completed Evaluations:
‚úÖ Retrieval Consistency
‚úÖ Answer Consistency  
‚úÖ Context Relevance
‚úÖ Answer Relevance
‚úÖ Faithfulness
‚úÖ BERTScore
‚úÖ LLM-as-Judge
‚úÖ RAGAS

Review scores above. Scores < 0.6 need improvement.

