In [35]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import sys
import os

In [39]:
parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
sys.path.insert(0, parent_dir)
from services.scenario_service import ScenarioService

In [41]:
test_df = pd.read_csv('test_data/ipc_scenarios.csv')    
test_df.head()

Unnamed: 0,id,text,sections
0,1,A group of individuals conspire to overthrow t...,"121A, 122"
1,2,A person collects weapons and ammunition with ...,122
2,3,An individual is aware of a plot to wage war a...,123
3,4,A person assaults the Governor of a state to c...,124
4,5,An individual makes a speech inciting hatred a...,124A


In [42]:
ipc_to_bns = {}
with open("test_data/ipc_bns_mapping.csv", mode="r", encoding="utf-8") as f:
    import csv
    reader = csv.DictReader(f)
    for row in reader:
        ipc = row["IPC"].strip()
        bns = row["BNS"].strip()
        ipc_to_bns[ipc] = bns

def dcg_at_k(relevance_scores, k):
    return sum(rel / np.log2(idx + 2) for idx, rel in enumerate(relevance_scores[:k]))

def ndcg_at_k(predicted, ground_truth, k):
    relevance_scores = [1 if sec in ground_truth else 0 for sec in predicted]
    dcg = dcg_at_k(relevance_scores, k)
    ideal_relevance = sorted(relevance_scores, reverse=True)
    idcg = dcg_at_k(ideal_relevance, k)
    return dcg / idcg if idcg > 0 else 0.0

In [57]:
def evaluate_scenarios(service, test_df, top_k=5):
    precision_scores = []
    recall_scores = []
    ndcg_scores = []
    st = set()

    for _, row in tqdm(test_df.iterrows(), total=len(test_df), desc="Evaluating"):
        query = row["text"]

        # Clean and split section string
        ipc_sections = [s.strip() for s in str(row["sections"]).split(",") if s.strip()]
        # print(ipc_sections)

        # Map to BNS sections using your mapping
        bns_mapped_sections = {ipc_to_bns[s] for s in ipc_sections if s in ipc_to_bns}
        ground_truth = bns_mapped_sections
        # print(ground_truth)

        if not ground_truth:
            # print(f"⚠️ Skipping: No ground truth found for query: {query}")
            for sec in ipc_sections:
                st.add(sec)
            continue

        # Get predictions
        top_sections = service.get_top_scenarios(query, history=[], top_k=top_k, validate_with_api=False)
        predicted = [str(sec["Section Number"]) for sec in top_sections if "Section Number" in sec]
        # print(predicted)
        if not predicted:
            # print(f"⚠️ No predictions for query: {query}")
            precision = 0.0
            recall = 0.0
            ndcg = 0.0
        else:
            hits = sum(1 for sec in predicted if sec in ground_truth)
            precision = hits / len(predicted) if predicted else 0.0
            recall = hits / len(ground_truth) if ground_truth else 0.0
            ndcg = ndcg_at_k(predicted, ground_truth, top_k)

        precision_scores.append(precision)
        recall_scores.append(recall)
        ndcg_scores.append(ndcg)

    total_evals = len(precision_scores)  # may be < len(test_df) due to skips

    print(st)
    if total_evals == 0:
        print("❌ No valid evaluations could be performed.")
        return {}, [], [], []
    results = {
        "Mean Precision@k": round(sum(precision_scores) / total_evals, 4),
        "Mean Recall@k": round(sum(recall_scores) / total_evals, 4),
        "Mean NDCG@k": round(sum(ndcg_scores) / total_evals, 4),
    }

    return results, precision_scores, recall_scores, ndcg_scores

In [58]:
scenario_service = ScenarioService(dataset_path="../data/Updated_BNS_Dataset.csv")
results, precisions, recalls, mrrs = evaluate_scenarios(scenario_service, test_df, top_k=5)


[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

{'497', '124A', '264', '444', '237', '377', 'nan', '246', '265', '266', '311', '238', '421', '309', '236', '310', '267'}





In [59]:
print("Results: ", results)

Results:  {'Mean Precision@k': 0.1414, 'Mean Recall@k': 0.6317, 'Mean NDCG@k': np.float64(0.5016)}
