In [47]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import sys
import os

In [48]:
parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
sys.path.insert(0, parent_dir)
from services.scenario_service import ScenarioService

In [49]:
test_df = pd.read_csv('test_data/test_scenarios.csv')    
test_df['text_length'] = test_df['text'].apply(len)
test_df = test_df.sort_values(by='text_length', ascending=True)
test_df = test_df[test_df['text_length'] >= 1000]
test_df = test_df.head(10)

In [50]:
ipc_to_bns = {}
with open("test_data/ipc_bns_mapping.csv", mode="r", encoding="utf-8") as f:
    import csv
    reader = csv.DictReader(f)
    for row in reader:
        ipc = row["IPC"].strip()
        bns = row["BNS"].strip()
        ipc_to_bns[ipc] = bns

def dcg_at_k(relevance_scores, k):
    return sum(rel / np.log2(idx + 2) for idx, rel in enumerate(relevance_scores[:k]))

def ndcg_at_k(predicted, ground_truth, k):
    relevance_scores = [1 if sec in ground_truth else 0 for sec in predicted]
    dcg = dcg_at_k(relevance_scores, k)
    ideal_relevance = sorted(relevance_scores, reverse=True)
    idcg = dcg_at_k(ideal_relevance, k)
    return dcg / idcg if idcg > 0 else 0.0

In [51]:
def evaluate_scenarios(service, test_df, top_k=5):
    precision_scores = []
    recall_scores = []
    ndcg_scores = []

    for _, row in tqdm(test_df.iterrows(), total=len(test_df), desc="Evaluating"):
        query = row["text"]
        ipc_sections = [s.strip() for s in row["sections"].split(",")]
        bns_mapped_sections = {ipc_to_bns[s] for s in ipc_sections if s in ipc_to_bns}
        ground_truth = bns_mapped_sections
        print (ground_truth)

        top_sections = service.get_top_scenarios(query, history=[], top_k=top_k, validate_with_api=False)
        predicted = [str(sec["Section Number"]) for sec in top_sections]
        print (predicted)

        hits = sum(1 for sec in predicted if sec in ground_truth)
        precision = hits / top_k
        recall = hits / len(ground_truth)
        ndcg = ndcg_at_k(predicted, ground_truth, top_k)

        precision_scores.append(precision)
        recall_scores.append(recall)
        ndcg_scores.append(ndcg)

    results = {
        "Mean Precision@k": round(sum(precision_scores) / len(test_df), 4),
        "Mean Recall@k": round(sum(recall_scores) / len(test_df), 4),
        "Mean NDCG@k": round(sum(ndcg_scores) / len(test_df), 4),
    }

    return results, precision_scores, recall_scores, ndcg_scores

In [52]:
scenario_service = ScenarioService(dataset_path="../data/Updated_BNS_Dataset.csv")
results, precisions, recalls, mrrs = evaluate_scenarios(scenario_service, test_df, top_k=5)

Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

{'318', '60'}
['248', '229', '112', '148', '212']
{'345', '116', '316', '86'}
['80', '82', '64', '69', '8']
{'319', '60'}
['8', '127', '358', '229', '248']
{'351', '191', '115', '117'}
['248', '229', '199', '127', '211']
{'327', '221', '121', '132', '189'}
['127', '8', '248', '208', '193']
{'318', '60', '340', '338', '345', '1'}
['358', '8', '248', '229', '127']
{'105'}
['109', '8', '358', '106', '127']
{'3', '296', '115'}
['127', '8', '206', '229', '248']
{'223', '340'}
['303', '8', '243', '317', '248']
{'309', '109', '115', '191', '193', '329', '190'}
['248', '353', '229', '197', '199']
