In [11]:
from ragas import SingleTurnSample
from ragas.metrics import NonLLMContextPrecisionWithReference
import os
import json
import asyncio

In [4]:
gt_dir = "C:\\Users\\nerea\\Documents\\MasterDTU\\masterThesis\\masterThesis\\testing\\gt.json"
sparse_dir = "C:\\Users\\nerea\\Documents\\MasterDTU\\masterThesis\\masterThesis\\testing\\retrieval_results_sparse.json"
dense_dir = "C:\\Users\\nerea\\Documents\\MasterDTU\\masterThesis\\masterThesis\\testing\\retrieval_results_dense.json"

# Load JSON files
with open(gt_dir, "r", encoding="utf-8") as f:
    ground_truth_data = json.load(f)

with open(sparse_dir, "r", encoding="utf-8") as f:
    retrieval_results = json.load(f)

with open(dense_dir, "r", encoding="utf-8") as f:
    retrieval_results_dense = json.load(f)

In [8]:
# Initialize metric
context_precision = NonLLMContextPrecisionWithReference()

In [9]:
# Helper: find matching ground truth by question text
def find_ground_truth(question_text):
    for item in ground_truth_data:
        if item["question"].strip().lower() == question_text.strip().lower():
            return item
    return None

In [22]:
async def evaluate_all():
    results = []

    for query_data in retrieval_results:
        query = query_data["query"]
        retrieved_contexts = [r["window"] for r in query_data["results"]]

        gt_item = find_ground_truth(query)
        if not gt_item:
            continue
        reference_contexts = [seg["text"] for seg in gt_item["ground_truth_segments"]]

        # Make RAGAS sample
        sample = SingleTurnSample(
            retrieved_contexts=retrieved_contexts,
            reference_contexts=reference_contexts,
        )

        # Compute precision for the full query
        query_score = await context_precision.single_turn_ascore(sample)

        # Evaluate each retrieved chunk individually
        per_context_scores = []
        for i, ctx in enumerate(retrieved_contexts):
            partial_sample = SingleTurnSample(
                retrieved_contexts=[ctx],
                reference_contexts=reference_contexts,
            )
            score = await context_precision.single_turn_ascore(partial_sample)
            per_context_scores.append({
                "retrieved_text": ctx,
                "precision_score": score
            })

        results.append({
            "query": query,
            "query_precision": query_score,
            "context_scores": per_context_scores
        })

    return results

# Use top-level await (works in Jupyter/IPython) instead of asyncio.run to avoid the
# "RuntimeError: asyncio.run() cannot be called from a running event loop"
final_results = await evaluate_all()

In [20]:
# Show results
for q in final_results:
    print(f"\nðŸ§  Query: {q['query']}")
    print(f"Overall Precision: {q['query_precision']:.3f}")
    print("Retrieved Contexts:")
    for ctx in q["context_scores"]:
        print(f"  - Score: {ctx['precision_score']:.3f}")
        print(f"    Text: {ctx['retrieved_text'][:120]}...")


ðŸ§  Query: Which solution should a researcher use if they want High accuracy and data export?
Overall Precision: 0.000
Retrieved Contexts:
  - Score: 0.000
    Text: It is possible to override all data.  Adjust 
the slope of the decay, if needed, or edit data to test theories 
about th...
  - Score: 0.000
    Text: 3.  Specify the name and location of the export file in the Destination file field.
 4.  Choose the format into which yo...

ðŸ§  Query: Which solution should I use if I want compatibility with analysis tools like matlab?
Overall Precision: 0.000
Retrieved Contexts:
  - Score: 0.000
    Text: 3.  Specify the name and location of the export file in the Destination file field.
 4.  Choose the format into which yo...
  - Score: 0.000
    Text: 3.  Specify the name and location of the export file in the Destination file field.
 4.  Choose the format into which yo...

ðŸ§  Query: Which solution is better for a university group? They want to use this solution in different applic