Imports

In [1]:
import json
from typing import List, Dict, Tuple
from ragas import SingleTurnSample
from ragas.metrics import NonLLMContextPrecisionWithReference
from ragas.metrics import LLMContextPrecisionWithReference


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from langchain_ollama import OllamaLLM  # Updated import
   
evaluator_llm = OllamaLLM(model="llama3.1")

Read Ground truth and get var

In [3]:
path = r"C:/Users/NCABALLERO/OneDrive - HBK/Thesis/Code/masterThesis/testing/gt.json"

# Load the JSON safely
with open(path, "r", encoding="utf-8") as f:
    data = json.load(f)

# Build a list of tuples: (query, [list of texts])
query_texts_pairs = [
    (
        item["question"].strip(),
        [seg["text"].strip() for seg in item.get("ground_truth_segments", [])]
    )
    for item in data
]

# Example: show the first query and its texts
first_query, first_texts = query_texts_pairs[0]
print("Query:", first_query)
print("Number of ground truth texts:", len(first_texts))
print("First text snippet:", first_texts[0][:200].replace("\n", " "), "...\n")

Query: Which solution should a researcher use if they want High accuracy and data export?
Number of ground truth texts: 17
First text snippet: You can simultaneously measure source (L1) and receiving room levels (L2) by connecting two HBK 2255 Sound Level Meters to the app.  This feature is designed to save time and enhance efficiency, espec ...



read dense json and get question and texts

In [4]:
# Path to your RAG output JSON
path = r"C:/Users/NCABALLERO/OneDrive - HBK/Thesis/Code/masterThesis/testing/retrieval_results.json"

# Load the JSON
with open(path, "r", encoding="utf-8") as f:
    rag_data_dense = json.load(f)

# Build a list of tuples: (query, [list of retrieved texts])
query_retrieved_pairs_dense = [
    (
        query,
        [item["text"].strip() for item in texts]
    )
    for query, texts in rag_data_dense.items()
]

# Example: show the first query and its retrieved texts
first_query, first_texts = query_retrieved_pairs_dense[0]
print("Query:", first_query)
print("Number of retrieved texts:", len(first_texts))
print("First retrieved text snippet:", first_texts[0][:200].replace("\n", " "), "...\n")

Query: Which solution should a researcher use if they want High accuracy and data export?
Number of retrieved texts: 2
First retrieved text snippet: Pros and cons of measurement methods Scans can be faster than measuring at fixed pos- You are able to listen to the sound field as you You can control measurements from outside the room, without intro ...



Same with the sparse

In [5]:
# Path to your BM25 output JSON
path = r"C:/Users/NCABALLERO/OneDrive - HBK/Thesis/Code/masterThesis/testing/retrieval_results_sparse copy.json"

# Load the JSON
with open(path, "r", encoding="utf-8") as f:
    bm25_data = json.load(f)

# Build a list of tuples: (query, [list of retrieved texts])
query_retrieved_pairs_sparse = [
    (
        item["query"],
        [res["window"].strip() for res in item["results"]]
    )
    for item in bm25_data
]

# Example: show the first query and its retrieved texts
first_query, first_texts = query_retrieved_pairs_sparse[0]
print("Query:", first_query)
print("Number of retrieved texts:", len(first_texts))
print("First retrieved text snippet:", first_texts[0][:200].replace("\n", " "), "...\n")

Query: Which solution should a researcher use if they want High accuracy and data export?
Number of retrieved texts: 2
First retrieved text snippet: It is possible to override all data.  Adjust  the slope of the decay, if needed, or edit data to test theories  about the effects of changes you can make to get specific results.   Data edited in this ...



LLM responses

In [8]:
path = r"C:/Users/NCABALLERO/OneDrive - HBK/Thesis/Code/masterThesis/testing/chat_logs.json"

# Load the JSON
with open(path, "r", encoding="utf-8") as f:
    data = json.load(f)

dense_pairs = []
sparse_pairs = []

for item in data:
    query = item.get("query", "").strip()
    mode = item.get("mode", "").strip().lower()  # normalize to lowercase
    
    if mode == "dense rag":
        response = item.get("response", "").strip()
        dense_pairs.append((query, response))
    elif mode == "sparse rag":
        results = item.get("results", [])
        texts = [res.get("window", "").strip() for res in results]  # even if empty
        sparse_pairs.append((query, texts))

print("Dense pairs found:", len(dense_pairs))
print("Sparse pairs found:", len(sparse_pairs))


Dense pairs found: 18
Sparse pairs found: 18


### Context Precision

In [9]:
# Initialize metric
context_precision = NonLLMContextPrecisionWithReference()

# Store scores
dense_scores = []
sparse_scores = []

# Loop over all queries for Dense retrieval
for (query_gt, gt_texts), (query_dense, dense_texts) in zip(query_texts_pairs, query_retrieved_pairs_dense):
    # Sanity check: queries should match
    assert query_gt == query_dense, f"Query mismatch: {query_gt} vs {query_dense}"

    # Build sample
    sample = SingleTurnSample(
        retrieved_contexts=dense_texts, 
        reference_contexts=gt_texts
    )

    # Compute score
    score = await context_precision.single_turn_ascore(sample)
    dense_scores.append((query_gt, score))

# Loop over all queries for Sparse retrieval
for (query_gt, gt_texts), (query_sparse, sparse_texts) in zip(query_texts_pairs, query_retrieved_pairs_sparse):
    assert query_gt == query_sparse, f"Query mismatch: {query_gt} vs {query_sparse}"

    sample = SingleTurnSample(
        retrieved_contexts=sparse_texts, 
        reference_contexts=gt_texts
    )

    score = await context_precision.single_turn_ascore(sample)
    sparse_scores.append((query_gt, score))

# Example: show results
print("Dense RAG scores:")
for q, s in dense_scores:
    print(q, "->", s)

print("\nSparse RAG scores:")
for q, s in sparse_scores:
    print(q, "->", s)

Dense RAG scores:
Which solution should a researcher use if they want High accuracy and data export? -> 0.0
Which solution should I use if I want compatibility with analysis tools like matlab? -> 0.0
Which solution is better for a university group? They want to use this solution in different applications. -> 0.0
I need a solution that complies with noise regulations and does automated reports. -> 0.0
Which solution should I use to measure noise levels in a factory floor? -> 0.0
What solution should I use to see if a construction site follows the noise regulations? -> 0.0
Which solution should I use to conduct noise impact assessments? I need GPS tagging and the device to be weatherproof. -> 0.0
I want to monitore a contruction site. What should I use? I need long term logging and report generation. -> 0.0
I need a solution that covers, Long term logging, GPS tagging, weatherproofing and report generation. -> 0.0
Which solution should I use to measure a building design? -> 0.0
I need a 

In [None]:
# Assume you already have an LLM evaluator instance
context_precision = LLMContextPrecisionWithReference(llm=evaluator_llm)

dense_llm_scores = []
sparse_llm_scores = []

# --- Dense retrieval with LLM answer ---
for (query_gt, gt_texts), (query_dense, dense_texts) in zip(query_texts_pairs, query_retrieved_pairs_dense):
    # Sanity check
    if query_gt != query_dense:
        print(f"Warning: query mismatch: {query_gt} vs {query_dense}")

    # Wrap the LLM answer if you also want to score it
    dense_llm_answer = next((resp for q, resp in dense_pairs if q == query_gt), None)

    sample = SingleTurnSample(
        user_input=query_gt,
        reference=" ".join(gt_texts),          # combine all ground truth texts
        retrieved_contexts=dense_texts + ([dense_llm_answer] if dense_llm_answer else [])
    )

    score = await context_precision.single_turn_ascore(sample)
    dense_llm_scores.append((query_gt, score))

# --- Sparse retrieval with LLM answer ---
for (query_gt, gt_texts), (query_sparse, sparse_texts) in zip(query_texts_pairs, query_retrieved_pairs_sparse):
    if query_gt != query_sparse:
        print(f"Warning: query mismatch: {query_gt} vs {query_sparse}")

    sparse_llm_answer = next((resp for q, resp in sparse_pairs if q == query_gt), None)

    sample = SingleTurnSample(
        user_input=query_gt,
        reference=" ".join(gt_texts),
        retrieved_contexts=sparse_texts + ([sparse_llm_answer] if sparse_llm_answer else [])
    )

    score = await context_precision.single_turn_ascore(sample)
    sparse_llm_scores.append((query_gt, score))

# --- Example outputs ---
print("Dense LLM scores (first 3):")
for q, s in dense_llm_scores:
    print(q, "->", s)

print("\nSparse LLM scores (first 3):")
for q, s in sparse_llm_scores:
    print(q, "->", s)

ValidationError: 1 validation error for Verification
  Invalid JSON: expected value at line 1 column 1 [type=json_invalid, input_value='This output is a JSON (J... and measurement modes.', input_type=str]
    For further information visit https://errors.pydantic.dev/2.12/v/json_invalid