Imports

In [1]:
import json
from typing import List, Dict, Tuple
from ragas import SingleTurnSample
from ragas.metrics import NonLLMContextPrecisionWithReference
from ragas.metrics import LLMContextPrecisionWithReference
from ragas.metrics import LLMContextPrecisionWithoutReference


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from langchain_community.chat_models import ChatOllama
from ragas.llms import LangchainLLMWrapper

llm = ChatOllama(model="llama3.1")
evaluator_llm = LangchainLLMWrapper(llm)

  llm = ChatOllama(model="llama3.1")
  evaluator_llm = LangchainLLMWrapper(llm)


Read Ground truth and get var

In [3]:
path = r"gt.json"

# Load the JSON safely
with open(path, "r", encoding="utf-8") as f:
    data = json.load(f)

# Build a list of tuples: (query, ground_truth_answer, [list of reference texts])
query_texts_pairs = [
    (
        item["question"].strip(),
        item["ground_truth_answer"].strip(),
        [seg["text"].strip() for seg in item.get("ground_truth_segments", [])]
    )
    for item in data
]

# Example: show the first query and its data
first_query, first_gt_answer, first_texts = query_texts_pairs[0]
print("Query:", first_query)
print("Ground truth answer:", first_gt_answer)
print("Number of ground truth texts:", len(first_texts))
print("First text snippet:", first_texts[0][:200].replace("\n", " "), "...\n")

Query: Which solution should a researcher use if they want High accuracy and data export?
Ground truth answer: Potential products are Sound Level Meter 2245, Sound Level Meter 2255, Building Acoustic Software and Accessories and DIRAC software.
Number of ground truth texts: 17
First text snippet: You can simultaneously measure source (L1) and receiving room levels (L2) by connecting two HBK 2255 Sound Level Meters to the app.  This feature is designed to save time and enhance efficiency, espec ...



read dense json and get question and texts

In [4]:
# Path to your RAG output JSON
path = r"retrieval_results.json"

# Load the JSON
with open(path, "r", encoding="utf-8") as f:
    rag_data_dense = json.load(f)

# Build a list of tuples: (query, [list of retrieved texts])
query_retrieved_pairs_dense = [
    (
        query,
        [item["text"].strip() for item in texts]
    )
    for query, texts in rag_data_dense.items()
]

# Example: show the first query and its retrieved texts
first_query, first_texts = query_retrieved_pairs_dense[0]
print("Query:", first_query)
print("Number of retrieved texts:", len(first_texts))
print("First retrieved text snippet:", first_texts[0][:200].replace("\n", " "), "...\n")

Query: Which solution should a researcher use if they want High accuracy and data export?
Number of retrieved texts: 2
First retrieved text snippet: Pros and cons of measurement methods Scans can be faster than measuring at fixed pos- You are able to listen to the sound field as you You can control measurements from outside the room, without intro ...



Same with the sparse

In [5]:
# Path to your BM25 output JSON
path = r"retrieval_results_sparse copy.json"

# Load the JSON
with open(path, "r", encoding="utf-8") as f:
    bm25_data = json.load(f)

# Build a list of tuples: (query, [list of retrieved texts])
query_retrieved_pairs_sparse = [
    (
        item["query"],
        [res["window"].strip() for res in item["results"]]
    )
    for item in bm25_data
]

# Example: show the first query and its retrieved texts
first_query, first_texts = query_retrieved_pairs_sparse[0]
print("Query:", first_query)
print("Number of retrieved texts:", len(first_texts))
print("First retrieved text snippet:", first_texts[0][:200].replace("\n", " "), "...\n")

Query: Which solution should a researcher use if they want High accuracy and data export?
Number of retrieved texts: 2
First retrieved text snippet: It is possible to override all data.  Adjust  the slope of the decay, if needed, or edit data to test theories  about the effects of changes you can make to get specific results.   Data edited in this ...



LLM responses

In [6]:
# Load chat logs and separate by mode
path = r"chat_logs.json"

with open(path, "r", encoding="utf-8") as f:
    data = json.load(f)

dense_pairs = []
sparse_pairs = []

for item in data:
    query = item.get("query", "").strip()
    mode = item.get("mode", "").strip().lower()
    response = item.get("response", "").strip()
    
    if mode == "dense rag":
        dense_pairs.append((query, response))
    elif mode == "sparse rag":
        sparse_pairs.append((query, response))

print("Dense pairs found:", len(dense_pairs))
print("Sparse pairs found:", len(sparse_pairs))

Dense pairs found: 18
Sparse pairs found: 18


### Context Precision

#### DENSE

Context Precision WITH reference - NON LLM

In [7]:
# Ensure prerequisite variables are available
if 'query_texts_pairs' not in globals() or 'query_retrieved_pairs_dense' not in globals():
    raise NameError(
        "query_texts_pairs and/or query_retrieved_pairs_dense are not defined. "
        "Please run the cells that load ground truth (cell that creates query_texts_pairs) "
        "and dense retrieval results (cell that creates query_retrieved_pairs_dense) before this cell."
    )

# Initialize metric
context_precision = NonLLMContextPrecisionWithReference()

# Store scores
dense_scores = []
sparse_scores = []

# Loop over all queries for Dense retrieval
# Note: query_texts_pairs elements are (query, ground_truth_answer, [reference_texts])
for (query_gt, gt_answer, gt_texts), (query_dense, dense_texts) in zip(query_texts_pairs, query_retrieved_pairs_dense):
    # Sanity check: queries should match
    assert query_gt == query_dense, f"Query mismatch: {query_gt} vs {query_dense}"

    # Build sample (NonLLM metric expects reference_contexts + retrieved_contexts)
    sample = SingleTurnSample(
        retrieved_contexts=dense_texts,
        reference_contexts=gt_texts
    )

    # Compute score (async)
    score = await context_precision.single_turn_ascore(sample)
    dense_scores.append((query_gt, score))
    print(f"Dense - Query: {query_gt}\nScore: {score}\n")

Dense - Query: Which solution should a researcher use if they want High accuracy and data export?
Score: 0.0

Dense - Query: Which solution should I use if I want compatibility with analysis tools like matlab?
Score: 0.0

Dense - Query: Which solution is better for a university group? They want to use this solution in different applications.
Score: 0.0

Dense - Query: I need a solution that complies with noise regulations and does automated reports.
Score: 0.0

Dense - Query: Which solution should I use to measure noise levels in a factory floor?
Score: 0.0

Dense - Query: What solution should I use to see if a construction site follows the noise regulations?
Score: 0.0

Dense - Query: Which solution should I use to conduct noise impact assessments? I need GPS tagging and the device to be weatherproof.
Score: 0.0

Dense - Query: I want to monitore a contruction site. What should I use? I need long term logging and report generation.
Score: 0.0

Dense - Query: I need a solution that cov

Context Precision WITH Reference - LLM

In [8]:
context_precision = LLMContextPrecisionWithReference(llm=evaluator_llm)

# Evaluate Dense RAG
print("\n=== Evaluating Dense RAG (With Reference) ===")
dense_scores = []

for (query_gt, gt_answer, gt_texts), (query_dense, dense_texts) in zip(
    query_texts_pairs, 
    query_retrieved_pairs_dense
):
    assert query_gt == query_dense, f"Query mismatch"

    sample = SingleTurnSample(
        user_input=query_gt,
        reference=gt_answer,  # Ground truth answer
        retrieved_contexts=dense_texts
    )

    score = await context_precision.single_turn_ascore(sample)
    print(f"Dense - Query: {query_gt[:60]}... Score: {score}")
    dense_scores.append((query_gt, score))


=== Evaluating Dense RAG (With Reference) ===
Dense - Query: Which solution should a researcher use if they want High acc... Score: 0.0
Dense - Query: Which solution should I use if I want compatibility with ana... Score: 0.0
Dense - Query: Which solution is better for a university group? They want t... Score: 0.0
Dense - Query: I need a solution that complies with noise regulations and d... Score: 0.99999999995
Dense - Query: Which solution should I use to measure noise levels in a fac... Score: 0.0
Dense - Query: What solution should I use to see if a construction site fol... Score: 0.99999999995
Dense - Query: Which solution should I use to conduct noise impact assessme... Score: 0.0
Dense - Query: I want to monitore a contruction site. What should I use? I ... Score: 0.0
Dense - Query: I need a solution that covers, Long term logging, GPS taggin... Score: 0.0
Dense - Query: Which solution should I use to measure a building design?... Score: 0.0
Dense - Query: I need a solution for

Context Precision Without Reference - LLM

In [9]:
context_precision = LLMContextPrecisionWithoutReference(llm=evaluator_llm)

# Evaluate Dense RAG
print("\n=== Evaluating Dense RAG ===")
dense_scores = []

for (query_gt, gt_answer, gt_texts), (query_dense, dense_texts), (query_response, response) in zip(
    query_texts_pairs, 
    query_retrieved_pairs_dense, 
    dense_pairs
):
    # Sanity check
    assert query_gt == query_dense == query_response, f"Query mismatch: {query_gt} vs {query_dense} vs {query_response}"

    # Build sample
    sample = SingleTurnSample(
        user_input=query_gt,
        response=response,  # LLM's actual response
        retrieved_contexts=dense_texts
    )

    # Compute score
    score = await context_precision.single_turn_ascore(sample)
    print(f"Dense - Query: {query_gt[:60]}... Score: {score}")
    dense_scores.append((query_gt, score))


=== Evaluating Dense RAG ===
Dense - Query: Which solution should a researcher use if they want High acc... Score: 0.0
Dense - Query: Which solution should I use if I want compatibility with ana... Score: 0.9999999999
Dense - Query: Which solution is better for a university group? They want t... Score: 0.0
Dense - Query: I need a solution that complies with noise regulations and d... Score: 0.99999999995
Dense - Query: Which solution should I use to measure noise levels in a fac... Score: 0.99999999995
Dense - Query: What solution should I use to see if a construction site fol... Score: 0.99999999995
Dense - Query: Which solution should I use to conduct noise impact assessme... Score: 0.0
Dense - Query: I want to monitore a contruction site. What should I use? I ... Score: 0.49999999995
Dense - Query: I need a solution that covers, Long term logging, GPS taggin... Score: 0.0
Dense - Query: Which solution should I use to measure a building design?... Score: 0.0
Dense - Query: I need a 

#### Sparse

In [10]:
# Ensure prerequisite variables are available
if 'query_texts_pairs' not in globals() or 'query_retrieved_pairs_dense' not in globals():
    raise NameError(
        "query_texts_pairs and/or query_retrieved_pairs_dense are not defined. "
        "Please run the cells that load ground truth (cell that creates query_texts_pairs) "
        "and dense retrieval results (cell that creates query_retrieved_pairs_dense) before this cell."
    )

# Initialize metric
context_precision = NonLLMContextPrecisionWithReference()


sparse_scores = []

# Loop over all queries for Sparse retrieval
# Note: query_texts_pairs elements are (query, ground_truth_answer, [reference_texts])
for (query_gt, gt_answer, gt_texts), (query_sparse, sparse_texts) in zip(query_texts_pairs, query_retrieved_pairs_sparse):
    # Sanity check: queries should match
    assert query_gt == query_sparse, f"Query mismatch: {query_gt} vs {query_sparse}"

    # Build sample (NonLLM metric expects reference_contexts + retrieved_contexts)
    sample = SingleTurnSample(
        retrieved_contexts=sparse_texts,
        reference_contexts=gt_texts
    )

    # Compute score (async)
    score = await context_precision.single_turn_ascore(sample)
    sparse_scores.append((query_gt, score))
    print(f"Sparse - Query: {query_gt}\nScore: {score}\n")

Sparse - Query: Which solution should a researcher use if they want High accuracy and data export?
Score: 0.0

Sparse - Query: Which solution should I use if I want compatibility with analysis tools like matlab?
Score: 0.0

Sparse - Query: Which solution is better for a university group? They want to use this solution in different applications.
Score: 0.0

Sparse - Query: I need a solution that complies with noise regulations and does automated reports.
Score: 0.0

Sparse - Query: Which solution should I use to measure noise levels in a factory floor?
Score: 0.0

Sparse - Query: What solution should I use to see if a construction site follows the noise regulations?
Score: 0.0

Sparse - Query: Which solution should I use to conduct noise impact assessments? I need GPS tagging and the device to be weatherproof.
Score: 0.0

Sparse - Query: I want to monitore a contruction site. What should I use? I need long term logging and report generation.
Score: 0.0

Sparse - Query: I need a solution

In [11]:
context_precision = LLMContextPrecisionWithReference(llm=evaluator_llm)

# Evaluate Sparse RAG
print("\n=== Evaluating Sparse RAG (With Reference) ===")
sparse_scores = []

for (query_gt, gt_answer, gt_texts), (query_sparse, sparse_texts) in zip(
    query_texts_pairs, 
    query_retrieved_pairs_sparse
):
    assert query_gt == query_sparse, f"Query mismatch: {query_gt} vs {query_sparse}"

    sample = SingleTurnSample(
        user_input=query_gt,
        reference=gt_answer,  # Ground truth answer
        retrieved_contexts=sparse_texts
    )

    score = await context_precision.single_turn_ascore(sample)
    print(f"Sparse - Query: {query_gt[:60]}... Score: {score}")
    sparse_scores.append((query_gt, score))


=== Evaluating Sparse RAG (With Reference) ===
Sparse - Query: Which solution should a researcher use if they want High acc... Score: 0.0
Sparse - Query: Which solution should I use if I want compatibility with ana... Score: 0.0
Sparse - Query: Which solution is better for a university group? They want t... Score: 0.0
Sparse - Query: I need a solution that complies with noise regulations and d... Score: 0.9999999999
Sparse - Query: Which solution should I use to measure noise levels in a fac... Score: 0.0
Sparse - Query: What solution should I use to see if a construction site fol... Score: 0.49999999995
Sparse - Query: Which solution should I use to conduct noise impact assessme... Score: 0.9999999999
Sparse - Query: I want to monitore a contruction site. What should I use? I ... Score: 0.0
Sparse - Query: I need a solution that covers, Long term logging, GPS taggin... Score: 0.0
Sparse - Query: Which solution should I use to measure a building design?... Score: 0.0
Sparse - Query: I

In [12]:
context_precision = LLMContextPrecisionWithoutReference(llm=evaluator_llm)

# Evaluate Sparse RAG
print("\n=== Evaluating Sparse RAG ===")
sparse_scores = []

for (query_gt, gt_answer, gt_texts), (query_sparse, sparse_texts), (query_response, response) in zip(
    query_texts_pairs, 
    query_retrieved_pairs_sparse, 
    sparse_pairs
):
    # Sanity check
    assert query_gt == query_sparse == query_response, f"Query mismatch: {query_gt} vs {query_sparse} vs {query_response}"

    # Build sample
    sample = SingleTurnSample(
        user_input=query_gt,
        response=response,  # LLM's actual response
        retrieved_contexts=sparse_texts
    )

    # Compute score
    score = await context_precision.single_turn_ascore(sample)
    print(f"Sparse - Query: {query_gt[:60]}... Score: {score}")
    sparse_scores.append((query_gt, score))


=== Evaluating Sparse RAG ===
Sparse - Query: Which solution should a researcher use if they want High acc... Score: 0.9999999999
Sparse - Query: Which solution should I use if I want compatibility with ana... Score: 0.0
Sparse - Query: Which solution is better for a university group? They want t... Score: 0.99999999995
Sparse - Query: I need a solution that complies with noise regulations and d... Score: 0.99999999995
Sparse - Query: Which solution should I use to measure noise levels in a fac... Score: 0.0
Sparse - Query: What solution should I use to see if a construction site fol... Score: 0.49999999995
Sparse - Query: Which solution should I use to conduct noise impact assessme... Score: 0.9999999999
Sparse - Query: I want to monitore a contruction site. What should I use? I ... Score: 0.9999999999
Sparse - Query: I need a solution that covers, Long term logging, GPS taggin... Score: 0.0
Sparse - Query: Which solution should I use to measure a building design?... Score: 0.9999999