Imports

In [1]:
import json
from typing import List, Dict, Tuple
from ragas import SingleTurnSample
from ragas.metrics import NonLLMContextPrecisionWithReference
from ragas.metrics import LLMContextPrecisionWithReference
from ragas.metrics import LLMContextPrecisionWithoutReference
from ragas.metrics import NoiseSensitivity

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from langchain_community.chat_models import ChatOllama
from ragas.llms import LangchainLLMWrapper

llm = ChatOllama(model="llama3.1")
evaluator_llm = LangchainLLMWrapper(llm)

  llm = ChatOllama(model="llama3.1")
  evaluator_llm = LangchainLLMWrapper(llm)


Read Ground truth and get var

In [3]:
import json

path = r"gt.json"

# Load the JSON safely
with open(path, "r", encoding="utf-8") as f:
    data = json.load(f)

# Build a list of tuples: (question, reference_answer, [list of context texts])
query_texts_pairs = [
    (
        item["question"].strip(),
        item["reference"].strip(),
        [seg.strip() for seg in item.get("context", [])]
    )
    for item in data
]

# Example: show the first query and its data
first_query, first_reference, first_contexts = query_texts_pairs[0]
print("Query:", first_query)
print("Reference answer:", first_reference)
print("Number of context texts:", len(first_contexts))
print("First context snippet:", first_contexts[0][:200].replace("\n", " "), "...\n")


Query: Which solution is suitable for measuring room acoustics and speech intelligibility in compliance with ISO standards?
Reference answer: DIRAC Room Acoustics Software with HBK 2255 and HBK 2755.
Number of context texts: 3
First context snippet: DIRAC Room Acoustics Software is used for measuring a wide range of room acoustical parameters. It supports wireless measurements using HBK 2255 Sound Level Meter and HBK 2755 Smart Power Amplifier. I ...



read dense json and get question and texts

In [4]:
# Path to your RAG output JSON
path = r"retrieval_results_dense.json"

# Load the JSON
with open(path, "r", encoding="utf-8") as f:
    rag_data_dense = json.load(f)

# Build a list of tuples: (query, [list of retrieved texts])
query_retrieved_pairs_dense = [
    (
        query,
        [item["text"].strip() for item in texts]
    )
    for query, texts in rag_data_dense.items()
]

# Example: show the first query and its retrieved texts
first_query, first_texts = query_retrieved_pairs_dense[0]
print("Query:", first_query)
print("Number of retrieved texts:", len(first_texts))
print("First retrieved text snippet:", first_texts[0][:200].replace("\n", " "), "...\n")

Query: Which solution is suitable for measuring room acoustics and speech intelligibility in compliance with ISO standards?
Number of retrieved texts: 2
First retrieved text snippet: Speech intelligibility measurements can be carried out in compliance with the IEC 60268-16 standard, for male and female voices, through an artificial mouth-directional loudspeaker sound source or thr ...



Same with the sparse

In [5]:
# Path to your BM25 output JSON
path = r"retrieval_results_sparse.json"

# Load the JSON
with open(path, "r", encoding="utf-8") as f:
    bm25_data = json.load(f)

# Build a list of tuples: (query, [list of retrieved texts])
query_retrieved_pairs_sparse = [
    (
        item["query"],
        [res["window"].strip() for res in item["results"]]
    )
    for item in bm25_data
]

# Example: show the first query and its retrieved texts
first_query, first_texts = query_retrieved_pairs_sparse[0]
print("Query:", first_query)
print("Number of retrieved texts:", len(first_texts))
print("First retrieved text snippet:", first_texts[0][:200].replace("\n", " "), "...\n")

Query: Which solution is suitable for measuring room acoustics and speech intelligibility in compliance with ISO standards?
Number of retrieved texts: 2
First retrieved text snippet: 5  Regression line through the speech sound levels, indicating the spatial decay  of sound A similar graph can be generated for the speech transmission  index (STI) with a regression line to calculate ...



LLM responses

In [6]:
# Load chat logs and separate by mode
path = r"chat_logs.json"

with open(path, "r", encoding="utf-8") as f:
    data = json.load(f)

dense_pairs = []
sparse_pairs = []

for item in data:
    query = item.get("query", "").strip()
    mode = item.get("mode", "").strip().lower()
    response = item.get("response", "").strip()
    
    if mode == "dense rag":
        dense_pairs.append((query, response))
    elif mode == "sparse rag":
        sparse_pairs.append((query, response))

print("Dense pairs found:", len(dense_pairs))
print("Sparse pairs found:", len(sparse_pairs))

Dense pairs found: 24
Sparse pairs found: 24


### Context Precision

#### DENSE

Context Precision WITH reference - NON LLM

In [7]:
# Ensure prerequisite variables are available
if 'query_texts_pairs' not in globals() or 'query_retrieved_pairs_dense' not in globals():
    raise NameError(
        "query_texts_pairs and/or query_retrieved_pairs_dense are not defined. "
        "Please run the cells that load ground truth (cell that creates query_texts_pairs) "
        "and dense retrieval results (cell that creates query_retrieved_pairs_dense) before this cell."
    )

# Initialize metric
context_precision = NonLLMContextPrecisionWithReference()

# Store scores
dense_scores = []
sparse_scores = []

# Loop over all queries for Dense retrieval
# Note: query_texts_pairs elements are (query, ground_truth_answer, [reference_texts])
for (query_gt, gt_answer, gt_texts), (query_dense, dense_texts) in zip(query_texts_pairs, query_retrieved_pairs_dense):
    # Sanity check: queries should match
    assert query_gt == query_dense, f"Query mismatch: {query_gt} vs {query_dense}"

    # Build sample (NonLLM metric expects reference_contexts + retrieved_contexts)
    sample = SingleTurnSample(
        retrieved_contexts=dense_texts,
        reference_contexts=gt_texts
    )

    # Compute score (async)
    score = await context_precision.single_turn_ascore(sample)
    dense_scores.append((query_gt, score))
    print(f"Dense - Query: {query_gt}\nScore: {score}\n")

Dense - Query: Which solution is suitable for measuring room acoustics and speech intelligibility in compliance with ISO standards?
Score: 0.99999999995

Dense - Query: What product should be used for façade sound insulation testing on a construction site?
Score: 0.0

Dense - Query: Which sound source is recommended for calibrated speech intelligibility measurements using DIRAC?
Score: 0.0

Dense - Query: What sound source should be used for ISO 3382-compliant room acoustics measurements?
Score: 0.0

Dense - Query: Which product supports compliance with ISO 9612 for workplace noise exposure?
Score: 0.99999999995

Dense - Query: Which product is suitable for investigating environmental noise complaints?
Score: 0.0

Dense - Query: Which product is designed for measuring exhaust noise in vehicles?
Score: 0.0

Dense - Query: Which product helps verify safe noise emissions from toys and machinery?
Score: 0.99999999995

Dense - Query: Which HBK 2255 variant is best suited for long-term envir

Context Precision WITH Reference - LLM

In [8]:
context_precision = LLMContextPrecisionWithReference(llm=evaluator_llm)

# Evaluate Dense RAG
print("\n=== Evaluating Dense RAG (With Reference) ===")
dense_scores = []

for (query_gt, gt_answer, gt_texts), (query_dense, dense_texts) in zip(
    query_texts_pairs, 
    query_retrieved_pairs_dense
):
    assert query_gt == query_dense, f"Query mismatch"

    sample = SingleTurnSample(
        user_input=query_gt,
        reference=gt_answer,  # Ground truth answer
        retrieved_contexts=dense_texts
    )

    score = await context_precision.single_turn_ascore(sample)
    print(f"Dense - Query: {query_gt[:60]}... Score: {score}")
    dense_scores.append((query_gt, score))


=== Evaluating Dense RAG (With Reference) ===
Dense - Query: Which solution is suitable for measuring room acoustics and ... Score: 0.0
Dense - Query: What product should be used for façade sound insulation test... Score: 0.0
Dense - Query: Which sound source is recommended for calibrated speech inte... Score: 0.9999999999
Dense - Query: What sound source should be used for ISO 3382-compliant room... Score: 0.99999999995
Dense - Query: Which product supports compliance with ISO 9612 for workplac... Score: 0.99999999995
Dense - Query: Which product is suitable for investigating environmental no... Score: 0.99999999995
Dense - Query: Which product is designed for measuring exhaust noise in veh... Score: 0.99999999995
Dense - Query: Which product helps verify safe noise emissions from toys an... Score: 0.99999999995
Dense - Query: Which HBK 2255 variant is best suited for long-term environm... Score: 0.9999999999
Dense - Query: Which HBK 2255 variant should be used for evaluating workpla

Context Precision Without Reference - LLM

In [9]:
context_precision = LLMContextPrecisionWithoutReference(llm=evaluator_llm)

# Evaluate Dense RAG
print("\n=== Evaluating Dense RAG ===")
dense_scores = []

for (query_gt, gt_answer, gt_texts), (query_dense, dense_texts), (query_response, response) in zip(
    query_texts_pairs, 
    query_retrieved_pairs_dense, 
    dense_pairs
):
    # Sanity check
    assert query_gt == query_dense == query_response, f"Query mismatch: {query_gt} vs {query_dense} vs {query_response}"

    # Build sample
    sample = SingleTurnSample(
        user_input=query_gt,
        response=response,  # LLM's actual response
        retrieved_contexts=dense_texts
    )

    # Compute score
    score = await context_precision.single_turn_ascore(sample)
    print(f"Dense - Query: {query_gt[:60]}... Score: {score}")
    dense_scores.append((query_gt, score))


=== Evaluating Dense RAG ===
Dense - Query: Which solution is suitable for measuring room acoustics and ... Score: 0.9999999999
Dense - Query: What product should be used for façade sound insulation test... Score: 0.0
Dense - Query: Which sound source is recommended for calibrated speech inte... Score: 0.99999999995
Dense - Query: What sound source should be used for ISO 3382-compliant room... Score: 0.99999999995
Dense - Query: Which product supports compliance with ISO 9612 for workplac... Score: 0.99999999995
Dense - Query: Which product is suitable for investigating environmental no... Score: 0.99999999995
Dense - Query: Which product is designed for measuring exhaust noise in veh... Score: 0.99999999995
Dense - Query: Which product helps verify safe noise emissions from toys an... Score: 0.99999999995
Dense - Query: Which HBK 2255 variant is best suited for long-term environm... Score: 0.0
Dense - Query: Which HBK 2255 variant should be used for evaluating workpla... Score: 0.0
D

#### Sparse

In [10]:
# Ensure prerequisite variables are available
if 'query_texts_pairs' not in globals() or 'query_retrieved_pairs_dense' not in globals():
    raise NameError(
        "query_texts_pairs and/or query_retrieved_pairs_dense are not defined. "
        "Please run the cells that load ground truth (cell that creates query_texts_pairs) "
        "and dense retrieval results (cell that creates query_retrieved_pairs_dense) before this cell."
    )

# Initialize metric
context_precision = NonLLMContextPrecisionWithReference()


sparse_scores = []

# Loop over all queries for Sparse retrieval
# Note: query_texts_pairs elements are (query, ground_truth_answer, [reference_texts])
for (query_gt, gt_answer, gt_texts), (query_sparse, sparse_texts) in zip(query_texts_pairs, query_retrieved_pairs_sparse):
    # Sanity check: queries should match
    assert query_gt == query_sparse, f"Query mismatch: {query_gt} vs {query_sparse}"

    # Build sample (NonLLM metric expects reference_contexts + retrieved_contexts)
    sample = SingleTurnSample(
        retrieved_contexts=sparse_texts,
        reference_contexts=gt_texts
    )

    # Compute score (async)
    score = await context_precision.single_turn_ascore(sample)
    sparse_scores.append((query_gt, score))
    print(f"Sparse - Query: {query_gt}\nScore: {score}\n")

Sparse - Query: Which solution is suitable for measuring room acoustics and speech intelligibility in compliance with ISO standards?
Score: 0.0

Sparse - Query: What product should be used for façade sound insulation testing on a construction site?
Score: 0.0

Sparse - Query: Which sound source is recommended for calibrated speech intelligibility measurements using DIRAC?
Score: 0.0

Sparse - Query: What sound source should be used for ISO 3382-compliant room acoustics measurements?
Score: 0.0

Sparse - Query: Which product supports compliance with ISO 9612 for workplace noise exposure?
Score: 0.0

Sparse - Query: Which product is suitable for investigating environmental noise complaints?
Score: 0.0

Sparse - Query: Which product is designed for measuring exhaust noise in vehicles?
Score: 0.0

Sparse - Query: Which product helps verify safe noise emissions from toys and machinery?
Score: 0.0

Sparse - Query: Which HBK 2255 variant is best suited for long-term environmental noise monito

In [11]:
context_precision = LLMContextPrecisionWithReference(llm=evaluator_llm)

# Evaluate Sparse RAG
print("\n=== Evaluating Sparse RAG (With Reference) ===")
sparse_scores = []

for (query_gt, gt_answer, gt_texts), (query_sparse, sparse_texts) in zip(
    query_texts_pairs, 
    query_retrieved_pairs_sparse
):
    assert query_gt == query_sparse, f"Query mismatch: {query_gt} vs {query_sparse}"

    sample = SingleTurnSample(
        user_input=query_gt,
        reference=gt_answer,  # Ground truth answer
        retrieved_contexts=sparse_texts
    )

    score = await context_precision.single_turn_ascore(sample)
    print(f"Sparse - Query: {query_gt[:60]}... Score: {score}")
    sparse_scores.append((query_gt, score))


=== Evaluating Sparse RAG (With Reference) ===
Sparse - Query: Which solution is suitable for measuring room acoustics and ... Score: 0.99999999995
Sparse - Query: What product should be used for façade sound insulation test... Score: 0.0
Sparse - Query: Which sound source is recommended for calibrated speech inte... Score: 0.99999999995
Sparse - Query: What sound source should be used for ISO 3382-compliant room... Score: 0.99999999995
Sparse - Query: Which product supports compliance with ISO 9612 for workplac... Score: 0.99999999995
Sparse - Query: Which product is suitable for investigating environmental no... Score: 0.99999999995
Sparse - Query: Which product is designed for measuring exhaust noise in veh... Score: 0.99999999995
Sparse - Query: Which product helps verify safe noise emissions from toys an... Score: 0.99999999995
Sparse - Query: Which HBK 2255 variant is best suited for long-term environm... Score: 0.99999999995
Sparse - Query: Which HBK 2255 variant should be used

In [12]:
context_precision = LLMContextPrecisionWithoutReference(llm=evaluator_llm)

# Evaluate Sparse RAG
print("\n=== Evaluating Sparse RAG ===")
sparse_scores = []

for (query_gt, gt_answer, gt_texts), (query_sparse, sparse_texts), (query_response, response) in zip(
    query_texts_pairs, 
    query_retrieved_pairs_sparse, 
    sparse_pairs
):
    # Sanity check
    assert query_gt == query_sparse == query_response, f"Query mismatch: {query_gt} vs {query_sparse} vs {query_response}"

    # Build sample
    sample = SingleTurnSample(
        user_input=query_gt,
        response=response,  # LLM's actual response
        retrieved_contexts=sparse_texts
    )

    # Compute score
    score = await context_precision.single_turn_ascore(sample)
    print(f"Sparse - Query: {query_gt[:60]}... Score: {score}")
    sparse_scores.append((query_gt, score))


=== Evaluating Sparse RAG ===
Sparse - Query: Which solution is suitable for measuring room acoustics and ... Score: 0.99999999995
Sparse - Query: What product should be used for façade sound insulation test... Score: 0.0
Sparse - Query: Which sound source is recommended for calibrated speech inte... Score: 0.99999999995
Sparse - Query: What sound source should be used for ISO 3382-compliant room... Score: 0.99999999995
Sparse - Query: Which product supports compliance with ISO 9612 for workplac... Score: 0.99999999995
Sparse - Query: Which product is suitable for investigating environmental no... Score: 0.99999999995
Sparse - Query: Which product is designed for measuring exhaust noise in veh... Score: 0.99999999995
Sparse - Query: Which product helps verify safe noise emissions from toys an... Score: 0.99999999995
Sparse - Query: Which HBK 2255 variant is best suited for long-term environm... Score: 0.99999999995
Sparse - Query: Which HBK 2255 variant should be used for evaluating w

### Noise Sensitivity

#### Dense

#### Sparse

In [None]:
# Evaluate Sparse RAG
print("\n=== Evaluating Sparse RAG (NoiseSensitivity) ===")
sparse_scores = []

for (query_gt, gt_answer, gt_texts), (query_sparse, sparse_texts), (query_response, response) in zip(
    query_texts_pairs, 
    query_retrieved_pairs_sparse, 
    sparse_pairs
):
    # Sanity check
    assert query_gt == query_sparse == query_response, f"Query mismatch: {query_gt} vs {query_sparse} vs {query_response}"

    sample = SingleTurnSample(
    user_input=query_gt,
    response=response,
    reference=gt_answer,
    retrieved_contexts=sparse_texts
    )

    scorer = NoiseSensitivity(llm=evaluator_llm)
    score = await scorer.single_turn_ascore(sample)
    
    print(f"Sparse - Query: {query_gt[:60]}... Score: {score}")
    sparse_scores.append((query_gt, score))


=== Evaluating Sparse RAG (NoiseSensitivity) ===
Sparse - Query: Which solution is suitable for measuring room acoustics and ... Score: 0.6
Sparse - Query: What product should be used for façade sound insulation test... Score: 0.0
Sparse - Query: Which sound source is recommended for calibrated speech inte... Score: 1.0
Sparse - Query: What sound source should be used for ISO 3382-compliant room... Score: 0.3333333333333333
Sparse - Query: Which product supports compliance with ISO 9612 for workplac... Score: 0.25
Sparse - Query: Which product is suitable for investigating environmental no... Score: 0.5
Sparse - Query: Which product is designed for measuring exhaust noise in veh... Score: 0.0
Sparse - Query: Which product helps verify safe noise emissions from toys an... Score: 0.6666666666666666
Sparse - Query: Which HBK 2255 variant is best suited for long-term environm... Score: 0.2
Sparse - Query: Which HBK 2255 variant should be used for evaluating workpla... Score: 0.0
Sparse - 

CancelledError: 