In [None]:
import json
from typing import List, Dict, Tuple
from ragas import SingleTurnSample
from ragas.metrics import NonLLMContextPrecisionWithReference
from ragas.metrics import LLMContextPrecisionWithReference
from ragas.metrics import LLMContextPrecisionWithoutReference
from ragas.metrics import NoiseSensitivity

In [None]:
from langchain_community.chat_models import ChatOllama
from ragas.llms import LangchainLLMWrapper

llm = ChatOllama(model="llama3.1")
evaluator_llm = LangchainLLMWrapper(llm)

In [None]:
import json

path = r"gt.json"

# Load the JSON safely
with open(path, "r", encoding="utf-8") as f:
    data = json.load(f)

# Build a list of tuples: (question, reference_answer, [list of context texts])
query_texts_pairs = [
    (
        item["question"].strip(),
        item["reference"].strip(),
        [seg.strip() for seg in item.get("context", [])]
    )
    for item in data
]

# Example: show the first query and its data
first_query, first_reference, first_contexts = query_texts_pairs[0]
print("Query:", first_query)
print("Reference answer:", first_reference)
print("Number of context texts:", len(first_contexts))
print("First context snippet:", first_contexts[0][:200].replace("\n", " "), "...\n")


In [None]:
# Path to your RAG output JSON
path = r"retrieval_results_dense.json"

# Load the JSON
with open(path, "r", encoding="utf-8") as f:
    rag_data_dense = json.load(f)

# Build a list of tuples: (query, [list of retrieved texts])
query_retrieved_pairs_dense = [
    (
        query,
        [item["text"].strip() for item in texts]
    )
    for query, texts in rag_data_dense.items()
]

# Example: show the first query and its retrieved texts
first_query, first_texts = query_retrieved_pairs_dense[0]
print("Query:", first_query)
print("Number of retrieved texts:", len(first_texts))
print("First retrieved text snippet:", first_texts[0][:200].replace("\n", " "), "...\n")

In [None]:
# Path to your BM25 output JSON
path = r"retrieval_results_sparse.json"

# Load the JSON
with open(path, "r", encoding="utf-8") as f:
    bm25_data = json.load(f)

# Build a list of tuples: (query, [list of retrieved texts])
query_retrieved_pairs_sparse = [
    (
        item["query"],
        [res["window"].strip() for res in item["results"]]
    )
    for item in bm25_data
]

# Example: show the first query and its retrieved texts
first_query, first_texts = query_retrieved_pairs_sparse[0]
print("Query:", first_query)
print("Number of retrieved texts:", len(first_texts))
print("First retrieved text snippet:", first_texts[0][:200].replace("\n", " "), "...\n")

In [None]:
# Load chat logs and separate by mode
path = r"chat_logs.json"

with open(path, "r", encoding="utf-8") as f:
    data = json.load(f)

dense_pairs = []
sparse_pairs = []

for item in data:
    query = item.get("query", "").strip()
    mode = item.get("mode", "").strip().lower()
    response = item.get("response", "").strip()
    
    if mode == "dense rag":
        dense_pairs.append((query, response))
    elif mode == "sparse rag":
        sparse_pairs.append((query, response))

print("Dense pairs found:", len(dense_pairs))
print("Sparse pairs found:", len(sparse_pairs))

In [None]:
# Evaluate Sparse RAG
print("\n=== Evaluating Sparse RAG (NoiseSensitivity) ===")
sparse_scores = []

for (query_gt, gt_answer, gt_texts), (query_sparse, sparse_texts), (query_response, response) in zip(
    query_texts_pairs, 
    query_retrieved_pairs_sparse, 
    sparse_pairs
):
    # Sanity check
    assert query_gt == query_sparse == query_response, f"Query mismatch: {query_gt} vs {query_sparse} vs {query_response}"

    sample = SingleTurnSample(
    user_input=query_gt,
    response=response,
    reference=gt_answer,
    retrieved_contexts=sparse_texts
    )

    scorer = NoiseSensitivity(llm=evaluator_llm)
    score = await scorer.single_turn_ascore(sample)
    
    print(f"Sparse - Query: {query_gt[:60]}... Score: {score}")
    sparse_scores.append((query_gt, score))

In [None]:
# Evaluate Sparse RAG
print("\n=== Evaluating Dense RAG (NoiseSensitivity) ===")
dense_scores = []

for (query_gt, gt_answer, gt_texts), (query_dense, dense_texts), (query_response, response) in zip(
    query_texts_pairs, 
    query_retrieved_pairs_dense, 
    dense_pairs
):
    # Sanity check
    assert query_gt == query_dense == query_response, f"Query mismatch: {query_gt} vs {query_dense} vs {query_response}"
    sample = SingleTurnSample(
    user_input=query_gt,
    response=response,
    reference=gt_answer,
    retrieved_contexts=dense_texts
    )

    scorer = NoiseSensitivity(llm=evaluator_llm)
    score = await scorer.single_turn_ascore(sample)
    
    print(f"Dense - Query: {query_gt[:60]}... Score: {score}")
    dense_scores.append((query_gt, score))