In [None]:
# --- Path Setup ---

import os
import sys
from pathlib import Path

current_dir = Path.cwd()
if current_dir.name == "notebooks":
    project_root = current_dir.parent
else:
    project_root = current_dir

sys.path.insert(0, str(project_root))
sys.path.insert(0, str(project_root / "src"))

os.chdir(project_root)

In [None]:
# --- Imports ---

import ast
import logging
import voyageai
import anthropic
import numpy as np
import pandas as pd
from openai import OpenAI
from tqdm.notebook import tqdm
from src.query_rag import RAGSystem  
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from src.query_rag_retrieval import RetrievalEvaluationSystem

from ragas_modified.evaluation import evaluate
from ragas_modified.run_config import RunConfig
from ragas_modified.llms import LangchainLLMWrapper
from ragas_modified.dataset_schema import EvaluationDataset
from ragas_modified.embeddings import LangchainEmbeddingsWrapper
from ragas_modified.metrics import LLMContextPrecisionWithReference, LLMContextRecall, Faithfulness, ResponseRelevancy

# Disable logging for RAG system
logging.disable(logging.CRITICAL)

# Initialize Voyage AI client
vo = voyageai.Client()

In [None]:
# Load the testing dataset
df_QA = pd.read_csv("data/NICEQA_utf8.csv", encoding="utf-8")
df_QA.head()

In [None]:
df_QA["Answer"][0]

## RAG Responses

In [None]:
# Function for running RAG queries

def query_rag_for_evaluation(query_text: str, model_weights: dict, rag_system: RAGSystem, llm_model: str = "anthropic/claude-sonnet-4",
    similarity_k: int = 25, common_sections_n: int = 15, filename_type_filter: str = "CG, NG", info_source: str = "NICE", 
    use_hybrid_search: bool = True, wrrf_k: int = 40, use_reranker: bool = True, reranker_model: str = "rerank-2", reranker_top_k: int = 5
):
    """Wrapper function to query the RAG system for evaluation."""
    try:
        response_chunks = []
        sources_string = ""
        context_text = ""
        
        for chunk, chunk_sources, chunk_context, sources_data in rag_system.query_rag_stream(
            query_text=query_text, llm_model=llm_model, similarity_k=similarity_k,
            common_sections_n=common_sections_n, info_source=info_source,
            model_weights=model_weights, filename_type_filter=filename_type_filter,
            use_hybrid_search=use_hybrid_search, wrrf_k=wrrf_k,
            use_reranker=use_reranker, reranker_model=reranker_model,
            reranker_top_k=reranker_top_k
        ):
            response_chunks.append(chunk)
            sources_string = chunk_sources
            context_text = chunk_context
        
        final_response = ''.join(response_chunks)
        
        return final_response, sources_string, context_text
        
    except Exception as e:
        print(f"Error in query_rag_for_evaluation: {e}")
        return f"Error: {e}", "", []

In [None]:
# Run RAG system for each query in the dataset

dataset = []
sample_queries = df_QA["Question"].tolist()
expected_responses = df_QA["Answer"].tolist()

rag_system = RAGSystem()

for query, reference in tqdm(zip(sample_queries, expected_responses), desc="Building evaluation dataset", total=len(sample_queries)):

    model_weights = {"voyage-3-large": 5.0, "BM25": 1.0}

    response, sources, relevant_docs = query_rag_for_evaluation(query, rag_system=rag_system, model_weights=model_weights)

    if not isinstance(relevant_docs, list):
        relevant_docs = [relevant_docs]

    dataset.append({"user_input": query, "retrieved_contexts": relevant_docs, "response": response, "reference": reference})

df = pd.DataFrame(dataset)
print(f"Created evaluation dataset with {len(df)} samples")

In [None]:
df.head()

In [None]:
df.to_csv("ragas_results2/rag_evaluation_dataset_claude_sonnet4_5chunks.csv", index=False)

## LLM-Only Responses

In [None]:
# Define function for LLM-only answers

def llm_only_answer(query_text: str, llm_model: str = "gpt-4.1-nano", system_prompt: str | None = None, use_web_search: bool = False) -> str:
    system_prompt = (
        "You are a medical AI assistant tasked with answering clinical questions strictly based on NICE clinical guidelines. "
        "Follow these rules:\n"
        "1. Only use information from NICE guidelines.\n"
        "2. If no relevant NICE guideline information is available, reply: 'There are no relevant NICE guidelines for this request.'\n"
        "3. Be concise. Use markdown for lists and tables.\n"
        "4. Never fabricate sources or references.\n"
    )
    
    if llm_model.startswith("claude"):
        if use_web_search:
            anthropic_client = anthropic.Anthropic()
            
            full_query = f"{system_prompt}\n\nUser question: {query_text}"
            
            resp = anthropic_client.messages.create(
                model=llm_model,
                max_tokens=1024,
                messages=[
                    {
                        "role": "user",
                        "content": full_query
                    }
                ],
                tools=[{
                    "type": "web_search_20250305",
                    "name": "web_search",
                    "max_uses": 5,
                    "allowed_domains": ["nice.org.uk"]
                }]
            )
            
            if resp.content:
                content_parts = []
                for content in resp.content:
                    if hasattr(content, 'text'):
                        content_parts.append(content.text)
                return '\n'.join(content_parts) if content_parts else str(resp.content)
            return str(resp)
        else:
            anthropic_client = OpenAI(
                api_key=os.getenv("ANTHROPIC_API_KEY"),
                base_url="https://api.anthropic.com/v1/"
            )
            resp = anthropic_client.chat.completions.create(
                model=llm_model,
                temperature=0,
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": query_text},
                ],
            )
            return resp.choices[0].message.content
    else:
        if "o4-mini" in llm_model:
            client = OpenAI()
            resp = client.chat.completions.create(
                model=llm_model,
                messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": query_text},
                ]
            )
            return resp.choices[0].message.content
        else:
            client = OpenAI()
            resp = client.chat.completions.create(
                model=llm_model,
                temperature=0,
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": query_text},
                ]
            )
            return resp.choices[0].message.content

def make_query_embeddings(q: str):
    vec = vo.embed(q, input_type="query", model="voyage-3-large", output_dimension=2048).embeddings
    return {"voyage-3-large": np.array(vec, dtype=np.float32)}

retrieval = RetrievalEvaluationSystem()

def get_retrieved_context_texts(query: str, k: int = 15, n_common: int = 15, info_source: str = "NICE", filename_type_filter: str | None = None):
    docs = retrieval.retrieve_documents(
        query_embeddings=make_query_embeddings(query),
        query_text=query, similarity_k=k,
        common_sections_n=n_common, info_source=info_source,
        model_weights={"voyage-3-large": 5.0, "BM25": 1.0}, filename_type_filter=filename_type_filter,
        use_hybrid_search=True, use_reranker=True, reranker_model="rerank-2",
        reranker_top_k=10,
        return_docs=True, 
    )
    return [d.get("document", "") for d in docs if d.get("document")]

llm_only_with_rag_contexts = []
for query, reference in tqdm(zip(df_QA["Question"], df_QA["Answer"]), total=len(df_QA), desc="LLM-only + RAG contexts + Web Search"):
    contexts = get_retrieved_context_texts(query, k=15, n_common=15, info_source="NICE")
    response = llm_only_answer(query, llm_model="o4-mini", use_web_search=False) 
    llm_only_with_rag_contexts.append({"user_input": query, "retrieved_contexts": contexts, "response": response, "reference": reference})

df_llm_only = pd.DataFrame(llm_only_with_rag_contexts)
print(f"Created dataset with {len(df_llm_only)} samples")
df_llm_only.head()

In [None]:
df_llm_only.to_csv("ragas_results2/rag_evaluation_dataset_o4_mini_baseline10_search.csv", index=False, encoding='utf-8')

In [None]:
df_llm_only["reference"].iloc[4]

## RAGAs

In [None]:
# Load the dataset for evaluation
df = pd.read_csv("baseline_results/rag_evaluation_dataset_gpt_4_1_nano_baseline10.csv")
df['retrieved_contexts'] = df['retrieved_contexts'].apply(ast.literal_eval)


In [None]:
# Set up RAGAS evaluation
openrouter_api_key = os.getenv("OPENROUTER_API_KEY")
evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="openai/gpt-4.1-mini", api_key=openrouter_api_key, base_url="https://openrouter.ai/api/v1/"))
evaluation_dataset = EvaluationDataset.from_pandas(df)

In [None]:
# Run Evaluation
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")  
ragas_embeddings = LangchainEmbeddingsWrapper(embeddings)
#metrics = [LLMContextPrecisionWithReference(), LLMContextRecall(), ResponseRelevancy(), Faithfulness()]
#metrics = [ResponseRelevancy(), Faithfulness()]
metrics = [Faithfulness()]
print("Starting RAGAS evaluation")
result = evaluate(dataset=evaluation_dataset, metrics=metrics, llm=evaluator_llm, embeddings=ragas_embeddings, raise_exceptions=True, run_config=RunConfig(max_workers=1))

print("Evaluation Results:")
print(result)

In [None]:
# Export results to CSV 
def export_ragas_results_to_csv(result, llm_model, embedding_model, num_chunks, evaluation_llm, filename):
    """Export RAGAS evaluation results to CSV with metadata"""

    results_data = {
        'llm_model': llm_model,
        'embedding_model': embedding_model,
        'num_chunks': num_chunks,
        'evaluation_llm': evaluation_llm,
        'timestamp': pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')
    }
    
    if hasattr(result, '_scores_dict'):
        for metric_name, score in result._scores_dict.items():
            results_data[metric_name] = np.mean(score) if isinstance(score, list) else score
    else:
        for attr in dir(result):
            if not attr.startswith('_') and hasattr(result, attr):
                value = getattr(result, attr)
                if isinstance(value, (int, float, list)):
                    results_data[attr] = np.mean(value) if isinstance(value, list) else value
    
    results_df = pd.DataFrame([results_data])
    
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    file_exists = os.path.exists(filename)
    
    results_df.to_csv(filename, mode='a', header=not file_exists, index=False)
    
    print(f"Results exported to: {filename}")
    return results_df

# Configuration
llm_model = "gpt-4.1-nano" 
embedding_model = "text-embedding-3-small"  
num_chunks = 10
evaluation_llm = "gpt-4.1-mini"  

results_df = export_ragas_results_to_csv(result=result, llm_model=llm_model, embedding_model=embedding_model, num_chunks=num_chunks,
                                            evaluation_llm=evaluation_llm, filename="baseline_results/baseline_evaluation_results.csv")

print("\nExported data:")
print(results_df)

In [None]:
detailed_results = []
for i, sample in enumerate(evaluation_dataset.samples):
    sample_result = {
        'question': sample.user_input,
        'answer': sample.response,
        'reference': sample.reference,
        'retrieved_contexts': sample.retrieved_contexts
    }
    
    for metric_name in result._scores_dict.keys():
        sample_result[f'{metric_name}_score'] = result._scores_dict[metric_name][i]
    
    detailed_results.append(sample_result)

detailed_df = pd.DataFrame(detailed_results)
print(detailed_df.head())
detailed_df.to_csv("baseline_results/rag_evaluation_dataset_gpt_4.1_nano_baseline10_results.csv", index=False)

In [None]:
ragas_failure_df = pd.read_csv("ragas_results2/rag_evaluation_dataset_o4_mini10chunks_results.csv")
ragas_failure_df.head()

In [None]:
# Find sample with lowest answer_relevancy_score
lowest_idx = ragas_failure_df['answer_relevancy_score'].idxmin()
lowest_sample = ragas_failure_df.loc[lowest_idx]

print(f"Lowest score: {lowest_sample['answer_relevancy_score']:.4f}")
print(f"\nQuestion: {lowest_sample['question']}")
print(f"\nSystem Answer: \n\n{lowest_sample['answer']}")
print("---------------------------------------------------------")
print(f"\nReference: \n\n{lowest_sample['reference']}")

In [None]:
# Find the 3 worst faithfulness scores and print their system answers
worst_indices = ragas_failure_df['faithfulness_score'].nsmallest(3).index.tolist()

print("SYSTEM ANSWERS FOR THE 3 WORST FAITHFULNESS SCORES:")
print("="*80)

for rank, idx in enumerate(worst_indices, 1):
    sample = ragas_failure_df.loc[idx]
    
    print(f"\nRANK {rank} WORST FAITHFULNESS SCORE")
    print(f"Index: {idx}")
    print(f"Faithfulness score: {sample['faithfulness_score']:.4f}")
    print(f"Question: {sample['question']}")
    print(f"\nSystem Answer:")
    print("-" * 60)
    print(sample['answer'])
    print("=" * 80)

In [None]:
# Find sample with lowest faithfulness_score
lowest_faith_idx = ragas_failure_df['faithfulness_score'].idxmin()
lowest_faith_sample = ragas_failure_df.loc[lowest_faith_idx]

print(f"Lowest faithfulness score: {lowest_faith_sample['faithfulness_score']:.4f}")
print(f"\nQuestion: {lowest_faith_sample['question']}")
print(f"\nSystem Answer: \n\n {lowest_faith_sample['answer']}")
print("---------------------------------------------------------")
print(f"\nContext: \n\n {lowest_faith_sample['retrieved_contexts']}")

In [None]:
# Analyse sentence by sentence impact on faithfulness

import ast
import re

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")  

def split_into_sentences(text):
    """Enhanced sentence splitting that handles bullets, lists, and structured text"""
    
    # First, split by bullet points and numbered lists
    bullet_pattern = r'(?=\n\s*[-•*]\s+|^\s*[-•*]\s+|\n\s*\d+\.\s+|^\s*\d+\.\s+)'
    parts = re.split(bullet_pattern, text.strip())
    
    sentences = []
    for part in parts:
        part = part.strip()
        if not part:
            continue
            
        # If it's a bullet point or numbered item, treat as one sentence
        if re.match(r'^\s*[-•*]\s+|^\s*\d+\.\s+', part):
            sentences.append(part)
        else:
            # Split regular text by sentence-ending punctuation
            sent_parts = re.split(r'(?<=[.!?])\s+', part)
            sentences.extend([s.strip() for s in sent_parts if s.strip()])
    
    return sentences

lowest_faith_idx = ragas_failure_df['faithfulness_score'].idxmin()
lowest_faith_sample = ragas_failure_df.loc[lowest_faith_idx]

print(f"Lowest faithfulness score: {lowest_faith_sample['faithfulness_score']:.4f}")
print(f"Question: {lowest_faith_sample['question']}")
print(f"System Answer: {lowest_faith_sample['answer']}")
print("\n" + "="*80)

# Test the function first
sentences = split_into_sentences(lowest_faith_sample['answer'])
print(f"Number of sentences found: {len(sentences)}")
for i, sent in enumerate(sentences, 1):
    print(f"{i}: {sent[:100]}{'...' if len(sent) > 100 else ''}")

print("\n" + "="*80)
print("PROCEEDING WITH ANALYSIS:")
print("="*80)

results = []
current_answer = ""

for i, sentence in enumerate(sentences):
    current_answer = sentence if i == 0 else current_answer + " " + sentence
    
    test_data = [{
        "user_input": lowest_faith_sample['question'],
        "response": current_answer,
        "retrieved_contexts": ast.literal_eval(lowest_faith_sample['retrieved_contexts'])
    }]
    
    test_dataset = EvaluationDataset.from_pandas(pd.DataFrame(test_data))
    faith_result = evaluate(test_dataset, [Faithfulness()], 
                           LangchainLLMWrapper(ChatOpenAI(model="gpt-4.1-mini")), 
                           LangchainEmbeddingsWrapper(embeddings))
    
    score = faith_result['faithfulness'][0]
    results.append({'sentence_num': i + 1, 'faithfulness_score': score, 'sentence_added': sentence})
    print(f"Sentence {i+1}: {score:.4f}")

print("\n" + "="*80)
print("INCREMENTAL ANALYSIS:")

biggest_drop = 0
problematic_sentence_idx = None

for i, result in enumerate(results):
    if i == 0:
        print(f"Sentence 1: {result['faithfulness_score']:.4f} (baseline)")
    else:
        change = result['faithfulness_score'] - results[i-1]['faithfulness_score']
        print(f"Sentence {result['sentence_num']}: {result['faithfulness_score']:.4f} (change: {change:+.4f})")
        
        if change < biggest_drop:
            biggest_drop = change
            problematic_sentence_idx = i
            
        if change < -0.05:
            print(f"  ⚠️  ISSUE: {result['sentence_added']}")

retrieved_contexts = ast.literal_eval(lowest_faith_sample['retrieved_contexts'])
print(f"\n{'='*80}")
print("RETRIEVED CONTEXTS:")
print(f"Number of contexts: {len(retrieved_contexts)}")

if problematic_sentence_idx is not None:
    print(f"Most problematic sentence (drop: {biggest_drop:.4f}):")
    print(f"{results[problematic_sentence_idx]['sentence_added']}")

print(f"\n{'-'*40}")
for i, context in enumerate(retrieved_contexts, 1):
    print(f"CONTEXT {i}: {context}")
    print(f"{'-'*40}")

In [None]:
# Detailed faithfulness analysis
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")  

def get_detailed_faithfulness_sync(question, answer, contexts):
    
    # Create a custom version that exposes intermediate results
    class VerboseFaithfulness(Faithfulness):
        def __init__(self):
            super().__init__()
            self.last_statements = []
            self.last_verdicts = []
            
        async def _ascore(self, row, callbacks):
            # Generate statements
            statements_result = await self._create_statements(row, callbacks)
            self.last_statements = statements_result.statements
            
            print(f"Generated {len(self.last_statements)} statements:")
            for i, stmt in enumerate(self.last_statements, 1):
                print(f"{i}: {stmt}")
            
            if not self.last_statements:
                return float('nan')
            
            # Get verdicts
            verdicts_result = await self._create_verdicts(row, self.last_statements, callbacks)
            self.last_verdicts = verdicts_result.statements
            
            print(f"\n{'='*80}")
            print("DETAILED VERDICT ANALYSIS:")
            print("="*80)
            
            faithful_count = 0
            for i, verdict in enumerate(self.last_verdicts, 1):
                status = "✅ FAITHFUL" if verdict.verdict == 1 else "❌ UNFAITHFUL"
                print(f"\nStatement {i}: {status}")
                print(f"Text: {verdict.statement}")
                print(f"Reason: {verdict.reason}")
                print(f"Verdict: {verdict.verdict}")
                print("-" * 60)
                
                if verdict.verdict == 1:
                    faithful_count += 1
            
            score = self._compute_score(verdicts_result)
            print(f"\nFINAL FAITHFULNESS SCORE: {score:.4f}")
            print(f"({faithful_count}/{len(self.last_verdicts)} statements faithful)")
            
            return score
    
    # Create the test data
    test_data = [{
        "user_input": question,
        "response": answer,
        "retrieved_contexts": contexts
    }]
    
    # Run the verbose analysis
    verbose_metric = VerboseFaithfulness()
    test_dataset = EvaluationDataset.from_pandas(pd.DataFrame(test_data))
    
    result = evaluate(test_dataset, [verbose_metric], 
                     LangchainLLMWrapper(ChatOpenAI(model="gpt-4.1-mini")), 
                     LangchainEmbeddingsWrapper(embeddings))
    
    return verbose_metric.last_statements, verbose_metric.last_verdicts, result['faithfulness'][0]

# Analyze the problematic response with detailed reasoning
print("ANALYZING PROBLEMATIC RESPONSE WITH DETAILED REASONING:")
print("="*80)

# Use the original question, answer, and contexts from lowest_faith_sample
contexts = ast.literal_eval(lowest_faith_sample['retrieved_contexts'])
statements, verdicts, final_score = get_detailed_faithfulness_sync(
    lowest_faith_sample['question'],        # Original question
    lowest_faith_sample['answer'],          # Original answer 
    contexts                               # Original contexts
)

print("\n" + "="*80)
print("SUMMARY OF UNFAITHFUL STATEMENTS:")
print("="*80)

unfaithful_statements = []
for i, verdict in enumerate(verdicts, 1):
    if verdict.verdict == 0:
        unfaithful_statements.append((i, verdict))
        print(f"\n❌ Statement {i}: UNFAITHFUL")
        print(f"Text: {verdict.statement}")
        print(f"Reason: {verdict.reason}")

if not unfaithful_statements:
    print("No unfaithful statements found!")
else:
    print(f"\n{len(unfaithful_statements)} out of {len(verdicts)} statements were unfaithful")
    print(f"This resulted in a faithfulness score of {final_score:.4f}")