In [17]:
# Install required packages
%pip install faiss-cpu python-dotenv openai sentence-transformers pdfplumber
import os
import re
import json
import pandas as pd
import numpy as np
from typing import List, Dict, Tuple, Optional
from dotenv import load_dotenv
from openai import AzureOpenAI
import pdfplumber
from sentence_transformers import SentenceTransformer
import faiss
import nltk
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
import warnings
warnings.filterwarnings('ignore')

# Download required NLTK data
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

try:
    nltk.data.find('tokenizers/punkt_tab')
except LookupError:
    nltk.download('punkt_tab')


# Load environment variables
load_dotenv()

def init_client():
    """Initialize Azure OpenAI client with environment variables."""
    api_key = os.getenv("AZURE_OPENAI_API_KEY")
    if not api_key:
        raise ValueError("AZURE_OPENAI_API_KEY environment variable not set.")

    endpoint = os.getenv("ENDPOINT_URL")
    if not endpoint:
        raise ValueError("ENDPOINT_URL environment variable not set.")

    client = AzureOpenAI(
        api_key=api_key,
        api_version=os.getenv("OPENAI_API_VERSION"),
        azure_endpoint=endpoint
    )
    return client

# Initialize client
client = init_client()



[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [18]:
def extract_json_from_response(response: str) -> str:
    """Extract JSON from response, handling various formats"""
    # Handle markdown code blocks
    if '```json' in response:
        match = re.search(r'```json\s*\n?(.*?)\n?```', response, re.DOTALL)
        if match:
            return match.group(1).strip()

    if '```' in response:
        match = re.search(r'```\s*\n?(.*?)\n?```', response, re.DOTALL)
        if match:
            return match.group(1).strip()

    # Find JSON objects
    json_pattern = r'\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}'
    matches = re.findall(json_pattern, response, re.DOTALL)
    if matches:
        return max(matches, key=len).strip()

    return response.strip()

def load_pdf_with_advanced_chunking(pdf_dir: str, pdf_files: List[str]) -> List[Dict]:
    """
    Load PDFs with advanced chunking strategies for better retrieval.
    Returns list of chunks with metadata.
    """
    chunks = []

    for pdf_file in pdf_files:
        file_path = os.path.join(pdf_dir, pdf_file)
        if not os.path.exists(file_path):
            print(f"Warning: {pdf_file} not found.")
            continue

        try:
            with pdfplumber.open(file_path) as pdf:
                for page_num, page in enumerate(pdf.pages):
                    text = page.extract_text() or ""
                    if not text.strip():
                        continue

                    # Advanced chunking strategies
                    page_chunks = create_advanced_chunks(text, page_num, pdf_file)
                    chunks.extend(page_chunks)

            print(f"Extracted {len([c for c in chunks if c['source'] == pdf_file])} chunks from {pdf_file}")

        except Exception as e:
            print(f"Error processing {pdf_file}: {str(e)}")

    return chunks


In [19]:
def create_advanced_chunks(text: str, page_num: int, source: str) -> List[Dict]:
    """
    Create advanced chunks using multiple strategies:
    1. Sentence-based chunking
    2. Paragraph-based chunking
    3. Overlapping chunks
    4. Semantic chunks based on headers/sections
    """
    chunks = []

    # Clean text
    text = re.sub(r'\s+', ' ', text).strip()

    # Strategy 1: Sentence-based chunking
    sentences = sent_tokenize(text)
    sentence_chunks = create_sentence_chunks(sentences, chunk_size=3, overlap=1)

    for i, chunk in enumerate(sentence_chunks):
        chunks.append({
            'text': chunk,
            'source': source,
            'page': page_num,
            'chunk_id': f"{source}_p{page_num}_sent_{i}",
            'chunk_type': 'sentence',
            'length': len(chunk)
        })

    # Strategy 2: Paragraph-based chunking
    paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]
    paragraph_chunks = create_paragraph_chunks(paragraphs, max_length=800, overlap=100)

    for i, chunk in enumerate(paragraph_chunks):
        chunks.append({
            'text': chunk,
            'source': source,
            'page': page_num,
            'chunk_id': f"{source}_p{page_num}_para_{i}",
            'chunk_type': 'paragraph',
            'length': len(chunk)
        })

    # Strategy 3: Semantic chunking (based on headers/sections)
    semantic_chunks = create_semantic_chunks(text)

    for i, chunk in enumerate(semantic_chunks):
        chunks.append({
            'text': chunk,
            'source': source,
            'page': page_num,
            'chunk_id': f"{source}_p{page_num}_semantic_{i}",
            'chunk_type': 'semantic',
            'length': len(chunk)
        })

    return chunks

In [20]:
def create_sentence_chunks(sentences: List[str], chunk_size: int = 3, overlap: int = 1) -> List[str]:
    """Create overlapping sentence chunks"""
    chunks = []
    for i in range(0, len(sentences), chunk_size - overlap):
        chunk_sentences = sentences[i:i + chunk_size]
        if chunk_sentences:
            chunks.append(' '.join(chunk_sentences))
    return chunks

def create_paragraph_chunks(paragraphs: List[str], max_length: int = 800, overlap: int = 100) -> List[str]:
    """Create overlapping paragraph chunks"""
    chunks = []
    current_chunk = ""

    for para in paragraphs:
        if len(current_chunk) + len(para) <= max_length:
            current_chunk += para + "\n\n"
        else:
            if current_chunk:
                chunks.append(current_chunk.strip())
                # Create overlap
                current_chunk = current_chunk[-overlap:] + para + "\n\n"
            else:
                current_chunk = para + "\n\n"

    if current_chunk:
        chunks.append(current_chunk.strip())

    return chunks

def create_semantic_chunks(text: str) -> List[str]:
    """Create chunks based on semantic sections (headers, numbered sections, etc.)"""
    chunks = []

    # Pattern for detecting sections/headers
    section_pattern = r'(?:^|\n)(?:\d+\.?\s+|[A-Z][^.]*:|\n[A-Z][A-Z\s]+\n|Chapter\s+\d+|Section\s+\d+)'

    sections = re.split(section_pattern, text)

    for section in sections:
        section = section.strip()
        if len(section) > 100:  # Only keep substantial sections
            chunks.append(section)

    return chunks

def build_enhanced_vector_index(chunks: List[Dict]) -> Tuple[faiss.IndexFlatL2, SentenceTransformer, List[Dict]]:
    """Build enhanced vector index with multiple embedding strategies"""

    # Initialize embedder
    embedder = SentenceTransformer('all-MiniLM-L6-v2')

    # Extract texts for embedding
    texts = [chunk['text'] for chunk in chunks]

    # Create embeddings
    embeddings = embedder.encode(texts, convert_to_tensor=False, show_progress_bar=True)

    # Create FAISS index
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(np.array(embeddings, dtype=np.float32))

    print(f"Built vector index with {len(chunks)} chunks")

    return index, embedder, chunks

def retrieve_enhanced_context(query: str, index: faiss.IndexFlatL2, embedder: SentenceTransformer,
                            chunks: List[Dict], k: int = 8) -> List[Dict]:
    """
    Enhanced context retrieval with multiple strategies:
    1. Semantic similarity search
    2. Keyword matching
    3. Diversity-based reranking
    """

    # Semantic search
    query_embedding = embedder.encode([query], convert_to_tensor=False)
    distances, indices = index.search(np.array(query_embedding, dtype=np.float32), k * 2)

    # Get candidate chunks
    candidate_chunks = []
    for i in indices[0]:
        if i < len(chunks):
            chunk = chunks[i].copy()
            chunk['similarity_score'] = float(1 / (1 + distances[0][len(candidate_chunks)]))
            candidate_chunks.append(chunk)

    # Keyword matching boost
    query_keywords = set(query.lower().split())
    for chunk in candidate_chunks:
        chunk_keywords = set(chunk['text'].lower().split())
        keyword_overlap = len(query_keywords.intersection(chunk_keywords))
        chunk['keyword_score'] = keyword_overlap / len(query_keywords) if query_keywords else 0

    # Diversity-based reranking
    final_chunks = rerank_for_diversity(candidate_chunks, k)

    return final_chunks

In [21]:
def rerank_for_diversity(chunks: List[Dict], k: int) -> List[Dict]:
    """Rerank chunks to ensure diversity in sources and content"""

    # Sort by combined score (similarity + keyword)
    for chunk in chunks:
        chunk['combined_score'] = chunk['similarity_score'] + chunk['keyword_score'] * 0.3

    chunks.sort(key=lambda x: x['combined_score'], reverse=True)

    # Ensure diversity
    selected_chunks = []
    used_sources = set()
    used_chunk_types = set()

    for chunk in chunks:
        if len(selected_chunks) >= k:
            break

        # Prioritize diversity in sources and chunk types
        source_penalty = 0.1 if chunk['source'] in used_sources else 0
        type_penalty = 0.05 if chunk['chunk_type'] in used_chunk_types else 0

        chunk['final_score'] = chunk['combined_score'] - source_penalty - type_penalty

        selected_chunks.append(chunk)
        used_sources.add(chunk['source'])
        used_chunk_types.add(chunk['chunk_type'])

    return selected_chunks

# Simple retrieve context function from openai_llm_as_a_judge
def retrieve_simple_context(query: str, embedder: SentenceTransformer, chunks: List[Dict], k: int = 3) -> List[str]:
    """Simple context retrieval function from openai_llm_as_a_judge"""
    sentences = [chunk['text'] for chunk in chunks]
    if not sentences:
        print("Warning: No valid sentences extracted from chunks.")
        return ["No relevant context found."]

    embeddings = embedder.encode(sentences, convert_to_numpy=True)
    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(embeddings)

    query_embedding = embedder.encode([query], convert_to_numpy=True)
    distances, indices = index.search(query_embedding, k=k)

    retrieved_context = [sentences[i] for i in indices[0] if i < len(sentences)]
    print(f"Query: {query}, Sentences Count: {len(sentences)}, Retrieved Context: {retrieved_context[:100]}...")

    return retrieved_context if retrieved_context else ["No relevant context found."]

def generate_response_with_rag(question: str, retrieved_chunks: List[Dict]) -> str:
    """Generate response using retrieved context (from llm_as_a_judge)"""

    # Combine context from chunks
    context_parts = []
    for chunk in retrieved_chunks:
        context_parts.append(f"[{chunk['source']} - Page {chunk['page']}]: {chunk['text']}")

    context = "\n\n".join(context_parts)

    prompt = f"""Context: {context}

Question: {question}

Based on the provided context, please answer the question accurately and concisely. If the context doesn't contain enough information to answer the question, please state that clearly."""

    deployment = os.getenv("DEPLOYMENT_NAME")
    if not deployment:
        raise ValueError("DEPLOYMENT_NAME is missing in .env")

    response = client.chat.completions.create(
        model=deployment,
        messages=[
            {"role": "system", "content": "You are a helpful AI assistant that answers questions based on provided context from insurance policy documents."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=200,
        temperature=0.3
    )

    return response.choices[0].message.content.strip()

def generate_simple_response(query: str) -> str:
    """Generate response without RAG (from openai_llm_as_a_judge)"""
    response = client.chat.completions.create(
        model=os.getenv("DEPLOYMENT_NAME"),
        messages=[{"role": "user", "content": query}],
        max_tokens=50,
        temperature=0.7
    )
    return response.choices[0].message.content.strip()

In [22]:
def judge_response_quality(question: str, response: str, ground_truth: str, context: str,
                         use_ground_truth: bool = True) -> Dict[str, any]:
    """
    Evaluate response quality using multiple criteria (from llm_as_a_judge)
    """

    if use_ground_truth:
        evaluation_prompt = f"""You are an expert evaluator. Please evaluate the following response across multiple dimensions:

Question: {question}
Context Provided: {context}
Generated Response: {response}
Ground Truth Answer: {ground_truth}

Please evaluate on these criteria (each on a scale of 0-1):
1. **Accuracy**: How factually correct is the response compared to the ground truth?
2. **Completeness**: Does the response cover all important points from the ground truth?
3. **Relevance**: Is the response relevant to the question and uses the context appropriately?
4. **Conciseness**: Is the response clear and concise without unnecessary information?

Provide your evaluation in the following JSON format:
{{
    "accuracy": <float>,
    "completeness": <float>,
    "relevance": <float>,
    "conciseness": <float>,
    "overall_score": <float>,
    "explanation": "<brief explanation of the scores>"
}}"""
    else:
        evaluation_prompt = f"""You are an expert evaluator. Please evaluate the following response across multiple dimensions:

Question: {question}
Context Provided: {context}
Generated Response: {response}

Please evaluate on these criteria (each on a scale of 0-1):
1. **Accuracy**: How factually correct is the response based on the context?
2. **Completeness**: Does the response adequately address the question?
3. **Relevance**: Is the response relevant to the question and uses the context appropriately?
4. **Conciseness**: Is the response clear and concise without unnecessary information?

Provide your evaluation in the following JSON format:
{{
    "accuracy": <float>,
    "completeness": <float>,
    "relevance": <float>,
    "conciseness": <float>,
    "overall_score": <float>,
    "explanation": "<brief explanation of the scores>"
}}"""

    deployment = os.getenv("DEPLOYMENT_NAME")
    completion = client.chat.completions.create(
        model=deployment,
        messages=[{"role": "user", "content": evaluation_prompt}],
        max_tokens=300,
        temperature=0.3
    )

    result = completion.choices[0].message.content.strip()

    try:
        json_content = extract_json_from_response(result)
        evaluation = json.loads(json_content)

        # Ensure all required fields exist
        required_fields = ['accuracy', 'completeness', 'relevance', 'conciseness', 'overall_score', 'explanation']
        for field in required_fields:
            if field not in evaluation:
                evaluation[field] = 0.0 if field != 'explanation' else 'Field missing from evaluation'

        return evaluation

    except json.JSONDecodeError:
        print(f"Failed to parse JSON response: {result}")
        return {
            "accuracy": 0.0,
            "completeness": 0.0,
            "relevance": 0.0,
            "conciseness": 0.0,
            "overall_score": 0.0,
            "explanation": "Failed to parse evaluation response"
        }

In [23]:
def judge_context_relevance(question: str, retrieved_chunks: List[Dict]) -> Dict[str, any]:
    """
    Evaluate how relevant the retrieved context is to the question (from llm_as_a_judge)
    """

    # Combine context from chunks
    context_parts = []
    for chunk in retrieved_chunks:
        context_parts.append(f"[{chunk['source']}]: {chunk['text'][:200]}...")

    context = "\n\n".join(context_parts)

    prompt = f"""You are an expert evaluator. Please evaluate how relevant the following context is to the question.

Question: {question}
Retrieved Context: {context}

Please evaluate:
1. **Relevance Score** (0-1): How relevant is the context to answering the question?
2. **Coverage Score** (0-1): How well does the context cover the information needed to answer the question?
3. **Noise Level** (0-1): How much irrelevant information is in the context? (0 = lots of noise, 1 = no noise)

Provide your evaluation in the following JSON format:
{{
    "relevance_score": <float>,
    "coverage_score": <float>,
    "noise_level": <float>,
    "explanation": "<brief explanation>"
}}"""

    deployment = os.getenv("DEPLOYMENT_NAME")
    completion = client.chat.completions.create(
        model=deployment,
        messages=[{"role": "user", "content": prompt}],
        max_tokens=200,
        temperature=0.3
    )

    result = completion.choices[0].message.content.strip()

    try:
        json_content = extract_json_from_response(result)
        evaluation = json.loads(json_content)

        # Ensure all required fields exist
        required_fields = ['relevance_score', 'coverage_score', 'noise_level', 'explanation']
        for field in required_fields:
            if field not in evaluation:
                evaluation[field] = 0.0 if field != 'explanation' else 'Field missing from evaluation'

        return evaluation
    except json.JSONDecodeError:
        print(f"Failed to parse JSON response: {result}")
        return {
            "relevance_score": 0.0,
            "coverage_score": 0.0,
            "noise_level": 0.0,
            "explanation": "Failed to parse evaluation response"
        }

In [24]:
def judge_retriever(query: str, retrieved_context: List[str]) -> Tuple[float, str]:
    """Evaluate the relevance of retrieved context to the query (from openai_llm_as_a_judge)"""
    context_text = "\n".join(retrieved_context)
    prompt = f"""
    You are an expert evaluator. Assess the relevance of the following retrieved context to the query.
    Query: {query}
    Retrieved Context: {context_text}
    Provide a relevance score from 0 to 1 (0 = completely irrelevant, 1 = perfectly relevant) and a brief explanation.
    Format your response exactly as: 'Score: <number>\nExplanation: <text>' where <number> is a float (e.g., 0.9) and <text> is the explanation.
    """

    completion = client.chat.completions.create(
        model=os.getenv("DEPLOYMENT_NAME"),
        messages=[{"role": "user", "content": prompt}],
        max_tokens=150,
        temperature=0.3
    )
    result = completion.choices[0].message.content.strip()

    # Parse score and explanation
    try:
        score_line = next((line for line in result.split("\n") if line.startswith("Score:")), "").strip()
        explanation_line = next((line for line in result.split("\n") if line.startswith("Explanation:")), "").strip()

        if score_line:
            score = float(score_line.replace("Score:", "").strip())
        else:
            score = 0.0

        if explanation_line:
            explanation = explanation_line.replace("Explanation:", "").strip()
        else:
            explanation = "No explanation provided"
    except (ValueError, StopIteration) as e:
        score = 0.0
        explanation = f"Failed to parse retriever response. Error: {str(e)}"

    return score, explanation


In [31]:
def load_qa_dataset(file_path: str) -> Tuple[List[str], List[str]]:
    """Load questions and ground truths from Excel dataset"""

    if file_path.endswith('.xlsx'):
        df = pd.read_excel(file_path, header=None)
    elif file_path.endswith('.csv'):
        df = pd.read_csv(file_path, header=None)
    else:
        raise ValueError("Unsupported file format. Use Excel (.xlsx) or CSV (.csv)")

    # Find the row with "question" to locate the start of data
    start_row = df[df[0].str.contains("question", na=False)].index[0]
    data = df.iloc[start_row + 1:].dropna(how='all')

    print("Raw data preview:", data.head())

    questions = data[0].dropna().tolist()
    ground_truths = data[1].dropna().tolist()

    return questions, ground_truths

def run_enhanced_evaluation(questions: List[str], ground_truths: List[str],
                          index: faiss.IndexFlatL2, embedder: SentenceTransformer,
                          chunks: List[Dict], use_ground_truth: bool = True) -> pd.DataFrame:
    """
    Run comprehensive RAG evaluation with enhanced metrics
    """

    results = []

    for i, (question, ground_truth) in enumerate(zip(questions, ground_truths)):
        print(f"\nProcessing {i + 1}/{len(questions)}: {question[:50]}...")

        # 1. Retrieve context using enhanced retrieval
        retrieved_chunks = retrieve_enhanced_context(question, index, embedder, chunks, k=5)

        # 2. Also get simple context for comparison
        simple_context = retrieve_simple_context(question, embedder, chunks, k=3)

        # 3. Evaluate context relevance
        context_eval = judge_context_relevance(question, retrieved_chunks)

        # 4. Evaluate simple retriever (from openai_llm_as_a_judge style)
        retriever_score, retriever_explanation = judge_retriever(question, simple_context)

        # 5. Generate response with enhanced RAG
        generated_response = generate_response_with_rag(question, retrieved_chunks)

        # 6. Evaluate response quality
        context_text = "\n\n".join([f"[{chunk['source']}]: {chunk['text']}" for chunk in retrieved_chunks])
        response_eval = judge_response_quality(question, generated_response, ground_truth,
                                             context_text, use_ground_truth)

        # Compile results
        result = {
            'question': question,
            'ground_truth': ground_truth,
            'generated_response': generated_response,
            'retrieved_chunks_count': len(retrieved_chunks),
            'context_sources': list(set([chunk['source'] for chunk in retrieved_chunks])),

            # Enhanced context evaluation metrics
            'context_relevance': context_eval.get('relevance_score', 0.0),
            'context_coverage': context_eval.get('coverage_score', 0.0),
            'context_noise_level': context_eval.get('noise_level', 0.0),
            'context_explanation': context_eval.get('explanation', 'N/A'),

            # Simple retriever evaluation (openai style)
            'simple_retriever_score': retriever_score,
            'simple_retriever_explanation': retriever_explanation,

            # Response evaluation metrics
            'response_accuracy': response_eval.get('accuracy', 0.0),
            'response_completeness': response_eval.get('completeness', 0.0),
            'response_relevance': response_eval.get('relevance', 0.0),
            'response_conciseness': response_eval.get('conciseness', 0.0),
            'response_overall_score': response_eval.get('overall_score', 0.0),
            'response_explanation': response_eval.get('explanation', 'N/A'),

            # Chunk details
            'chunk_details': [{
                'source': chunk['source'],
                'chunk_type': chunk['chunk_type'],
                'similarity_score': chunk.get('similarity_score', 0.0),
                'text_preview': chunk['text'][:100] + '...' if len(chunk['text']) > 100 else chunk['text']
            } for chunk in retrieved_chunks]
        }

        results.append(result)

        # Print progress
        print(f"Context Relevance: {context_eval['relevance_score']:.2f}")
        print(f"Simple Retriever Score: {retriever_score:.2f}")
        print(f"Response Overall: {response_eval['overall_score']:.2f}")

    return pd.DataFrame(results)

def calculate_evaluation_metrics(results_df: pd.DataFrame) -> Dict[str, float]:
    """Calculate comprehensive evaluation metrics"""

    metrics = {
        # Context metrics
        'avg_context_relevance': results_df['context_relevance'].mean(),
        'avg_context_coverage': results_df['context_coverage'].mean(),
        'avg_context_noise_level': results_df['context_noise_level'].mean(),
        'avg_simple_retriever_score': results_df['simple_retriever_score'].mean(),

        # Response metrics
        'avg_response_accuracy': results_df['response_accuracy'].mean(),
        'avg_response_completeness': results_df['response_completeness'].mean(),
        'avg_response_relevance': results_df['response_relevance'].mean(),
        'avg_response_conciseness': results_df['response_conciseness'].mean(),
        'avg_response_overall': results_df['response_overall_score'].mean(),

        # Standard deviations
        'std_context_relevance': results_df['context_relevance'].std(),
        'std_response_overall': results_df['response_overall_score'].std(),

        # Performance distribution
        'excellent_responses': (results_df['response_overall_score'] >= 0.8).sum() / len(results_df),
        'good_responses': ((results_df['response_overall_score'] >= 0.6) &
                          (results_df['response_overall_score'] < 0.8)).sum() / len(results_df),
        'poor_responses': (results_df['response_overall_score'] < 0.6).sum() / len(results_df),

        # Retrieval metrics
        'avg_chunks_retrieved': results_df['retrieved_chunks_count'].mean(),
        'avg_sources_used': results_df['context_sources'].apply(len).mean(),
    }

    return metrics

def generate_evaluation_report(results_df: pd.DataFrame, metrics: Dict[str, float],
                             output_dir: str = '.') -> None:
    """Generate comprehensive evaluation report"""

    import os
    os.makedirs(output_dir, exist_ok=True)

    # Save detailed results
    results_df.to_csv(f"{output_dir}/merged_rag_evaluation_detailed.csv", index=False)

    # Create summary report
    report = f"""
# Merged Enhanced RAG Evaluation Report

## Dataset Summary
- Total samples evaluated: {len(results_df)}
- Evaluation date: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}
- Average chunks retrieved per query: {metrics['avg_chunks_retrieved']:.1f}
- Average sources used per query: {metrics['avg_sources_used']:.1f}

## Context Retrieval Quality
- Average Relevance: {metrics['avg_context_relevance']:.3f} (±{metrics['std_context_relevance']:.3f})
- Average Coverage: {metrics['avg_context_coverage']:.3f}
- Average Noise Level: {metrics['avg_context_noise_level']:.3f}
- Simple Retriever Score: {metrics['avg_simple_retriever_score']:.3f}

## Response Generation Quality
- Average Accuracy: {metrics['avg_response_accuracy']:.3f}
- Average Completeness: {metrics['avg_response_completeness']:.3f}
- Average Relevance: {metrics['avg_response_relevance']:.3f}
- Average Conciseness: {metrics['avg_response_conciseness']:.3f}
- **Overall Score: {metrics['avg_response_overall']:.3f} (±{metrics['std_response_overall']:.3f})**

## Performance Distribution
- Excellent (≥0.8): {metrics['excellent_responses']:.1%}
- Good (0.6-0.8): {metrics['good_responses']:.1%}
- Poor (<0.6): {metrics['poor_responses']:.1%}

## Top Performing Examples
{results_df.nlargest(3, 'response_overall_score')[['question', 'response_overall_score', 'response_explanation']].to_string()}

## Worst Performing Examples
{results_df.nsmallest(3, 'response_overall_score')[['question', 'response_overall_score', 'response_explanation']].to_string()}

## Retrieval Analysis
- Context relevance vs Response quality correlation: {results_df['context_relevance'].corr(results_df['response_overall_score']):.3f}
- Simple retriever vs Enhanced retriever correlation: {results_df['simple_retriever_score'].corr(results_df['context_relevance']):.3f}
- Sources most frequently used: {results_df.explode('context_sources')['context_sources'].value_counts().head().to_dict()}
"""

    # Save report
    with open(f"{output_dir}/merged_rag_evaluation_report.txt", 'w') as f:
        f.write(report)

    # Save metrics as JSON
    with open(f"{output_dir}/merged_rag_metrics.json", 'w') as f:
        json.dump(metrics, f, indent=2)

    print(report)
    print(f"\nResults saved to {output_dir}/")

# Main execution function
def main():
    """Main execution function"""

    # Configuration
    pdf_dir = "/content/"
    pdf_files = ["indoor_asr.pdf"]  # or ["aegon_policy.pdf", "asr_policy.pdf"]
    qa_file = "asr_indoor2.xlsx"  # or "Groundtruth inboedel translated.xlsx"
    output_dir = "merged_evaluation_results"

    # Whether to use ground truth in evaluation
    use_ground_truth = False  # Set to False to evaluate without ground truth

    try:
        print("=== Merged Enhanced RAG Evaluation System ===")

        # 1. Load and chunk documents
        print("\n1. Loading and chunking documents...")
        chunks = load_pdf_with_advanced_chunking(pdf_dir, pdf_files)
        print(f"Created {len(chunks)} chunks total")

        # 2. Build vector index
        print("\n2. Building enhanced vector index...")
        index, embedder, chunks = build_enhanced_vector_index(chunks)

        # 3. Load QA dataset
        print("\n3. Loading QA dataset...")
        questions, ground_truths = load_qa_dataset(qa_file)
        print(f"Loaded {len(questions)} questions")

        # 4. Run evaluation
        print("\n4. Running merged enhanced evaluation...")
        results_df = run_enhanced_evaluation(questions, ground_truths, index, embedder, chunks, use_ground_truth)

        # 5. Calculate metrics
        print("\n5. Calculating evaluation metrics...")
        metrics = calculate_evaluation_metrics(results_df)

        # 6. Generate report
        print("\n6. Generating evaluation report...")
        generate_evaluation_report(results_df, metrics, output_dir)

        print("\n=== Evaluation Complete ===")

    except Exception as e:
        print(f"Error during evaluation: {str(e)}")
        raise

if __name__ == "__main__":
    main()

=== Merged Enhanced RAG Evaluation System ===

1. Loading and chunking documents...
Extracted 746 chunks from indoor_asr.pdf
Created 746 chunks total

2. Building enhanced vector index...


Batches:   0%|          | 0/24 [00:00<?, ?it/s]

Built vector index with 746 chunks

3. Loading QA dataset...
Raw data preview:                                                    0  \
1  A ball went through my window. Will this be co...   
2               I dropped my phone. Is this covered?   
3  My roof is leaking, causing damage to the ceil...   
4  I tripped outside and broke my glasses. Is thi...   
5  My phone slipped out of my hands and fell into...   

                                                   1  
1  Yes, glass breakage due to sudden and unforese...  
2  Yes, damage to your phone due to sudden and un...  
3  No, painting the ceiling due to water damage f...  
4  No, damage to your glasses due to tripping out...  
5  No, this is not insured. Loss of your mobile e...  
Loaded 12 questions

4. Running merged enhanced evaluation...

Processing 1/12: A ball went through my window. Will this be covere...
Query: A ball went through my window. Will this be covered?, Sentences Count: 746, Retrieved Context: ['Then you will re