In [1]:
import os
import pandas as pd
import numpy as np
%pip install python-dotenv openai faiss-cpu sentence-transformers pdfplumber
from typing import List, Dict, Tuple
from dotenv import load_dotenv
from openai import AzureOpenAI
import json
import pdfplumber
from sentence_transformers import SentenceTransformer
import faiss
import nltk
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
import re
import warnings
warnings.filterwarnings('ignore')

# Download required NLTK data
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

try:
    nltk.data.find('tokenizers/punkt_tab')
except LookupError:
    nltk.download('punkt_tab')

# Load environment variables
load_dotenv()

# Initialize Azure OpenAI client
def init_client():
    """Initialize Azure OpenAI client with environment variables."""
    api_key = os.getenv("AZURE_OPENAI_API_KEY")
    if not api_key:
        raise ValueError("AZURE_OPENAI_API_KEY environment variable not set.")

    endpoint = os.getenv("ENDPOINT_URL")
    if not endpoint:
        raise ValueError("ENDPOINT_URL environment variable not set.")

    client = AzureOpenAI(
        api_key=api_key,
        api_version=os.getenv("OPENAI_API_VERSION"),
        azure_endpoint=endpoint
    )
    return client

# Initialize client
client = init_client()

Collecting python-dotenv
  Downloading python_dotenv-1.1.1-py3-none-any.whl.metadata (24 kB)
Downloading python_dotenv-1.1.1-py3-none-any.whl (20 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.1.1


In [None]:
def create_sentence_chunks(sentences: List[str], chunk_size: int = 3, overlap: int = 1) -> List[str]:
    """Create overlapping sentence chunks"""
    chunks = []
    for i in range(0, len(sentences), chunk_size - overlap):
        chunk_sentences = sentences[i:i + chunk_size]
        if chunk_sentences:
            chunks.append(' '.join(chunk_sentences))
    return chunks

def create_paragraph_chunks(paragraphs: List[str], max_length: int = 800, overlap: int = 100) -> List[str]:
    """Create overlapping paragraph chunks"""
    chunks = []
    current_chunk = ""

    for para in paragraphs:
        if len(current_chunk) + len(para) <= max_length:
            current_chunk += para + "\n\n"
        else:
            if current_chunk:
                chunks.append(current_chunk.strip())
                # Create overlap
                current_chunk = current_chunk[-overlap:] + para + "\n\n"
            else:
                current_chunk = para + "\n\n"

    if current_chunk:
        chunks.append(current_chunk.strip())

    return chunks

def create_semantic_chunks(text: str) -> List[str]:
    """Create chunks based on semantic sections (headers, numbered sections, etc.)"""
    chunks = []

    # Pattern for detecting sections/headers
    section_pattern = r'(?:^|\n)(?:\d+\.?\s+|[A-Z][^.]*:|\n[A-Z][A-Z\s]+\n|Chapter\s+\d+|Section\s+\d+)'

    sections = re.split(section_pattern, text)

    for section in sections:
        section = section.strip()
        if len(section) > 100:  # Only keep substantial sections
            chunks.append(section)

    return chunks

def create_advanced_chunks(text: str, page_num: int, source: str) -> List[Dict]:
    """
    Create advanced chunks using multiple strategies:
    1. Sentence-based chunking
    2. Paragraph-based chunking
    3. Overlapping chunks
    4. Semantic chunks based on headers/sections
    """
    chunks = []

    # Clean text
    text = re.sub(r'\s+', ' ', text).strip()

    # Strategy 1: Sentence-based chunking
    sentences = sent_tokenize(text)
    sentence_chunks = create_sentence_chunks(sentences, chunk_size=3, overlap=1)

    for i, chunk in enumerate(sentence_chunks):
        chunks.append({
            'text': chunk,
            'source': source,
            'page': page_num,
            'chunk_id': f"{source}_p{page_num}_sent_{i}",
            'chunk_type': 'sentence',
            'length': len(chunk)
        })

    # Strategy 2: Paragraph-based chunking
    paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]
    paragraph_chunks = create_paragraph_chunks(paragraphs, max_length=800, overlap=100)

    for i, chunk in enumerate(paragraph_chunks):
        chunks.append({
            'text': chunk,
            'source': source,
            'page': page_num,
            'chunk_id': f"{source}_p{page_num}_para_{i}",
            'chunk_type': 'paragraph',
            'length': len(chunk)
        })

    # Strategy 3: Semantic chunking (based on headers/sections)
    semantic_chunks = create_semantic_chunks(text)

    for i, chunk in enumerate(semantic_chunks):
        chunks.append({
            'text': chunk,
            'source': source,
            'page': page_num,
            'chunk_id': f"{source}_p{page_num}_semantic_{i}",
            'chunk_type': 'semantic',
            'length': len(chunk)
        })

    return chunks

def load_pdf_with_advanced_chunking(pdf_dir: str, pdf_files: List[str]) -> List[Dict]:
    """
    Load PDFs with advanced chunking strategies for better retrieval.
    Returns list of chunks with metadata.
    """
    chunks = []

    for pdf_file in pdf_files:
        file_path = os.path.join(pdf_dir, pdf_file)
        if not os.path.exists(file_path):
            print(f"Warning: {pdf_file} not found.")
            continue

        try:
            with pdfplumber.open(file_path) as pdf:
                for page_num, page in enumerate(pdf.pages):
                    text = page.extract_text() or ""
                    if not text.strip():
                        continue

                    # Advanced chunking strategies
                    page_chunks = create_advanced_chunks(text, page_num, pdf_file)
                    chunks.extend(page_chunks)

            print(f"Extracted {len([c for c in chunks if c['source'] == pdf_file])} chunks from {pdf_file}")

        except Exception as e:
            print(f"Error processing {pdf_file}: {str(e)}")

    return chunks

In [None]:
pdf_dir = "."
pdf_files = ["aegon_travel.pdf"]
chunks = load_pdf_with_advanced_chunking(pdf_dir, pdf_files)

In [None]:
def build_enhanced_vector_index(chunks: List[Dict]) -> Tuple[faiss.IndexFlatL2, SentenceTransformer, List[Dict]]:
    """Build enhanced vector index with multiple embedding strategies"""

    # Initialize embedder
    embedder = SentenceTransformer('all-MiniLM-L6-v2')

    # Extract texts for embedding
    texts = [chunk['text'] for chunk in chunks]

    # Create embeddings
    embeddings = embedder.encode(texts, convert_to_tensor=False, show_progress_bar=True)

    # Create FAISS index
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(np.array(embeddings, dtype=np.float32))

    print(f"Built vector index with {len(chunks)} chunks")

    return index, embedder, chunks

def rerank_for_diversity(chunks: List[Dict], k: int) -> List[Dict]:
    """Rerank chunks to ensure diversity in sources and content"""

    # Sort by combined score (similarity + keyword)
    for chunk in chunks:
        chunk['combined_score'] = chunk['similarity_score'] + chunk['keyword_score'] * 0.3

    chunks.sort(key=lambda x: x['combined_score'], reverse=True)

    # Ensure diversity
    selected_chunks = []
    used_sources = set()
    used_chunk_types = set()

    for chunk in chunks:
        if len(selected_chunks) >= k:
            break

        # Prioritize diversity in sources and chunk types
        source_penalty = 0.1 if chunk['source'] in used_sources else 0
        type_penalty = 0.05 if chunk['chunk_type'] in used_chunk_types else 0

        chunk['final_score'] = chunk['combined_score'] - source_penalty - type_penalty

        selected_chunks.append(chunk)
        used_sources.add(chunk['source'])
        used_chunk_types.add(chunk['chunk_type'])

    return selected_chunks

def retrieve_enhanced_context(query: str, index: faiss.IndexFlatL2, embedder: SentenceTransformer,
                            chunks: List[Dict], k: int = 8) -> List[Dict]:
    """
    Enhanced context retrieval with multiple strategies:
    1. Semantic similarity search
    2. Keyword matching
    3. Diversity-based reranking
    """

    # Semantic search
    query_embedding = embedder.encode([query], convert_to_tensor=False)
    distances, indices = index.search(np.array(query_embedding, dtype=np.float32), k * 2)

    # Get candidate chunks
    candidate_chunks = []
    for i in indices[0]:
        if i < len(chunks):
            chunk = chunks[i].copy()
            chunk['similarity_score'] = float(1 / (1 + distances[0][len(candidate_chunks)]))
            candidate_chunks.append(chunk)

    # Keyword matching boost
    query_keywords = set(query.lower().split())
    for chunk in candidate_chunks:
        chunk_keywords = set(chunk['text'].lower().split())
        keyword_overlap = len(query_keywords.intersection(chunk_keywords))
        chunk['keyword_score'] = keyword_overlap / len(query_keywords) if query_keywords else 0

    # Diversity-based reranking
    final_chunks = rerank_for_diversity(candidate_chunks, k)

    return final_chunks

def retrieve_enhanced_context_for_df(df: pd.DataFrame, index, embedder, chunks, k = 5):
    """Retrieve context for each question in the DataFrame and store it in a new column."""
    for df_index, row in df.iterrows():
        # Retrieve context
        retrieved_chunks = retrieve_enhanced_context(row['question'], index, embedder, chunks, k)
        # Format context as string
        context_str = "\n\n".join([c['text'] for c in retrieved_chunks])
        df.at[df_index, 'context'] = context_str


In [None]:
# Build index
if 'chunks' in locals() and chunks:
    index, embedder, chunks = build_enhanced_vector_index(chunks)
    print("Index built successfully.")
else:
    print("Chunks not found. Please run the PDF loading cell first.")

# Apply to DataFrame
if 'df' in locals():
    print("Retrieving context for DataFrame...")
    retrieve_enhanced_context_for_df(df, index, embedder, chunks)
    print("Context retrieval complete. Added 'context' column to DataFrame.")
    print(df[['question', 'context']].head())
else:
    print("DataFrame 'df' not found. Please load the dataset first.")


In [2]:
# 将这段代码复制到您的notebook中
import re

# 增强的JSON提取函数
def extract_json_from_response(response: str) -> str:
    """从响应中提取JSON，处理各种格式"""
    # 处理markdown代码块
    if '```json' in response:
        match = re.search(r'```json\s*\n?(.*?)\n?```', response, re.DOTALL)
        if match:
            return match.group(1).strip()

    if '```' in response:
        match = re.search(r'```\s*\n?(.*?)\n?```', response, re.DOTALL)
        if match:
            return match.group(1).strip()

    # 查找JSON对象
    json_pattern = r'\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}'
    matches = re.findall(json_pattern, response, re.DOTALL)
    if matches:
        return max(matches, key=len).strip()

    return response.strip()




In [3]:
def load_evaluation_dataset(file_path: str) -> pd.DataFrame:
    """
    Load evaluation dataset containing questions, ground truth answers, and retrieved contexts.

    Expected format:
    - CSV/Excel with columns: 'question', 'ground_truth', 'retrieved_context'
    - Or JSON with same structure
    """
    if file_path.endswith('.csv'):
        df = pd.read_csv(file_path)
    elif file_path.endswith('.xlsx'):
        df = pd.read_excel(file_path)
    elif file_path.endswith('.json'):
        df = pd.read_json(file_path)
    else:
        raise ValueError("Unsupported file format. Use CSV, Excel, or JSON.")

    # Validate required columns
    required_columns = ['question', 'ground_truth', 'retrieved_context']
    missing_columns = [col for col in required_columns if col not in df.columns]
    if missing_columns:
        raise ValueError(f"Missing required columns: {missing_columns}")

    print(f"Loaded {len(df)} evaluation samples")
    print(f"Sample data:\n{df.head(2)}")
    return df

In [4]:
def generate_response_with_context(question: str, context: str) -> str:
    """
    Generate a response using the provided context.
    """
    deployment = os.getenv("DEPLOYMENT_NAME")
    if not deployment:
        raise ValueError("DEPLOYMENT_NAME is missing in .env")

    prompt = f"""Context: {context}

Question: {question}

Please answer the question based on the provided context. Be concise and accurate."""

    response = client.chat.completions.create(
        model=deployment,
        messages=[
            {"role": "system", "content": "You are a helpful AI assistant that answers questions based on the provided context."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=150,
        temperature=0.3  # Lower temperature for more consistent responses
    )
    return response.choices[0].message.content.strip()

In [5]:
def judge_response_quality(question: str, response: str, ground_truth: str, context: str) -> Dict[str, any]:
    """
    Evaluate response quality against ground truth using multiple criteria.
    """
    prompt = f"""You are an expert evaluator. Please evaluate the following response across multiple dimensions:

Question: {question}
Context Provided: {context}
Generated Response: {response}
Ground Truth Answer: {ground_truth}

Please evaluate on these criteria (each on a scale of 0-1):
1. **Accuracy**: How factually correct is the response compared to the ground truth?
2. **Completeness**: Does the response cover all important points from the ground truth?
3. **Relevance**: Is the response relevant to the question and uses the context appropriately?
4. **Conciseness**: Is the response clear and concise without unnecessary information?

Provide your evaluation in the following JSON format:
{{
    "accuracy": <float>,
    "completeness": <float>,
    "relevance": <float>,
    "conciseness": <float>,
    "overall_score": <float>,
    "explanation": "<brief explanation of the scores>"
}}"""

    deployment = os.getenv("DEPLOYMENT_NAME")
    completion = client.chat.completions.create(
        model=deployment,
        messages=[{"role": "user", "content": prompt}],
        max_tokens=300,
        temperature=0.3
    )

    result = completion.choices[0].message.content.strip()

    try:
        # Parse JSON response
        # 提取JSON内容（去除markdown标记）
      if '```json' in result:
        json_content = result.split('```json')[1].split('```')[0].strip()
      elif '```' in result:
        json_content = result.split('```')[1].split('```')[0].strip()
      else:
        json_content = result

# 解析JSON
      evaluation = json.loads(json_content)
      return evaluation
    except json.JSONDecodeError:
        # Fallback parsing if JSON fails
        print(f"Failed to parse JSON response: {result}")
        return {
            "accuracy": 0.0,
            "completeness": 0.0,
            "relevance": 0.0,
            "conciseness": 0.0,
            "overall_score": 0.0,
            "explanation": "Failed to parse evaluation response"
        }


In [6]:
def judge_context_relevance(question: str, context: str) -> Dict[str, any]:
    """
    Evaluate how relevant the retrieved context is to the question.
    """
    prompt = f"""You are an expert evaluator. Please evaluate how relevant the following context is to the question.

Question: {question}
Retrieved Context: {context}

Please evaluate:
1. **Relevance Score** (0-1): How relevant is the context to answering the question?
2. **Coverage Score** (0-1): How well does the context cover the information needed to answer the question?
3. **Noise Level** (0-1): How much irrelevant information is in the context? (0 = lots of noise, 1 = no noise)

Provide your evaluation in the following JSON format:
{{
    "relevance_score": <float>,
    "coverage_score": <float>,
    "noise_level": <float>,
    "explanation": "<brief explanation>"
}}"""

    deployment = os.getenv("DEPLOYMENT_NAME")
    completion = client.chat.completions.create(
        model=deployment,
        messages=[{"role": "user", "content": prompt}],
        max_tokens=200,
        temperature=0.3
    )

    result = completion.choices[0].message.content.strip()

    try:
        json_content = extract_json_from_response(result)
        evaluation = json.loads(json_content)
        # Ensure all required fields exist
        required_fields = ['relevance_score', 'coverage_score', 'noise_level', 'explanation']
        for field in required_fields:
            if field not in evaluation:
                evaluation[field] = 0.0 if field != 'explanation' else 'Field missing from evaluation'
        return evaluation
    except json.JSONDecodeError:
        print(f"Failed to parse JSON response: {result}")
        return {
            "relevance_score": 0.0,
            "coverage_score": 0.0,
            "noise_level": 0.0,
            "explanation": "Failed to parse evaluation response"
        }


In [7]:
def run_baseline_evaluation(df: pd.DataFrame, sample_size: int = None) -> pd.DataFrame:
    """
    Run comprehensive baseline evaluation on the dataset.
    """
    if sample_size:
        df = df.sample(n=min(sample_size, len(df)), random_state=42)
        print(f"Evaluating {len(df)} samples...")

    results = []

    for idx, row in df.iterrows():
        print(f"\nProcessing {idx + 1}/{len(df)}...")

        question = row['question']
        ground_truth = row['ground_truth']
        context = row['retrieved_context']

        # 1. Evaluate context relevance
        context_eval = judge_context_relevance(question, context)

        # 2. Generate response using context
        generated_response = generate_response_with_context(question, context)

        # 3. Evaluate response quality
        response_eval = judge_response_quality(question, generated_response, ground_truth, context)

        # Compile results
        result = {
            'question': question,
            'ground_truth': ground_truth,
            'retrieved_context': context[:200] + '...' if len(context) > 200 else context,
            'generated_response': generated_response,

            # Context evaluation metrics
            'context_relevance': context_eval.get('relevance_score', 0.0),
            'context_coverage': context_eval.get('coverage_score', 0.0),
            'context_noise_level': context_eval.get('noise_level', 0.0),
            'context_explanation': context_eval.get('explanation', 'N/A'),

            # Response evaluation metrics
            'response_accuracy': response_eval.get('accuracy', 0.0),
            'response_completeness': response_eval.get('completeness', 0.0),
            'response_relevance': response_eval.get('relevance', 0.0),  # 注意这里是 'relevance' 不是 'response_relevance'
            'response_conciseness': response_eval.get('conciseness', 0.0),
            'response_overall_score': response_eval.get('overall_score', 0.0),
            'response_explanation': response_eval.get('explanation', 'N/A')
        }

        results.append(result)

        # Print summary for this sample
        print(f"Context Relevance: {context_eval['relevance_score']:.2f}")
        print(f"Response Overall Score: {response_eval['overall_score']:.2f}")

    return pd.DataFrame(results)

In [8]:
def calculate_baseline_metrics(results_df: pd.DataFrame) -> Dict[str, float]:
    """
    Calculate aggregate baseline metrics from evaluation results.
    """
    metrics = {
        # Context metrics
        'avg_context_relevance': results_df['context_relevance'].mean(),
        'avg_context_coverage': results_df['context_coverage'].mean(),
        'avg_context_noise_level': results_df['context_noise_level'].mean(),

        # Response metrics
        'avg_response_accuracy': results_df['response_accuracy'].mean(),
        'avg_response_completeness': results_df['response_completeness'].mean(),
        'avg_response_relevance': results_df['response_relevance'].mean(),
        'avg_response_conciseness': results_df['response_conciseness'].mean(),
        'avg_response_overall': results_df['response_overall_score'].mean(),

        # Standard deviations
        'std_context_relevance': results_df['context_relevance'].std(),
        'std_response_overall': results_df['response_overall_score'].std(),

        # Performance distribution
        'excellent_responses': (results_df['response_overall_score'] >= 0.8).sum() / len(results_df),
        'good_responses': ((results_df['response_overall_score'] >= 0.6) &
                          (results_df['response_overall_score'] < 0.8)).sum() / len(results_df),
        'poor_responses': (results_df['response_overall_score'] < 0.6).sum() / len(results_df)
    }

    return metrics

In [9]:
def generate_evaluation_report(results_df: pd.DataFrame, metrics: Dict[str, float], output_dir: str = '.'):
    """
    Generate comprehensive evaluation report.
    """
    # Save detailed results
    results_df.to_csv(f"{output_dir}/baseline_evaluation_detailed.csv", index=False)

    # Create summary report
    report = f"""
# LLM-as-a-Judge Baseline Evaluation Report

## Dataset Summary
- Total samples evaluated: {len(results_df)}
- Evaluation date: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}

## Context Retrieval Quality
- Average Relevance: {metrics['avg_context_relevance']:.3f} (±{metrics['std_context_relevance']:.3f})
- Average Coverage: {metrics['avg_context_coverage']:.3f}
- Average Noise Level: {metrics['avg_context_noise_level']:.3f}

## Response Generation Quality
- Average Accuracy: {metrics['avg_response_accuracy']:.3f}
- Average Completeness: {metrics['avg_response_completeness']:.3f}
- Average Relevance: {metrics['avg_response_relevance']:.3f}
- Average Conciseness: {metrics['avg_response_conciseness']:.3f}
- **Overall Score: {metrics['avg_response_overall']:.3f} (±{metrics['std_response_overall']:.3f})**

## Performance Distribution
- Excellent (≥0.8): {metrics['excellent_responses']:.1%}
- Good (0.6-0.8): {metrics['good_responses']:.1%}
- Poor (<0.6): {metrics['poor_responses']:.1%}

## Top Performing Examples
{results_df.nlargest(3, 'response_overall_score')[['question', 'response_overall_score', 'response_explanation']].to_string()}

## Worst Performing Examples
{results_df.nsmallest(3, 'response_overall_score')[['question', 'response_overall_score', 'response_explanation']].to_string()}
"""

    # Save report
    with open(f"{output_dir}/baseline_evaluation_report.txt", 'w') as f:
        f.write(report)

    # Save metrics as JSON
    with open(f"{output_dir}/baseline_metrics.json", 'w') as f:
        json.dump(metrics, f, indent=2)

    print(report)
    print(f"\nResults saved to {output_dir}/")



In [20]:
# Main execution
if __name__ == "__main__":
    # Load your evaluation dataset
    # Replace with your actual data file path
    dataset_path = "asr_travel.xlsx"  # or .xlsx, .json

    try:
        # Load data
        df = load_evaluation_dataset(dataset_path)

        # Run evaluation (you can specify sample_size for testing)
        print("\nStarting baseline evaluation...")
        results_df = run_baseline_evaluation(df, sample_size=None)  # Use sample_size=10 for testing

        # Calculate metrics
        print("\nCalculating baseline metrics...")
        metrics = calculate_baseline_metrics(results_df)

        # Generate report
        print("\nGenerating evaluation report...")
        import os
        output_dir = 'evaluation_results'
        os.makedirs(output_dir, exist_ok=True)
        generate_evaluation_report(results_df, metrics, output_dir=output_dir)

    except Exception as e:
        print(f"Error during evaluation: {str(e)}")
        raise

Loaded 33 evaluation samples
Sample data:
                                            question  \
0  I have a concussion, I want to cancel, is this...   
1  There is a strike on the German railways next ...   

                                        ground_truth  \
0  Yes, cancellation due to a serious illness suc...   
1  Yes, you can cancel if there are strikes at th...   

                                   retrieved_context  
0  Event Restriction or exclusion 1. Death, serio...  
1  Do you have to stay longer at your travel dest...  

Starting baseline evaluation...

Processing 1/33...
Context Relevance: 0.80
Response Overall Score: 0.93

Processing 2/33...
Context Relevance: 0.80
Response Overall Score: 0.75

Processing 3/33...
Context Relevance: 0.90
Response Overall Score: 0.97

Processing 4/33...
Context Relevance: 0.20
Response Overall Score: 0.80

Processing 5/33...
Context Relevance: 1.00
Response Overall Score: 0.97

Processing 6/33...
Context Relevance: 0.90
Response Over