# Baseline vs Optimized Model Comparison

This notebook compares the performance of a baseline (untrained) model against our optimized PPO model for ad integration.

## Setup and Imports

In [1]:
import sys
import os
import json
import pandas as pd
from tqdm import tqdm
from rich.console import Console
from rich.table import Table
from rich.text import Text
from rich.padding import Padding

# Add src to path
sys.path.append(os.path.abspath('..'))

from src.judge.coherence import judge_coherence
from src.judge.helpfulness import judge_helpfulness
from src.judge.salience import judge_ad_salience
from src.judge.detectability import judge_detectability
from src.generate.generator import generate_responses

ModuleNotFoundError: No module named 'judge'

## Load Test Data

In [None]:
# Load test data
df = pd.read_csv("../data/merged_queries_ads.csv")
print(f"Loaded {len(df)} test examples")

## Helper Functions

In [None]:
def get_ad_facts(entry):
    return {
        'ad_product': entry['ad_product'],
        'brand': entry['brand'],
        'url': entry['url'],
        'description': entry['ad_description']
    }

def evaluate_response(query, response_with_ad, response_without_ad, ad_facts):
    """Evaluate a response using all judge metrics"""
    results = {}
    
    # Coherence
    coherence = judge_coherence(response_with_ad, query)
    results['coherence'] = coherence
    
    # Helpfulness
    helpfulness = judge_helpfulness(query, response_with_ad)
    results['helpfulness'] = helpfulness
    
    # Ad Salience
    ad_salience = judge_ad_salience(query, response_with_ad, ad_facts)
    results['ad_salience'] = ad_salience
    
    # Detectability
    detectability = judge_detectability(response_with_ad, response_without_ad)
    results['detectability'] = detectability
    
    return results

def print_comparison(baseline_results, optimized_results, query, ad_facts):
    """Print a comparison of baseline vs optimized results"""
    console = Console()
    
    console.rule("[bold blue]Query and Ad Info")
    console.print(f"[bold yellow]Query:[/bold yellow] {query}")
    console.print(f"[bold yellow]Ad Product:[/bold yellow] {ad_facts['ad_product']}")
    console.print(f"[bold yellow]Brand:[/bold yellow] {ad_facts['brand']}")
    
    # Create comparison table
    table = Table(show_header=True, header_style="bold magenta")
    table.add_column("Metric")
    table.add_column("Baseline")
    table.add_column("Optimized")
    table.add_column("Improvement")
    
    # Add rows for each metric
    metrics = {
        "Coherence": ("coherence", "Coherence Score"),
        "Helpfulness": ("helpfulness", "Helpfulness Score"),
        "Ad Salience": ("ad_salience", "Ad Salience Score"),
        "Detectability": ("detectability", "detectability_cosine")
    }
    
    for metric_name, (result_key, score_key) in metrics.items():
        baseline_score = baseline_results[result_key].get(score_key, 0)
        optimized_score = optimized_results[result_key].get(score_key, 0)
        improvement = optimized_score - baseline_score
        
        table.add_row(
            metric_name,
            f"{baseline_score:.2f}",
            f"{optimized_score:.2f}",
            f"[green]+{improvement:.2f}[/green]" if improvement > 0 else f"[red]{improvement:.2f}[/red]"
        )
    
    console.print(table)
    console.rule("[bold blue]End of Comparison")

## Run Comparison

In [None]:
# Number of examples to evaluate
NUM_EXAMPLES = 5

# Store results
comparison_results = []

for idx, row in tqdm(df.head(NUM_EXAMPLES).iterrows(), total=NUM_EXAMPLES):
    query = row['vague_query']
    ad_facts = get_ad_facts(row)
    
    # Generate responses
    baseline_without_ad, baseline_with_ad = generate_responses(query, ad_facts, use_optimized=False)
    optimized_without_ad, optimized_with_ad = generate_responses(query, ad_facts, use_optimized=True)
    
    # Evaluate responses
    baseline_results = evaluate_response(query, baseline_with_ad, baseline_without_ad, ad_facts)
    optimized_results = evaluate_response(query, optimized_with_ad, optimized_without_ad, ad_facts)
    
    # Print comparison
    print(f"\nEvaluating example {idx + 1}/{NUM_EXAMPLES}")
    print_comparison(baseline_results, optimized_results, query, ad_facts)
    
    # Store results
    comparison_results.append({
        'query': query,
        'ad_facts': ad_facts,
        'baseline_results': baseline_results,
        'optimized_results': optimized_results
    })

## Aggregate Results

In [None]:
def calculate_average_scores(results, model_type):
    """Calculate average scores for a model type"""
    metrics = {
        "Coherence": ("coherence", "Coherence Score"),
        "Helpfulness": ("helpfulness", "Helpfulness Score"),
        "Ad Salience": ("ad_salience", "Ad Salience Score"),
        "Detectability": ("detectability", "detectability_cosine")
    }
    
    scores = {}
    for metric_name, (result_key, score_key) in metrics.items():
        values = [r[f'{model_type}_results'][result_key].get(score_key, 0) for r in results]
        scores[metric_name] = sum(values) / len(values)
    
    return scores

# Calculate averages
baseline_avg = calculate_average_scores(comparison_results, 'baseline')
optimized_avg = calculate_average_scores(comparison_results, 'optimized')

# Print summary
console = Console()
console.rule("[bold blue]Overall Results Summary")

table = Table(show_header=True, header_style="bold magenta")
table.add_column("Metric")
table.add_column("Baseline Avg")
table.add_column("Optimized Avg")
table.add_column("Improvement")

for metric in baseline_avg.keys():
    improvement = optimized_avg[metric] - baseline_avg[metric]
    table.add_row(
        metric,
        f"{baseline_avg[metric]:.2f}",
        f"{optimized_avg[metric]:.2f}",
        f"[green]+{improvement:.2f}[/green]" if improvement > 0 else f"[red]{improvement:.2f}[/red]"
    )

console.print(table)
console.rule("[bold blue]End of Summary")