# QCAL-LLM: Benchmark LLaMA 4 Maverick

Evaluaci√≥n de coherencia usando Œ® = I √ó A_eff¬≤ y f‚ÇÄ = 141.7001 Hz

**Modelos evaluados:**
- LLaMA 4 Maverick (17B Instruct / FP8)
- GPT-4 (comparativa opcional)
- Claude 3 (comparativa opcional)

**M√©tricas:**
- Œ® (coherencia vibracional)
- ‚à¥-rate (tasa de conectores l√≥gicos)
- SNR sem√°ntico
- KLD‚Åª¬π (divergencia inversa)
- Calidad global (0-100)


In [None]:
# Setup
import sys
from pathlib import Path
import json
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Add parent directory to path
sys.path.insert(0, str(Path.cwd().parent))

from qcal.coherence import psi_score, analyze_text, evaluate_coherence
from qcal.metrics import comprehensive_metrics, quality_score

# Configure matplotlib
plt.style.use('seaborn-v0_8-darkgrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10

print("‚úÖ Imports successful")
print(f"üìä Working directory: {Path.cwd()}")

## 1. Load Evaluation Results

Load results from the evaluation script.

In [None]:
# Load results
results_file = Path('../results/evaluation_results.json')

if results_file.exists():
    with open(results_file, 'r', encoding='utf-8') as f:
        results = json.load(f)
    print(f"‚úÖ Loaded {len(results)} evaluation results")
else:
    print("‚ö†Ô∏è  No results file found. Run: python scripts/qcal_llm_eval.py")
    print("Using sample data for demonstration...")
    
    # Load prompts and evaluate directly
    with open('../data/prompts_qcal.json', 'r', encoding='utf-8') as f:
        prompts = json.load(f)
    
    results = []
    for prompt in prompts:
        if 'response' in prompt:
            metrics = evaluate_coherence(prompt['response'])
            results.append({
                'label': prompt['label'],
                'prompt': prompt['text'],
                'response': prompt['response'],
                'metrics': metrics
            })
    print(f"‚úÖ Evaluated {len(results)} prompts")

# Display first result as example
print("\n" + "="*80)
print("Example result:")
print("="*80)
example = results[0]
print(f"Label: {example['label']}")
print(f"Prompt: {example['prompt'][:100]}...")
print(f"Response: {example['response'][:150]}...")
print(f"\nMetrics:")
print(f"  Œ®: {example['metrics']['psi_standard']:.2f}")
print(f"  Status: {example['metrics']['status']}")

## 2. Statistical Analysis

Calculate summary statistics for all metrics.

In [None]:
# Extract metrics into DataFrame
data = []
for r in results:
    row = {
        'label': r['label'],
        'psi': r['metrics']['psi_standard'],
        'intention': r['metrics']['intention'],
        'effectiveness': r['metrics']['effectiveness'],
        'strich_rate': r['metrics']['strich_rate'],
        'coherent': r['metrics']['passes_threshold'],
    }
    
    # Add optional metrics if available
    if 'quality_score' in r['metrics']:
        row['quality'] = r['metrics']['quality_score']
    if 'snr_db' in r['metrics']:
        row['snr_db'] = r['metrics']['snr_db']
    if 'kld_inv' in r['metrics']:
        row['kld_inv'] = r['metrics']['kld_inv']
    
    data.append(row)

df = pd.DataFrame(data)

# Display summary statistics
print("\n" + "="*80)
print("SUMMARY STATISTICS")
print("="*80)
print(df.describe())

# Calculate pass rate
pass_rate = df['coherent'].sum() / len(df) * 100
print(f"\n‚úÖ Coherent responses (Œ® ‚â• 5.0): {df['coherent'].sum()}/{len(df)} ({pass_rate:.1f}%)")

# Show per-prompt results
print("\n" + "="*80)
print("PER-PROMPT RESULTS")
print("="*80)
print(df[['label', 'psi', 'intention', 'effectiveness', 'coherent']].to_string(index=False))

## 3. Visualizations

Create comprehensive visualizations of the evaluation results.

In [None]:
# Figure 1: Œ® scores by prompt
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# 1a. Œ® scores bar chart
ax = axes[0, 0]
colors = ['green' if c else 'red' for c in df['coherent']]
ax.bar(range(len(df)), df['psi'], color=colors, alpha=0.7)
ax.axhline(y=5.0, color='orange', linestyle='--', label='Threshold (Œ®=5.0)', linewidth=2)
ax.set_xlabel('Prompt')
ax.set_ylabel('Œ® Score')
ax.set_title('Coherence Scores (Œ® = I √ó A_eff¬≤)')
ax.set_xticks(range(len(df)))
ax.set_xticklabels(df['label'], rotation=45, ha='right')
ax.legend()
ax.grid(True, alpha=0.3)

# 1b. I vs A_eff scatter
ax = axes[0, 1]
scatter = ax.scatter(df['effectiveness'], df['intention'], 
                     c=df['psi'], cmap='viridis', s=100, alpha=0.7)
ax.set_xlabel('A_eff (Effectiveness)')
ax.set_ylabel('I (Intention)')
ax.set_title('Intention vs Effectiveness')
plt.colorbar(scatter, ax=ax, label='Œ®')
ax.grid(True, alpha=0.3)

# 1c. Distribution of Œ®
ax = axes[1, 0]
ax.hist(df['psi'], bins=10, color='steelblue', alpha=0.7, edgecolor='black')
ax.axvline(x=5.0, color='orange', linestyle='--', label='Threshold', linewidth=2)
ax.set_xlabel('Œ® Score')
ax.set_ylabel('Frequency')
ax.set_title('Distribution of Coherence Scores')
ax.legend()
ax.grid(True, alpha=0.3)

# 1d. ‚à¥-rate
ax = axes[1, 1]
ax.bar(range(len(df)), df['strich_rate'], color='purple', alpha=0.7)
ax.set_xlabel('Prompt')
ax.set_ylabel('‚à¥-rate (per 100 words)')
ax.set_title('Logical Connector Rate')
ax.set_xticks(range(len(df)))
ax.set_xticklabels(df['label'], rotation=45, ha='right')
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../results/benchmark_llama4_analysis.png', dpi=150, bbox_inches='tight')
print("\n‚úÖ Figure saved: results/benchmark_llama4_analysis.png")
plt.show()

## 4. Quality Metrics

Analyze additional quality metrics (SNR, KLD‚Åª¬π, quality score).

In [None]:
# Check if quality metrics are available
has_quality = 'quality' in df.columns
has_snr = 'snr_db' in df.columns
has_kld = 'kld_inv' in df.columns

if has_quality or has_snr or has_kld:
    n_plots = sum([has_quality, has_snr, has_kld])
    fig, axes = plt.subplots(1, n_plots, figsize=(6*n_plots, 4))
    
    if n_plots == 1:
        axes = [axes]
    
    plot_idx = 0
    
    if has_quality:
        ax = axes[plot_idx]
        ax.bar(range(len(df)), df['quality'], color='teal', alpha=0.7)
        ax.set_xlabel('Prompt')
        ax.set_ylabel('Quality Score')
        ax.set_title('Overall Quality (0-100)')
        ax.set_xticks(range(len(df)))
        ax.set_xticklabels(df['label'], rotation=45, ha='right')
        ax.grid(True, alpha=0.3)
        plot_idx += 1
    
    if has_snr:
        ax = axes[plot_idx]
        ax.bar(range(len(df)), df['snr_db'], color='orange', alpha=0.7)
        ax.set_xlabel('Prompt')
        ax.set_ylabel('SNR (dB)')
        ax.set_title('Semantic Signal-to-Noise Ratio')
        ax.set_xticks(range(len(df)))
        ax.set_xticklabels(df['label'], rotation=45, ha='right')
        ax.grid(True, alpha=0.3)
        plot_idx += 1
    
    if has_kld:
        ax = axes[plot_idx]
        ax.bar(range(len(df)), df['kld_inv'], color='green', alpha=0.7)
        ax.set_xlabel('Prompt')
        ax.set_ylabel('KLD‚Åª¬π')
        ax.set_title('Inverse KL Divergence')
        ax.set_xticks(range(len(df)))
        ax.set_xticklabels(df['label'], rotation=45, ha='right')
        ax.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig('../results/benchmark_llama4_quality.png', dpi=150, bbox_inches='tight')
    print("\n‚úÖ Figure saved: results/benchmark_llama4_quality.png")
    plt.show()
else:
    print("\n‚ö†Ô∏è  Quality metrics not available in results")

## 5. Export Results

Export results to CSV and PDF for publication.

In [None]:
# Export to CSV
csv_file = '../results/benchmark_llama4_results.csv'
df.to_csv(csv_file, index=False)
print(f"\n‚úÖ Results exported to: {csv_file}")

# Create summary report
summary = {
    'model': 'LLaMA 4 Maverick (17B Instruct / FP8)',
    'total_prompts': len(df),
    'coherent_count': int(df['coherent'].sum()),
    'coherent_rate': float(df['coherent'].sum() / len(df)),
    'psi_mean': float(df['psi'].mean()),
    'psi_std': float(df['psi'].std()),
    'psi_min': float(df['psi'].min()),
    'psi_max': float(df['psi'].max()),
    'intention_mean': float(df['intention'].mean()),
    'effectiveness_mean': float(df['effectiveness'].mean()),
    'strich_rate_mean': float(df['strich_rate'].mean()),
}

if has_quality:
    summary['quality_mean'] = float(df['quality'].mean())
if has_snr:
    summary['snr_mean_db'] = float(df['snr_db'].mean())
if has_kld:
    summary['kld_inv_mean'] = float(df['kld_inv'].mean())

# Save summary
summary_file = '../results/benchmark_llama4_summary.json'
with open(summary_file, 'w', encoding='utf-8') as f:
    json.dump(summary, f, indent=2)

print(f"‚úÖ Summary exported to: {summary_file}")

# Display summary
print("\n" + "="*80)
print("BENCHMARK SUMMARY")
print("="*80)
for key, value in summary.items():
    print(f"{key:25s}: {value}")

print("\n‚à¥ ‚Äî QCAL Œ®‚àû¬≥")
print("Benchmark complete. Ready for Zenodo publication.")

## 6. Comparison with Other Models (Optional)

If you have results from GPT-4, Claude, etc., load and compare them here.

In [None]:
# Example: Load comparison data
# gpt4_results = json.load(open('../results/gpt4_results.json'))
# claude_results = json.load(open('../results/claude_results.json'))

# Create comparison DataFrame and visualizations
# ...

print("\n‚ö†Ô∏è  Comparison with other models not yet implemented.")
print("To compare: Run evaluation with different models and load results here.")