# TruthSim Results Analysis

This notebook analyzes the results from TruthSim diagnostic conversations.

In [None]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Set style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

## 1. Load Evaluation Results

In [None]:
# Load evaluation results
eval_path = Path('../data/evaluations/evaluation_detailed.json')

if eval_path.exists():
    with open(eval_path, 'r') as f:
        results = json.load(f)
    print(f"Loaded results for {results['transcripts_evaluated']} transcripts")
else:
    print("No evaluation results found. Run evaluate.py first.")
    results = None

## 2. Diagnostic Accuracy Analysis

In [None]:
if results and 'diagnosis_matching' in results:
    # Create DataFrame
    df_diagnosis = pd.DataFrame(results['diagnosis_matching'])
    
    # Compute accuracy
    accuracy = df_diagnosis['match'].mean()
    print(f"Overall Diagnostic Accuracy: {accuracy:.2%}")
    
    # Display sample results
    display(df_diagnosis[['patient_id', 'doctor_diagnosis', 'ground_truth', 'match', 'reasoning']].head(10))

## 3. Simulation Quality Analysis

In [None]:
if results and 'llm_judge' in results:
    # Create DataFrame
    df_judge = pd.DataFrame(results['llm_judge'])
    
    # Extract scores
    df_judge['truth_pass'] = df_judge.apply(
        lambda x: x.get('truth_preservation', {}).get('pass', False), axis=1
    )
    df_judge['avg_realism'] = df_judge.apply(
        lambda x: x.get('realism', {}).get('average', 0), axis=1
    )
    df_judge['avg_utility'] = df_judge.apply(
        lambda x: x.get('clinical_utility', {}).get('average', 0), axis=1
    )
    
    print(f"Truth Preservation Pass Rate: {df_judge['truth_pass'].mean():.2%}")
    print(f"Average Realism Score: {df_judge['avg_realism'].mean():.2f}/5.0")
    print(f"Average Clinical Utility: {df_judge['avg_utility'].mean():.2f}/5.0")

## 4. Visualizations

In [None]:
if results and 'llm_judge' in results:
    fig, axes = plt.subplots(1, 3, figsize=(15, 4))
    
    # Realism distribution
    axes[0].hist(df_judge['avg_realism'].dropna(), bins=10, edgecolor='black')
    axes[0].set_xlabel('Realism Score')
    axes[0].set_ylabel('Count')
    axes[0].set_title('Distribution of Realism Scores')
    axes[0].axvline(df_judge['avg_realism'].mean(), color='red', linestyle='--', label='Mean')
    axes[0].legend()
    
    # Utility distribution
    axes[1].hist(df_judge['avg_utility'].dropna(), bins=10, edgecolor='black')
    axes[1].set_xlabel('Utility Score')
    axes[1].set_ylabel('Count')
    axes[1].set_title('Distribution of Clinical Utility Scores')
    axes[1].axvline(df_judge['avg_utility'].mean(), color='red', linestyle='--', label='Mean')
    axes[1].legend()
    
    # Truth preservation
    truth_counts = df_judge['truth_pass'].value_counts()
    axes[2].pie(truth_counts, labels=['Pass', 'Fail'], autopct='%1.1f%%', colors=['green', 'red'])
    axes[2].set_title('Truth Preservation')
    
    plt.tight_layout()
    plt.savefig('../data/evaluations/quality_distribution.png', dpi=150)
    plt.show()

## 5. Model Comparison (if multiple models evaluated)

In [None]:
# Load transcripts to get model information
conversations_dir = Path('../data/conversations')

if conversations_dir.exists():
    model_results = {}
    
    for model_dir in conversations_dir.iterdir():
        if model_dir.is_dir():
            model_name = model_dir.name
            transcripts = list(model_dir.glob('*.json'))
            
            if transcripts:
                model_results[model_name] = {
                    'count': len(transcripts),
                }
    
    if model_results:
        print("Models evaluated:")
        for model, info in model_results.items():
            print(f"  - {model}: {info['count']} conversations")

## 6. Summary Statistics

In [None]:
if results:
    print("=" * 50)
    print("EVALUATION SUMMARY")
    print("=" * 50)
    
    metrics = results.get('metrics', {})
    
    if 'diagnostic_accuracy' in metrics:
        acc = metrics['diagnostic_accuracy']
        print(f"\nDiagnostic Accuracy: {acc['top1_accuracy']:.2%} ({acc['correct']}/{acc['total']})")
    
    if 'simulation_quality' in metrics:
        sq = metrics['simulation_quality']
        print(f"\nSimulation Quality:")
        print(f"  Truth Preservation: {sq['truth_preservation']['pass_rate']:.2%}")
        print(f"  Hallucination Rate: {sq['truth_preservation']['hallucination_rate']:.2%}")
        print(f"  Average Realism: {sq['realism']['mean']:.2f} ± {sq['realism']['std']:.2f}")
        print(f"  Average Utility: {sq['clinical_utility']['mean']:.2f} ± {sq['clinical_utility']['std']:.2f}")