# Pony LLM Evaluation - Interactive Analysis

This notebook demonstrates how to analyze evaluation results interactively.

In [None]:
# Import libraries
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Set style
sns.set_theme(style="whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

## 1. Load Results

In [None]:
# Update this path to your results file
results_file = Path('../results/eval_TIMESTAMP/evaluation_results.json')

# Load data
with open(results_file, 'r') as f:
    results = json.load(f)

# Convert to DataFrame
df = pd.DataFrame(results)

print(f"Loaded {len(df)} evaluation results")
df.head()

## 2. Overall Statistics

In [None]:
# Calculate overall stats
total = len(df)
successful = df['compilation_success'].sum()
success_rate = successful / total * 100

print(f"Total Evaluations: {total}")
print(f"Successful Compilations: {successful}")
print(f"Overall Success Rate: {success_rate:.1f}%")
print(f"\nAverage Execution Time: {df['execution_time'].mean():.2f}s")

## 3. Success Rate by Strategy

In [None]:
# Calculate success rate by strategy
strategy_stats = df.groupby('strategy')['compilation_success'].agg(['mean', 'count', 'sum'])
strategy_stats.columns = ['Success Rate', 'Total', 'Successful']
strategy_stats = strategy_stats.sort_values('Success Rate', ascending=False)

print("Success Rate by Strategy:")
print(strategy_stats)

# Visualize
plt.figure(figsize=(12, 6))
ax = strategy_stats['Success Rate'].plot(kind='bar', color='steelblue')
ax.set_ylabel('Success Rate')
ax.set_title('Compilation Success Rate by Prompting Strategy', fontsize=14, fontweight='bold')
ax.set_ylim(0, 1)
ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda y, _: f'{y*100:.0f}%'))
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

## 4. Success Rate by Category

In [None]:
# Calculate success rate by category
category_stats = df.groupby('category')['compilation_success'].agg(['mean', 'count'])
category_stats.columns = ['Success Rate', 'Total']
category_stats = category_stats.sort_values('Success Rate', ascending=False)

print("Success Rate by Category:")
print(category_stats)

# Visualize
plt.figure(figsize=(10, 6))
ax = category_stats['Success Rate'].plot(kind='bar', color='coral')
ax.set_ylabel('Success Rate')
ax.set_title('Compilation Success Rate by Task Category', fontsize=14, fontweight='bold')
ax.set_ylim(0, 1)
ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda y, _: f'{y*100:.0f}%'))
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

## 5. Heatmap: Strategy vs Category

In [None]:
# Create pivot table
pivot = df.pivot_table(
    values='compilation_success',
    index='strategy',
    columns='category',
    aggfunc='mean'
)

# Visualize
plt.figure(figsize=(12, 8))
sns.heatmap(
    pivot * 100,
    annot=True,
    fmt='.1f',
    cmap='YlGnBu',
    cbar_kws={'label': 'Success Rate (%)'}
)
plt.title('Success Rate Heatmap: Strategy vs Category', fontsize=14, fontweight='bold')
plt.xlabel('Task Category')
plt.ylabel('Prompting Strategy')
plt.tight_layout()
plt.show()

## 6. Difficulty Analysis

In [None]:
# Success rate by difficulty
difficulty_stats = df.groupby('difficulty')['compilation_success'].mean().sort_index()

print("Success Rate by Difficulty:")
print(difficulty_stats)

# Visualize
plt.figure(figsize=(10, 6))
difficulty_stats.plot(kind='bar', color=['green', 'yellow', 'orange', 'red'][:len(difficulty_stats)])
plt.ylabel('Success Rate')
plt.title('Success Rate by Difficulty Level', fontsize=14, fontweight='bold')
plt.ylim(0, 1)
plt.gca().yaxis.set_major_formatter(plt.FuncFormatter(lambda y, _: f'{y*100:.0f}%'))
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

## 7. Execution Time Analysis

In [None]:
# Execution time by strategy
time_stats = df.groupby('strategy')['execution_time'].agg(['mean', 'std'])

print("Execution Time by Strategy (seconds):")
print(time_stats)

# Visualize
plt.figure(figsize=(12, 6))
time_stats['mean'].plot(kind='bar', yerr=time_stats['std'], color='mediumpurple', capsize=4)
plt.ylabel('Execution Time (seconds)')
plt.title('Average Execution Time by Strategy', fontsize=14, fontweight='bold')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

## 8. Error Analysis

In [None]:
# Get failed compilations
failed = df[df['compilation_success'] == False]

print(f"Failed Compilations: {len(failed)} ({len(failed)/len(df)*100:.1f}%)")

if len(failed) > 0:
    print("\nErrors by Strategy:")
    print(failed.groupby('strategy').size().sort_values(ascending=False))
    
    print("\nSample Error Messages:")
    for idx, row in failed.head(3).iterrows():
        print(f"\nTask: {row['task_id']} | Strategy: {row['strategy']}")
        if row['compilation_error']:
            print(f"Error: {row['compilation_error'][:200]}...")

## 9. Best Strategy per Category

In [None]:
# Find best strategy for each category
best_strategies = pivot.idxmax(axis=0)
best_rates = pivot.max(axis=0)

print("Best Strategy per Category:")
for category, strategy in best_strategies.items():
    rate = best_rates[category]
    print(f"{category}: {strategy} ({rate*100:.1f}%)")

## 10. Custom Analysis

Add your own analysis here!

In [None]:
# Example: Compare two specific strategies
strategy1 = 'zero_shot'
strategy2 = 'few_shot'

comparison = df[df['strategy'].isin([strategy1, strategy2])].groupby('strategy')['compilation_success'].mean()
print(f"\n{strategy1}: {comparison[strategy1]*100:.1f}%")
print(f"{strategy2}: {comparison[strategy2]*100:.1f}%")
print(f"Improvement: {(comparison[strategy2] - comparison[strategy1])*100:.1f} percentage points")