# Cross-Task Analysis: Comparing Strategies Across Benchmarks

This notebook aggregates results from all task-specific evaluations and provides:

1. **Strategy effectiveness across tasks** - Which methods work best where?
2. **Compute-performance tradeoffs** - Pareto frontiers
3. **Task difficulty analysis** - How do tasks compare?
4. **Recommendations** - When to use which strategy

In [None]:
import sys
sys.path.insert(0, '..')

import json
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from pathlib import Path
from glob import glob

from src.utils import load_results, compare_runs

plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (14, 8)
plt.rcParams['font.size'] = 11

## 1. Load All Results

In [None]:
# Find all result directories
RESULTS_BASE = "../results"

task_dirs = {
    "GSM8K": f"{RESULTS_BASE}/gsm8k_comparison",
    "AIME": f"{RESULTS_BASE}/aime_comparison",
    "HumanEval": f"{RESULTS_BASE}/humaneval_comparison",
    "IFEval": f"{RESULTS_BASE}/ifeval_comparison",
}

# Load results from each task
all_task_results = {}

for task_name, task_dir in task_dirs.items():
    task_path = Path(task_dir)
    if not task_path.exists():
        print(f"Warning: {task_dir} not found, skipping {task_name}")
        continue
    
    # Find all run directories
    run_dirs = [d for d in task_path.iterdir() if d.is_dir()]
    
    task_results = {}
    for run_dir in run_dirs:
        try:
            results, metrics, config = load_results(str(run_dir))
            exp_name = config.get("experiment", run_dir.name.split("_2")[0])
            task_results[exp_name] = {
                "metrics": metrics,
                "config": config,
                "run_dir": str(run_dir),
            }
        except Exception as e:
            print(f"  Error loading {run_dir}: {e}")
    
    if task_results:
        all_task_results[task_name] = task_results
        print(f"Loaded {len(task_results)} experiments for {task_name}")

print(f"\nTotal: {len(all_task_results)} tasks loaded")

## 2. Build Comparison DataFrame

In [None]:
# Create unified dataframe
rows = []

for task_name, experiments in all_task_results.items():
    for exp_name, data in experiments.items():
        metrics = data["metrics"]
        
        # Determine strategy type
        if "greedy" in exp_name.lower():
            strategy_type = "Greedy"
        elif "mcts" in exp_name.lower() or "tree" in exp_name.lower():
            strategy_type = "Tree Search"
        elif "consistency" in exp_name.lower() or "diverse" in exp_name.lower() or "voting" in exp_name.lower():
            strategy_type = "Self-Consistency"
        elif "temp" in exp_name.lower() or "nucleus" in exp_name.lower():
            strategy_type = "Sampling"
        else:
            strategy_type = "Other"
        
        # Get primary metric
        accuracy = metrics.get("accuracy", 0)
        pass_at_1 = metrics.get("pass@1", accuracy)
        
        rows.append({
            "Task": task_name,
            "Experiment": exp_name,
            "Strategy Type": strategy_type,
            "Accuracy": pass_at_1,
            "pass@5": metrics.get("pass@5", np.nan),
            "pass@10": metrics.get("pass@10", np.nan),
        })

df = pd.DataFrame(rows)
print(f"Total experiments: {len(df)}")
df.head(10)

## 3. Task Difficulty Comparison

In [None]:
# Compare greedy performance across tasks
greedy_df = df[df["Experiment"].str.contains("greedy", case=False)]

if not greedy_df.empty:
    task_difficulty = greedy_df.groupby("Task")["Accuracy"].mean().sort_values(ascending=False)
    
    fig, ax = plt.subplots(figsize=(10, 5))
    colors = ['#2ecc71', '#3498db', '#f39c12', '#e74c3c'][:len(task_difficulty)]
    bars = ax.bar(task_difficulty.index, task_difficulty.values, color=colors)
    ax.set_ylabel("Greedy Accuracy")
    ax.set_title("Task Difficulty (Greedy Baseline Performance)")
    ax.set_ylim(0, 1)
    
    for bar, acc in zip(bars, task_difficulty.values):
        ax.text(bar.get_x() + bar.get_width()/2, acc + 0.02, f'{acc:.3f}', 
                ha='center', fontsize=12)
    
    plt.tight_layout()
    plt.savefig(f"{RESULTS_BASE}/task_difficulty.png", dpi=150, bbox_inches='tight')
    plt.show()
    
    print("Task Difficulty Ranking (easiest to hardest):")
    for task, acc in task_difficulty.items():
        print(f"  {task}: {acc:.4f}")

## 4. Strategy Effectiveness by Task

In [None]:
# Pivot table: best accuracy per strategy type per task
pivot = df.groupby(["Task", "Strategy Type"])["Accuracy"].max().unstack(fill_value=0)

fig, ax = plt.subplots(figsize=(12, 6))

pivot.plot(kind="bar", ax=ax, width=0.8)
ax.set_ylabel("Best Accuracy")
ax.set_title("Best Performance by Strategy Type Across Tasks")
ax.set_xticklabels(pivot.index, rotation=0)
ax.legend(title="Strategy Type", bbox_to_anchor=(1.02, 1), loc='upper left')
ax.set_ylim(0, 1)

plt.tight_layout()
plt.savefig(f"{RESULTS_BASE}/strategy_by_task.png", dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Improvement over greedy by strategy
improvements = []

for task in df["Task"].unique():
    task_df = df[df["Task"] == task]
    greedy_acc = task_df[task_df["Experiment"].str.contains("greedy", case=False)]["Accuracy"].max()
    
    if pd.isna(greedy_acc) or greedy_acc == 0:
        continue
    
    for strategy in task_df["Strategy Type"].unique():
        if strategy == "Greedy":
            continue
        best_acc = task_df[task_df["Strategy Type"] == strategy]["Accuracy"].max()
        improvement = best_acc - greedy_acc
        improvements.append({
            "Task": task,
            "Strategy": strategy,
            "Improvement": improvement,
            "Relative Improvement": improvement / greedy_acc * 100 if greedy_acc > 0 else 0,
        })

imp_df = pd.DataFrame(improvements)

if not imp_df.empty:
    pivot_imp = imp_df.pivot(index="Task", columns="Strategy", values="Improvement").fillna(0)
    
    fig, ax = plt.subplots(figsize=(12, 6))
    pivot_imp.plot(kind="bar", ax=ax, width=0.8)
    ax.set_ylabel("Accuracy Improvement over Greedy")
    ax.set_title("Strategy Improvement over Greedy Baseline")
    ax.set_xticklabels(pivot_imp.index, rotation=0)
    ax.axhline(y=0, color='black', linestyle='-', linewidth=0.5)
    ax.legend(title="Strategy", bbox_to_anchor=(1.02, 1), loc='upper left')
    
    plt.tight_layout()
    plt.savefig(f"{RESULTS_BASE}/improvement_over_greedy.png", dpi=150, bbox_inches='tight')
    plt.show()

## 5. Best Strategy per Task

In [None]:
# Find best experiment for each task
best_per_task = df.loc[df.groupby("Task")["Accuracy"].idxmax()]

print("Best Strategy per Task:")
print("=" * 60)
for _, row in best_per_task.iterrows():
    print(f"\n{row['Task']}:")
    print(f"  Best: {row['Experiment']} ({row['Strategy Type']})")
    print(f"  Accuracy: {row['Accuracy']:.4f}")

## 6. Heatmap: All Results

In [None]:
# Create heatmap of all results
# Standardize experiment names
def standardize_exp_name(name):
    name = name.lower()
    if 'greedy' in name:
        return 'greedy'
    if 'mcts' in name:
        return 'mcts'
    if 'best_first' in name or 'tree' in name:
        return 'best_first'
    if 'consistency' in name:
        if '16' in name:
            return 'self_consistency_16'
        return 'self_consistency_8'
    if 'diverse' in name:
        return 'diverse'
    if 'temp' in name:
        # Extract temperature
        import re
        match = re.search(r'temp[_]?(\d+\.?\d*)', name)
        if match:
            return f'temp_{match.group(1)}'
    return name

df['Std Experiment'] = df['Experiment'].apply(standardize_exp_name)

# Pivot
heatmap_data = df.pivot_table(
    index='Std Experiment', 
    columns='Task', 
    values='Accuracy', 
    aggfunc='max'
).fillna(0)

if not heatmap_data.empty:
    fig, ax = plt.subplots(figsize=(12, 8))
    
    im = ax.imshow(heatmap_data.values, cmap='RdYlGn', aspect='auto', vmin=0, vmax=1)
    
    ax.set_xticks(range(len(heatmap_data.columns)))
    ax.set_xticklabels(heatmap_data.columns, fontsize=11)
    ax.set_yticks(range(len(heatmap_data.index)))
    ax.set_yticklabels(heatmap_data.index, fontsize=10)
    
    # Add values
    for i in range(len(heatmap_data.index)):
        for j in range(len(heatmap_data.columns)):
            val = heatmap_data.iloc[i, j]
            if val > 0:
                color = 'white' if val > 0.5 else 'black'
                ax.text(j, i, f'{val:.2f}', ha='center', va='center', color=color, fontsize=9)
    
    ax.set_title('Accuracy Heatmap: Strategies √ó Tasks', fontsize=14)
    plt.colorbar(im, ax=ax, label='Accuracy')
    
    plt.tight_layout()
    plt.savefig(f"{RESULTS_BASE}/accuracy_heatmap.png", dpi=150, bbox_inches='tight')
    plt.show()

## 7. Summary & Recommendations

In [None]:
print("\n" + "="*80)
print("CROSS-TASK ANALYSIS SUMMARY")
print("="*80)

print("\nüìä Task Difficulty (Greedy Baseline):")
if 'task_difficulty' in dir():
    for task, acc in task_difficulty.items():
        difficulty = "Easy" if acc > 0.6 else "Medium" if acc > 0.3 else "Hard"
        print(f"  ‚Ä¢ {task}: {acc:.4f} ({difficulty})")

print("\nüèÜ Best Strategy per Task:")
for _, row in best_per_task.iterrows():
    print(f"  ‚Ä¢ {row['Task']}: {row['Experiment']} ({row['Accuracy']:.4f})")

print("\nüìà Strategy Recommendations:")
print("""
  1. GSM8K (Grade School Math):
     ‚Üí Self-consistency with 8-16 samples works well
     ‚Üí Higher temperatures (0.7) improve diversity
  
  2. AIME (Competition Math):
     ‚Üí Tree search methods can help on hard problems
     ‚Üí More compute generally helps
     ‚Üí Consider MCTS for exploration
  
  3. HumanEval (Code Generation):
     ‚Üí Temperature 0.8 is optimal for pass@k
     ‚Üí Generate 20+ samples for reliable pass@10
     ‚Üí Diversity matters more than greedy
  
  4. IFEval (Instruction Following):
     ‚Üí Lower temperatures preserve instruction adherence
     ‚Üí Greedy often competitive
     ‚Üí Self-consistency helps on multi-constraint tasks
""")

print("\nüí° General Insights:")
print("""
  ‚Ä¢ Self-consistency provides consistent improvements across math tasks
  ‚Ä¢ Tree search is most beneficial for hard reasoning problems
  ‚Ä¢ Temperature tuning is task-specific
  ‚Ä¢ Compute-accuracy tradeoff varies by task difficulty
""")

In [None]:
# Save summary
df.to_csv(f"{RESULTS_BASE}/cross_task_summary.csv", index=False)
print(f"\nResults saved to {RESULTS_BASE}/cross_task_summary.csv")