In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import argparse
import json
from pathlib import Path
from statistics import mean
from typing import Dict, List

In [4]:
def calculate_aggregate_metrics(results: List[Dict]) -> Dict[str, float]:
    """Calculate average metrics across all results."""
    metrics = {
        "code_precision": [],
        "code_recall": [],
        "test_precision": [],
        "test_recall": []
    }
    
    for result in results:
        if result is None:
            continue
        for metric in metrics:
            metrics[metric].append(result[metric])
    
    return {
        metric: mean(values) if values else 0.0 
        for metric, values in metrics.items()
    }

results_path = Path("../results/2025-01-25-file-selection-o1-preview-2024-09-12")

# Load all result files
all_results = []
for result_file in results_path.glob("*.json"):
    if result_file.name == "aggregate_metrics.json":
        continue
        
    try:
        with open(result_file) as f:
            result = json.load(f)
            all_results.append(result)
    except Exception as e:
        print(f"Error loading {result_file}: {str(e)}")
        continue

# Calculate and display aggregate metrics
aggregate_metrics = calculate_aggregate_metrics(all_results)
print("\nResults:")
print(f"Number of examples processed: {len(all_results)}")
print(f"Code Precision: {aggregate_metrics['code_precision']:.3f}")
print(f"Code Recall: {aggregate_metrics['code_recall']:.3f}")
print(f"Test Precision: {aggregate_metrics['test_precision']:.3f}")
print(f"Test Recall: {aggregate_metrics['test_recall']:.3f}")


Results:
Number of examples processed: 35
Code Precision: 0.283
Code Recall: 0.351
Test Precision: 0.233
Test Recall: 0.250
