In [None]:
import os
import matplotlib.pyplot as plt
import re
import numpy as np

output_folder = 'plots'
os.makedirs(output_folder, exist_ok=True)

def extract_metrics(file_path):
    """Extract Precision and Recall from a results .txt file."""
    precision, recall = None, None
    with open(file_path, 'r') as f:
        for line in f:
            if "Precision" in line:
                precision = float(line.strip().split(":")[-1].replace('%', '').strip())
            elif "Recall" in line:
                recall = float(line.strip().split(":")[-1].replace('%', '').strip())
    return precision, recall

def get_trailing_number(filename):
    """Extract trailing number before .txt (e.g. results_w500.txt -> 500)."""
    match = re.search(r'(\d+)(?=\.txt$)', filename)
    return int(match.group(1)) if match else 0

def plot_metrics():
    parent_folder = 'evaluations'
    run_folders = [os.path.join(parent_folder, f) for f in os.listdir(parent_folder) 
                   if f.startswith("EC") and os.path.isdir(os.path.join(parent_folder, f))]

    all_precisions = {}
    all_recalls = {}

    # Iterate over each run folder (EC1, EC2, ...)
    for run_folder in run_folders:
        txt_files = [f for f in os.listdir(run_folder) if f.endswith('.txt')]
        for filename in txt_files:
            ensemble_count = get_trailing_number(filename)
            file_path = os.path.join(run_folder, filename)
            precision, recall = extract_metrics(file_path)

            all_precisions.setdefault(ensemble_count, []).append(precision)
            all_recalls.setdefault(ensemble_count, []).append(recall)

    # Sort ensemble counts
    ensemble_counts = sorted(all_precisions.keys())

    # Compute averages & stds
    mean_precisions = [np.mean(all_precisions[c]) for c in ensemble_counts]
    std_precisions = [np.std(all_precisions[c]) for c in ensemble_counts]

    mean_recalls = [np.mean(all_recalls[c]) for c in ensemble_counts]
    std_recalls = [np.std(all_recalls[c]) for c in ensemble_counts]

    # Plot with error bands
    fig, ax = plt.subplots(figsize=(12, 8))

    ax.plot(ensemble_counts, mean_precisions, label='Precision (mean)', color='b', marker='o')
    ax.fill_between(ensemble_counts,
                    np.array(mean_precisions) - np.array(std_precisions),
                    np.array(mean_precisions) + np.array(std_precisions),
                    color='b', alpha=0.2)

    ax.plot(ensemble_counts, mean_recalls, label='Recall (mean)', color='r', marker='o')
    ax.fill_between(ensemble_counts,
                    np.array(mean_recalls) - np.array(std_recalls),
                    np.array(mean_recalls) + np.array(std_recalls),
                    color='r', alpha=0.2)

    ax.set_title("Precision and Recall with Variance across Runs")
    ax.set_xlabel("Ensemble Count")
    ax.set_ylabel("Percentage (%)")
    ax.set_xticks(ensemble_counts)
    ax.grid(True, linestyle="--", alpha=0.6)
    ax.legend()

    output_path = os.path.join(output_folder, 'precision_recall_error_band.png')
    plt.tight_layout()
    plt.savefig(output_path)
    print(f"Plot saved to: {output_path}")
    plt.show()
    plt.close(fig)

# Run
plot_metrics()
