# Permutations Task Analysis

This notebook analyzes the performance of different agent types on the permutations task, plotting the best average element accuracy and exact match scores across agent types.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import scienceplots
print(plt.style.available)
plt.style.use(['science', 'no-latex'])  # Optional: replace 'no-latex' with 'ieee' if using LaTeX

In [None]:
# Load the datasets
element_accuracy_df = pd.read_csv('element_accuracy_permutations.csv')
exact_match_df = pd.read_csv('exact_match_permutations.csv')

print("Element Accuracy DataFrame shape:", element_accuracy_df.shape)
print("Exact Match DataFrame shape:", exact_match_df.shape)
print("\nElement Accuracy columns:")
print([col for col in element_accuracy_df.columns if 'avg_element_accuracy' in col][:5])
print("\nExact Match columns:")
print([col for col in exact_match_df.columns if 'avg_exact_match' in col][:5])

In [None]:
def extract_hyperparams(col_name):
    """Extract hyperparameters from column name"""
    if 'maj-voting' in col_name:
        match = re.search(r'agents(\d+)', col_name)
        return int(match.group(1)) if match else None
    elif 'coa' in col_name:
        match = re.search(r'chunk(\d+)', col_name)
        return int(match.group(1)) if match else None
    elif 'prefix-sum' in col_name:
        match = re.search(r'b(\d+)', col_name)
        return int(match.group(1)) if match else None
    return None

def find_best_performance(df, metric_type='avg_element_accuracy'):
    """Find best performance for each agent type across different number of swaps"""
    
    # Filter columns to remove MIN/MAX/step columns
    filtered_df = df[[col for col in df.columns if all(x not in col for x in ['MIN', 'MAX', 'step'])]]
    
    methods = ['prefix-sum', 'maj-voting', 'coa']
    num_swaps = df['num_swaps']
    
    best_performance = {method: [] for method in methods}
    best_hyperparams = {method: [] for method in methods}
    
    for method in methods:
        method_cols = [col for col in filtered_df.columns if method in col and metric_type in col]
        
        for _, row in filtered_df.iterrows():
            # Find best performance for this method at this number of swaps
            method_perfs = {}
            for col in method_cols:
                perf = row[col]
                if pd.notna(perf) and perf != '':  # Only include non-NaN and non-empty values
                    try:
                        perf = float(perf)
                        hyperparam = extract_hyperparams(col)
                        if hyperparam is not None:
                            method_perfs[hyperparam] = perf
                    except (ValueError, TypeError):
                        continue
            
            if method_perfs:
                best_hyperparam = max(method_perfs.keys(), key=lambda k: method_perfs[k])
                best_perf = method_perfs[best_hyperparam]
                best_performance[method].append(best_perf)
                best_hyperparams[method].append(best_hyperparam)
            else:
                best_performance[method].append(0)
                best_hyperparams[method].append(None)
    
    return best_performance, best_hyperparams, num_swaps

In [None]:
# Find best element accuracy performance
best_element_acc, best_element_hyperparams, num_swaps = find_best_performance(element_accuracy_df, 'avg_element_accuracy')

print("Best Element Accuracy Performance:")
print("=" * 50)
for method in ['prefix-sum', 'maj-voting', 'coa']:
    print(f"\n{method.upper().replace('-', ' ')}:")
    for i, (swaps, acc, hyperparam) in enumerate(zip(num_swaps, best_element_acc[method], best_element_hyperparams[method])):
        if hyperparam is not None:
            param_name = "branching_factor" if method == 'prefix-sum' else ("num_agents" if method == 'maj-voting' else "chunk_size")
            print(f"  {swaps:2d} swaps: {param_name}={hyperparam:2d}, accuracy={acc:.3f}")
        else:
            print(f"  {swaps:2d} swaps: No data available")

In [None]:
# Find best exact match performance
best_exact_match, best_exact_hyperparams, num_swaps = find_best_performance(exact_match_df, 'avg_exact_match')

print("Best Exact Match Performance:")
print("=" * 50)
for method in ['prefix-sum', 'maj-voting', 'coa']:
    print(f"\n{method.upper().replace('-', ' ')}:")
    for i, (swaps, acc, hyperparam) in enumerate(zip(num_swaps, best_exact_match[method], best_exact_hyperparams[method])):
        if hyperparam is not None:
            param_name = "branching_factor" if method == 'prefix-sum' else ("num_agents" if method == 'maj-voting' else "chunk_size")
            print(f"  {swaps:2d} swaps: {param_name}={hyperparam:2d}, accuracy={acc:.3f}")
        else:
            print(f"  {swaps:2d} swaps: No data available")

In [None]:
# Plot best element accuracy across agent types
plt.style.use(['science', 'ieee'])  # Clean scientific styling

# LaTeX font settings for publication quality
plt.rcParams.update({
    'text.usetex': True,
    'font.family': 'serif',
    'font.size': 8,
    'axes.titlesize': 9,
    'axes.labelsize': 8,
    'legend.fontsize': 7,
    'xtick.labelsize': 7,
    'ytick.labelsize': 7
})

methods = ['prefix-sum', 'maj-voting', 'coa']
colors = ['#4C72B0', '#55A868', '#C44E52']

# Filter data to only include up to 16 swaps
max_swaps = 16
mask = num_swaps <= max_swaps
filtered_num_swaps = num_swaps[mask]
filtered_best_element_acc = {method: [acc for i, acc in enumerate(best_element_acc[method]) if num_swaps.iloc[i] <= max_swaps] for method in methods}
filtered_best_exact_match = {method: [acc for i, acc in enumerate(best_exact_match[method]) if num_swaps.iloc[i] <= max_swaps] for method in methods}

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(7, 2.5))

# Plot element accuracy
bar_width = 0.25
x_pos = range(len(filtered_num_swaps))

for i, method in enumerate(methods):
    # Calculate standard error assuming binomial distribution
    error_bars = [np.sqrt(acc * (1 - acc) / 100) if acc > 0 else 0 for acc in filtered_best_element_acc[method]]
    
    ax1.bar(
        [p + i * bar_width for p in x_pos],
        filtered_best_element_acc[method],
        width=bar_width,
        label=method.replace('-', ' ').title(),
        color=colors[i],
        yerr=error_bars,
        capsize=2,
        error_kw={'linewidth': 0.5, 'capthick': 0.5}
    )

ax1.set_xticks([p + bar_width for p in x_pos])
ax1.set_xticklabels(filtered_num_swaps)
ax1.set_xlabel(r'\textbf{Number of Swaps}')
ax1.set_ylabel(r'\textbf{Element Accuracy}')
ax1.set_title(r'\textbf{Best Element Accuracy by Agent Type}')
ax1.legend(frameon=False, loc='upper right')
ax1.grid(True, axis='y', linestyle='--', linewidth=0.5, alpha=0.7)

# Plot exact match accuracy
for i, method in enumerate(methods):
    # Calculate standard error assuming binomial distribution
    error_bars = [np.sqrt(acc * (1 - acc) / 100) if acc > 0 else 0 for acc in filtered_best_exact_match[method]]
    
    ax2.bar(
        [p + i * bar_width for p in x_pos],
        filtered_best_exact_match[method],
        width=bar_width,
        label=method.replace('-', ' ').title(),
        color=colors[i],
        yerr=error_bars,
        capsize=2,
        error_kw={'linewidth': 0.5, 'capthick': 0.5}
    )

ax2.set_xticks([p + bar_width for p in x_pos])
ax2.set_xticklabels(filtered_num_swaps)
ax2.set_xlabel(r'\textbf{Number of Swaps}')
ax2.set_ylabel(r'\textbf{Exact Match Accuracy}')
ax2.set_title(r'\textbf{Best Exact Match Accuracy by Agent Type}')
ax2.legend(frameon=False, loc='upper right')
ax2.grid(True, axis='y', linestyle='--', linewidth=0.5, alpha=0.7)

fig.tight_layout()
fig.savefig("permutations_analysis.pdf", bbox_inches='tight')
plt.show()
plt.close()

In [None]:
# Create line plots with shaded error regions
import matplotlib.pyplot as plt
import numpy as np

# Ensure matplotlib backend is set properly
plt.ioff()  # Turn off interactive mode temporarily
plt.close('all')  # Close any existing figures

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(7, 2.5))

# Convert to numpy arrays for proper matplotlib handling and filter to max 16 swaps
max_swaps = 16
mask = num_swaps <= max_swaps
filtered_x_vals = np.array(list(num_swaps[mask]))

# Plot element accuracy as line plot
for i, method in enumerate(methods):
    y_vals = np.array([acc for j, acc in enumerate(best_element_acc[method]) if num_swaps.iloc[j] <= max_swaps])
    # Calculate standard error assuming binomial distribution
    error_bars = np.array([np.sqrt(acc * (1 - acc) / 100) if acc > 0 else 0 for acc in y_vals])
    
    ax1.plot(
        filtered_x_vals,
        y_vals,
        label=method.replace('-', ' ').title(),
        color=colors[i],
        marker='o',
        markersize=4,
        linewidth=1.5
    )
    
    # Add shaded error region using numpy arrays
    lower_bound = np.maximum(0, y_vals - error_bars)
    upper_bound = np.minimum(1, y_vals + error_bars)
    
    ax1.fill_between(
        filtered_x_vals,
        lower_bound,
        upper_bound,
        color=colors[i],
        alpha=0.2
    )

ax1.set_xlabel(r'\textbf{Number of Swaps}')
ax1.set_ylabel(r'\textbf{Element Accuracy}')
#ax1.set_title(r'\textbf{Best Element Accuracy by Agent Type}')
ax1.legend(frameon=False, loc='lower right')
ax1.grid(True, linestyle='--', linewidth=0.5, alpha=0.7)
ax1.set_ylim(0, 1)

# Plot exact match accuracy as line plot
for i, method in enumerate(methods):
    y_vals = np.array([acc for j, acc in enumerate(best_exact_match[method]) if num_swaps.iloc[j] <= max_swaps])
    # Calculate standard error assuming binomial distribution
    error_bars = np.array([np.sqrt(acc * (1 - acc) / 100) if acc > 0 else 0 for acc in y_vals])
    
    ax2.plot(
        filtered_x_vals,
        y_vals,
        label=method.replace('-', ' ').title(),
        color=colors[i],
        marker='o',
        markersize=4,
        linewidth=1.5
    )
    
    # Add shaded error region using numpy arrays
    lower_bound = np.maximum(0, y_vals - error_bars)
    upper_bound = np.minimum(1, y_vals + error_bars)
    
    ax2.fill_between(
        filtered_x_vals,
        lower_bound,
        upper_bound,
        color=colors[i],
        alpha=0.2
    )

ax2.set_xlabel(r'\textbf{Number of Swaps}')
ax2.set_ylabel(r'\textbf{Exact Match Accuracy}')
#ax2.set_title(r'\textbf{Best Exact Match Accuracy by Agent Type}')
ax2.legend(frameon=False, loc='upper right')
ax2.grid(True, linestyle='--', linewidth=0.5, alpha=0.7)
ax2.set_ylim(0, 1)

plt.tight_layout()
plt.savefig("permutations_line_plots.pdf", bbox_inches='tight')
plt.show()
plt.close(fig)  # Explicitly close the figure
plt.ion()  # Turn interactive mode back on

In [None]:
# Create summary tables (filtered to 16 swaps max)
element_summary = []
exact_match_summary = []

max_swaps = 16

for method in methods:
    for i, swaps in enumerate(num_swaps):
        if swaps <= max_swaps:
            if best_element_hyperparams[method][i] is not None:
                element_summary.append({
                    'Method': method.replace('-', '_'),
                    'Num_Swaps': swaps,
                    'Best_Hyperparam': best_element_hyperparams[method][i],
                    'Element_Accuracy': best_element_acc[method][i]
                })
                
            if best_exact_hyperparams[method][i] is not None:
                exact_match_summary.append({
                    'Method': method.replace('-', '_'),
                    'Num_Swaps': swaps,
                    'Best_Hyperparam': best_exact_hyperparams[method][i],
                    'Exact_Match': best_exact_match[method][i]
                })

element_summary_df = pd.DataFrame(element_summary)
exact_match_summary_df = pd.DataFrame(exact_match_summary)

print("\nElement Accuracy Summary (up to 16 swaps):")
print("=" * 50)
print(element_summary_df.to_string(index=False))

print("\n\nExact Match Summary (up to 16 swaps):")
print("=" * 50)
print(exact_match_summary_df.to_string(index=False))