In [None]:
# Alternative 4: Bar plot showing performance at key hop values with 200 and 100 facts using hatching
# This creates 4 bars for each hop: MajorityVoting (100), IterativeQuery (100), MajorityVoting (200), IterativeQuery (200)

key_hops = [4, 8, 12]  # Focus on these hop counts for cleaner visualization

# Update font sizes for 12pt axes and titles with Times New Roman
plt.rcParams.update({
    'text.usetex': True,
    'font.family': 'serif',
    'font.serif': ['Times'],  # Use Times New Roman
    'text.latex.preamble': r'\usepackage{times}',  # LaTeX Times package
    'font.size': 10,
    'axes.titlesize': 12,  # 12pt for titles
    'axes.labelsize': 12,  # 12pt for axes labels
    'legend.fontsize': 10,  # Keep legend at 10pt
    'xtick.labelsize': 9,
    'ytick.labelsize': 9
})

# Focus on 100 and 200 facts for comparison
fact_counts_new = [100, 200]

# Create figure
fig, ax = plt.subplots(figsize=(10, 6))

# Extract data for both fact counts at key hop values
data_for_plot = {}

for num_facts in fact_counts_new:
    data_for_plot[num_facts] = {'MajorityVoting': [], 'IterativeQuery': []}
    
    for hop in key_hops:
        # Find accuracy for MajorityVoting at this hop
        maj_key = f"MajorityVoting_{num_facts}facts"
        if maj_key in combined_data:
            maj_data = combined_data[maj_key]
            if hop in maj_data['hops']:
                hop_idx = maj_data['hops'].index(hop)
                data_for_plot[num_facts]['MajorityVoting'].append(maj_data['accuracy'][hop_idx])
            else:
                data_for_plot[num_facts]['MajorityVoting'].append(0)
        else:
            data_for_plot[num_facts]['MajorityVoting'].append(0)
            
        # Find accuracy for IterativeQuery at this hop  
        iter_key = f"IterativeQuery_{num_facts}facts"
        if iter_key in combined_data:
            iter_data = combined_data[iter_key]
            if hop in iter_data['hops']:
                hop_idx = iter_data['hops'].index(hop)
                data_for_plot[num_facts]['IterativeQuery'].append(iter_data['accuracy'][hop_idx])
            else:
                data_for_plot[num_facts]['IterativeQuery'].append(0)
        else:
            data_for_plot[num_facts]['IterativeQuery'].append(0)

# Create grouped bar plot with 4 bars per hop
x = np.arange(len(key_hops))
width = 0.2  # Width of each bar

# Colors and patterns
colors = {'MajorityVoting': '#4C72B0', 'IterativeQuery': '#55A868'}
patterns = {100: '', 200: '///'}  # No pattern for 100 facts, hatching for 200 facts

# Plot the 4 bars for each hop
bars = []
labels = []

for i, num_facts in enumerate(fact_counts_new):
    for j, method in enumerate(['MajorityVoting', 'IterativeQuery']):
        position = x + (i * 2 + j - 1.5) * width
        pattern = patterns[num_facts]
        color = colors[method]
        alpha = 0.9 if num_facts == 100 else 0.7  # Slightly different alpha for differentiation
        
        bar = ax.bar(position, data_for_plot[num_facts][method], width,
                    color=color, alpha=alpha, edgecolor='black', linewidth=0.5,
                    hatch=pattern)
        
        bars.append(bar)
        
        # Create labels for legend
        method_clean = method.replace('Voting', ' Voting')
        label = f"{method_clean} ({num_facts} facts)"
        if label not in labels:
            labels.append(label)

# Customize the plot
ax.set_xlabel(r'\textbf{Number of Hops}')
ax.set_ylabel(r'\textbf{Accuracy}')
ax.set_title(r'\textbf{K-Hop Reasoning: Performance Comparison with 100 and 200 Facts}')
ax.set_xticks(x)
ax.set_xticklabels(key_hops)

# Create custom legend
legend_elements = []
for num_facts in fact_counts_new:
    for method in ['MajorityVoting', 'IterativeQuery']:
        method_clean = method.replace('Voting', ' Voting')
        color = colors[method]
        pattern = patterns[num_facts]
        alpha = 0.9 if num_facts == 100 else 0.7
        
        from matplotlib.patches import Patch
        legend_elements.append(Patch(facecolor=color, alpha=alpha, hatch=pattern, 
                                   edgecolor='black', linewidth=0.5,
                                   label=f"{method_clean} ({num_facts} facts)"))

ax.legend(handles=legend_elements, frameon=True, loc='upper right', fontsize=10,
         fancybox=True, shadow=True, framealpha=0.95,
         edgecolor='black', facecolor='white')

ax.grid(True, axis='y', linestyle='--', alpha=0.7)
ax.set_ylim(0, 1.05)

# Clean up spines
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

plt.tight_layout()

# Save the plot to figures folder
plt.savefig("figures/khop_100_200_facts_comparison_hatched.pdf", bbox_inches='tight', dpi=300)
plt.show()
plt.close()

# Print the data values for reference
print("Data used in the plot:")
print("=" * 50)
for hop_idx, hop in enumerate(key_hops):
    print(f"Hop {hop}:")
    for num_facts in fact_counts_new:
        for method in ['MajorityVoting', 'IterativeQuery']:
            method_clean = method.replace('Voting', ' Voting')
            value = data_for_plot[num_facts][method][hop_idx]
            print(f"  {method_clean} ({num_facts} facts): {value:.3f}")
    print()

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import scienceplots
print(plt.style.available)
plt.style.use(['science', 'no-latex'])  # Optional: replace 'no-latex' with 'ieee' if using LaTeX

# K-Hop Reasoning Task Analysis

This notebook analyzes the performance of different agent types on the k-hop reasoning task, plotting accuracy vs number of hops with error bars and combining related runs into continuous lines.

# K-Hop Reasoning Task Analysis

This notebook analyzes the performance of different agent types on the k-hop reasoning task, plotting accuracy vs number of hops with error bars and combining related runs into continuous lines.

In [None]:
# Load the datasets
#avg_acc_df = pd.read_csv('k_hop_accuracy.csv')
avg_acc_df = pd.read_csv('data/khop_avg_acc.csv')
#se_df = pd.read_csv('k_hop_se.csv')
se_df = pd.read_csv('data/khop_se.csv')

print("Average Accuracy DataFrame shape:", avg_acc_df.shape)
print("Standard Error DataFrame shape:", se_df.shape)
print("\nAverage Accuracy columns:")
print([col for col in avg_acc_df.columns if 'avg_accuracy' in col][:5])
print("\nStandard Error columns:")
print([col for col in se_df.columns if 'se_accuracy' in col][:5])

In [None]:
# Load the datasets
#avg_acc_df = pd.read_csv('k_hop_accuracy.csv')
avg_acc_df = pd.read_csv('khop_avg_acc.csv')
#se_df = pd.read_csv('k_hop_se.csv')
se_df = pd.read_csv('khop_se.csv')

print("Average Accuracy DataFrame shape:", avg_acc_df.shape)
print("Standard Error DataFrame shape:", se_df.shape)
print("\nAverage Accuracy columns:")
print([col for col in avg_acc_df.columns if 'avg_accuracy' in col][:5])
print("\nStandard Error columns:")
print([col for col in se_df.columns if 'se_accuracy' in col][:5])

In [None]:
avg_acc_df.head()

In [None]:
def extract_experiment_info(col_name):
    """Extract experiment information from column name"""
    if 'MajorityVotingAgents' in col_name:
        agent_type = 'MajorityVoting'
        # Extract agents number
        agents_match = re.search(r'agents(\d+)', col_name)
        agents = int(agents_match.group(1)) if agents_match else None
    elif 'IterativeQueryAgents' in col_name:
        agent_type = 'IterativeQuery'
        # Extract facts per worker
        facts_match = re.search(r'factsperworker(\d+)', col_name)
        agents = int(facts_match.group(1)) if facts_match else None
    else:
        return None, None, None
    
    # Extract number of facts
    facts_match = re.search(r'facts(\d+)', col_name)
    num_facts = int(facts_match.group(1)) if facts_match else None
    
    # Extract hop range
    hops_match = re.search(r'hops(\d+)-(\d+)', col_name)
    hop_range = (int(hops_match.group(1)), int(hops_match.group(2))) if hops_match else None
    
    return agent_type, num_facts, hop_range

def group_related_experiments(df, metric_type='avg_accuracy'):
    """Group related experiments that should form continuous lines"""
    
    # Filter columns to remove MIN/MAX/step columns
    filtered_df = df[[col for col in df.columns if all(x not in col for x in ['MIN', 'MAX', 'step'])]]
    
    # Get columns with the specified metric
    metric_cols = [col for col in filtered_df.columns if metric_type in col]
    
    experiments = {}
    
    for col in metric_cols:
        agent_type, num_facts, hop_range = extract_experiment_info(col)
        if agent_type and num_facts and hop_range:
            # Create a unique key for experiments that should be grouped
            key = f"{agent_type}_{num_facts}facts"
            
            if key not in experiments:
                experiments[key] = []
            
            experiments[key].append({
                'column': col,
                'hop_range': hop_range,
                'agent_type': agent_type,
                'num_facts': num_facts
            })
    
    return experiments, filtered_df

# Group experiments
acc_experiments, filtered_acc_df = group_related_experiments(avg_acc_df, 'avg_accuracy')
se_experiments, filtered_se_df = group_related_experiments(se_df, 'se_accuracy')

print("Found experiment groups:")
for key, exps in acc_experiments.items():
    print(f"  {key}: {len(exps)} experiments")
    for exp in exps:
        print(f"    - Hops {exp['hop_range'][0]}-{exp['hop_range'][1]}")

In [None]:
def combine_experiments_into_lines(experiments, acc_df, se_df, num_hops):
    """Combine related experiments into continuous lines"""
    
    combined_data = {}
    
    for exp_key, exp_list in experiments.items():
        # Sort experiments by hop range start
        exp_list.sort(key=lambda x: x['hop_range'][0])
        
        combined_hops = []
        combined_acc = []
        combined_se = []
        
        for exp in exp_list:
            col_name = exp['column']
            se_col_name = col_name.replace('avg_accuracy', 'se_accuracy')
            
            hop_start, hop_end = exp['hop_range']
            
            # Get data for this hop range
            for i, hop in enumerate(num_hops):
                if hop_start <= hop <= hop_end:
                    acc_val = acc_df.iloc[i][col_name] if col_name in acc_df.columns else None
                    se_val = se_df.iloc[i][se_col_name] if se_col_name in se_df.columns else None
                    
                    if pd.notna(acc_val) and acc_val != '':
                        try:
                            combined_hops.append(hop)
                            combined_acc.append(float(acc_val))
                            combined_se.append(float(se_val) if pd.notna(se_val) and se_val != '' else 0.0)
                        except (ValueError, TypeError):
                            continue
        
        if combined_hops:  # Only add if we have data
            combined_data[exp_key] = {
                'hops': combined_hops,
                'accuracy': combined_acc,
                'se': combined_se,
                'agent_type': exp_list[0]['agent_type'],
                'num_facts': exp_list[0]['num_facts']
            }
    
    return combined_data

# Get number of hops
num_hops = filtered_acc_df['num_hops'].tolist()

# Combine experiments
combined_data = combine_experiments_into_lines(acc_experiments, filtered_acc_df, filtered_se_df, num_hops)

print(f"Combined {len(combined_data)} experiment groups into continuous lines:")
for key, data in combined_data.items():
    print(f"  {key}: {len(data['hops'])} data points (hops {min(data['hops'])}-{max(data['hops'])})")

In [None]:
# Create summary table
summary_data = []

for key, data in combined_data.items():
    for i, hop in enumerate(data['hops']):
        summary_data.append({
            'Experiment': key,
            'Agent_Type': data['agent_type'],
            'Num_Facts': data['num_facts'],
            'Num_Hops': hop,
            'Accuracy': data['accuracy'][i],
            'Standard_Error': data['se'][i]
        })

summary_df = pd.DataFrame(summary_data)
print("K-Hop Reasoning Summary:")
print("=" * 80)
print(summary_df.to_string(index=False))

In [None]:
# Create small square plots comparing MajorityVoting vs IterativeQuery for fixed number of facts
plt.style.use(['science', 'ieee'])  # Clean scientific styling

# LaTeX font settings for publication quality
plt.rcParams.update({
    'text.usetex': True,
    'font.family': 'serif',
    'font.size': 8,
    'axes.titlesize': 9,
    'axes.labelsize': 8,
    'legend.fontsize': 7,
    'xtick.labelsize': 7,
    'ytick.labelsize': 7
})

# Target fact counts for comparison
fact_counts = [100, 200, 500]

# Create figure with 3 small square subplots
fig, axes = plt.subplots(1, 3, figsize=(9, 2.8))

# Colors for the two agent types
colors = {'MajorityVoting': '#4C72B0', 'IterativeQuery': '#55A868'}

for i, num_facts in enumerate(fact_counts):
    ax = axes[i]
    
    # Plot both agent types for this fact count
    for agent_type in ['MajorityVoting', 'IterativeQuery']:
        # Find the corresponding data
        key = f"{agent_type}_{num_facts}facts"
        
        if key in combined_data:
            data = combined_data[key]
            
            # Use all available data points (should be hops 2-20 for MajorityVoting, 4-20 for IterativeQuery)
            x_vals = np.array(data['hops'])
            y_vals = np.array(data['accuracy'])
            se_vals = np.array(data['se'])
            
            if len(x_vals) > 0:  # Only plot if we have data
                color = colors[agent_type]
                
                # Plot line with markers
                ax.plot(
                    x_vals,
                    y_vals,
                    label=agent_type.replace('Voting', ' Voting'),
                    color=color,
                    marker='o',
                    markersize=4,
                    linewidth=1.5
                )
                
                # Add shaded error region
                lower_bound = np.maximum(0, y_vals - se_vals)
                upper_bound = np.minimum(1, y_vals + se_vals)
                
                ax.fill_between(
                    x_vals,
                    lower_bound,
                    upper_bound,
                    color=color,
                    alpha=0.2
                )
    
    # Customize subplot
    ax.set_xlabel(r'\textbf{Number of Hops}')
    if i == 0:
        ax.set_ylabel(r'\textbf{Accuracy}')
    
    ax.set_title(r'\textbf{' + f'{num_facts} Facts' + '}')
    ax.grid(True, linestyle='--', linewidth=0.5, alpha=0.7)
    ax.set_ylim(0, 1)
    ax.set_xlim(1.5, 20.5)  # Show full range from 2 to 20 hops
    
    # Set x-ticks to match the data (even numbers from 2 to 20)
    ax.set_xticks(range(2, 21, 2))
    
    # Add legend only to the rightmost plot
    if i == len(fact_counts) - 1:
        ax.legend(frameon=False, loc='upper right')

plt.tight_layout()
plt.savefig("figures/khop_accuracy_comparison_small_plots_exaone.pdf", bbox_inches='tight')
plt.show()
plt.close(fig)

In [None]:
# Alternative visualizations for conference paper (ICLR style)

# Alternative 1: Bar plot showing average accuracy across all hops for each method and fact count
plt.style.use(['science', 'ieee'])
plt.rcParams.update({
    'text.usetex': True,
    'font.family': 'serif',
    'font.size': 10,
    'axes.titlesize': 11,
    'axes.labelsize': 10,
    'legend.fontsize': 9,
    'xtick.labelsize': 9,
    'ytick.labelsize': 9
})

fact_counts = [100, 200, 500]
methods = ['MajorityVoting', 'IterativeQuery']
colors = {'MajorityVoting': '#4C72B0', 'IterativeQuery': '#55A868'}

# Calculate average accuracy across all hops for each method and fact count
avg_accuracies = {}
std_errors = {}

for method in methods:
    avg_accuracies[method] = []
    std_errors[method] = []
    
    for num_facts in fact_counts:
        key = f"{method}_{num_facts}facts"
        if key in combined_data:
            data = combined_data[key]
            # Calculate mean accuracy across all hops
            mean_acc = np.mean(data['accuracy'])
            # Calculate standard error of the mean
            se_mean = np.std(data['accuracy']) / np.sqrt(len(data['accuracy']))
            avg_accuracies[method].append(mean_acc)
            std_errors[method].append(se_mean)
        else:
            avg_accuracies[method].append(0)
            std_errors[method].append(0)

# Create bar plot
fig, ax = plt.subplots(figsize=(6, 4))
x = np.arange(len(fact_counts))
width = 0.35

bars1 = ax.bar(x - width/2, avg_accuracies['MajorityVoting'], width, 
               label='Majority Voting', color=colors['MajorityVoting'],
               yerr=std_errors['MajorityVoting'], capsize=5, alpha=0.9,
               edgecolor='black', linewidth=0.5)

bars2 = ax.bar(x + width/2, avg_accuracies['IterativeQuery'], width,
               label='Iterative Query', color=colors['IterativeQuery'], 
               yerr=std_errors['IterativeQuery'], capsize=5, alpha=0.9,
               edgecolor='black', linewidth=0.5)

ax.set_xlabel(r'\textbf{Number of Facts}')
ax.set_ylabel(r'\textbf{Average Accuracy}')
ax.set_title(r'\textbf{K-Hop Reasoning: Average Performance}')
ax.set_xticks(x)
ax.set_xticklabels(fact_counts)
ax.legend(frameon=True, loc='upper right', fancybox=True, shadow=True)
ax.grid(True, axis='y', linestyle='--', alpha=0.7)
ax.set_ylim(0, 1)

# Clean up spines
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

plt.tight_layout()
plt.savefig("figures/khop_average_performance_bars.pdf", bbox_inches='tight', dpi=300)
plt.show()
plt.close()

# Print the values for reference
print("Average Performance Results:")
for method in methods:
    print(f"\n{method}:")
    for i, num_facts in enumerate(fact_counts):
        print(f"  {num_facts} facts: {avg_accuracies[method][i]:.3f} Â± {std_errors[method][i]:.3f}")

In [None]:
# Alternative 2: Bar plot showing performance at key hop values (4, 8, 12, 16, 20)
# This focuses on specific points to reduce noise

key_hops = [4, 8, 12, 16, 20]  # Focus on these hop counts for cleaner visualization

# Update font sizes for 12pt axes and titles with Times New Roman
plt.rcParams.update({
    'text.usetex': True,
    'font.family': 'serif',
    'font.serif': ['Times'],  # Use Times New Roman
    'text.latex.preamble': r'\usepackage{times}',  # LaTeX Times package
    'font.size': 10,
    'axes.titlesize': 12,  # 12pt for titles
    'axes.labelsize': 12,  # 12pt for axes labels
    'legend.fontsize': 10,  # Keep legend at 10pt
    'xtick.labelsize': 9,
    'ytick.labelsize': 9
})

# Create subplots for each fact count
fig, axes = plt.subplots(1, 3, figsize=(12, 4))

for subplot_idx, num_facts in enumerate(fact_counts):
    ax = axes[subplot_idx]
    
    # Extract data for this fact count at key hop values
    maj_accs = []
    iter_accs = []
    maj_ses = []
    iter_ses = []
    
    for hop in key_hops:
        # Find accuracy for MajorityVoting at this hop
        maj_key = f"MajorityVoting_{num_facts}facts"
        if maj_key in combined_data:
            maj_data = combined_data[maj_key]
            if hop in maj_data['hops']:
                hop_idx = maj_data['hops'].index(hop)
                maj_accs.append(maj_data['accuracy'][hop_idx])
                maj_ses.append(maj_data['se'][hop_idx])
            else:
                maj_accs.append(0)
                maj_ses.append(0)
        else:
            maj_accs.append(0)
            maj_ses.append(0)
            
        # Find accuracy for IterativeQuery at this hop  
        iter_key = f"IterativeQuery_{num_facts}facts"
        if iter_key in combined_data:
            iter_data = combined_data[iter_key]
            if hop in iter_data['hops']:
                hop_idx = iter_data['hops'].index(hop)
                iter_accs.append(iter_data['accuracy'][hop_idx])
                iter_ses.append(iter_data['se'][hop_idx])
            else:
                iter_accs.append(0)
                iter_ses.append(0)
        else:
            iter_accs.append(0)
            iter_ses.append(0)
    
    # Create grouped bar plot
    x = np.arange(len(key_hops))
    width = 0.35
    
    bars1 = ax.bar(x - width/2, maj_accs, width, 
                   label='Majority Voting' if subplot_idx == 2 else "", 
                   color=colors['MajorityVoting'],
                   yerr=maj_ses, capsize=3, alpha=0.9,
                   edgecolor='black', linewidth=0.5)

    bars2 = ax.bar(x + width/2, iter_accs, width,
                   label='Iterative Query' if subplot_idx == 2 else "", 
                   color=colors['IterativeQuery'], 
                   yerr=iter_ses, capsize=3, alpha=0.9,
                   edgecolor='black', linewidth=0.5)
    
    ax.set_xlabel(r'\textbf{Number of Hops}')
    if subplot_idx == 0:
        ax.set_ylabel(r'\textbf{Accuracy}')
    
    ax.set_title(r'\textbf{' + f'{num_facts} Facts' + '}')
    ax.set_xticks(x)
    ax.set_xticklabels(key_hops)
    
    # Add legend only to the third plot (rightmost) with parity notebook styling
    if subplot_idx == 2:
        legend = ax.legend(frameon=True, loc='upper right', fontsize=10,
                          fancybox=True, shadow=True, framealpha=0.95,
                          edgecolor='black', facecolor='white')
        legend.get_frame().set_linewidth(0.8)
    
    ax.grid(True, axis='y', linestyle='--', alpha=0.7)
    ax.set_ylim(0, 1.05)
    
    # Clean up spines
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)

plt.tight_layout()

# Save the combined plot
plt.savefig("figures/khop_key_hops_comparison.pdf", bbox_inches='tight', dpi=300)
plt.show()
plt.close()

# Now create and save individual plots
for subplot_idx, num_facts in enumerate(fact_counts):
    fig_individual, ax_individual = plt.subplots(figsize=(4, 4))
    
    # Extract data for this fact count at key hop values (same logic as above)
    maj_accs = []
    iter_accs = []
    maj_ses = []
    iter_ses = []
    
    for hop in key_hops:
        # Find accuracy for MajorityVoting at this hop
        maj_key = f"MajorityVoting_{num_facts}facts"
        if maj_key in combined_data:
            maj_data = combined_data[maj_key]
            if hop in maj_data['hops']:
                hop_idx = maj_data['hops'].index(hop)
                maj_accs.append(maj_data['accuracy'][hop_idx])
                maj_ses.append(maj_data['se'][hop_idx])
            else:
                maj_accs.append(0)
                maj_ses.append(0)
        else:
            maj_accs.append(0)
            maj_ses.append(0)
            
        # Find accuracy for IterativeQuery at this hop  
        iter_key = f"IterativeQuery_{num_facts}facts"
        if iter_key in combined_data:
            iter_data = combined_data[iter_key]
            if hop in iter_data['hops']:
                hop_idx = iter_data['hops'].index(hop)
                iter_accs.append(iter_data['accuracy'][hop_idx])
                iter_ses.append(iter_data['se'][hop_idx])
            else:
                iter_accs.append(0)
                iter_ses.append(0)
        else:
            iter_accs.append(0)
            iter_ses.append(0)
    
    # Create grouped bar plot for individual subplot
    x = np.arange(len(key_hops))
    width = 0.35
    
    bars1 = ax_individual.bar(x - width/2, maj_accs, width, 
                             label='Majority Voting' if subplot_idx == 2 else "", 
                             color=colors['MajorityVoting'],
                             yerr=maj_ses, capsize=3, alpha=0.9,
                             edgecolor='black', linewidth=0.5)

    bars2 = ax_individual.bar(x + width/2, iter_accs, width,
                             label='Iterative Query' if subplot_idx == 2 else "", 
                             color=colors['IterativeQuery'], 
                             yerr=iter_ses, capsize=3, alpha=0.9,
                             edgecolor='black', linewidth=0.5)
    
    ax_individual.set_xlabel(r'\textbf{Number of Hops}')
    ax_individual.set_ylabel(r'\textbf{Accuracy}')
    ax_individual.set_title(r'\textbf{' + f'{num_facts} Facts' + '}')
    ax_individual.set_xticks(x)
    ax_individual.set_xticklabels(key_hops)
    
    # Add legend only to the third plot (500 facts) with parity notebook styling
    if subplot_idx == 2:
        legend = ax_individual.legend(frameon=True, loc='upper right', fontsize=10,
                                     fancybox=True, shadow=True, framealpha=0.95,
                                     edgecolor='black', facecolor='white')
        legend.get_frame().set_linewidth(0.8)
    
    ax_individual.grid(True, axis='y', linestyle='--', alpha=0.7)
    ax_individual.set_ylim(0, 1.05)
    
    # Clean up spines
    ax_individual.spines['top'].set_visible(False)
    ax_individual.spines['right'].set_visible(False)
    
    plt.tight_layout()
    
    # Save individual plot
    filename = f"figures/khop_key_hops_{num_facts}facts.pdf"
    plt.savefig(filename, bbox_inches='tight', dpi=300)
    print(f"Saved individual plot: {filename}")
    plt.show()
    plt.close()

In [None]:
# Alternative 2: Bar plot showing performance at key hop values (4, 8, 12, 16, 20)
# This focuses on specific points to reduce noise

key_hops = [4, 8, 12, 16, 20]  # Focus on these hop counts for cleaner visualization

# Update font sizes for 12pt axes and titles with Times New Roman
plt.rcParams.update({
    'text.usetex': True,
    'font.family': 'serif',
    'font.serif': ['Times'],  # Use Times New Roman
    'text.latex.preamble': r'\usepackage{times}',  # LaTeX Times package
    'font.size': 10,
    'axes.titlesize': 12,  # 12pt for titles
    'axes.labelsize': 12,  # 12pt for axes labels
    'legend.fontsize': 10,  # Keep legend at 10pt
    'xtick.labelsize': 9,
    'ytick.labelsize': 9
})

# Create subplots for each fact count
fig, axes = plt.subplots(1, 3, figsize=(12, 4))

for subplot_idx, num_facts in enumerate(fact_counts):
    ax = axes[subplot_idx]
    
    # Extract data for this fact count at key hop values
    maj_accs = []
    iter_accs = []
    maj_ses = []
    iter_ses = []
    
    for hop in key_hops:
        # Find accuracy for MajorityVoting at this hop
        maj_key = f"MajorityVoting_{num_facts}facts"
        if maj_key in combined_data:
            maj_data = combined_data[maj_key]
            if hop in maj_data['hops']:
                hop_idx = maj_data['hops'].index(hop)
                maj_accs.append(maj_data['accuracy'][hop_idx])
                maj_ses.append(maj_data['se'][hop_idx])
            else:
                maj_accs.append(0)
                maj_ses.append(0)
        else:
            maj_accs.append(0)
            maj_ses.append(0)
            
        # Find accuracy for IterativeQuery at this hop  
        iter_key = f"IterativeQuery_{num_facts}facts"
        if iter_key in combined_data:
            iter_data = combined_data[iter_key]
            if hop in iter_data['hops']:
                hop_idx = iter_data['hops'].index(hop)
                iter_accs.append(iter_data['accuracy'][hop_idx])
                iter_ses.append(iter_data['se'][hop_idx])
            else:
                iter_accs.append(0)
                iter_ses.append(0)
        else:
            iter_accs.append(0)
            iter_ses.append(0)
    
    # Create grouped bar plot
    x = np.arange(len(key_hops))
    width = 0.35
    
    bars1 = ax.bar(x - width/2, maj_accs, width, 
                   label='Majority Voting' if subplot_idx == 2 else "", 
                   color=colors['MajorityVoting'],
                   yerr=maj_ses, capsize=3, alpha=0.9,
                   edgecolor='black', linewidth=0.5)

    bars2 = ax.bar(x + width/2, iter_accs, width,
                   label='Iterative Query' if subplot_idx == 2 else "", 
                   color=colors['IterativeQuery'], 
                   yerr=iter_ses, capsize=3, alpha=0.9,
                   edgecolor='black', linewidth=0.5)
    
    ax.set_xlabel(r'\textbf{Number of Hops}')
    if subplot_idx == 0:
        ax.set_ylabel(r'\textbf{Accuracy}')
    
    ax.set_title(r'\textbf{' + f'{num_facts} Facts' + '}')
    ax.set_xticks(x)
    ax.set_xticklabels(key_hops)
    
    # Add legend only to the third plot (rightmost) with parity notebook styling
    if subplot_idx == 2:
        legend = ax.legend(frameon=True, loc='upper right', fontsize=10,
                          fancybox=True, shadow=True, framealpha=0.95,
                          edgecolor='black', facecolor='white')
        legend.get_frame().set_linewidth(0.8)
    
    ax.grid(True, axis='y', linestyle='--', alpha=0.7)
    ax.set_ylim(0, 1.05)
    
    # Clean up spines
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)

plt.tight_layout()

# Save the combined plot
plt.savefig("khop_key_hops_comparison.pdf", bbox_inches='tight', dpi=300)
plt.show()
plt.close()

# Now create and save individual plots
for subplot_idx, num_facts in enumerate(fact_counts):
    fig_individual, ax_individual = plt.subplots(figsize=(4, 4))
    
    # Extract data for this fact count at key hop values (same logic as above)
    maj_accs = []
    iter_accs = []
    maj_ses = []
    iter_ses = []
    
    for hop in key_hops:
        # Find accuracy for MajorityVoting at this hop
        maj_key = f"MajorityVoting_{num_facts}facts"
        if maj_key in combined_data:
            maj_data = combined_data[maj_key]
            if hop in maj_data['hops']:
                hop_idx = maj_data['hops'].index(hop)
                maj_accs.append(maj_data['accuracy'][hop_idx])
                maj_ses.append(maj_data['se'][hop_idx])
            else:
                maj_accs.append(0)
                maj_ses.append(0)
        else:
            maj_accs.append(0)
            maj_ses.append(0)
            
        # Find accuracy for IterativeQuery at this hop  
        iter_key = f"IterativeQuery_{num_facts}facts"
        if iter_key in combined_data:
            iter_data = combined_data[iter_key]
            if hop in iter_data['hops']:
                hop_idx = iter_data['hops'].index(hop)
                iter_accs.append(iter_data['accuracy'][hop_idx])
                iter_ses.append(iter_data['se'][hop_idx])
            else:
                iter_accs.append(0)
                iter_ses.append(0)
        else:
            iter_accs.append(0)
            iter_ses.append(0)
    
    # Create grouped bar plot for individual subplot
    x = np.arange(len(key_hops))
    width = 0.35
    
    bars1 = ax_individual.bar(x - width/2, maj_accs, width, 
                             label='Majority Voting' if subplot_idx == 2 else "", 
                             color=colors['MajorityVoting'],
                             yerr=maj_ses, capsize=3, alpha=0.9,
                             edgecolor='black', linewidth=0.5)

    bars2 = ax_individual.bar(x + width/2, iter_accs, width,
                             label='Iterative Query' if subplot_idx == 2 else "", 
                             color=colors['IterativeQuery'], 
                             yerr=iter_ses, capsize=3, alpha=0.9,
                             edgecolor='black', linewidth=0.5)
    
    ax_individual.set_xlabel(r'\textbf{Number of Hops}')
    ax_individual.set_ylabel(r'\textbf{Accuracy}')
    ax_individual.set_title(r'\textbf{' + f'{num_facts} Facts' + '}')
    ax_individual.set_xticks(x)
    ax_individual.set_xticklabels(key_hops)
    
    # Add legend only to the third plot (500 facts) with parity notebook styling
    if subplot_idx == 2:
        legend = ax_individual.legend(frameon=True, loc='upper right', fontsize=10,
                                     fancybox=True, shadow=True, framealpha=0.95,
                                     edgecolor='black', facecolor='white')
        legend.get_frame().set_linewidth(0.8)
    
    ax_individual.grid(True, axis='y', linestyle='--', alpha=0.7)
    ax_individual.set_ylim(0, 1.05)
    
    # Clean up spines
    ax_individual.spines['top'].set_visible(False)
    ax_individual.spines['right'].set_visible(False)
    
    plt.tight_layout()
    
    # Save individual plot
    filename = f"khop_key_hops_{num_facts}facts.pdf"
    plt.savefig(filename, bbox_inches='tight', dpi=300)
    print(f"Saved individual plot: {filename}")
    plt.show()
    plt.close()

In [None]:
# Alternative 3: Performance degradation analysis - shows how accuracy changes with increasing complexity
# This visualization tells the story of how methods degrade as the task becomes harder

fig, ax = plt.subplots(figsize=(8, 5))

# Calculate performance at early hops (4-6) vs late hops (16-20) for each method and fact count
early_hops = [4, 6]
late_hops = [16, 18, 20]

results_summary = []

for num_facts in fact_counts:
    for method in methods:
        key = f"{method}_{num_facts}facts"
        if key in combined_data:
            data = combined_data[key]
            
            # Calculate early performance (average of hops 4-6)
            early_acc = []
            for hop in early_hops:
                if hop in data['hops']:
                    hop_idx = data['hops'].index(hop)
                    early_acc.append(data['accuracy'][hop_idx])
            early_mean = np.mean(early_acc) if early_acc else 0
            
            # Calculate late performance (average of hops 16-20)
            late_acc = []
            for hop in late_hops:
                if hop in data['hops']:
                    hop_idx = data['hops'].index(hop)
                    late_acc.append(data['accuracy'][hop_idx])
            late_mean = np.mean(late_acc) if late_acc else 0
            
            # Calculate degradation
            degradation = early_mean - late_mean
            
            results_summary.append({
                'method': method,
                'num_facts': num_facts,
                'early_performance': early_mean,
                'late_performance': late_mean,
                'degradation': degradation
            })

# Convert to DataFrame for easier plotting
df_summary = pd.DataFrame(results_summary)

# Create a grouped bar plot showing early vs late performance
x_pos = np.arange(len(fact_counts))
bar_width = 0.15

for i, method in enumerate(methods):
    method_data = df_summary[df_summary['method'] == method]
    
    # Plot early performance
    ax.bar(x_pos + i*bar_width*2 - bar_width/2, method_data['early_performance'], 
           bar_width, label=f'{method.replace("Voting", " Voting")} (Early)', 
           color=colors[method], alpha=0.9, edgecolor='black', linewidth=0.5)
    
    # Plot late performance with hatching to distinguish
    ax.bar(x_pos + i*bar_width*2 + bar_width/2, method_data['late_performance'], 
           bar_width, label=f'{method.replace("Voting", " Voting")} (Late)', 
           color=colors[method], alpha=0.6, edgecolor='black', linewidth=0.5, hatch='//')

ax.set_xlabel(r'\textbf{Number of Facts}')
ax.set_ylabel(r'\textbf{Accuracy}')
ax.set_title(r'\textbf{Performance Comparison: Early Hops (4-6) vs Late Hops (16-20)}')
ax.set_xticks(x_pos + bar_width/2)
ax.set_xticklabels(fact_counts)
ax.legend(frameon=True, loc='upper right', ncol=2, fontsize=8)
ax.grid(True, axis='y', linestyle='--', alpha=0.7)
ax.set_ylim(0, 1.05)

# Clean up spines
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

plt.tight_layout()
plt.savefig("figures/khop_early_vs_late_performance.pdf", bbox_inches='tight', dpi=300)
plt.show()
plt.close()

# Print summary statistics
print("\nPerformance Analysis Summary:")
print("=" * 60)
print("Early Hops (4-6) vs Late Hops (16-20)")
print("=" * 60)

for _, row in df_summary.iterrows():
    method_name = row['method'].replace('Voting', ' Voting')
    print(f"{method_name:15s} | {row['num_facts']:3d} facts | Early: {row['early_performance']:.3f} | Late: {row['late_performance']:.3f} | Degradation: {row['degradation']:.3f}")
    
# Calculate average degradation by method
print("\nAverage Performance Degradation:")
for method in methods:
    method_data = df_summary[df_summary['method'] == method]
    avg_degradation = method_data['degradation'].mean()
    method_name = method.replace('Voting', ' Voting')
    print(f"{method_name:15s}: {avg_degradation:.3f}")

In [None]:
# Alternative 3: Performance degradation analysis - shows how accuracy changes with increasing complexity
# This visualization tells the story of how methods degrade as the task becomes harder

fig, ax = plt.subplots(figsize=(8, 5))

# Calculate performance at early hops (4-6) vs late hops (16-20) for each method and fact count
early_hops = [4, 6]
late_hops = [16, 18, 20]

results_summary = []

for num_facts in fact_counts:
    for method in methods:
        key = f"{method}_{num_facts}facts"
        if key in combined_data:
            data = combined_data[key]
            
            # Calculate early performance (average of hops 4-6)
            early_acc = []
            for hop in early_hops:
                if hop in data['hops']:
                    hop_idx = data['hops'].index(hop)
                    early_acc.append(data['accuracy'][hop_idx])
            early_mean = np.mean(early_acc) if early_acc else 0
            
            # Calculate late performance (average of hops 16-20)
            late_acc = []
            for hop in late_hops:
                if hop in data['hops']:
                    hop_idx = data['hops'].index(hop)
                    late_acc.append(data['accuracy'][hop_idx])
            late_mean = np.mean(late_acc) if late_acc else 0
            
            # Calculate degradation
            degradation = early_mean - late_mean
            
            results_summary.append({
                'method': method,
                'num_facts': num_facts,
                'early_performance': early_mean,
                'late_performance': late_mean,
                'degradation': degradation
            })

# Convert to DataFrame for easier plotting
df_summary = pd.DataFrame(results_summary)

# Create a grouped bar plot showing early vs late performance
x_pos = np.arange(len(fact_counts))
bar_width = 0.15

for i, method in enumerate(methods):
    method_data = df_summary[df_summary['method'] == method]
    
    # Plot early performance
    ax.bar(x_pos + i*bar_width*2 - bar_width/2, method_data['early_performance'], 
           bar_width, label=f'{method.replace("Voting", " Voting")} (Early)', 
           color=colors[method], alpha=0.9, edgecolor='black', linewidth=0.5)
    
    # Plot late performance with hatching to distinguish
    ax.bar(x_pos + i*bar_width*2 + bar_width/2, method_data['late_performance'], 
           bar_width, label=f'{method.replace("Voting", " Voting")} (Late)', 
           color=colors[method], alpha=0.6, edgecolor='black', linewidth=0.5, hatch='//')

ax.set_xlabel(r'\textbf{Number of Facts}')
ax.set_ylabel(r'\textbf{Accuracy}')
ax.set_title(r'\textbf{Performance Comparison: Early Hops (4-6) vs Late Hops (16-20)}')
ax.set_xticks(x_pos + bar_width/2)
ax.set_xticklabels(fact_counts)
ax.legend(frameon=True, loc='upper right', ncol=2, fontsize=8)
ax.grid(True, axis='y', linestyle='--', alpha=0.7)
ax.set_ylim(0, 1.05)

# Clean up spines
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

plt.tight_layout()
plt.savefig("khop_early_vs_late_performance.pdf", bbox_inches='tight', dpi=300)
plt.show()
plt.close()

# Print summary statistics
print("\nPerformance Analysis Summary:")
print("=" * 60)
print("Early Hops (4-6) vs Late Hops (16-20)")
print("=" * 60)

for _, row in df_summary.iterrows():
    method_name = row['method'].replace('Voting', ' Voting')
    print(f"{method_name:15s} | {row['num_facts']:3d} facts | Early: {row['early_performance']:.3f} | Late: {row['late_performance']:.3f} | Degradation: {row['degradation']:.3f}")
    
# Calculate average degradation by method
print("\nAverage Performance Degradation:")
for method in methods:
    method_data = df_summary[df_summary['method'] == method]
    avg_degradation = method_data['degradation'].mean()
    method_name = method.replace('Voting', ' Voting')
    print(f"{method_name:15s}: {avg_degradation:.3f}")

# Visualization Recommendations for Conference Papers (ICLR Style)

Based on the analysis above, here are the recommended visualizations for different scenarios:

## 1. **Average Performance Bar Plot** (Alternative 1)
**Best for:** Main results showing overall method comparison
- **Pros:** Clean, easy to understand, shows clear winner across fact counts
- **Cons:** Loses information about hop-specific performance
- **Use case:** Main results in paper body, executive summary figures

## 2. **Key Hops Bar Plot** (Alternative 2) 
**Best for:** Detailed analysis showing performance at critical points
- **Pros:** Reduces noise while maintaining hop-specific insights
- **Cons:** Arbitrary selection of "key" hops
- **Use case:** Supplementary material, detailed analysis sections

## 3. **Early vs Late Performance** (Alternative 3)
**Best for:** Highlighting degradation patterns and robustness
- **Pros:** Tells a clear story about method robustness to complexity
- **Cons:** Reduces all data to two summary points
- **Use case:** Discussion section, robustness analysis

## 4. **Original Line Plots** (Current)
**Best for:** Complete data visualization showing all trends
- **Pros:** Shows all data points, preserves complete information
- **Cons:** Noisy, overlapping lines make it hard to draw conclusions
- **Use case:** Appendix, complete results documentation

## Key Findings from the Data:

1. **Iterative Query** consistently outperforms **Majority Voting** across all fact counts
2. Performance degradation with increasing hops is more severe for **Majority Voting**
3. Both methods show better performance with fewer facts (100 vs 500)
4. **Iterative Query** maintains more stable performance at higher hop counts

## Recommendation:
For an ICLR paper, I'd recommend using **Alternative 1 (Average Performance)** in the main results and **Alternative 3 (Early vs Late)** in the analysis section to tell a compelling story about method robustness.