# K-Hop Reasoning Model Comparison: EXAONE vs Llama-8B

This notebook compares the performance of two language models (EXAONE and Llama-8B) on the k-hop reasoning task, analyzing their accuracy vs number of hops with error bars for different fact counts (100, 200, 500).

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import scienceplots
print(plt.style.available)
plt.style.use(['science', 'no-latex'])  # Optional: replace 'no-latex' with 'ieee' if using LaTeX

In [None]:
# Load the datasets for model comparison
avg_acc_df = pd.read_csv('khop_comparison_acc.csv')
se_df = pd.read_csv('khop_comparison_se.csv')

print("Average Accuracy DataFrame shape:", avg_acc_df.shape)
print("Standard Error DataFrame shape:", se_df.shape)
print("\nAverage Accuracy columns:")
print([col for col in avg_acc_df.columns if 'avg_accuracy' in col][:5])
print("\nStandard Error columns:")
print([col for col in se_df.columns if 'se_accuracy' in col][:5])

In [None]:
avg_acc_df.head()

In [None]:
def extract_experiment_info_with_model(col_name):
    """Extract experiment information from column name including model type"""
    # Extract model type
    if 'exaone' in col_name.lower():
        model_type = 'EXAONE'
    elif 'llama8b' in col_name.lower():
        model_type = 'Llama-8B'
    else:
        return None, None, None, None
    
    if 'MajorityVotingAgents' in col_name:
        agent_type = 'MajorityVoting'
        # Extract agents number
        agents_match = re.search(r'agents(\d+)', col_name)
        agents = int(agents_match.group(1)) if agents_match else None
    elif 'IterativeQueryAgents' in col_name:
        agent_type = 'IterativeQuery'
        # Extract facts per worker
        facts_match = re.search(r'factsperworker(\d+)', col_name)
        agents = int(facts_match.group(1)) if facts_match else None
    else:
        return None, None, None, None
    
    # Extract number of facts
    facts_match = re.search(r'facts(\d+)', col_name)
    num_facts = int(facts_match.group(1)) if facts_match else None
    
    # Extract hop range
    hops_match = re.search(r'hops(\d+)-(\d+)', col_name)
    hop_range = (int(hops_match.group(1)), int(hops_match.group(2))) if hops_match else None
    
    return agent_type, num_facts, hop_range, model_type

def group_related_experiments_with_models(df, metric_type='avg_accuracy'):
    """Group related experiments that should form continuous lines, including model information"""
    
    # Filter columns to remove MIN/MAX/step columns
    filtered_df = df[[col for col in df.columns if all(x not in col for x in ['MIN', 'MAX', 'step'])]]
    
    # Get columns with the specified metric
    metric_cols = [col for col in filtered_df.columns if metric_type in col]
    
    experiments = {}
    
    for col in metric_cols:
        agent_type, num_facts, hop_range, model_type = extract_experiment_info_with_model(col)
        if agent_type and num_facts and hop_range and model_type:
            # Create a unique key for experiments that should be grouped
            key = f"{agent_type}_{num_facts}facts_{model_type}"
            
            if key not in experiments:
                experiments[key] = []
            
            experiments[key].append({
                'column': col,
                'hop_range': hop_range,
                'agent_type': agent_type,
                'num_facts': num_facts,
                'model_type': model_type
            })
    
    return experiments, filtered_df

# Group experiments
acc_experiments, filtered_acc_df = group_related_experiments_with_models(avg_acc_df, 'avg_accuracy')
se_experiments, filtered_se_df = group_related_experiments_with_models(se_df, 'se_accuracy')

print("Found experiment groups:")
for key, exps in acc_experiments.items():
    print(f"  {key}: {len(exps)} experiments")
    for exp in exps:
        print(f"    - Hops {exp['hop_range'][0]}-{exp['hop_range'][1]}")

In [None]:
def combine_experiments_into_lines_with_models(experiments, acc_df, se_df, num_hops):
    """Combine related experiments into continuous lines, including model information"""
    
    combined_data = {}
    
    for exp_key, exp_list in experiments.items():
        # Sort experiments by hop range start
        exp_list.sort(key=lambda x: x['hop_range'][0])
        
        combined_hops = []
        combined_acc = []
        combined_se = []
        
        for exp in exp_list:
            col_name = exp['column']
            se_col_name = col_name.replace('avg_accuracy', 'se_accuracy')
            
            hop_start, hop_end = exp['hop_range']
            
            # Get data for this hop range
            for i, hop in enumerate(num_hops):
                if hop_start <= hop <= hop_end:
                    acc_val = acc_df.iloc[i][col_name] if col_name in acc_df.columns else None
                    se_val = se_df.iloc[i][se_col_name] if se_col_name in se_df.columns else None
                    
                    if pd.notna(acc_val) and acc_val != '':
                        try:
                            combined_hops.append(hop)
                            combined_acc.append(float(acc_val))
                            combined_se.append(float(se_val) if pd.notna(se_val) and se_val != '' else 0.0)
                        except (ValueError, TypeError):
                            continue
        
        if combined_hops:  # Only add if we have data
            combined_data[exp_key] = {
                'hops': combined_hops,
                'accuracy': combined_acc,
                'se': combined_se,
                'agent_type': exp_list[0]['agent_type'],
                'num_facts': exp_list[0]['num_facts'],
                'model_type': exp_list[0]['model_type']
            }
    
    return combined_data

# Get number of hops
num_hops = filtered_acc_df['num_hops'].tolist()

# Combine experiments
combined_data = combine_experiments_into_lines_with_models(acc_experiments, filtered_acc_df, filtered_se_df, num_hops)

print(f"Combined {len(combined_data)} experiment groups into continuous lines:")
for key, data in combined_data.items():
    print(f"  {key}: {len(data['hops'])} data points (hops {min(data['hops'])}-{max(data['hops'])})")

In [None]:
# Create summary table
summary_data = []

for key, data in combined_data.items():
    for i, hop in enumerate(data['hops']):
        summary_data.append({
            'Experiment': key,
            'Agent_Type': data['agent_type'],
            'Model_Type': data['model_type'],
            'Num_Facts': data['num_facts'],
            'Num_Hops': hop,
            'Accuracy': data['accuracy'][i],
            'Standard_Error': data['se'][i]
        })

summary_df = pd.DataFrame(summary_data)
print("K-Hop Reasoning Model Comparison Summary:")
print("=" * 90)
print(summary_df.to_string(index=False))

In [None]:
# Alternative 2: Bar plot showing performance at key hop values (4, 8, 12, 16, 20)
# This focuses on specific points to reduce noise and compares both models

key_hops = [4, 8, 12, 16, 20]  # Focus on these hop counts for cleaner visualization
fact_counts = [100, 200, 500]  # Target fact counts for comparison

# Update font sizes for 12pt axes and titles with Times New Roman
plt.rcParams.update({
    'text.usetex': True,
    'font.family': 'serif',
    'font.serif': ['Times'],  # Use Times New Roman
    'text.latex.preamble': r'\usepackage{times}',  # LaTeX Times package
    'font.size': 10,
    'axes.titlesize': 12,  # 12pt for titles
    'axes.labelsize': 12,  # 12pt for axes labels
    'legend.fontsize': 10,  # Keep legend at 10pt
    'xtick.labelsize': 9,
    'ytick.labelsize': 9
})

# Colors for the two models
model_colors = {'EXAONE': '#4C72B0', 'Llama-8B': '#55A868'}

# Check which model-fact combinations are available
print("Available model-fact combinations:")
available_combinations = set()
for key in combined_data.keys():
    parts = key.split('_')
    if len(parts) >= 3:
        model = parts[2]
        facts = parts[1].replace('facts', '')
        available_combinations.add((model, int(facts)))
        print(f"  {model}: {facts} facts")

print("\nMissing combinations for 500 facts:")
for model in ['EXAONE', 'Llama-8B']:
    if (model, 500) not in available_combinations:
        print(f"  {model}: 500 facts - MISSING")

# Create subplots for each fact count
fig, axes = plt.subplots(1, 3, figsize=(12, 4))

for subplot_idx, num_facts in enumerate(fact_counts):
    ax = axes[subplot_idx]
    
    # Extract data for this fact count at key hop values for both models
    model_data = {'EXAONE': {'mv': [], 'iq': []}, 'Llama-8B': {'mv': [], 'iq': []}}
    model_se = {'EXAONE': {'mv': [], 'iq': []}, 'Llama-8B': {'mv': [], 'iq': []}}
    
    for hop in key_hops:
        for model in ['EXAONE', 'Llama-8B']:
            for agent_type, agent_key in [('MajorityVoting', 'mv'), ('IterativeQuery', 'iq')]:
                # Find data for this combination
                exp_key = f"{agent_type}_{num_facts}facts_{model}"
                if exp_key in combined_data:
                    data = combined_data[exp_key]
                    if hop in data['hops']:
                        hop_idx = data['hops'].index(hop)
                        model_data[model][agent_key].append(data['accuracy'][hop_idx])
                        model_se[model][agent_key].append(data['se'][hop_idx])
                    else:
                        model_data[model][agent_key].append(None)  # Changed from 0 to None
                        model_se[model][agent_key].append(None)
                else:
                    # Model-fact combination doesn't exist in data
                    model_data[model][agent_key].append(None)
                    model_se[model][agent_key].append(None)
    
    # Create grouped bar plot
    x = np.arange(len(key_hops))
    width = 0.2
    
    # Plot bars for each model and agent type combination
    # Only plot if we have at least some non-None data
    if any(v is not None for v in model_data['EXAONE']['mv']):
        mv_data = [v if v is not None else 0 for v in model_data['EXAONE']['mv']]
        mv_se = [v if v is not None else 0 for v in model_se['EXAONE']['mv']]
        bars1 = ax.bar(x - 1.5*width, mv_data, width, 
                       label='EXAONE MV' if subplot_idx == 2 else "", 
                       color=model_colors['EXAONE'], alpha=0.8,
                       yerr=mv_se, capsize=3,
                       edgecolor='black', linewidth=0.5)

    if any(v is not None for v in model_data['EXAONE']['iq']):
        iq_data = [v if v is not None else 0 for v in model_data['EXAONE']['iq']]
        iq_se = [v if v is not None else 0 for v in model_se['EXAONE']['iq']]
        bars2 = ax.bar(x - 0.5*width, iq_data, width,
                       label='EXAONE IQ' if subplot_idx == 2 else "", 
                       color=model_colors['EXAONE'], alpha=0.6, hatch='///',
                       yerr=iq_se, capsize=3,
                       edgecolor='black', linewidth=0.5)
    
    if any(v is not None for v in model_data['Llama-8B']['mv']):
        mv_data = [v if v is not None else 0 for v in model_data['Llama-8B']['mv']]
        mv_se = [v if v is not None else 0 for v in model_se['Llama-8B']['mv']]
        bars3 = ax.bar(x + 0.5*width, mv_data, width, 
                       label='Llama-8B MV' if subplot_idx == 2 else "", 
                       color=model_colors['Llama-8B'], alpha=0.8,
                       yerr=mv_se, capsize=3,
                       edgecolor='black', linewidth=0.5)

    if any(v is not None for v in model_data['Llama-8B']['iq']):
        iq_data = [v if v is not None else 0 for v in model_data['Llama-8B']['iq']]
        iq_se = [v if v is not None else 0 for v in model_se['Llama-8B']['iq']]
        bars4 = ax.bar(x + 1.5*width, iq_data, width,
                       label='Llama-8B IQ' if subplot_idx == 2 else "", 
                       color=model_colors['Llama-8B'], alpha=0.6, hatch='///',
                       yerr=iq_se, capsize=3,
                       edgecolor='black', linewidth=0.5)
    
    ax.set_xlabel(r'\textbf{Number of Hops}')
    if subplot_idx == 0:
        ax.set_ylabel(r'\textbf{Accuracy}')
    
    # Add note if data is missing for this fact count
    title = f'{num_facts} Facts'
    if (('EXAONE', num_facts) not in available_combinations or 
        ('Llama-8B', num_facts) not in available_combinations):
        missing_models = []
        if ('EXAONE', num_facts) not in available_combinations:
            missing_models.append('EXAONE')
        if ('Llama-8B', num_facts) not in available_combinations:
            missing_models.append('Llama-8B')
        title += f' (missing: {", ".join(missing_models)})'
    
    ax.set_title(r'\textbf{' + title + '}')
    ax.set_xticks(x)
    ax.set_xticklabels(key_hops)
    
    # Add legend only to the third plot (rightmost)
    if subplot_idx == 2:
        legend = ax.legend(frameon=True, loc='upper right', fontsize=8,
                          fancybox=True, shadow=True, framealpha=0.95,
                          edgecolor='black', facecolor='white')
        legend.get_frame().set_linewidth(0.8)
    
    ax.grid(True, axis='y', linestyle='--', alpha=0.7)
    ax.set_ylim(0, 1.05)
    
    # Clean up spines
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)

plt.tight_layout()

# Save the combined plot
plt.savefig("khop_model_comparison_exaone_llama8b.pdf", bbox_inches='tight', dpi=300)
plt.close()

In [None]:
# Now create and save individual plots for each fact count
for subplot_idx, num_facts in enumerate(fact_counts):
    fig_individual, ax_individual = plt.subplots(figsize=(4, 4))
    
    # Extract data for this fact count at key hop values for both models (same logic as above)
    model_data = {'EXAONE': {'mv': [], 'iq': []}, 'Llama-8B': {'mv': [], 'iq': []}}
    model_se = {'EXAONE': {'mv': [], 'iq': []}, 'Llama-8B': {'mv': [], 'iq': []}}
    
    for hop in key_hops:
        for model in ['EXAONE', 'Llama-8B']:
            for agent_type, agent_key in [('MajorityVoting', 'mv'), ('IterativeQuery', 'iq')]:
                # Find data for this combination
                exp_key = f"{agent_type}_{num_facts}facts_{model}"
                if exp_key in combined_data:
                    data = combined_data[exp_key]
                    if hop in data['hops']:
                        hop_idx = data['hops'].index(hop)
                        model_data[model][agent_key].append(data['accuracy'][hop_idx])
                        model_se[model][agent_key].append(data['se'][hop_idx])
                    else:
                        model_data[model][agent_key].append(0)
                        model_se[model][agent_key].append(0)
                else:
                    model_data[model][agent_key].append(0)
                    model_se[model][agent_key].append(0)
    
    # Create grouped bar plot for individual subplot
    x = np.arange(len(key_hops))
    width = 0.2
    
    # Plot bars for each model and agent type combination
    bars1 = ax_individual.bar(x - 1.5*width, model_data['EXAONE']['mv'], width, 
                             label='EXAONE MV', 
                             color=model_colors['EXAONE'], alpha=0.8,
                             yerr=model_se['EXAONE']['mv'], capsize=3,
                             edgecolor='black', linewidth=0.5)

    bars2 = ax_individual.bar(x - 0.5*width, model_data['EXAONE']['iq'], width,
                             label='EXAONE IQ', 
                             color=model_colors['EXAONE'], alpha=0.6, hatch='///',
                             yerr=model_se['EXAONE']['iq'], capsize=3,
                             edgecolor='black', linewidth=0.5)
    
    bars3 = ax_individual.bar(x + 0.5*width, model_data['Llama-8B']['mv'], width, 
                             label='Llama-8B MV', 
                             color=model_colors['Llama-8B'], alpha=0.8,
                             yerr=model_se['Llama-8B']['mv'], capsize=3,
                             edgecolor='black', linewidth=0.5)

    bars4 = ax_individual.bar(x + 1.5*width, model_data['Llama-8B']['iq'], width,
                             label='Llama-8B IQ', 
                             color=model_colors['Llama-8B'], alpha=0.6, hatch='///',
                             yerr=model_se['Llama-8B']['iq'], capsize=3,
                             edgecolor='black', linewidth=0.5)
    
    ax_individual.set_xlabel(r'\textbf{Number of Hops}')
    ax_individual.set_ylabel(r'\textbf{Accuracy}')
    ax_individual.set_title(r'\textbf{' + f'{num_facts} Facts' + '}')
    ax_individual.set_xticks(x)
    ax_individual.set_xticklabels(key_hops)
    
    # Add legend to all individual plots
    legend = ax_individual.legend(frameon=True, loc='upper right', fontsize=8,
                                 fancybox=True, shadow=True, framealpha=0.95,
                                 edgecolor='black', facecolor='white')
    legend.get_frame().set_linewidth(0.8)
    
    ax_individual.grid(True, axis='y', linestyle='--', alpha=0.7)
    ax_individual.set_ylim(0, 1.05)
    
    # Clean up spines
    ax_individual.spines['top'].set_visible(False)
    ax_individual.spines['right'].set_visible(False)
    
    plt.tight_layout()
    
    # Save individual plot
    filename = f"khop_key_hops_{num_facts}facts_exaone_llama8b_comparison.pdf"
    plt.savefig(filename, bbox_inches='tight', dpi=300)
    print(f"Saved individual plot: {filename}")
    plt.close()

# Model Comparison Analysis

This notebook compares the performance of EXAONE and Llama-8B language models on the k-hop reasoning task. The analysis shows:

## Key Findings:

1. **Model Performance**: The plots show how both models perform across different hop counts (4, 8, 12, 16, 20) and fact counts (100, 200, 500)

2. **Agent Types**: Two agent architectures are compared:
   - **MV**: Majority Voting Agents (solid bars)
   - **IQ**: Iterative Query Agents (hatched bars)

3. **Visualization**: Bar plots at key hop values provide a clear comparison between:
   - Different models (EXAONE vs Llama-8B)
   - Different agent types (MV vs IQ)
   - Different fact counts (100, 200, 500)

The plots are generated both as a combined view and as individual plots for each fact count, making them suitable for different presentation contexts.