# K-Hop Token Usage Analysis

This notebook analyzes the average completion tokens as a function of the number of hops for the k-hop reasoning task across different models (exaone, llama8b, llama70b) and fact counts.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re

# Set LaTeX font settings for publication quality with Times New Roman 12pt
plt.rcParams.update({
    'text.usetex': True,
    'font.family': 'serif',
    'font.serif': ['Times'],  # Use Times New Roman
    'text.latex.preamble': r'\usepackage{times}',  # LaTeX Times package
    'font.size': 10,
    'axes.titlesize': 12,  # 12pt for titles
    'axes.labelsize': 12,  # 12pt for axes labels
    'legend.fontsize': 10,  # Keep legend at 10pt
    'xtick.labelsize': 9,
    'ytick.labelsize': 9
})

print("Notebook setup complete.")

In [None]:
# Load the token usage data
token_df = pd.read_csv('data/khop_avg_completion_tokens.csv')

print("Token usage data loaded successfully.")
print(f"DataFrame shape: {token_df.shape}")
print(f"\nColumns (first 5):")
print([col for col in token_df.columns][:5])
print(f"\nFirst few rows:")
print(token_df.head())

In [None]:
def extract_model_and_facts_info(col_name):
    """Extract model type and fact count information from column name"""
    
    # Extract model type
    if 'llama8B' in col_name:
        model_type = 'llama8b'
    elif 'llama70B' in col_name:
        model_type = 'llama70b'
    elif 'exaone' in col_name:
        model_type = 'exaone'
    else:
        return None, None
    
    # Extract number of facts
    facts_match = re.search(r'facts(\d+)', col_name)
    num_facts = int(facts_match.group(1)) if facts_match else None
    
    return model_type, num_facts

def organize_token_data(df):
    """Organize token data by model and fact count"""
    
    # Filter columns to get only the main avg_completion_tokens columns (not MIN/MAX)
    token_cols = [col for col in df.columns if ' - avg_completion_tokens' in col and 'MIN' not in col and 'MAX' not in col]
    
    organized_data = {}
    
    for col in token_cols:
        model_type, num_facts = extract_model_and_facts_info(col)
        if model_type and num_facts:
            key = f"{model_type}_{num_facts}facts"
            
            if key not in organized_data:
                organized_data[key] = {
                    'model_type': model_type,
                    'num_facts': num_facts,
                    'hops': [],
                    'tokens': []
                }
            
            # Get data for this column
            for i, hop in enumerate(df['num_hops']):
                token_val = df.iloc[i][col]
                if pd.notna(token_val) and token_val != '':
                    try:
                        organized_data[key]['hops'].append(hop)
                        organized_data[key]['tokens'].append(float(token_val))
                    except (ValueError, TypeError):
                        continue
    
    return organized_data

# Organize the data
organized_data = organize_token_data(token_df)

print("Found the following model/fact combinations:")
for key, data in organized_data.items():
    print(f"  {key}: {len(data['hops'])} data points (hops {min(data['hops'])}-{max(data['hops'])})")

In [None]:
# Create plots showing avg_completion_tokens vs num_hops for each model type
# Save separate PDF for each model

# Define the three models and colors similar to existing notebooks
model_types = ['exaone', 'llama8b', 'llama70b']

# Define consistent colors for different fact counts across all plots
fact_colors = {
    100: '#2E5EAA',   # Red
    200: '#2E8B57',   # Blue  
    500: '#B22222'    # Green
}

# Create individual plots for each model type
for model_type in model_types:
    fig, ax = plt.subplots(figsize=(4, 4))
    
    # Find all fact counts for this model
    model_data = {key: data for key, data in organized_data.items() if data['model_type'] == model_type}
    
    # Sort by fact count for consistent legend ordering
    sorted_keys = sorted(model_data.keys(), key=lambda x: model_data[x]['num_facts'])
    
    for key in sorted_keys:
        data = model_data[key]
        
        # Plot line with markers - X axis is num_hops, Y axis is avg_completion_tokens
        # Use consistent colors based on fact count
        color = fact_colors.get(data['num_facts'], '#000000')  # Default to black if fact count not found
        
        ax.plot(
            data['hops'],
            data['tokens'],
            label=f"{data['num_facts']} facts",
            color=color,
            marker='o',
            markersize=5,
            linewidth=2,
            alpha=0.9
        )
    
    # Customize plot with correct axis labels
    ax.set_xlabel(r'\textbf{Number of Hops}')
    ax.set_ylabel(r'\textbf{Computation Depth}')
    
    # Capitalize model names for titles
    model_title = model_type.replace('llama8b', 'Llama-8B').replace('llama70b', 'Llama-70B').replace('exaone', 'ExaOne')
    ax.set_title(r'\textbf{' + f'Token Usage Analysis: {model_title}' + '}')
    
    ax.grid(True, linestyle='--', linewidth=0.5, alpha=0.7)
    
    # Set axis limits
    if model_data:
        all_tokens = [token for data in model_data.values() for token in data['tokens']]
        all_hops = [hop for data in model_data.values() for hop in data['hops']]
        
        if all_tokens and all_hops:
            ax.set_ylim(0, max(all_tokens) * 1.1)
            ax.set_xlim(min(all_hops) - 0.5, max(all_hops) + 0.5)
            ax.set_xticks(range(min(all_hops), max(all_hops) + 1, 2))  # Show every other hop
    
    # Add boxed legend with similar styling to other notebooks
    if len(model_data) > 0:
        legend = ax.legend(frameon=True, loc='upper left', fontsize=10,
                          fancybox=True, shadow=True, framealpha=0.95,
                          edgecolor='black', facecolor='white')
        legend.get_frame().set_linewidth(0.8)
    
    # Clean up spines
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    
    plt.tight_layout()
    
    # Save individual plot as PDF
    filename = f"khop_token_usage_{model_type}.pdf"
    plt.savefig(filename, bbox_inches='tight', dpi=300)
    print(f"Saved plot: {filename}")
    
    plt.show()
    plt.close()

In [None]:
# Also create a combined plot showing all three models together for comparison
fig, ax = plt.subplots(figsize=(12, 6))

# Plot all models on the same plot, showing all fact counts
model_markers = {
    'exaone': 'o',     # Circle
    'llama8b': 's',    # Square
    'llama70b': '^'    # Triangle
}

plotted_combinations = []

for model_type in model_types:
    model_data = {key: data for key, data in organized_data.items() if data['model_type'] == model_type}
    
    for key in sorted(model_data.keys(), key=lambda x: model_data[x]['num_facts']):
        data = model_data[key]
        
        # Use consistent colors for fact counts and different markers for models
        color = fact_colors.get(data['num_facts'], '#000000')
        marker = model_markers.get(model_type, 'o')
        
        model_label = model_type.replace('llama8b', 'Llama-8B').replace('llama70b', 'Llama-70B').replace('exaone', 'ExaOne')
        label = f"{model_label} ({data['num_facts']} facts)"
        
        ax.plot(
            data['hops'],
            data['tokens'],
            label=label,
            color=color,
            marker=marker,
            markersize=5,
            linewidth=2,
            alpha=0.8
        )
        plotted_combinations.append((model_type, data['num_facts']))

# Customize plot with correct axis labels
ax.set_xlabel(r'\textbf{Number of Hops}')
ax.set_ylabel(r'\textbf{Computation Depth}')
ax.set_title(r'\textbf{Token Usage Comparison: All Models and Fact Counts}')

ax.grid(True, linestyle='--', linewidth=0.5, alpha=0.7)

# Add boxed legend - use two columns to fit all combinations
if plotted_combinations:
    legend = ax.legend(frameon=True, loc='upper left', fontsize=9, ncol=2,
                      fancybox=True, shadow=True, framealpha=0.95,
                      edgecolor='black', facecolor='white')
    legend.get_frame().set_linewidth(0.8)

# Clean up spines
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

plt.tight_layout()
plt.savefig("khop_token_usage_comparison_all_models.pdf", bbox_inches='tight', dpi=300)
print("Saved combined comparison plot: khop_token_usage_comparison_all_models.pdf")
plt.show()
plt.close()

In [None]:
def extract_model_and_facts_info(col_name):
    """Extract model type and fact count information from column name"""
    
    # Extract model type
    if 'llama8B' in col_name:
        model_type = 'llama8b'
    elif 'llama70B' in col_name:
        model_type = 'llama70b'
    elif 'exaone' in col_name:
        model_type = 'exaone'
    else:
        return None, None
    
    # Extract number of facts
    facts_match = re.search(r'facts(\d+)', col_name)
    num_facts = int(facts_match.group(1)) if facts_match else None
    
    return model_type, num_facts

def organize_token_data(df):
    """Organize token data by model and fact count"""
    
    # Filter columns to get only the main step columns (not MIN/MAX)
    step_cols = [col for col in df.columns if ' - _step' in col and 'MIN' not in col and 'MAX' not in col]
    
    organized_data = {}
    
    for col in step_cols:
        model_type, num_facts = extract_model_and_facts_info(col)
        if model_type and num_facts:
            key = f"{model_type}_{num_facts}facts"
            
            if key not in organized_data:
                organized_data[key] = {
                    'model_type': model_type,
                    'num_facts': num_facts,
                    'hops': [],
                    'tokens': []
                }
            
            # Get data for this column
            for i, hop in enumerate(df['num_hops']):
                token_val = df.iloc[i][col]
                if pd.notna(token_val) and token_val != '':
                    try:
                        organized_data[key]['hops'].append(hop)
                        organized_data[key]['tokens'].append(float(token_val))
                    except (ValueError, TypeError):
                        continue
    
    return organized_data

# Organize the data
organized_data = organize_token_data(token_df)

print("Found the following model/fact combinations:")
for key, data in organized_data.items():
    print(f"  {key}: {len(data['hops'])} data points (hops {min(data['hops'])}-{max(data['hops'])})")

## Key Observations

From the token usage analysis, we can observe:

1. **Linear Growth**: Computation depth (measured in tokens) appears to grow linearly with the number of hops, which is expected as each hop requires additional reasoning steps.

2. **Model Differences**: Different models show varying computational depth patterns, which may reflect differences in their reasoning strategies and efficiency.

3. **Fact Count Impact**: The number of facts in the knowledge base affects computational depth, with more facts potentially requiring different amounts of computation.

4. **Scaling Behavior**: The consistent linear relationship suggests that the computational complexity scales predictably with problem complexity.

This analysis provides insights into the computational efficiency of different models on the k-hop reasoning task and can inform decisions about model selection and resource allocation for complex reasoning tasks.