# Parameter Sweeps: N=K vs N=K+D Algorithm Comparison

This notebook focuses on the core investigation: comparing N=K vs N=K+D algorithms using refined quality metrics.

## Setup

In [None]:
import sys
import os
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import time
from pathlib import Path
import asyncio

# Add paths
sys.path.append('../../puzzle-generation')
sys.path.append('../scripts')

# Set up plotting
plt.style.use('default')
sns.set_palette('husl')
%matplotlib inline

## Run Parameter Sweep

In [None]:
# Import and run parameter sweep
from generate_parameter_sweep import ParameterSweepGenerator

async def run_focused_sweep():
    """Run focused algorithm comparison sweep"""
    print("🔬 Starting Focused Parameter Sweep")
    print("=" * 50)
    
    try:
        # Initialize sweep generator
        sweep_gen = ParameterSweepGenerator()
        await sweep_gen.initialize()
        
        # Run algorithm comparison (primary focus)
        print(f"\n{'='*20} ALGORITHM COMPARISON SWEEP {'='*20}")
        await sweep_gen.run_algorithm_comparison_sweep()
        
        # Save results
        sweep_gen.save_results()
        
        print("\n✅ Parameter sweep completed successfully!")
        return sweep_gen.results
        
    except Exception as e:
        print(f"\n❌ Parameter sweep failed: {e}")
        return []

# Run the sweep
sweep_results = await run_focused_sweep()

## Analyze Results

In [None]:
# Load and analyze sweep results
if sweep_results:
    results_df = pd.DataFrame(sweep_results)
    
    print("📊 Sweep Results Summary:")
    print(f"Total attempts: {len(results_df)}")
    
    successful_df = results_df[results_df['success'] == True]
    print(f"Successful: {len(successful_df)} ({len(successful_df)/len(results_df)*100:.1f}%)")
    
    # Group by algorithm
    if 'algorithm' in results_df.columns:
        print("\n📈 Results by Algorithm:")
        for algorithm in results_df['algorithm'].unique():
            alg_df = successful_df[successful_df['algorithm'] == algorithm]
            if len(alg_df) > 0:
                print(f"\n{algorithm}:")
                print(f"  Success rate: {len(alg_df)}/{len(results_df[results_df['algorithm'] == algorithm])} "
                      f"({len(alg_df)/len(results_df[results_df['algorithm'] == algorithm])*100:.1f}%)")
                
                # Quality metrics
                if 'refined_overall_quality_score' in alg_df.columns:
                    avg_quality = alg_df['refined_overall_quality_score'].mean()
                    std_quality = alg_df['refined_overall_quality_score'].std()
                    print(f"  Refined Quality: {avg_quality:.3f} ± {std_quality:.3f}")
                
                # Generation time
                if 'generation_time' in alg_df.columns:
                    avg_time = alg_df['generation_time'].mean()
                    print(f"  Avg Generation Time: {avg_time:.2f}s")
else:
    print("❌ No sweep results to analyze")

## Visualize Quality Metrics

In [None]:
# Visualize quality metrics comparison
if sweep_results and len(successful_df) > 0:
    
    # Quality metric columns
    quality_metrics = [
        'refined_intracategory_word_distinctiveness',
        'refined_intercategory_discoherence',
        'refined_intracategory_coherence',
        'refined_difficulty_progression',
        'refined_overall_quality_score'
    ]
    
    available_metrics = [m for m in quality_metrics if m in successful_df.columns]
    
    if available_metrics:
        fig, axes = plt.subplots(2, 3, figsize=(15, 10))
        axes = axes.flatten()
        
        for i, metric in enumerate(available_metrics):
            if i < len(axes):
                # Box plot by algorithm
                if 'algorithm' in successful_df.columns:
                    successful_df.boxplot(column=metric, by='algorithm', ax=axes[i])
                    axes[i].set_title(metric.replace('refined_', '').replace('_', ' ').title())
                else:
                    axes[i].hist(successful_df[metric], bins=10, alpha=0.7)
                    axes[i].set_title(metric.replace('refined_', '').replace('_', ' ').title())
        
        # Remove empty subplots
        for i in range(len(available_metrics), len(axes)):
            axes[i].remove()
        
        plt.tight_layout()
        plt.show()
    else:
        print("❌ No quality metrics found in results")
else:
    print("❌ No data to visualize")

## Statistical Analysis

In [None]:
# Statistical comparison between algorithms
if sweep_results and 'algorithm' in successful_df.columns and len(successful_df) > 0:
    
    algorithms = successful_df['algorithm'].unique()
    
    if len(algorithms) > 1:
        print("📊 Statistical Comparison Between Algorithms:")
        
        # Quality metric comparison
        if 'refined_overall_quality_score' in successful_df.columns:
            print("\n🎯 Overall Quality Score:")
            for alg in algorithms:
                alg_data = successful_df[successful_df['algorithm'] == alg]['refined_overall_quality_score']
                print(f"  {alg}: {alg_data.mean():.3f} ± {alg_data.std():.3f} (n={len(alg_data)})")
        
        # Generation time comparison  
        if 'generation_time' in successful_df.columns:
            print("\n⏱️ Generation Time:")
            for alg in algorithms:
                alg_data = successful_df[successful_df['algorithm'] == alg]['generation_time']
                print(f"  {alg}: {alg_data.mean():.2f}s ± {alg_data.std():.2f}s (n={len(alg_data)})")
        
        # Statistical significance test (if scipy available)
        try:
            from scipy.stats import ttest_ind
            
            if len(algorithms) == 2 and 'refined_overall_quality_score' in successful_df.columns:
                alg1_data = successful_df[successful_df['algorithm'] == algorithms[0]]['refined_overall_quality_score']
                alg2_data = successful_df[successful_df['algorithm'] == algorithms[1]]['refined_overall_quality_score']
                
                if len(alg1_data) > 1 and len(alg2_data) > 1:
                    t_stat, p_value = ttest_ind(alg1_data, alg2_data)
                    print(f"\n📈 T-test between {algorithms[0]} and {algorithms[1]}:")
                    print(f"  t-statistic: {t_stat:.3f}")
                    print(f"  p-value: {p_value:.3f}")
                    print(f"  Significant difference: {'Yes' if p_value < 0.05 else 'No'} (α = 0.05)")
        except ImportError:
            print("\n⚠️ scipy not available for statistical tests")
    else:
        print("📊 Single algorithm tested - no comparison available")
else:
    print("❌ Insufficient data for statistical analysis")

## Load Recent Results (Alternative)

In [None]:
# Alternative: Load most recent sweep results from file
data_dir = Path('../data/raw')
csv_files = list(data_dir.glob('parameter_sweep_*.csv'))

if csv_files:
    # Load most recent file
    latest_file = max(csv_files, key=lambda x: x.stat().st_mtime)
    print(f"📁 Loading results from: {latest_file.name}")
    
    df = pd.read_csv(latest_file)
    print(f"   Total records: {len(df)}")
    print(f"   Sweep types: {df['sweep_type'].unique() if 'sweep_type' in df.columns else 'N/A'}")
    
    # Show first few rows
    display(df.head())
else:
    print("📁 No existing sweep results found")
    print("   Run the parameter sweep first or use the cells above to generate new data")

## Conclusions

This notebook provides the framework for comparing N=K vs N=K+D algorithms using sophisticated quality metrics:

### Key Comparisons
1. **Word Distinctiveness**: How well each algorithm produces distinct words within categories
2. **Category Separation**: Spatial separation between different puzzle categories  
3. **Category Coherence**: Internal consistency and theme alignment
4. **Performance**: Generation time and success rates

### Next Steps
1. Implement N=K+D algorithm variant in generator
2. Run comprehensive comparisons with larger sample sizes
3. Validate findings with human evaluation
4. Optimize parameters based on results