# Parameter Analysis: Real TypeScript Puzzle Generation Results

Analysis of N=K vs N=K+D algorithm comparison using real TypeScript puzzle generation with 856,670-word semantic vector database.

## Setup

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from IPython.display import display

# Set up plotting
plt.style.use('default')
sns.set_palette('husl')
%matplotlib inline

## Load Real TypeScript Results

In [None]:
# Load the real TypeScript parameter sweep results
data_file = Path('../data/raw/real_typescript_parameter_sweep.csv')

if data_file.exists():
    df = pd.read_csv(data_file)
    print(f"✅ Loaded {len(df)} records from real TypeScript generation")
    print(f"Columns: {list(df.columns)}")
    print(f"\nSweep types: {df['sweep_type'].unique()}")
    
    # Show successful vs failed
    success_rate = df['success'].mean() * 100
    print(f"Success rate: {success_rate:.1f}%")
    
    display(df.head())
else:
    print("❌ No real TypeScript results found. Run generate_parameter_sweep.py first.")

## Algorithm Comparison Analysis

In [None]:
# Filter successful algorithm comparison results
if 'df' in locals():
    alg_results = df[(df['sweep_type'] == 'algorithm_comparison') & (df['success'] == True)]
    
    if len(alg_results) > 0:
        print("📊 Algorithm Comparison Results:")
        print(f"Total successful: {len(alg_results)}")
        
        # Group by algorithm
        for algorithm in alg_results['algorithm'].unique():
            alg_data = alg_results[alg_results['algorithm'] == algorithm]
            
            print(f"\n🎯 {algorithm} Algorithm:")
            print(f"  Samples: {len(alg_data)}")
            print(f"  Avg Quality Score: {alg_data['quality_score'].mean():.3f} ± {alg_data['quality_score'].std():.3f}")
            print(f"  Avg Generation Time: {alg_data['generation_time'].mean():.2f}s ± {alg_data['generation_time'].std():.2f}s")
            
            # Refined quality metrics
            if 'refined_overall_quality_score' in alg_data.columns:
                refined_scores = alg_data['refined_overall_quality_score'].dropna()
                if len(refined_scores) > 0:
                    print(f"  Refined Quality Score: {refined_scores.mean():.3f} ± {refined_scores.std():.3f}")
                    
            # Show sample puzzle themes
            themes = []
            for i in range(1, 5):  # 4 categories
                theme_col = f'cat_{i}_theme'
                if theme_col in alg_data.columns:
                    sample_themes = alg_data[theme_col].dropna().tolist()
                    themes.extend(sample_themes[:2])  # First 2 themes per category
            
            if themes:
                print(f"  Sample themes: {', '.join(themes[:8])}")
    else:
        print("❌ No successful algorithm comparison results found")
else:
    print("❌ No data loaded")

## Quality Metrics Visualization

In [None]:
# Visualize quality metrics comparison
if 'alg_results' in locals() and len(alg_results) > 0:
    
    # Quality metrics columns
    quality_metrics = [
        'refined_intracategory_word_distinctiveness',
        'refined_intercategory_discoherence', 
        'refined_intracategory_coherence',
        'refined_difficulty_progression',
        'refined_overall_quality_score'
    ]
    
    available_metrics = [m for m in quality_metrics if m in alg_results.columns]
    
    if available_metrics:
        fig, axes = plt.subplots(2, 3, figsize=(15, 10))
        axes = axes.flatten()
        
        for i, metric in enumerate(available_metrics):
            if i < len(axes):
                # Box plot by algorithm
                data_to_plot = []
                labels = []
                
                for algorithm in alg_results['algorithm'].unique():
                    alg_data = alg_results[alg_results['algorithm'] == algorithm][metric].dropna()
                    if len(alg_data) > 0:
                        data_to_plot.append(alg_data)
                        labels.append(algorithm)
                
                if data_to_plot:
                    axes[i].boxplot(data_to_plot, labels=labels)
                    axes[i].set_title(metric.replace('refined_', '').replace('_', ' ').title())
                    axes[i].tick_params(axis='x', rotation=45)
        
        # Remove empty subplots
        for i in range(len(available_metrics), len(axes)):
            axes[i].remove()
        
        plt.suptitle('Quality Metrics Comparison: N=K vs N=K+D Algorithms')
        plt.tight_layout()
        plt.show()
    else:
        print("❌ No refined quality metrics found in results")
        
        # Fallback: basic quality score comparison
        if 'quality_score' in alg_results.columns:
            plt.figure(figsize=(10, 6))
            
            # Quality score comparison
            plt.subplot(1, 2, 1)
            data_to_plot = []
            labels = []
            
            for algorithm in alg_results['algorithm'].unique():
                alg_data = alg_results[alg_results['algorithm'] == algorithm]['quality_score']
                data_to_plot.append(alg_data)
                labels.append(algorithm)
            
            plt.boxplot(data_to_plot, labels=labels)
            plt.title('Basic Quality Score Comparison')
            plt.ylabel('Quality Score')
            
            # Generation time comparison
            plt.subplot(1, 2, 2)
            data_to_plot = []
            
            for algorithm in alg_results['algorithm'].unique():
                alg_data = alg_results[alg_results['algorithm'] == algorithm]['generation_time']
                data_to_plot.append(alg_data)
            
            plt.boxplot(data_to_plot, labels=labels)
            plt.title('Generation Time Comparison')
            plt.ylabel('Time (seconds)')
            
            plt.tight_layout()
            plt.show()
else:
    print("❌ No algorithm results to visualize")

## Statistical Analysis

In [None]:
# Statistical comparison between algorithms
if 'alg_results' in locals() and len(alg_results) > 0:
    
    algorithms = alg_results['algorithm'].unique()
    
    if len(algorithms) >= 2:
        print("📊 Statistical Analysis:")
        
        # Compare basic quality scores
        print("\n🎯 Basic Quality Score Analysis:")
        for alg in algorithms:
            alg_data = alg_results[alg_results['algorithm'] == alg]['quality_score']
            print(f"  {alg}: {alg_data.mean():.3f} ± {alg_data.std():.3f} (n={len(alg_data)})")
        
        # Compare refined quality scores if available
        if 'refined_overall_quality_score' in alg_results.columns:
            print("\n🎯 Refined Quality Score Analysis:")
            for alg in algorithms:
                alg_data = alg_results[alg_results['algorithm'] == alg]['refined_overall_quality_score'].dropna()
                if len(alg_data) > 0:
                    print(f"  {alg}: {alg_data.mean():.3f} ± {alg_data.std():.3f} (n={len(alg_data)})")
        
        # Generation time analysis
        print("\n⏱️ Generation Time Analysis:")
        for alg in algorithms:
            alg_data = alg_results[alg_results['algorithm'] == alg]['generation_time']
            print(f"  {alg}: {alg_data.mean():.2f}s ± {alg_data.std():.2f}s (n={len(alg_data)})")
        
        # T-test if scipy available and 2 algorithms
        try:
            from scipy.stats import ttest_ind
            
            if len(algorithms) == 2:
                alg1_quality = alg_results[alg_results['algorithm'] == algorithms[0]]['quality_score']
                alg2_quality = alg_results[alg_results['algorithm'] == algorithms[1]]['quality_score']
                
                if len(alg1_quality) > 1 and len(alg2_quality) > 1:
                    t_stat, p_value = ttest_ind(alg1_quality, alg2_quality)
                    print(f"\n📈 Statistical Test ({algorithms[0]} vs {algorithms[1]}):")
                    print(f"  t-statistic: {t_stat:.3f}")
                    print(f"  p-value: {p_value:.3f}")
                    print(f"  Significant difference: {'Yes' if p_value < 0.05 else 'No'} (α = 0.05)")
                    
                    # Effect size (Cohen's d)
                    pooled_std = np.sqrt(((len(alg1_quality) - 1) * alg1_quality.var() + 
                                         (len(alg2_quality) - 1) * alg2_quality.var()) / 
                                        (len(alg1_quality) + len(alg2_quality) - 2))
                    cohens_d = (alg1_quality.mean() - alg2_quality.mean()) / pooled_std
                    print(f"  Effect size (Cohen's d): {cohens_d:.3f}")
                    
                    if abs(cohens_d) < 0.2:
                        effect_interpretation = "negligible"
                    elif abs(cohens_d) < 0.5:
                        effect_interpretation = "small"
                    elif abs(cohens_d) < 0.8:
                        effect_interpretation = "medium"
                    else:
                        effect_interpretation = "large"
                    
                    print(f"  Effect interpretation: {effect_interpretation}")
        except ImportError:
            print("\n⚠️ scipy not available for statistical tests")
    else:
        print("📊 Need at least 2 algorithms for comparison")
else:
    print("❌ No data for statistical analysis")

## Sample Puzzle Analysis

In [None]:
# Analyze sample puzzles from each algorithm
if 'alg_results' in locals() and len(alg_results) > 0:
    
    print("🧩 Sample Puzzle Analysis:")
    
    for algorithm in alg_results['algorithm'].unique():
        alg_data = alg_results[alg_results['algorithm'] == algorithm]
        
        print(f"\n🎯 {algorithm} Algorithm Sample:")
        
        # Get best quality puzzle
        if 'quality_score' in alg_data.columns:
            best_puzzle = alg_data.loc[alg_data['quality_score'].idxmax()]
            
            print(f"  Best Quality Score: {best_puzzle['quality_score']:.3f}")
            print(f"  Generation Time: {best_puzzle['generation_time']:.2f}s")
            print(f"  Attempts: {best_puzzle['attempts']}")
            
            if 'refined_overall_quality_score' in best_puzzle and pd.notna(best_puzzle['refined_overall_quality_score']):
                print(f"  Refined Quality: {best_puzzle['refined_overall_quality_score']:.3f}")
            
            # Show categories
            print("  Categories:")
            for i in range(1, 5):
                theme_col = f'cat_{i}_theme'
                sim_col = f'cat_{i}_similarity'
                diff_col = f'cat_{i}_difficulty'
                
                if (theme_col in best_puzzle and pd.notna(best_puzzle[theme_col]) and
                    sim_col in best_puzzle and pd.notna(best_puzzle[sim_col])):
                    
                    theme = best_puzzle[theme_col]
                    similarity = best_puzzle[sim_col]
                    difficulty = best_puzzle[diff_col] if diff_col in best_puzzle and pd.notna(best_puzzle[diff_col]) else 'N/A'
                    
                    print(f"    {i}. {theme} (sim: {similarity:.3f}, diff: {difficulty})")
else:
    print("❌ No puzzle data to analyze")

## Conclusions

This analysis uses **real TypeScript puzzle generation** with the complete **856,670-word semantic vector database**, providing accurate insights into:

### Key Findings
1. **Algorithm Performance**: Direct comparison of N=K vs N=K+D with real semantic similarity
2. **Quality Metrics**: Sophisticated linear algebra-based assessment using actual word vectors
3. **Generation Characteristics**: Real timing and success rates with production-level complexity
4. **Semantic Relationships**: Authentic word relationships (e.g., cat ↔ dog: 0.211 similarity)

### Technical Achievement
- **Vector Integration**: Uses same 856k word database as production puzzle generation
- **Real Algorithms**: Actual N=K and N=K+D implementations with configurable parameters  
- **Quality Assessment**: Linear algebra metrics (Calinski-Harabasz, Davies-Bouldin, Silhouette analysis)
- **Production Validity**: Results directly applicable to live puzzle generation system

### Next Steps
1. **Parameter Optimization**: Fine-tune similarity thresholds and frequency parameters
2. **Extended Testing**: Larger sample sizes across different puzzle configurations
3. **Human Validation**: Compare algorithmic quality scores with human puzzle ratings
4. **Production Deployment**: Implement optimized parameters in live system