# Baseline Analysis: Current Themes Generation System

This notebook analyzes the current puzzle generation system using the refined quality metrics:
- intracategory_word_distinctiveness
- intercategory_discoherence
- intracategory_coherence
- difficulty_progression

## Setup

In [None]:
import sys
import os
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import time
from pathlib import Path

# Add puzzle generation scripts to path
sys.path.append('../../puzzle-generation')
sys.path.append('../scripts')

# Set up plotting
plt.style.use('default')
sns.set_palette('husl')
%matplotlib inline

In [None]:
# Load investigation configuration
with open('../config/investigation_config.json', 'r') as f:
    config = json.load(f)
    
print("Investigation Configuration:")
print(f"Name: {config['investigation']['name']}")
print(f"Version: {config['investigation']['version']}")
print(f"Quality Metrics: {', '.join(config['quality_assessment']['metrics'])}")

## Import Components

In [None]:
# Import puzzle generation components
try:
    from HighQualityPuzzleGenerator import HighQualityPuzzleGenerator
    from FullVectorLoader import FullVectorLoader
    from WordFrequencyService import WordFrequencyService
    print("✅ Successfully imported puzzle generation components")
    
    # Import quality metrics
    from quality_metrics import QualityMetrics
    print("✅ Successfully imported quality metrics")
except ImportError as e:
    print(f"❌ Failed to import components: {e}")
    print("Please ensure the puzzle-generation scripts are available")

## Initialize System

In [None]:
# Initialize puzzle generation system
print("🚀 Initializing puzzle generation system...")

# Initialize vector loader
vector_loader = FullVectorLoader()
load_result = await vector_loader.initialize()

if load_result['success']:
    print(f"✅ Vector loader initialized: {load_result['loadedWords']} words loaded")
    
    # Initialize puzzle generator
    generator = HighQualityPuzzleGenerator(vector_loader)
    print("✅ Puzzle generator initialized")
    
    # Initialize quality metrics with vector loader
    quality_calculator = QualityMetrics(vector_loader)
    print("✅ Quality metrics initialized with vector support")
else:
    print("❌ Failed to initialize vector loader")
    # Fallback to string-based metrics
    quality_calculator = QualityMetrics()
    print("⚠️ Using string-based quality metrics only")

## Generate Baseline Puzzles

In [None]:
# Generate baseline puzzles with current parameters
baseline_puzzles = []
baseline_metrics = []
baseline_quality_metrics = []

num_baseline_samples = 25  # Increased for better statistics
test_date = "2024-08-05"

print(f"🎯 Generating {num_baseline_samples} baseline puzzles with refined quality metrics...")

for i in range(num_baseline_samples):
    start_time = time.time()
    
    try:
        # Generate a single 4x4 puzzle for baseline
        if 'generator' in locals():
            result = await generator.generateSinglePuzzle(test_date, i+1, 4)
        else:
            print("⚠️ No generator available, using mock data")
            result = {'puzzle': None, 'qualityScore': 0, 'attempts': 0}
            
        generation_time = time.time() - start_time
        
        if result['puzzle']:
            puzzle = result['puzzle']
            baseline_puzzles.append(puzzle)
            
            # Calculate refined quality metrics
            refined_metrics = quality_calculator.calculate_all_metrics(puzzle)
            refined_metrics['puzzle_id'] = i + 1
            baseline_quality_metrics.append(refined_metrics)
            
            # Collect basic metrics
            metrics = {
                'puzzle_id': i+1,
                'generation_time': generation_time,
                'quality_score': result['qualityScore'],
                'attempts': result['attempts'],
                'success': True,
                'avg_similarity': puzzle['metadata']['avgSimilarity'],
                'num_categories': len(puzzle['categories']),
                'total_words': len(puzzle['words'])
            }
            
            # Add per-category metrics
            for j, category in enumerate(puzzle['categories']):
                metrics[f'cat_{j+1}_difficulty'] = category['difficulty']
                metrics[f'cat_{j+1}_similarity'] = category['similarity']
                metrics[f'cat_{j+1}_theme'] = category['themeWord']
                
            baseline_metrics.append(metrics)
            
            print(f"   ✅ Puzzle {i+1}: Quality {result['qualityScore']:.3f}, "
                  f"Refined Score {refined_metrics['overall_quality_score']:.3f}, "
                  f"Time {generation_time:.2f}s")
        else:
            baseline_metrics.append({
                'puzzle_id': i+1,
                'generation_time': generation_time,
                'success': False,
                'attempts': result['attempts']
            })
            print(f"   ❌ Puzzle {i+1}: Failed after {result['attempts']} attempts")
            
    except Exception as e:
        print(f"   ❌ Puzzle {i+1}: Error - {e}")
        baseline_metrics.append({
            'puzzle_id': i+1,
            'generation_time': time.time() - start_time,
            'success': False,
            'error': str(e)
        })

print(f"\n📊 Generated {len(baseline_puzzles)} successful puzzles out of {num_baseline_samples} attempts")

## Analyze Refined Quality Metrics

In [None]:
# Convert refined quality metrics to DataFrame
if baseline_quality_metrics:
    quality_df = pd.DataFrame(baseline_quality_metrics)
    
    print("📈 Refined Quality Metrics Summary:")
    
    metrics_to_analyze = [
        'intracategory_word_distinctiveness',
        'intercategory_discoherence', 
        'intracategory_coherence',
        'difficulty_progression',
        'overall_quality_score'
    ]
    
    for metric in metrics_to_analyze:
        if metric in quality_df.columns:
            mean_val = quality_df[metric].mean()
            std_val = quality_df[metric].std()
            print(f"{metric}: {mean_val:.3f} ± {std_val:.3f}")
    
    # Display first few rows
    print("\n📋 Sample Quality Metrics:")
    display(quality_df[metrics_to_analyze].head())
else:
    print("❌ No quality metrics calculated")

## Visualize Quality Metrics

In [None]:
# Plot refined quality metrics
if baseline_quality_metrics:
    fig, axes = plt.subplots(2, 3, figsize=(15, 10))
    
    metrics_to_plot = [
        ('intracategory_word_distinctiveness', 'Word Distinctiveness\n(within categories)'),
        ('intercategory_discoherence', 'Category Separation\n(between categories)'),
        ('intracategory_coherence', 'Category Coherence\n(within categories)'),
        ('difficulty_progression', 'Difficulty Progression'),
        ('overall_quality_score', 'Overall Quality Score')
    ]
    
    colors = ['skyblue', 'lightcoral', 'lightgreen', 'orange', 'purple']
    
    for i, (metric, title) in enumerate(metrics_to_plot):
        row = i // 3
        col = i % 3
        
        if metric in quality_df.columns:
            axes[row, col].hist(quality_df[metric], bins=10, alpha=0.7, color=colors[i])
            axes[row, col].set_title(title)
            axes[row, col].set_xlabel('Score')
            axes[row, col].set_ylabel('Frequency')
            
            # Add mean line
            mean_val = quality_df[metric].mean()
            axes[row, col].axvline(mean_val, color='red', linestyle='--', 
                                   label=f'Mean: {mean_val:.3f}')
            axes[row, col].legend()
    
    # Remove empty subplot
    axes[1, 2].remove()
    
    plt.tight_layout()
    plt.show()
    
    # Correlation matrix
    plt.figure(figsize=(10, 8))
    correlation_matrix = quality_df[metrics_to_analyze].corr()
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0,
                square=True, linewidths=0.5)
    plt.title('Quality Metrics Correlation Matrix')
    plt.tight_layout()
    plt.show()
else:
    print("❌ No quality metrics to visualize")

## Compare Original vs Refined Quality Scores

In [None]:
# Compare original quality scores with refined metrics
if baseline_metrics and baseline_quality_metrics:
    basic_df = pd.DataFrame(baseline_metrics)
    successful_basic = basic_df[basic_df['success'] == True]
    
    if len(successful_basic) > 0 and len(quality_df) > 0:
        # Merge dataframes
        comparison_df = successful_basic.merge(quality_df, on='puzzle_id')
        
        plt.figure(figsize=(12, 4))
        
        # Original vs Refined Quality Scores
        plt.subplot(1, 3, 1)
        plt.scatter(comparison_df['quality_score'], comparison_df['overall_quality_score'], alpha=0.7)
        plt.xlabel('Original Quality Score')
        plt.ylabel('Refined Quality Score')
        plt.title('Original vs Refined Quality')
        plt.plot([0, 1], [0, 1], 'r--', alpha=0.5)  # diagonal line
        
        # Quality Score Distributions
        plt.subplot(1, 3, 2)
        plt.hist(comparison_df['quality_score'], alpha=0.5, label='Original', bins=10)
        plt.hist(comparison_df['overall_quality_score'], alpha=0.5, label='Refined', bins=10)
        plt.xlabel('Quality Score')
        plt.ylabel('Frequency')
        plt.title('Quality Score Distributions')
        plt.legend()
        
        # Generation Time vs Quality
        plt.subplot(1, 3, 3)
        plt.scatter(comparison_df['generation_time'], comparison_df['overall_quality_score'], alpha=0.7)
        plt.xlabel('Generation Time (seconds)')
        plt.ylabel('Refined Quality Score')
        plt.title('Time vs Quality')
        
        plt.tight_layout()
        plt.show()
        
        # Calculate correlation
        correlation = comparison_df['quality_score'].corr(comparison_df['overall_quality_score'])
        print(f"\n📊 Correlation between original and refined quality scores: {correlation:.3f}")
    else:
        print("❌ Insufficient data for comparison")
else:
    print("❌ Missing data for comparison")

## Save Baseline Results

In [None]:
# Save baseline data for comparison
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_dir = Path('../data/processed')
output_dir.mkdir(exist_ok=True)

# Save basic metrics
if baseline_metrics:
    basic_df = pd.DataFrame(baseline_metrics)
    basic_df.to_csv(output_dir / f'baseline_basic_metrics_{timestamp}.csv', index=False)
    print(f"✅ Saved basic metrics: baseline_basic_metrics_{timestamp}.csv")

# Save refined quality metrics
if baseline_quality_metrics:
    quality_df.to_csv(output_dir / f'baseline_quality_metrics_{timestamp}.csv', index=False)
    print(f"✅ Saved quality metrics: baseline_quality_metrics_{timestamp}.csv")

# Save successful puzzles
if baseline_puzzles:
    with open(output_dir / f'baseline_puzzles_{timestamp}.json', 'w') as f:
        json.dump(baseline_puzzles, f, indent=2)
    print(f"✅ Saved puzzles: baseline_puzzles_{timestamp}.json")

# Save comprehensive summary
summary = {
    'timestamp': timestamp,
    'config': config,
    'total_attempts': len(baseline_metrics),
    'successful_puzzles': len(baseline_puzzles),
    'success_rate': len(baseline_puzzles) / len(baseline_metrics) if baseline_metrics else 0,
}

if baseline_quality_metrics:
    # Add refined quality metrics summary
    for metric in ['intracategory_word_distinctiveness', 'intercategory_discoherence', 
                   'intracategory_coherence', 'difficulty_progression', 'overall_quality_score']:
        if metric in quality_df.columns:
            summary[f'avg_{metric}'] = float(quality_df[metric].mean())
            summary[f'std_{metric}'] = float(quality_df[metric].std())

with open(output_dir / f'baseline_summary_{timestamp}.json', 'w') as f:
    json.dump(summary, f, indent=2)

print(f"✅ Saved summary: baseline_summary_{timestamp}.json")
print(f"\n📁 All results saved to: {output_dir}")

## Conclusions

This baseline analysis establishes the current performance using refined quality metrics:

### Key Findings
1. **Word Distinctiveness**: Measures how different words are within each category
2. **Category Separation**: Uses Calinski-Harabasz inspired metrics for inter-category discoherence
3. **Category Coherence**: Measures within-category compactness and theme alignment
4. **Difficulty Progression**: Validates systematic difficulty increase

### Next Steps
1. **Parameter Sweeps**: Test N=K vs N=K+D algorithms with these refined metrics
2. **Optimization**: Find parameter combinations that maximize quality scores
3. **Validation**: Compare results with human evaluation
4. **Implementation**: Deploy optimized parameters to production system