# Curriculum Learning Experiment: Progressive Scale Analysis

This notebook implements a progressive approach to curriculum learning research:
1. **Debug Study** (1K samples) - Pipeline validation
2. **Medium Pilot Study** (100K samples) - Effect detection
3. **Full Scientific Study** (1M samples) - Comprehensive analysis

## Key Features
- Robust BERTopic handling for large datasets
- Multi-GPU support with curriculum preservation
- Statistical significance testing at each scale
- Weights & Biases integration for experiment tracking

## 1. Setup and Configuration

In [1]:
# Core imports
import os
import sys
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
import gc
warnings.filterwarnings('ignore')

# Set style for visualizations
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

# NEW: Unified system imports
from unified_experiment import (
    UnifiedExperiment, 
    ExperimentMode,
    run_basic_experiment,
    run_enhanced_experiment,
    run_memory_efficient_experiment,
    run_fair_comparison_experiment
)
from config import (
    Config, 
    debug_config,
    pilot_config, 
    scientific_config,
    fair_comparison_config
)

# Enable high DPI displays
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU count: {torch.cuda.device_count()}")
    for i in range(torch.cuda.device_count()):
        print(f"  GPU {i}: {torch.cuda.get_device_name(i)}")
        
# Memory status
import psutil
print(f"\nSystem Memory: {psutil.virtual_memory().total / 1024**3:.1f} GB")
print(f"Available Memory: {psutil.virtual_memory().available / 1024**3:.1f} GB")

print(f"\nüî¨ Unified Experiment System loaded:")
print(f"  ‚úÖ Single interface for all experiment modes")
print(f"  ‚úÖ Enhanced metrics (top-5 accuracy, perplexity, confidence)")
print(f"  ‚úÖ Memory-efficient handling for large datasets")
print(f"  ‚úÖ Fair comparison mode for unbiased research")
print(f"  ‚úÖ Statistical analysis and convergence detection")
print(f"  ‚úÖ Improved W&B logging with organized structure")

# Configuration for different experiment types
EXPERIMENT_MODE = ExperimentMode.ENHANCED  # Can be: BASIC, ENHANCED, MEMORY_EFFICIENT, FAIR_COMPARISON
FAIR_COMPARISON_MODE = False  # Set to True for unbiased curriculum comparison

print(f"\n‚öôÔ∏è  Current settings:")
print(f"   Experiment mode: {EXPERIMENT_MODE.value}")
print(f"   Fair comparison: {'Enabled' if FAIR_COMPARISON_MODE else 'Disabled'}")

PyTorch version: 2.7.1+cu126
CUDA available: True
GPU count: 4
  GPU 0: NVIDIA A100-SXM4-80GB
  GPU 1: NVIDIA A100-SXM4-80GB
  GPU 2: NVIDIA A100-SXM4-80GB
  GPU 3: NVIDIA A100-SXM4-80GB

System Memory: 668.9 GB
Available Memory: 662.4 GB

üî¨ Unified Experiment System loaded:
  ‚úÖ Single interface for all experiment modes
  ‚úÖ Enhanced metrics (top-5 accuracy, perplexity, confidence)
  ‚úÖ Memory-efficient handling for large datasets
  ‚úÖ Fair comparison mode for unbiased research
  ‚úÖ Statistical analysis and convergence detection
  ‚úÖ Improved W&B logging with organized structure

‚öôÔ∏è  Current settings:
   Experiment mode: enhanced
   Fair comparison: Disabled


## 2. Debug Study - Pipeline Validation

Quick test to ensure all components work correctly.

In [None]:
# Run debug experiment with unified system
print("\nüîß Running Debug Study (Unified System)...\n")

# Create debug configuration
config = debug_config(use_wandb=False)

# Run with selected mode
if FAIR_COMPARISON_MODE:
    print("üéØ Using Fair Comparison Mode (no early stopping)")
    config = fair_comparison_config(scale="debug", use_wandb=False)
    debug_results = run_fair_comparison_experiment(config)
elif EXPERIMENT_MODE == ExperimentMode.ENHANCED:
    print("üî¨ Using Enhanced Mode (comprehensive metrics)")
    debug_results = run_enhanced_experiment(config)
elif EXPERIMENT_MODE == ExperimentMode.MEMORY_EFFICIENT:
    print("üíæ Using Memory-Efficient Mode")
    debug_results = run_memory_efficient_experiment(config)
else:
    print("üìä Using Basic Mode")
    debug_results = run_basic_experiment(config)

print("\n‚úÖ Debug study completed successfully!")

# Safe access to experiment ID with fallback
experiment_id = "N/A"
if hasattr(debug_results, 'experiment_summary') and debug_results.experiment_summary:
    if 'experiment_summary' in debug_results.experiment_summary:
        experiment_id = debug_results.experiment_summary['experiment_summary'].get('experiment_id', 'N/A')
    elif 'experiment_id' in debug_results.experiment_summary:
        experiment_id = debug_results.experiment_summary['experiment_id']

print(f"Experiment ID: {experiment_id}")

# Enhanced results analysis for enhanced/fair comparison modes
if EXPERIMENT_MODE in [ExperimentMode.ENHANCED, ExperimentMode.FAIR_COMPARISON] or FAIR_COMPARISON_MODE:
    final_scores = debug_results.statistical_analysis
    if final_scores:
        print(f"\nüìä Final Validation Accuracies:")
        for strategy, score in final_scores.items():
            print(f"   {strategy}: {score:.4f}")
        
        max_accuracy = max(final_scores.values())
        if max_accuracy < 0.1:
            print(f"\n‚ö†Ô∏è  WARNING: Maximum accuracy is {max_accuracy:.4f} (<10%), this seems low for MLM.")
            print(f"    This might indicate a training problem that needs investigation.")
        else:
            print(f"\n‚úÖ Accuracy looks reasonable (max: {max_accuracy:.4f})")
        
        # Convergence analysis
        if debug_results.convergence_analysis:
            converged_strategies = [s for s, info in debug_results.convergence_analysis.items() 
                                  if info.get('converged', False)]
            if converged_strategies:
                print(f"\nüéØ Converged strategies: {', '.join(converged_strategies)}")
    else:
        print("\nüìä Basic mode results - limited analysis available")
        
print(f"\n‚è±Ô∏è  Runtime: {debug_results.resource_usage.get('total_runtime_hours', 0):.2f} hours")

In [None]:
# Run debug experiment
print("\nüîß Running Debug Study...\n")
debug_experiment = Experiment(debug_config)
debug_results = debug_experiment.run()

print("\n‚úÖ Debug study completed successfully!")
print(f"Experiment ID: {debug_results['experiment_id']}")

## 3. Medium Pilot Study - Effect Detection

This pilot study uses 100K samples to detect whether curriculum learning effects exist in our setup.

In [None]:
# Medium pilot configuration - unified system
if FAIR_COMPARISON_MODE:
    print("üéØ Creating Fair Comparison Configuration for Pilot Study")
    pilot_config = fair_comparison_config(
        scale="large",  # 100K samples
        model_size="bert-small",
        num_epochs=15,
        num_runs=3,
        batch_size=32,
        strategies=[
            "random",  # Baseline
            "reading_level_easy_to_hard",
            "reading_level_hard_to_easy",
            "topic_sequential",
            "topic_largest_first",
            "hybrid_reading_topic"
        ],
        use_wandb=True,
        experiment_name="curriculum_pilot_fair_comparison"
    )
else:
    print("üî¨ Creating Enhanced Configuration for Pilot Study")
    pilot_config = Config(
        scale="large",  # 100K samples
        model_size="bert-small",
        num_epochs=15,
        num_runs=3,
        batch_size=32,
        strategies=[
            "random",  # Baseline
            "reading_level_easy_to_hard",
            "reading_level_hard_to_easy",
            "topic_sequential",
            "topic_largest_first",
            "hybrid_reading_topic"
        ],
        use_wandb=True,
        experiment_name="curriculum_pilot_enhanced",
        memory_efficient=True,  # Enable for large dataset
        eval_every_n_steps=500,
        use_early_stopping=True,
        early_stopping_patience=7
    )

print("Pilot Study Configuration:")
pilot_config.print_summary()

# Show configuration type
if hasattr(pilot_config, 'use_early_stopping'):
    early_stopping_status = "Disabled (fair comparison)" if not pilot_config.use_early_stopping else f"Enabled (patience={pilot_config.early_stopping_patience})"
    print(f"\nExperiment mode: {'Fair Comparison' if FAIR_COMPARISON_MODE else 'Enhanced'}")
    print(f"Early stopping: {early_stopping_status}")
    print(f"Memory efficient: {getattr(pilot_config, 'memory_efficient', False)}")
    
# Mode selection
if EXPERIMENT_MODE == ExperimentMode.MEMORY_EFFICIENT:
    PILOT_MODE = ExperimentMode.MEMORY_EFFICIENT
    print(f"Using memory-efficient mode for pilot study")
elif FAIR_COMPARISON_MODE:
    PILOT_MODE = ExperimentMode.FAIR_COMPARISON
    print(f"Using fair comparison mode for pilot study")
else:
    PILOT_MODE = ExperimentMode.ENHANCED
    print(f"Using enhanced mode for pilot study")

In [None]:
# Run pilot experiment with unified system
print(f"\nüöÄ Running Medium Pilot Study (Unified System - {PILOT_MODE.value} mode)...\n")
print("This will help us detect if curriculum effects exist before the full study.\n")

# Print mode-specific features
if PILOT_MODE == ExperimentMode.ENHANCED:
    print("Enhanced mode features:\n")
    print("  üî¨ Comprehensive MLM metrics (accuracy, top-5, perplexity, confidence)")
    print("  üìä Epoch-level summaries and statistical analysis")
    print("  üõë Early stopping to prevent overtraining")
    print("  üìà Convergence detection and effect size calculation")
    print("  üéØ Organized W&B logging with reduced noise")
elif PILOT_MODE == ExperimentMode.MEMORY_EFFICIENT:
    print("Memory-efficient mode features:\n")
    print("  üíæ Aggressive garbage collection and memory monitoring")
    print("  üîÑ GPU cache clearing between strategies")
    print("  üìä Resource usage tracking")
elif PILOT_MODE == ExperimentMode.FAIR_COMPARISON:
    print("Fair comparison mode features:\n")
    print("  ‚öñÔ∏è  No early stopping (all strategies train equal steps)")
    print("  üîÑ Fixed seeds for perfect reproducibility")
    print("  üìä Comprehensive logging for unbiased analysis")
    print("  üìà Statistical significance testing")

print()

# Clear memory before starting
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()

# Run experiment with unified system
pilot_experiment = UnifiedExperiment(pilot_config, PILOT_MODE)
pilot_results = pilot_experiment.run()

# Clear memory after completion
del pilot_experiment
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()

print("\n‚úÖ Pilot study completed!")

# Enhanced results display
print(f"\nüìä Pilot Study Results:")
print(f"   Strategies analyzed: {len(pilot_results.strategy_results)}")
print(f"   Runtime: {pilot_results.resource_usage.get('total_runtime_hours', 0):.2f} hours")

# Show final scores if available
if pilot_results.statistical_analysis:
    print(f"\nüéØ Final Validation Accuracies:")
    baseline_acc = pilot_results.statistical_analysis.get('random', 0)
    
    for strategy, score in pilot_results.statistical_analysis.items():
        if strategy == "random":
            print(f"   üìä {strategy}: {score:.4f} (baseline)")
        else:
            improvement = ((score - baseline_acc) / max(baseline_acc, 1e-8)) * 100 if baseline_acc > 0 else 0
            print(f"   üìà {strategy}: {score:.4f} ({improvement:+.2f}% vs baseline)")
    
    # Quality assessment
    max_accuracy = max(pilot_results.statistical_analysis.values())
    if max_accuracy < 0.15:
        print(f"\n‚ö†Ô∏è  ATTENTION: Maximum accuracy is {max_accuracy:.4f} (<15%)")
        print(f"    Consider checking model configuration or training setup")
    elif max_accuracy > 0.25:
        print(f"\n‚úÖ Excellent accuracy achieved (max: {max_accuracy:.4f})!")
    else:
        print(f"\n‚úÖ Reasonable accuracy for MLM (max: {max_accuracy:.4f})")

# Convergence analysis for enhanced modes
if (PILOT_MODE in [ExperimentMode.ENHANCED, ExperimentMode.FAIR_COMPARISON] and 
    pilot_results.convergence_analysis):
    print(f"\nüîç Convergence Analysis:")
    converged_strategies = []
    for strategy, convergence in pilot_results.convergence_analysis.items():
        if convergence.get('converged', False):
            print(f"   ‚úÖ {strategy}: Converged at epoch {convergence.get('convergence_epoch', 'N/A')}")
            converged_strategies.append(strategy)
        else:
            print(f"   ‚è≥ {strategy}: Still improving ({convergence.get('reason', 'Unknown')})")
    
    print(f"\n   Summary: {len(converged_strategies)}/{len(pilot_results.convergence_analysis)} strategies converged")

# Memory usage for memory-efficient mode
if PILOT_MODE == ExperimentMode.MEMORY_EFFICIENT:
    resource_summary = pilot_results.resource_usage
    if 'peak_memory_mb' in resource_summary:
        print(f"\nüíæ Memory Usage:")
        print(f"   Initial: {resource_summary.get('initial_memory_mb', 0):.1f} MB")
        print(f"   Peak: {resource_summary.get('peak_memory_mb', 0):.1f} MB")
        print(f"   Final: {resource_summary.get('final_memory_mb', 0):.1f} MB")

In [None]:
# Medium pilot configuration with memory efficiency
pilot_config = Config(
    scale="large",  # 100K samples
    model_size="bert-small",
    num_epochs=15,  # Enough to see convergence differences
    num_runs=3,  # Fewer runs for pilot
    batch_size=32,  # Reduced from 64 to save memory
    strategies=[
        "random",  # Baseline
        "reading_level_easy_to_hard",
        "reading_level_hard_to_easy",
        "topic_sequential",
        "topic_largest_first",
        "hybrid_reading_topic"
    ],
    use_wandb=True,
    experiment_name="curriculum_pilot_study",
    memory_efficient=True,  # Enable memory-efficient mode
    num_workers=2,  # Reduce workers to save memory
    eval_every_n_steps=500  # Less frequent evaluation to save memory
)

print("Pilot Study Configuration (Memory-Efficient):")
pilot_config.print_summary()
print(f"\nMemory-efficient mode: {pilot_config.memory_efficient}")
print(f"Max memory limit: {pilot_config.max_memory_gb} GB")

In [None]:
# Enhanced pilot results analysis
pilot_report = pilot_results['report']
enhanced_summary = pilot_results['enhanced_summary']

# Extract metrics from enhanced summary (more reliable than old approach)
strategies = list(enhanced_summary['final_scores'].keys())
final_accuracies = list(enhanced_summary['final_scores'].values())

# Create enhanced comparison dataframe
pilot_df = pd.DataFrame({
    'Strategy': strategies,
    'Final Accuracy': final_accuracies
}).sort_values('Final Accuracy', ascending=False)

# Add additional metrics if available
if 'convergence_analysis' in enhanced_summary:
    convergence_epochs = []
    converged_status = []
    
    for strategy in strategies:
        conv_info = enhanced_summary['convergence_analysis'].get(strategy, {})
        convergence_epochs.append(conv_info.get('convergence_epoch', 'N/A'))
        converged_status.append('Yes' if conv_info.get('converged', False) else 'No')
    
    pilot_df['Convergence Epoch'] = convergence_epochs
    pilot_df['Converged'] = converged_status

print("üìä Enhanced Pilot Study Results Summary:")
print(pilot_df.to_string(index=False))

# Enhanced statistical analysis
random_accuracy = enhanced_summary['final_scores'].get('random', 0)
best_accuracy = pilot_df.iloc[0]['Final Accuracy']
best_strategy = pilot_df.iloc[0]['Strategy']

if random_accuracy > 0:
    improvement = ((best_accuracy - random_accuracy) / random_accuracy) * 100
    print(f"\nüìà Statistical Analysis:")
    print(f"   Random baseline: {random_accuracy:.4f}")
    print(f"   Best strategy: {best_strategy}")
    print(f"   Best accuracy: {best_accuracy:.4f}")
    print(f"   Improvement over random: {improvement:.2f}%")
    
    # Effect size estimation (rough)
    effect_size = abs(improvement) / 10  # Rough estimate
    if effect_size < 0.2:
        effect_interpretation = "Negligible effect"
    elif effect_size < 0.5:
        effect_interpretation = "Small effect"
    elif effect_size < 0.8:
        effect_interpretation = "Medium effect"
    else:
        effect_interpretation = "Large effect"
    
    print(f"   Effect size: {effect_size:.3f} ({effect_interpretation})")
    
    # Decision for full study
    if improvement > 5 and best_accuracy > 0.15:
        print(f"\n‚úÖ STRONG SIGNAL: Significant curriculum effects detected!")
        print(f"   Both improvement ({improvement:.1f}%) and absolute accuracy ({best_accuracy:.1f}) are good.")
        print(f"   üìù Recommendation: Proceed to full scientific study.")
    elif improvement > 2:
        print(f"\n‚ö†Ô∏è  WEAK SIGNAL: Some curriculum effects detected.")
        print(f"   Improvement is modest ({improvement:.1f}%). Consider:")
        print(f"   ‚Ä¢ Running longer (more epochs)")
        print(f"   ‚Ä¢ Trying different hyperparameters") 
        print(f"   ‚Ä¢ Using larger model")
    else:
        print(f"\n‚ùå NO CLEAR SIGNAL: Minimal curriculum effects detected.")
        print(f"   Consider investigating training setup before full study.")

else:
    print(f"\n‚ö†Ô∏è  Cannot perform statistical analysis - no random baseline found.")

# Quality assessment
print(f"\nüîç Training Quality Assessment:")
if best_accuracy < 0.15:
    print(f"   ‚ùå Low accuracy ({best_accuracy:.4f}) suggests training issues")
    print(f"      ‚Ä¢ Check learning rate (current: {pilot_config.learning_rate})")
    print(f"      ‚Ä¢ Consider more epochs (current: {pilot_config.num_epochs})")
    print(f"      ‚Ä¢ Verify data quality and model configuration")
elif best_accuracy > 0.3:
    print(f"   ‚úÖ Excellent accuracy ({best_accuracy:.4f}) - training working well!")
else:
    print(f"   ‚úÖ Reasonable accuracy ({best_accuracy:.4f}) - training seems healthy")

# Convergence insights
converged_count = pilot_df['Converged'].value_counts().get('Yes', 0) if 'Converged' in pilot_df.columns else 0
total_strategies = len(strategies)

print(f"\nüéØ Convergence Insights:")
print(f"   Converged strategies: {converged_count}/{total_strategies}")
if converged_count == total_strategies:
    print(f"   ‚úÖ All strategies converged - good training stability")
elif converged_count > total_strategies // 2:
    print(f"   ‚ö° Most strategies converged - reasonable training")
else:
    print(f"   ‚ö†Ô∏è  Few strategies converged - may need more epochs or better LR")

# Specific curriculum insights
reading_strategies = [s for s in strategies if 'reading_level' in s]
topic_strategies = [s for s in strategies if 'topic' in s and 'reading' not in s]
hybrid_strategies = [s for s in strategies if 'hybrid' in s]

if reading_strategies:
    reading_scores = [enhanced_summary['final_scores'][s] for s in reading_strategies]
    best_reading = reading_strategies[np.argmax(reading_scores)]
    print(f"\nüìö Reading-level Strategies:")
    print(f"   Best: {best_reading} ({max(reading_scores):.4f})")

if topic_strategies:
    topic_scores = [enhanced_summary['final_scores'][s] for s in topic_strategies]
    best_topic = topic_strategies[np.argmax(topic_scores)]
    print(f"\nüè∑Ô∏è  Topic-based Strategies:")
    print(f"   Best: {best_topic} ({max(topic_scores):.4f})")

if hybrid_strategies:
    hybrid_scores = [enhanced_summary['final_scores'][s] for s in hybrid_strategies]
    best_hybrid = hybrid_strategies[np.argmax(hybrid_scores)]
    print(f"\nüîÑ Hybrid Strategies:")
    print(f"   Best: {best_hybrid} ({max(hybrid_scores):.4f})")

### 3.1 Pilot Study Analysis - Can We Detect Effects?

In [None]:
# Analyze pilot results
pilot_report = pilot_results['report']

# Extract key metrics
strategies = list(pilot_results['results'].keys())
final_losses = []
convergence_speeds = []

for strategy in strategies:
    if strategy in pilot_results['results']:
        result = pilot_results['results'][strategy]
        final_losses.append(result['losses'][-1])
        
        # Find convergence point (when loss drops below threshold)
        threshold = result['losses'][0] * 0.5  # 50% of initial loss
        convergence_step = next((i for i, loss in enumerate(result['losses']) if loss < threshold), len(result['losses']))
        convergence_speeds.append(convergence_step)

# Create comparison dataframe
pilot_df = pd.DataFrame({
    'Strategy': strategies,
    'Final Loss': final_losses,
    'Convergence Step': convergence_speeds
}).sort_values('Final Loss')

print("Pilot Study Results Summary:")
print(pilot_df.to_string(index=False))

# Statistical test
from scipy import stats
random_loss = pilot_df[pilot_df['Strategy'] == 'random']['Final Loss'].values[0]
best_loss = pilot_df.iloc[0]['Final Loss']
improvement = (random_loss - best_loss) / random_loss * 100

print(f"\nBest strategy: {pilot_df.iloc[0]['Strategy']}")
print(f"Improvement over random: {improvement:.1f}%")

# Decision for full study
if improvement > 5:
    print("\n‚úÖ Significant curriculum effects detected! Proceeding to full study is justified.")
else:
    print("\n‚ö†Ô∏è Minimal curriculum effects detected. Consider adjusting approach.")

In [None]:
# Visualize pilot results
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Learning curves
for strategy in strategies:
    if strategy in pilot_results['results']:
        result = pilot_results['results'][strategy]
        steps = result['steps'][:1000:10]  # Sample for clarity
        losses = result['losses'][:1000:10]
        ax1.plot(steps, losses, label=strategy, linewidth=2)

ax1.set_xlabel('Training Steps')
ax1.set_ylabel('Loss')
ax1.set_title('Pilot Study: Learning Curves')
ax1.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
ax1.grid(True, alpha=0.3)

# Final performance comparison
ax2.bar(pilot_df['Strategy'], pilot_df['Final Loss'])
ax2.set_xlabel('Strategy')
ax2.set_ylabel('Final Loss')
ax2.set_title('Pilot Study: Final Performance')
ax2.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## 4. Full Scientific Study - Comprehensive Analysis

Based on pilot results, we proceed with the full-scale study using 1M samples.

In [None]:
# Full scientific configuration with unified system
if FAIR_COMPARISON_MODE:
    print("üéØ Creating Fair Comparison Configuration for Scientific Study")
    scientific_config = fair_comparison_config(
        scale="extreme",  # 1M samples
        model_size="bert-small",
        num_epochs=50,
        num_runs=5,
        batch_size=256,
        strategies=[
            # Core strategies
            "random",
            "reading_level_easy_to_hard",
            "reading_level_hard_to_easy",
            "reading_level_staged",
            
            # Topic-based strategies
            "topic_sequential",
            "topic_interleaved",
            "topic_largest_first",
            
            # Hybrid strategies
            "hybrid_reading_topic",
            "hybrid_topic_reading",
            
            # Epoch-interleaving strategies
            "reading_topic_by_epoch",
            "reading_levels_by_epoch",
            "all_strategies_by_epoch"
        ],
        use_wandb=True,
        experiment_name="curriculum_scientific_fair_comparison"
    )
    SCIENTIFIC_MODE = ExperimentMode.FAIR_COMPARISON
else:
    print("üî¨ Creating Enhanced Configuration for Scientific Study")
    scientific_config = Config(
        scale="extreme",  # 1M samples
        model_size="bert-small",
        num_epochs=50,
        num_runs=5,
        batch_size=256,
        strategies=[
            # Core strategies
            "random",
            "reading_level_easy_to_hard",
            "reading_level_hard_to_easy",
            "reading_level_staged",
            
            # Topic-based strategies
            "topic_sequential",
            "topic_interleaved",
            "topic_largest_first",
            
            # Hybrid strategies
            "hybrid_reading_topic",
            "hybrid_topic_reading",
            
            # Epoch-interleaving strategies
            "reading_topic_by_epoch",
            "reading_levels_by_epoch",
            "all_strategies_by_epoch"
        ],
        use_wandb=True,
        experiment_name="curriculum_scientific_enhanced",
        memory_efficient=True,  # Enable for 1M samples
        eval_every_n_steps=2000
    )
    
    # Use memory-efficient mode for large scale regardless of global setting
    SCIENTIFIC_MODE = ExperimentMode.MEMORY_EFFICIENT

print("Full Scientific Study Configuration:")
scientific_config.print_summary()

print(f"\n‚öôÔ∏è  Scientific Study Settings:")
print(f"   Mode: {SCIENTIFIC_MODE.value}")
print(f"   Fair comparison: {'Yes' if FAIR_COMPARISON_MODE else 'No'}")
print(f"   Memory efficient: {'Yes' if SCIENTIFIC_MODE == ExperimentMode.MEMORY_EFFICIENT else 'No'}")
print(f"   Early stopping: {'Disabled' if FAIR_COMPARISON_MODE else 'Enabled'}")
print(f"   BERTopic sampling: Will use 10K samples for topic discovery on 1M dataset")

In [None]:
# Confirm before running (this will take several hours)
print("‚ö†Ô∏è  WARNING: The full scientific study will take several hours to complete.")
print("Make sure you have:")
print("  - Stable power and internet connection")
print("  - Sufficient disk space for checkpoints")
print("  - W&B configured for tracking")
print("\nThe experiment will use the new robust BERTopic implementation that handles 1M+ samples.")
print("\nPress Enter to continue or Ctrl+C to cancel...")
input()

In [None]:
# Run full scientific experiment with unified system
print(f"\nüî¨ Running Full Scientific Study (Unified System - {SCIENTIFIC_MODE.value} mode)...\n")

# Clear memory before starting
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()

# Run experiment with unified system
scientific_experiment = UnifiedExperiment(scientific_config, SCIENTIFIC_MODE)
scientific_results = scientific_experiment.run()

# Clear memory after completion
del scientific_experiment
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()

print("\n‚úÖ Scientific study completed!")

# Results summary with unified structure
print(f"\nüìä Scientific Study Results:")
print(f"   Strategies analyzed: {len(scientific_results.strategy_results)}")
print(f"   Runtime: {scientific_results.resource_usage.get('total_runtime_hours', 0):.2f} hours")

# Display final scores
if scientific_results.statistical_analysis:
    print(f"\nüéØ Final Performance Ranking:")
    # Sort strategies by performance
    sorted_strategies = sorted(scientific_results.statistical_analysis.items(), 
                             key=lambda x: x[1], reverse=True)
    
    baseline_score = scientific_results.statistical_analysis.get('random', 0)
    
    for i, (strategy, score) in enumerate(sorted_strategies, 1):
        if strategy == "random":
            print(f"   {i:2d}. {strategy:30s} {score:.4f} (baseline)")
        else:
            improvement = ((score - baseline_score) / max(baseline_score, 1e-8)) * 100 if baseline_score > 0 else 0
            print(f"   {i:2d}. {strategy:30s} {score:.4f} ({improvement:+.2f}%)")

# Quality and convergence analysis for enhanced modes
if SCIENTIFIC_MODE in [ExperimentMode.ENHANCED, ExperimentMode.FAIR_COMPARISON]:
    max_accuracy = max(scientific_results.statistical_analysis.values()) if scientific_results.statistical_analysis else 0
    
    print(f"\nüìà Training Quality:")
    if max_accuracy > 0.3:
        print(f"   ‚úÖ Excellent maximum accuracy: {max_accuracy:.4f}")
    elif max_accuracy > 0.2:
        print(f"   ‚úÖ Good maximum accuracy: {max_accuracy:.4f}")
    elif max_accuracy > 0.15:
        print(f"   ‚ö†Ô∏è  Moderate maximum accuracy: {max_accuracy:.4f}")
    else:
        print(f"   ‚ùå Low maximum accuracy: {max_accuracy:.4f} - investigate training")
    
    # Convergence analysis
    if scientific_results.convergence_analysis:
        converged_count = sum(1 for info in scientific_results.convergence_analysis.values() 
                            if info.get('converged', False))
        total_strategies = len(scientific_results.convergence_analysis)
        
        print(f"\nüîç Convergence Analysis:")
        print(f"   Converged strategies: {converged_count}/{total_strategies}")
        
        if converged_count == total_strategies:
            print(f"   ‚úÖ All strategies converged - excellent training stability")
        elif converged_count > total_strategies * 0.7:
            print(f"   ‚úÖ Most strategies converged - good training stability")
        else:
            print(f"   ‚ö†Ô∏è  Few strategies converged - consider longer training")

# Memory analysis for memory-efficient mode
if SCIENTIFIC_MODE == ExperimentMode.MEMORY_EFFICIENT:
    resource_summary = scientific_results.resource_usage
    print(f"\nüíæ Memory Efficiency Analysis:")
    if 'peak_memory_mb' in resource_summary:
        peak_gb = resource_summary['peak_memory_mb'] / 1024
        print(f"   Peak memory usage: {peak_gb:.1f} GB")
        if peak_gb < 20:
            print(f"   ‚úÖ Excellent memory efficiency!")
        elif peak_gb < 40:
            print(f"   ‚úÖ Good memory usage")
        else:
            print(f"   ‚ö†Ô∏è  High memory usage - consider optimization")
    
    print(f"   No memory crashes detected - robust pipeline!")

print(f"\nüéâ Scientific study completed successfully!")

## 5. Comprehensive Results Analysis

In [None]:
# Load and analyze results
report = scientific_results['report']
results = scientific_results['results']

# Create comprehensive results dataframe
all_metrics = []

for strategy, data in results.items():
    if data and 'losses' in data:
        all_metrics.append({
            'Strategy': strategy,
            'Final Loss': data['losses'][-1],
            'Final Accuracy': data['accuracies'][-1],
            'Convergence Step': len(data['losses']),
            'Min Loss': min(data['losses']),
            'Max Accuracy': max(data['accuracies'])
        })

results_df = pd.DataFrame(all_metrics).sort_values('Final Loss')

print("Scientific Study Results:")
print(results_df.to_string(index=False))

# Statistical analysis
print("\n" + "="*60)
print("STATISTICAL ANALYSIS")
print("="*60)

# Compare to random baseline
random_metrics = results_df[results_df['Strategy'] == 'random'].iloc[0]
print(f"\nRandom Baseline:")
print(f"  Final Loss: {random_metrics['Final Loss']:.4f}")
print(f"  Final Accuracy: {random_metrics['Final Accuracy']:.4f}")

print("\nImprovements over Random:")
for _, row in results_df.iterrows():
    if row['Strategy'] != 'random':
        loss_improvement = (random_metrics['Final Loss'] - row['Final Loss']) / random_metrics['Final Loss'] * 100
        acc_improvement = (row['Final Accuracy'] - random_metrics['Final Accuracy']) / random_metrics['Final Accuracy'] * 100
        print(f"  {row['Strategy']:30s} Loss: {loss_improvement:+6.1f}%  Accuracy: {acc_improvement:+6.1f}%")

In [None]:
# Comprehensive visualizations
fig = plt.figure(figsize=(20, 15))
gs = fig.add_gridspec(3, 3, hspace=0.3, wspace=0.3)

# 1. Learning curves (all strategies)
ax1 = fig.add_subplot(gs[0, :])
for strategy, data in results.items():
    if data and 'losses' in data:
        steps = data['steps'][::100]  # Sample every 100 steps
        losses = data['losses'][::100]
        ax1.plot(steps, losses, label=strategy, linewidth=2, alpha=0.8)

ax1.set_xlabel('Training Steps')
ax1.set_ylabel('Loss')
ax1.set_title('Learning Curves - All Strategies', fontsize=16)
ax1.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
ax1.grid(True, alpha=0.3)

# 2. Final performance comparison
ax2 = fig.add_subplot(gs[1, 0])
ax2.barh(results_df['Strategy'], results_df['Final Loss'])
ax2.set_xlabel('Final Loss')
ax2.set_title('Final Loss by Strategy')
ax2.invert_yaxis()

# 3. Convergence speed
ax3 = fig.add_subplot(gs[1, 1])
ax3.barh(results_df['Strategy'], results_df['Convergence Step'])
ax3.set_xlabel('Convergence Step')
ax3.set_title('Convergence Speed')
ax3.invert_yaxis()

# 4. Strategy categories comparison
ax4 = fig.add_subplot(gs[1, 2])
strategy_categories = {
    'Random': ['random'],
    'Reading-based': ['reading_level_easy_to_hard', 'reading_level_hard_to_easy', 'reading_level_staged'],
    'Topic-based': ['topic_sequential', 'topic_interleaved', 'topic_largest_first'],
    'Hybrid': ['hybrid_reading_topic', 'hybrid_topic_reading'],
    'Epoch-interleaving': ['reading_topic_by_epoch', 'reading_levels_by_epoch', 'all_strategies_by_epoch']
}

category_performance = {}
for category, strategies in strategy_categories.items():
    category_losses = results_df[results_df['Strategy'].isin(strategies)]['Final Loss'].values
    if len(category_losses) > 0:
        category_performance[category] = category_losses.mean()

ax4.bar(category_performance.keys(), category_performance.values())
ax4.set_xlabel('Strategy Category')
ax4.set_ylabel('Average Final Loss')
ax4.set_title('Performance by Category')
ax4.tick_params(axis='x', rotation=45)

# 5. Learning efficiency (area under curve)
ax5 = fig.add_subplot(gs[2, :])
learning_efficiency = []
for strategy, data in results.items():
    if data and 'losses' in data:
        # Calculate area under the loss curve (lower is better)
        auc = np.trapz(data['losses'], data['steps'])
        learning_efficiency.append({'Strategy': strategy, 'AUC': auc})

efficiency_df = pd.DataFrame(learning_efficiency).sort_values('AUC')
ax5.barh(efficiency_df['Strategy'], efficiency_df['AUC'])
ax5.set_xlabel('Area Under Loss Curve')
ax5.set_title('Learning Efficiency (Lower is Better)')
ax5.invert_yaxis()

plt.tight_layout()
plt.show()

## 6. Key Findings and Recommendations

In [None]:
# Generate final report
print("="*60)
print("CURRICULUM LEARNING EXPERIMENT - FINAL REPORT")
print("="*60)

# Best strategies
print("\nüìä TOP PERFORMING STRATEGIES:")
for i, (_, row) in enumerate(results_df.head(5).iterrows()):
    print(f"  {i+1}. {row['Strategy']:30s} Loss: {row['Final Loss']:.4f}  Acc: {row['Final Accuracy']:.4f}")

# Key insights
print("\nüí° KEY INSIGHTS:")

# Check if easy-to-hard beats hard-to-easy
easy_to_hard = results_df[results_df['Strategy'] == 'reading_level_easy_to_hard']['Final Loss'].values
hard_to_easy = results_df[results_df['Strategy'] == 'reading_level_hard_to_easy']['Final Loss'].values

if len(easy_to_hard) > 0 and len(hard_to_easy) > 0:
    if easy_to_hard[0] < hard_to_easy[0]:
        print("  ‚úì Easy-to-hard curriculum outperforms hard-to-easy")
    else:
        print("  ‚úì Hard-to-easy curriculum outperforms easy-to-hard")

# Check if hybrids beat single-factor
hybrid_avg = results_df[results_df['Strategy'].str.contains('hybrid')]['Final Loss'].mean()
single_avg = results_df[results_df['Strategy'].isin(['reading_level_easy_to_hard', 'topic_sequential'])]['Final Loss'].mean()

if hybrid_avg < single_avg:
    print("  ‚úì Hybrid strategies outperform single-factor approaches")

# Check epoch-interleaving
epoch_strategies = results_df[results_df['Strategy'].str.contains('by_epoch')]
if not epoch_strategies.empty:
    best_epoch = epoch_strategies.iloc[0]
    print(f"  ‚úì Best epoch-interleaving: {best_epoch['Strategy']} (Loss: {best_epoch['Final Loss']:.4f})")

# Recommendations
print("\nüéØ RECOMMENDATIONS:")
best_strategy = results_df.iloc[0]['Strategy']
improvement = (random_metrics['Final Loss'] - results_df.iloc[0]['Final Loss']) / random_metrics['Final Loss'] * 100

print(f"  1. Use '{best_strategy}' for {improvement:.1f}% improvement over random")
print(f"  2. Consider ensemble of top 3 strategies for robustness")
print(f"  3. Topic modeling with BERTopic successfully scaled to 1M samples")

# Dataset insights
if 'outlier' in str(report):
    print(f"  4. Monitor outlier topic performance in production")

print("\n" + "="*60)

## 7. Export Results for Paper

In [None]:
# Save results for paper
import json

paper_results = {
    'pilot_study': {
        'config': pilot_config.to_dict(),
        'results': pilot_df.to_dict('records')
    },
    'scientific_study': {
        'config': scientific_config.to_dict(),
        'results': results_df.to_dict('records'),
        'best_strategy': best_strategy,
        'improvement_over_random': improvement
    },
    'experiment_ids': {
        'debug': debug_results['experiment_id'],
        'pilot': pilot_results['experiment_id'],
        'scientific': scientific_results['experiment_id']
    }
}

with open('curriculum_learning_results.json', 'w') as f:
    json.dump(paper_results, f, indent=2)

print("Results saved to curriculum_learning_results.json")
print("\nüéâ Experiment complete! Check W&B for detailed tracking.")