## Summary

**Dataset Generation Complete! ✓**

The synthetic training dataset has been successfully generated with:
- ✓ Balanced class distribution (CDM, WDM, SIDM)
- ✓ Realistic noise modeling (Gaussian + Poisson)
- ✓ Diverse parameter ranges
- ✓ Efficient HDF5 storage format
- ✓ Train/validation/test splits

**Next Steps:**
1. Train the PINN model (`phase5b_train_pinn.ipynb`)
2. Evaluate performance (`phase5c_evaluate.ipynb`)

**To scale up:**
- Increase `N_SAMPLES` to 100,000+ for production training
- Add data augmentation (rotation, flipping, brightness)
- Generate multiple datasets with different noise levels

In [None]:
if Path(OUTPUT_FILE).exists():
    with h5py.File(OUTPUT_FILE, 'r') as f:
        # Load training set for analysis
        train_labels = f['train/labels'][:]
        train_params = f['train/parameters'][:]
        
    print("="*70)
    print(" "*20 + "DATASET STATISTICS")
    print("="*70)
    
    # Class distribution
    print("\nCLASS DISTRIBUTION (Training Set):")
    class_names = ['CDM', 'WDM', 'SIDM']
    for i, name in enumerate(class_names):
        count = np.sum(train_labels == i)
        percentage = count / len(train_labels) * 100
        print(f"  {name}: {count:,} samples ({percentage:.1f}%)")
    
    # Parameter statistics
    print("\nPARAMETER STATISTICS (Training Set):")
    param_names = ['M_vir', 'r_s', 'beta_x', 'beta_y', 'H0']
    print(f"\n{'Parameter':<12} {'Mean':<15} {'Std':<15} {'Min':<15} {'Max':<15}")
    print("-" * 72)
    
    for i, name in enumerate(param_names):
        data = train_params[:, i]
        print(f"{name:<12} {data.mean():<15.3e} {data.std():<15.3e} {data.min():<15.3e} {data.max():<15.3e}")
    
    print("\n" + "="*70)
    
    # Plot class distribution
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Bar chart
    ax = axes[0]
    counts = [np.sum(train_labels == i) for i in range(3)]
    bars = ax.bar(class_names, counts, color=['blue', 'orange', 'green'], alpha=0.7, edgecolor='black')
    ax.set_ylabel('Count', fontsize=12)
    ax.set_title('Class Distribution', fontsize=13, fontweight='bold')
    ax.grid(True, alpha=0.3, axis='y')
    
    # Add value labels on bars
    for bar, count in zip(bars, counts):
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height,
                f'{count:,}',
                ha='center', va='bottom', fontsize=11, fontweight='bold')
    
    # Pie chart
    ax = axes[1]
    colors = ['blue', 'orange', 'green']
    explode = (0.05, 0.05, 0.05)
    ax.pie(counts, labels=class_names, autopct='%1.1f%%', colors=colors,
           explode=explode, shadow=True, startangle=90, textprops={'fontsize': 12, 'fontweight': 'bold'})
    ax.set_title('Class Proportion', fontsize=13, fontweight='bold')
    
    plt.tight_layout()
    plt.show()

else:
    print("❌ Dataset file not found. Please generate the dataset first.")

## 6. Dataset Statistics

Analyze the class distribution and parameter statistics.

In [None]:
# Check if file exists
if Path(OUTPUT_FILE).exists():
    print(f"✓ HDF5 file found: {OUTPUT_FILE}")
    print(f"  File size: {Path(OUTPUT_FILE).stat().st_size / 1024**2:.2f} MB\n")
    
    # Open and inspect
    with h5py.File(OUTPUT_FILE, 'r') as f:
        print("="*70)
        print(" "*20 + "DATASET STRUCTURE")
        print("="*70)
        
        print("\nSplits available:", list(f.keys()))
        
        for split in ['train', 'val', 'test']:
            print(f"\n{split.upper()} SET:")
            print(f"  Images shape:     {f[f'{split}/images'].shape}")
            print(f"  Parameters shape: {f[f'{split}/parameters'].shape}")
            print(f"  Labels shape:     {f[f'{split}/labels'].shape}")
        
        print("\n\nMETADATA:")
        for key, value in f.attrs.items():
            print(f"  {key}: {value}")
        
        print("\n" + "="*70)
        
        # Load a few samples for visualization
        train_images = f['train/images'][:9]
        train_params = f['train/parameters'][:9]
        train_labels = f['train/labels'][:9]
    
    # Visualize samples from dataset
    fig, axes = plt.subplots(3, 3, figsize=(12, 12))
    class_names = ['CDM', 'WDM', 'SIDM']
    
    for i in range(9):
        ax = axes[i // 3, i % 3]
        im = ax.imshow(train_images[i], cmap='viridis', origin='lower')
        
        label = train_labels[i]
        params = train_params[i]
        
        title = f"{class_names[label]}\n"
        title += f"M={params[0]:.2e}, H₀={params[4]:.1f}"
        
        ax.set_title(title, fontsize=9)
        ax.axis('off')
        plt.colorbar(im, ax=ax, fraction=0.046, pad=0.04)
    
    plt.suptitle('Random Samples from Training Set', fontsize=14, fontweight='bold', y=0.995)
    plt.tight_layout()
    plt.show()
    
else:
    print(f"❌ File not found: {OUTPUT_FILE}")
    print("   Please run the generation step first.")

## 5. Verify Generated Dataset

Let's load and verify the generated HDF5 file.

In [None]:
# Configuration
N_SAMPLES = 10000  # Start with 10K, increase to 100K for production
OUTPUT_FILE = '../data/processed/lens_training_data.h5'
GRID_SIZE = 64

print("="*70)
print(" "*15 + "DATASET GENERATION CONFIGURATION")
print("="*70)
print(f"Total samples:    {N_SAMPLES:,}")
print(f"Grid size:        {GRID_SIZE}×{GRID_SIZE}")
print(f"Output file:      {OUTPUT_FILE}")
print(f"Train/Val/Test:   70%/15%/15%")
print("="*70)

# Confirm before generating
response = input("\nProceed with generation? (yes/no): ")

if response.lower() == 'yes':
    print("\n🚀 Starting dataset generation...\n")
    start_time = time.time()
    
    # Generate dataset
    split_info = generate_training_data(
        n_samples=N_SAMPLES,
        output_file=OUTPUT_FILE,
        grid_size=GRID_SIZE,
        train_split=0.7,
        val_split=0.15,
        test_split=0.15,
        seed=42
    )
    
    elapsed_time = time.time() - start_time
    
    print(f"\n✓ Dataset generation complete!")
    print(f"  Total time: {elapsed_time:.1f} seconds ({elapsed_time/60:.1f} minutes)")
    print(f"  Samples per second: {N_SAMPLES/elapsed_time:.1f}")
    print(f"\n  Train samples: {split_info['train']:,}")
    print(f"  Val samples:   {split_info['val']:,}")
    print(f"  Test samples:  {split_info['test']:,}")
else:
    print("\n❌ Generation cancelled")

## 4. Generate Full Dataset

Now let's generate the complete training dataset. 

**Configuration:**
- Start with 10,000 samples (increase to 100,000+ for production)
- 70% training, 15% validation, 15% test split
- Saved to HDF5 format

**Estimated time:**
- 10K samples: ~5-10 minutes
- 100K samples: ~1-2 hours

In [None]:
# Generate 300 samples (100 per DM type)
n_test_samples = 300
test_params = []
test_labels = []

print(f"Generating {n_test_samples} samples for parameter analysis...")
for i in range(n_test_samples):
    if i % 50 == 0:
        print(f"  {i}/{n_test_samples}...")
    
    dm_type = ['CDM', 'WDM', 'SIDM'][i % 3]
    try:
        _, params, label = generate_single_sample(dm_type, grid_size=64, add_noise_flag=False)
        test_params.append(params)
        test_labels.append(label)
    except:
        continue

test_params = np.array(test_params)
test_labels = np.array(test_labels)

print(f"✓ Generated {len(test_params)} samples")

# Plot parameter distributions
param_names = ['M_vir (M☉)', 'r_s (kpc)', 'β_x (arcsec)', 'β_y (arcsec)', 'H₀ (km/s/Mpc)']
colors = ['blue', 'orange', 'green']
dm_names = ['CDM', 'WDM', 'SIDM']

fig, axes = plt.subplots(2, 3, figsize=(15, 8))
axes = axes.flatten()

for i in range(5):
    ax = axes[i]
    
    for label_idx, (color, name) in enumerate(zip(colors, dm_names)):
        mask = test_labels == label_idx
        if np.sum(mask) > 0:
            data = test_params[mask, i]
            ax.hist(data, bins=20, alpha=0.6, color=color, label=name, edgecolor='black')
    
    ax.set_xlabel(param_names[i], fontsize=11)
    ax.set_ylabel('Count', fontsize=11)
    ax.set_title(f'{param_names[i]}\nRange: [{test_params[:, i].min():.2e}, {test_params[:, i].max():.2e}]', fontsize=10)
    ax.legend(fontsize=9)
    ax.grid(True, alpha=0.3)

# Remove extra subplot
axes[5].remove()

plt.suptitle('Parameter Distributions by Dark Matter Type', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

## 3. Visualize Parameter Distributions

Let's generate a small batch to visualize the parameter distributions.

In [None]:
# Generate sample images for each DM type
dm_types = ['CDM', 'WDM', 'SIDM']
n_samples_per_type = 3

fig, axes = plt.subplots(3, 3, figsize=(12, 12))

print("Generating sample images...")
for i, dm_type in enumerate(dm_types):
    for j in range(n_samples_per_type):
        print(f"  {dm_type} sample {j+1}/3...", end=" ")
        try:
            image, params, label = generate_single_sample(dm_type, grid_size=64, add_noise_flag=True)
            
            ax = axes[i, j]
            im = ax.imshow(image, cmap='viridis', origin='lower')
            ax.set_title(f'{dm_type} (label={label})\nM={params[0]:.2e} M☉', fontsize=10)
            ax.axis('off')
            plt.colorbar(im, ax=ax, fraction=0.046, pad=0.04)
            print("✓")
        except Exception as e:
            print(f"✗ Error: {e}")

plt.suptitle('Sample Convergence Maps by Dark Matter Type', fontsize=14, fontweight='bold', y=0.995)
plt.tight_layout()
plt.show()

print("\n✓ Sample visualization complete")

## 2. Visualize Sample Images

Before generating the full dataset, let's visualize a few samples from each dark matter type to verify the data generation pipeline.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import h5py
import sys
import time
from pathlib import Path

sys.path.append('..')
from src.ml import generate_training_data
from src.ml.generate_dataset import generate_single_sample

# Configure matplotlib
plt.rcParams['figure.figsize'] = (14, 8)
plt.rcParams['font.size'] = 11

print("✓ All modules imported successfully")

## 1. Import Required Libraries

# Phase 5a: Generate Training Dataset for PINN

This notebook generates a large-scale synthetic dataset for training the Physics-Informed Neural Network (PINN) to:
1. Infer lens parameters (M_vir, r_s, source position, H₀)
2. Classify dark matter model type (CDM, WDM, SIDM)

## Dataset Specification

- **Size**: 10,000 samples (configurable up to 100,000+)
- **Image resolution**: 64×64 pixels
- **Dark matter types**: 33% CDM, 33% WDM, 34% SIDM
- **Noise**: Realistic Gaussian + Poisson noise
- **Split**: 70% train, 15% validation, 15% test
- **Format**: HDF5 for efficient storage and loading