## Summary

**Evaluation Complete! ✓**

The PINN model has been comprehensively evaluated on the test set:

**Key Results:**
- ✓ Classification accuracy computed
- ✓ Confusion matrix visualized  
- ✓ Parameter errors analyzed (MAE, RMSE, MAPE)
- ✓ Calibration curves generated
- ✓ Scatter plots created
- ✓ Performance compared against targets

**Files Generated:**
- Comprehensive evaluation metrics
- Multiple publication-quality visualizations
- Error analysis by dark matter type
- Best/worst prediction examples

**Next Steps:**
1. If targets met: Deploy model for inference
2. If not: Apply enhancements (augmentation, tuning, more data)
3. Add ML unit tests for robustness
4. Integrate TensorBoard for training monitoring
5. Document results for publication

In [None]:
# Compare against target metrics
metrics = results['metrics']
classification_acc = metrics['classification']['overall_accuracy']
avg_mape = metrics['parameters']['overall_mape']

print("\n" + "="*70)
print(" "*15 + "PERFORMANCE vs TARGET METRICS")
print("="*70)
print(f"\n{'Metric':<40s} {'Target':<15s} {'Achieved':<15s} {'Status':<10s}")
print("-"*70)

# Classification accuracy
target_acc = 90.0
status_acc = "✓ PASS" if classification_acc >= target_acc else "✗ FAIL"
print(f"{'Classification Accuracy (%)':<40s} {f'>= {target_acc:.1f}':<15s} {f'{classification_acc:.2f}':<15s} {status_acc:<10s}")

# Parameter error
target_error = 5.0
status_error = "✓ PASS" if avg_mape <= target_error else "✗ FAIL"
print(f"{'Parameter Error (MAPE %)':<40s} {f'< {target_error:.1f}':<15s} {f'{avg_mape:.2f}':<15s} {status_error:<10s}")

# Per-class accuracy
print("\nPer-Class Accuracy:")
for dm_type, class_name in enumerate(class_names):
    class_acc = metrics['classification']['per_class_accuracy'][dm_type]
    status = "✓" if class_acc >= target_acc else "✗"
    print(f"  {status} {class_name}: {class_acc:.2f}%")

# Per-parameter MAPE
print("\nPer-Parameter MAPE:")
for i, param_name in enumerate(param_names):
    param_mape = metrics['parameters']['per_parameter_mape'][i]
    status = "✓" if param_mape <= target_error else "✗"
    print(f"  {status} {param_name}: {param_mape:.2f}%")

print("="*70)

# Final summary
if classification_acc >= target_acc and avg_mape <= target_error:
    print("\n🎉 SUCCESS! All target metrics achieved!")
    print("   The PINN model is ready for deployment.")
else:
    print("\n⚠ Some targets not met. Consider:")
    if classification_acc < target_acc:
        print("   - Increase training data")
        print("   - Adjust classification loss weight")
    if avg_mape > target_error:
        print("   - Increase physics loss weight (λ)")
        print("   - Add more augmentation")

## 10. Performance vs Target Metrics

In [None]:
# Calculate combined error metric (normalized MAE for params + classification error)
param_errors = np.mean(np.abs(pred_params - true_params), axis=1)
class_errors = (results['predictions']['pred_classes'] != true_classes).astype(float)
combined_error = param_errors + class_errors * np.mean(param_errors) * 2  # Weight class errors higher

# Find best and worst predictions
best_indices = np.argsort(combined_error)[:6]
worst_indices = np.argsort(combined_error)[-6:]

# Load original images for visualization
import h5py
with h5py.File(DATA_FILE, 'r') as f:
    test_images = f['test/images'][:]

# Plot best predictions
fig, axes = plt.subplots(2, 3, figsize=(14, 9))
axes = axes.ravel()

for i, idx in enumerate(best_indices):
    ax = axes[i]
    ax.imshow(test_images[idx], cmap='viridis', origin='lower')
    
    true_class = class_names[true_classes[idx]]
    pred_class = class_names[results['predictions']['pred_classes'][idx]]
    confidence = results['predictions']['class_probs'][idx, results['predictions']['pred_classes'][idx]] * 100
    
    mae = param_errors[idx]
    
    title = f"✓ {pred_class} ({confidence:.0f}%) | MAE: {mae:.3f}\n"
    title += f"True: {true_class} | M_vir: {true_params[idx,0]:.2e}"
    
    ax.set_title(title, fontsize=9, color='green', fontweight='bold')
    ax.axis('off')

plt.suptitle('Best Predictions (Lowest Error)', fontsize=13, fontweight='bold')
plt.tight_layout()
plt.show()

# Plot worst predictions
fig, axes = plt.subplots(2, 3, figsize=(14, 9))
axes = axes.ravel()

for i, idx in enumerate(worst_indices):
    ax = axes[i]
    ax.imshow(test_images[idx], cmap='viridis', origin='lower')
    
    true_class = class_names[true_classes[idx]]
    pred_class = class_names[results['predictions']['pred_classes'][idx]]
    confidence = results['predictions']['class_probs'][idx, results['predictions']['pred_classes'][idx]] * 100
    
    mae = param_errors[idx]
    correct = "✓" if true_classes[idx] == results['predictions']['pred_classes'][idx] else "✗"
    color = 'orange' if correct == "✓" else 'red'
    
    title = f"{correct} {pred_class} ({confidence:.0f}%) | MAE: {mae:.3f}\n"
    title += f"True: {true_class} | M_vir: {true_params[idx,0]:.2e}"
    
    ax.set_title(title, fontsize=9, color=color, fontweight='bold')
    ax.axis('off')

plt.suptitle('Worst Predictions (Highest Error)', fontsize=13, fontweight='bold')
plt.tight_layout()
plt.show()

## 9. Best and Worst Predictions

In [None]:
# Analyze errors by DM type
pred_params = results['predictions']['pred_params']
true_params = results['predictions']['true_params']
true_classes = results['predictions']['true_classes']

fig, axes = plt.subplots(1, 3, figsize=(15, 5))

for dm_type in range(3):
    mask = true_classes == dm_type
    
    # Calculate MAE for each parameter
    errors = np.abs(pred_params[mask] - true_params[mask])
    mae_per_param = np.mean(errors, axis=0)
    
    ax = axes[dm_type]
    x = np.arange(5)
    ax.bar(x, mae_per_param, color=['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd'])
    ax.set_xticks(x)
    ax.set_xticklabels(['M_vir', 'r_s', 'β_x', 'β_y', 'H₀'], rotation=45)
    ax.set_ylabel('Mean Absolute Error', fontsize=11)
    ax.set_title(f'{class_names[dm_type]} - Parameter Errors', fontsize=12, fontweight='bold')
    ax.grid(True, alpha=0.3, axis='y')

plt.suptitle('Parameter Prediction Errors by Dark Matter Type', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

# Print statistics
print("\nError Statistics by Dark Matter Type:")
print("="*70)
for dm_type in range(3):
    mask = true_classes == dm_type
    errors = np.abs(pred_params[mask] - true_params[mask])
    mae_overall = np.mean(errors)
    print(f"{class_names[dm_type]:5s} - Overall MAE: {mae_overall:.4f} | Samples: {mask.sum()}")

## 8. Error Analysis by Dark Matter Type

In [None]:
fig = plot_parameter_scatter(
    results['predictions']['pred_params'],
    results['predictions']['true_params'],
    param_names
)
plt.show()

## 7. Parameter Scatter Plots

In [None]:
fig = plot_calibration_curve(
    results['predictions']['class_probs'],
    results['predictions']['true_classes'],
    class_names
)
plt.show()

## 6. Calibration Curves

In [None]:
param_names = ['M_vir [M☉]', 'r_s [kpc]', 'β_x [arcsec]', 'β_y [arcsec]', 'H₀ [km/s/Mpc]']

fig = plot_parameter_errors(
    results['predictions']['pred_params'],
    results['predictions']['true_params'],
    param_names,
    results['metrics']['parameters']
)
plt.show()

## 5. Parameter Prediction Errors

In [None]:
class_names = ['CDM', 'WDM', 'SIDM']

# Plot confusion matrix (counts)
fig = plot_confusion_matrix(
    results['metrics']['classification']['confusion_matrix'],
    class_names,
    normalize=False
)
plt.show()

# Plot normalized confusion matrix
fig = plot_confusion_matrix(
    results['metrics']['classification']['confusion_matrix'],
    class_names,
    normalize=True
)
plt.show()

## 4. Confusion Matrix

In [None]:
# Evaluate model on test set
print("Evaluating model on test set...")
results = evaluate_model(model, test_loader, device, return_predictions=True)

print("\n" + "="*70)
print(" "*20 + "EVALUATION RESULTS")
print("="*70)
print_evaluation_summary(results['metrics'])

## 3. Run Complete Evaluation

In [None]:
# Load trained model
MODEL_PATH = '../models/best_pinn_model.pth'
model = PhysicsInformedNN(input_size=64, dropout_rate=0.2)
checkpoint = torch.load(MODEL_PATH, map_location=device)
model.load_state_dict(checkpoint['model_state_dict'])
model = model.to(device)
model.eval()

print(f"✓ Model loaded from: {MODEL_PATH}")
print(f"  Trained for {checkpoint['epoch']+1} epochs")
print(f"  Best validation loss: {checkpoint['val_loss']:.4f}")

# Load test dataset
DATA_FILE = '../data/processed/lens_training_data.h5'
test_dataset = LensDataset(DATA_FILE, split='test')
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=0)

print(f"\n✓ Test dataset loaded: {len(test_dataset)} samples")

## 2. Load Model and Test Data

In [None]:
import torch
from torch.utils.data import DataLoader
import numpy as np
import matplotlib.pyplot as plt
import sys

sys.path.append('..')
from src.ml import PhysicsInformedNN, evaluate_model, compute_metrics
from src.ml.evaluate import (
    plot_confusion_matrix, plot_parameter_errors,
    plot_calibration_curve, plot_parameter_scatter,
    print_evaluation_summary
)
from src.ml.generate_dataset import LensDataset

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")

plt.rcParams['figure.figsize'] = (12, 8)
print("✓ All modules imported")

## 1. Import Libraries

# Phase 5c: Comprehensive Model Evaluation

This notebook performs comprehensive evaluation of the trained PINN model on the test set:

**Evaluation Metrics:**
- Classification: Overall accuracy, per-class accuracy, confusion matrix
- Regression: MAE, RMSE, MAPE for each parameter
- Calibration: Probability calibration curves
- Visualization: Prediction scatter plots, error distributions

**Expected Performance:**
- Classification accuracy: >90%
- Parameter error: <5%

Let's see how our physics-informed model performs!