# 📊 Enhanced Catalog Comparison

This notebook performs comprehensive comparison between reference earthquake catalogs and detected events from the QuakeFlow pipeline.

## Features
- **Reference Catalog Download**: Automatic download from USGS, JMA, NCEDC, etc.
- **Event Matching**: Sophisticated spatiotemporal matching algorithms
- **Performance Metrics**: Precision, recall, F1-score, and detection rates
- **Comprehensive Visualization**: Maps, residual plots, and performance charts
- **Detailed Reporting**: HTML reports with interpretation and recommendations

## Required Inputs
- Configuration file (config/config.json)
- Detected events catalog (gamma/gamma_catalog.csv)
- Station information (stations/stations.json)

## Outputs
- Reference catalog (comparison/reference_catalog.csv)
- Event matches (comparison/event_matches.csv)
- Performance metrics (comparison/performance_metrics.json)
- Visualization plots (comparison/figures/)
- HTML comparison report (comparison/comparison_report.html)

In [None]:
# Enhanced catalog comparison setup
import os
import sys
import json
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Add common utilities to path
sys.path.append('../../examples')

# Import QuakeFlow enhanced utilities
from common import (
    notebook_setup, 
    notebook_finalize, 
    CatalogComparison,
    ElyraUtils
)

print("📊 Enhanced Catalog Comparison Notebook Initialized")
print(f"Timestamp: {datetime.now().isoformat()}")
print(f"Working directory: {os.getcwd()}")

In [None]:
# Enhanced notebook setup with automatic configuration
try:
    config, workflow, parallel_config = notebook_setup()
    
    print(f"\n🌍 Region Configuration Loaded: {config.region.upper()}")
    print(f"Geographic Bounds: {config.get_geographic_bounds()}")
    print(f"Parallel Configuration: {parallel_config}")
    
    # Environment detection
    is_elyra = ElyraUtils.is_elyra_environment()
    is_kubeflow = ElyraUtils.is_kubeflow_environment()
    
    print(f"\n🔧 Environment Detection:")
    print(f"  Elyra Environment: {is_elyra}")
    print(f"  Kubeflow Environment: {is_kubeflow}")
    
    # Initialize catalog comparison
    comparison = CatalogComparison(config)
    print(f"\n📊 Catalog Comparison Initialized")
    print(f"  Matching Criteria: {comparison.matching_criteria}")
    
except Exception as e:
    print(f"❌ Setup failed: {e}")
    raise

In [None]:
# Configure comparison parameters from environment variables
REFERENCE_SOURCE = os.environ.get('REFERENCE_SOURCE', 'usgs').lower()
MIN_MAGNITUDE = float(os.environ.get('MIN_MAGNITUDE', '2.0'))
MAX_MAGNITUDE = float(os.environ.get('MAX_MAGNITUDE', '9.0'))
START_TIME = os.environ.get('START_TIME', None)
END_TIME = os.environ.get('END_TIME', None)

# Override matching criteria if specified
TIME_WINDOW = float(os.environ.get('TIME_WINDOW', '30.0'))
DISTANCE_THRESHOLD = float(os.environ.get('DISTANCE_THRESHOLD', '10.0'))
MAG_DIFF_THRESHOLD = float(os.environ.get('MAG_DIFF_THRESHOLD', '1.0'))

# Update comparison criteria
comparison.matching_criteria.update({
    'time_window': TIME_WINDOW,
    'distance_threshold': DISTANCE_THRESHOLD,
    'magnitude_diff_threshold': MAG_DIFF_THRESHOLD
})

print(f"\n⚙️ Comparison Configuration:")
print(f"  Reference Source: {REFERENCE_SOURCE.upper()}")
print(f"  Magnitude Range: {MIN_MAGNITUDE} - {MAX_MAGNITUDE}")
print(f"  Time Range: {START_TIME or 'Auto'} to {END_TIME or 'Auto'}")
print(f"  Updated Matching Criteria: {comparison.matching_criteria}")

# Define output directories
output_base = os.environ.get('OUTPUT_DIR', 'comparison')
os.makedirs(output_base, exist_ok=True)
os.makedirs(f"{output_base}/figures", exist_ok=True)

print(f"\n📁 Output Directory: {output_base}")

In [None]:
# Download reference catalog from specified source
print(f"\n🌐 Downloading Reference Catalog from {REFERENCE_SOURCE.upper()}...")

try:
    # Determine time range if not specified
    if START_TIME is None or END_TIME is None:
        # Try to extract from detected catalog if available
        detected_catalog_path = 'gamma/gamma_catalog.csv'
        if os.path.exists(detected_catalog_path):
            temp_detected = pd.read_csv(detected_catalog_path)
            if 'time' in temp_detected.columns:
                temp_times = pd.to_datetime(temp_detected['time'])
                auto_start = temp_times.min() - timedelta(hours=1)
                auto_end = temp_times.max() + timedelta(hours=1)
                START_TIME = START_TIME or auto_start.isoformat()
                END_TIME = END_TIME or auto_end.isoformat()
                print(f"  Auto-detected time range from detected catalog:")
                print(f"    Start: {START_TIME}")
                print(f"    End: {END_TIME}")
    
    # Download reference catalog
    reference_catalog = comparison.download_reference_catalog(
        source=REFERENCE_SOURCE,
        start_time=START_TIME,
        end_time=END_TIME,
        min_magnitude=MIN_MAGNITUDE,
        max_magnitude=MAX_MAGNITUDE
    )
    
    if len(reference_catalog) == 0:
        print(f"⚠️ Warning: No reference events downloaded from {REFERENCE_SOURCE.upper()}")
        print("   This may be due to:")
        print("   - No events in the specified time/magnitude/location range")
        print("   - Network connectivity issues")
        print("   - Service unavailability")
    else:
        print(f"✅ Successfully downloaded {len(reference_catalog)} reference events")
        
        # Display summary statistics
        print(f"\n📈 Reference Catalog Summary:")
        print(f"  Time Range: {reference_catalog['time'].min()} to {reference_catalog['time'].max()}")
        if 'magnitude' in reference_catalog.columns and not reference_catalog['magnitude'].isna().all():
            mag_stats = reference_catalog['magnitude'].describe()
            print(f"  Magnitude Range: {mag_stats['min']:.1f} - {mag_stats['max']:.1f} (mean: {mag_stats['mean']:.1f})")
        print(f"  Depth Range: {reference_catalog['depth_km'].min():.1f} - {reference_catalog['depth_km'].max():.1f} km")
        
        # Save reference catalog
        reference_catalog_path = f"{output_base}/reference_catalog.csv"
        reference_catalog.to_csv(reference_catalog_path, index=False)
        print(f"\n💾 Reference catalog saved: {reference_catalog_path}")

except Exception as e:
    print(f"❌ Error downloading reference catalog: {e}")
    print("   Creating empty reference catalog for demonstration")
    reference_catalog = pd.DataFrame(columns=[
        'event_id', 'time', 'latitude', 'longitude', 'depth_km', 
        'magnitude', 'magnitude_type', 'source'
    ])
    
print(f"\n📊 Reference catalog contains {len(reference_catalog)} events")

In [None]:
# Load detected events catalog from GaMMA output
print(f"\n📂 Loading Detected Events Catalog...")

try:
    # Possible detected catalog file paths
    possible_paths = [
        'gamma/gamma_catalog.csv',
        'gamma/gamma_catalog_filtered.csv',
        '../gamma/gamma_catalog.csv',
        'events/events_gamma.csv'
    ]
    
    detected_catalog = None
    used_path = None
    
    for path in possible_paths:
        if os.path.exists(path):
            detected_catalog = comparison.load_detected_catalog(path)
            used_path = path
            break
    
    if detected_catalog is None or len(detected_catalog) == 0:
        print(f"⚠️ Warning: No detected events found in standard locations")
        print(f"   Searched paths: {possible_paths}")
        print(f"   Creating mock detected catalog for demonstration")
        
        # Create mock detected catalog for demonstration
        bounds = config.get_geographic_bounds()
        n_mock = 10
        
        mock_data = {
            'event_id': [f'mock_{i:03d}' for i in range(n_mock)],
            'time': pd.date_range(START_TIME or '2024-01-01', periods=n_mock, freq='H'),
            'latitude': np.random.uniform(bounds['minlatitude'], bounds['maxlatitude'], n_mock),
            'longitude': np.random.uniform(bounds['minlongitude'], bounds['maxlongitude'], n_mock),
            'depth_km': np.random.uniform(0, 30, n_mock),
            'magnitude': np.random.uniform(2.0, 5.0, n_mock),
            'source': ['DETECTED'] * n_mock
        }
        
        detected_catalog = pd.DataFrame(mock_data)
        used_path = 'mock_catalog'
        
        print(f"   Created {len(detected_catalog)} mock detected events")
    else:
        print(f"✅ Successfully loaded {len(detected_catalog)} detected events from {used_path}")
    
    # Display detected catalog summary
    print(f"\n📈 Detected Catalog Summary:")
    print(f"  Source File: {used_path}")
    print(f"  Event Count: {len(detected_catalog)}")
    
    if len(detected_catalog) > 0:
        print(f"  Time Range: {detected_catalog['time'].min()} to {detected_catalog['time'].max()}")
        if 'magnitude' in detected_catalog.columns and not detected_catalog['magnitude'].isna().all():
            mag_stats = detected_catalog['magnitude'].describe()
            print(f"  Magnitude Range: {mag_stats['min']:.1f} - {mag_stats['max']:.1f} (mean: {mag_stats['mean']:.1f})")
        print(f"  Depth Range: {detected_catalog['depth_km'].min():.1f} - {detected_catalog['depth_km'].max():.1f} km")
        
        # Save standardized detected catalog
        detected_catalog_path = f"{output_base}/detected_catalog.csv"
        detected_catalog.to_csv(detected_catalog_path, index=False)
        print(f"\n💾 Detected catalog saved: {detected_catalog_path}")

except Exception as e:
    print(f"❌ Error loading detected catalog: {e}")
    raise

In [None]:
# Perform comprehensive catalog comparison
print(f"\n🔍 Performing Catalog Comparison...")

try:
    if len(reference_catalog) == 0 and len(detected_catalog) == 0:
        print("⚠️ Warning: Both reference and detected catalogs are empty")
        print("   Cannot perform meaningful comparison")
        
        # Create minimal results for pipeline completion
        comparison_results = {
            'summary': {
                'total_reference': 0,
                'total_detected': 0,
                'matched_pairs': 0,
                'f1_score': 0.0,
                'precision': 0.0,
                'recall': 0.0
            },
            'output_files': {
                'metrics': f"{output_base}/performance_metrics.json",
                'report': f"{output_base}/comparison_report.html"
            }
        }
        
        # Save minimal metrics
        minimal_metrics = {
            'precision': 0.0,
            'recall': 0.0,
            'f1_score': 0.0,
            'detection_rate': 0.0,
            'false_alarm_rate': 0.0,
            'note': 'No data available for comparison'
        }
        
        with open(f"{output_base}/performance_metrics.json", 'w') as f:
            json.dump(minimal_metrics, f, indent=2)
            
        print(f"   Saved minimal metrics to handle empty catalogs")
        
    else:
        # Run full comparison workflow
        comparison_results = comparison.run_full_comparison(
            reference_catalog=reference_catalog,
            detected_catalog=detected_catalog,
            reference_source=REFERENCE_SOURCE,
            output_dir=output_base
        )
        
        print(f"\n✅ Catalog Comparison Completed Successfully!")
        print(f"\n📊 Performance Summary:")
        print(f"  F1 Score: {comparison_results['summary']['f1_score']:.3f}")
        print(f"  Precision: {comparison_results['summary']['precision']:.3f}")
        print(f"  Recall: {comparison_results['summary']['recall']:.3f}")
        
        print(f"\n📈 Event Statistics:")
        print(f"  Reference Events: {comparison_results['summary']['total_reference_events']}")
        print(f"  Detected Events: {comparison_results['summary']['total_detected_events']}")
        print(f"  Matched Events: {comparison_results['summary']['matched_events']}")
        
        # Display output files
        print(f"\n📁 Generated Output Files:")
        for file_type, file_path in comparison_results['output_files'].items():
            if file_path and os.path.exists(file_path):
                print(f"  {file_type}: {file_path}")

except Exception as e:
    print(f"❌ Error during catalog comparison: {e}")
    import traceback
    traceback.print_exc()
    raise

In [None]:
# Generate Kubeflow UI metadata and finalize notebook
print(f"\n🎯 Finalizing Catalog Comparison Results...")

try:
    # Prepare results for finalization
    results = {
        'comparison_completed': True,
        'reference_source': REFERENCE_SOURCE,
        'reference_events_count': len(reference_catalog),
        'detected_events_count': len(detected_catalog),
        'matching_criteria': comparison.matching_criteria
    }
    
    # Add performance metrics if comparison was performed
    if 'summary' in comparison_results:
        results.update({
            'matched_events': comparison_results['summary'].get('matched_events', 0),
            'f1_score': comparison_results['summary'].get('f1_score', 0.0),
            'precision': comparison_results['summary'].get('precision', 0.0),
            'recall': comparison_results['summary'].get('recall', 0.0)
        })
    
    # Define output artifacts
    artifacts = [
        f"{output_base}/reference_catalog.csv",
        f"{output_base}/detected_catalog.csv", 
        f"{output_base}/performance_metrics.json",
        f"{output_base}/comparison_report.html"
    ]
    
    # Add optional artifacts if they exist
    optional_artifacts = [
        f"{output_base}/event_matches.csv",
        f"{output_base}/unmatched_reference.csv",
        f"{output_base}/unmatched_detected.csv",
        f"{output_base}/figures/event_locations_comparison.png",
        f"{output_base}/figures/performance_metrics.png",
        f"{output_base}/figures/parameter_residuals.png",
        f"{output_base}/figures/magnitude_performance.png"
    ]
    
    for artifact in optional_artifacts:
        if os.path.exists(artifact):
            artifacts.append(artifact)
    
    # Filter to only existing artifacts
    existing_artifacts = [a for a in artifacts if os.path.exists(a)]
    
    print(f"Results summary: {results}")
    print(f"Artifacts ({len(existing_artifacts)}): {existing_artifacts}")
    
    # Finalize notebook with enhanced metadata
    metadata = notebook_finalize('catalog_comparison', results, existing_artifacts)
    
    print(f"\n✅ Catalog Comparison Notebook Completed Successfully!")
    
    if 'summary' in comparison_results:
        summary = comparison_results['summary']
        print(f"\n🎊 Final Performance Summary:")
        print(f"   📊 F1 Score: {summary.get('f1_score', 0.0):.3f}")
        print(f"   🎯 Precision: {summary.get('precision', 0.0):.3f}")
        print(f"   📈 Recall: {summary.get('recall', 0.0):.3f}")
        print(f"   📁 Total Artifacts: {len(existing_artifacts)}")
    
    print(f"\n📖 View the detailed comparison report: {output_base}/comparison_report.html")

except Exception as e:
    print(f"❌ Error finalizing notebook: {e}")
    import traceback
    traceback.print_exc()
    
    # Ensure we have basic artifacts for pipeline continuation
    basic_results = {'comparison_completed': False, 'error': str(e)}
    basic_artifacts = [f for f in [f"{output_base}/performance_metrics.json"] if os.path.exists(f)]
    
    try:
        metadata = notebook_finalize('catalog_comparison_error', basic_results, basic_artifacts)
        print(f"Generated error metadata for pipeline continuation")
    except:
        print(f"Failed to generate error metadata")
        pass