# CUDA Acceleration in SpecKit

This notebook demonstrates CUDA-accelerated spectral analysis capabilities in SpecKit.

CUDA acceleration provides significant speedups for large-scale spectral analysis problems, particularly when processing many segments in parallel.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import time
from speckit import SpectrumAnalyzer, compute_spectrum
from speckit.core import _CUDA_ENABLED, _NUMBA_ENABLED

print(f"CUDA Available: {_CUDA_ENABLED}")
print(f"Numba Available: {_NUMBA_ENABLED}")

## Generate Large Test Dataset

We'll use a large dataset to demonstrate the performance benefits of CUDA acceleration.

In [None]:
N = 10_000_000  # 10 million samples
fs = 100.0
np.random.seed(42)
x = np.random.randn(N)

print(f"Generated {N:,} samples at {fs} Hz sampling rate")
print(f"Data size: {N * 8 / 1e6:.1f} MB (float64)")

## Compare Backend Performance

Let's compare the performance of different backends (NumPy, Numba, CUDA) on the same dataset.

In [None]:
backends = []
if _NUMBA_ENABLED:
    backends.append('numba')
if _CUDA_ENABLED:
    backends.append('cuda')
backends.append('numpy')  # Always available

times = {}
results = {}

for backend in backends:
    print(f"
Benchmarking {backend.upper()} backend...")
    analyzer = SpectrumAnalyzer(
        data=x, 
        fs=fs, 
        backend=backend,
        Jdes=1000, 
        Kdes=100, 
        order=1,
        verbose=False
    )
    
    t0 = time.perf_counter()
    result = analyzer.compute()
    elapsed = time.perf_counter() - t0
    
    times[backend] = elapsed
    results[backend] = result
    print(f"  Completed in {elapsed:.3f} seconds")

## Visualize Speedup

Compare the speedup achieved by each backend relative to NumPy.

In [None]:
fig, ax = plt.subplots(figsize=(8, 5))
backends_list = list(times.keys())
speedups = [times['numpy'] / times[b] for b in backends_list]
colors = ['royalblue' if b == 'cuda' else 'tomato' if b == 'numba' else 'gray' for b in backends_list]

bars = ax.bar(backends_list, speedups, color=colors, alpha=0.7)
ax.set_ylabel('Speedup vs NumPy', fontsize=12)
ax.set_title('CUDA Acceleration Speedup', fontsize=14, fontweight='bold')
ax.grid(axis='y', alpha=0.3)

# Add value labels on bars
for bar, speedup in zip(bars, speedups):
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{speedup:.2f}x',
            ha='center', va='bottom', fontsize=11, fontweight='bold')

plt.tight_layout()
plt.show()

print("
Speedup Summary:")
for backend, speedup in zip(backends_list, speedups):
    print(f"  {backend.upper()}: {speedup:.2f}x faster than NumPy")

## Accuracy Comparison

Verify that CUDA produces identical results to the CPU backends.

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))

for backend in ['numba', 'cuda']:
    if backend in results:
        result = results[backend]
        result.plot(which='asd', ax=ax, label=f'{backend.upper()} backend', alpha=0.7)

ax.set_xlim(0.1, fs/2)
ax.set_ylabel(r'Amplitude Spectral Density (units/$\sqrt{\mathrm{Hz}}$)', fontsize=12)
ax.set_xlabel('Frequency (Hz)', fontsize=12)
ax.legend(fontsize=11)
ax.grid(alpha=0.3)
ax.set_title('Spectrum Comparison: CUDA vs Numba', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

# Numerical comparison
if 'numba' in results and 'cuda' in results:
    numba_result = results['numba']
    cuda_result = results['cuda']
    
    # Compare frequencies (should match exactly)
    freq_match = np.allclose(numba_result.f, cuda_result.f, rtol=1e-12)
    print(f"Frequency grids match: {freq_match}")
    
    # Compare spectra (should match within numerical precision)
    asd_match = np.allclose(numba_result.asd, cuda_result.asd, rtol=1e-9, atol=1e-12)
    print(f"ASD values match: {asd_match}")
    
    if asd_match:
        max_diff = np.max(np.abs(numba_result.asd - cuda_result.asd))
        rel_diff = max_diff / np.max(numba_result.asd)
        print(f"Maximum absolute difference: {max_diff:.2e}")
        print(f"Maximum relative difference: {rel_diff:.2e}")

## Best Practices and Recommendations

### When to Use CUDA

- **Large datasets**: CUDA excels with datasets > 1 million samples
- **Many segments**: Best performance when K > 1000 segments
- **Batch processing**: Ideal for processing multiple datasets or frequencies
- **Production workloads**: When consistent high performance is needed

### When CPU is Sufficient

- **Small datasets**: For N < 100,000 samples, CPU overhead may dominate
- **Interactive work**: For exploratory analysis, CPU latency may be acceptable
- **Development/debugging**: Easier to debug CPU code

### Memory Management

- CUDA automatically manages GPU memory
- Large datasets are transferred to GPU only during computation
- Memory is freed after computation completes

### Backend Selection

- Use  for automatic selection (recommended)
- Explicitly set  to force GPU usage
- Set  or  to force CPU usage