# FiftyOne Audio Embedding Visualization System

This notebook generates AudioMAE embeddings from audio data and visualizes them in FiftyOne with:
- Similarity search
- Geographic visualization (lat/long mapping)
- UMAP/t-SNE embedding space visualization
- Metadata filtering and queries

Designed for ESC-50 proof of concept but extensible to any IQ data (audio, RF signals).

## Section 1: Setup & Configuration

In [1]:
# Cell 1: Imports, environment check, FiftyOne installation verification

import sys
import subprocess
from pathlib import Path
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import json
import warnings
warnings.filterwarnings('ignore')

# Check and install FiftyOne if needed
try:
    import fiftyone as fo
    import fiftyone.brain as fob
    print(f"FiftyOne version: {fo.__version__}")
except ImportError:
    print("Installing FiftyOne...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "fiftyone", "-q"])
    import fiftyone as fo
    import fiftyone.brain as fob
    print(f"FiftyOne installed successfully. Version: {fo.__version__}")

# Check GPU availability
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
if device == "cuda":
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

print("\nEnvironment ready!")

FiftyOne version: 1.11.0
Using device: cpu

Environment ready!


In [2]:
!ls

audiomaepp.ipynb  data_loaders.py		research.md
audiomae.py	  embeddings_utils.py		spec.md
checkpoints	  fiftyone_visualization.ipynb	test_improvements.py
CLAUDE.md	  plan.md
data		  __pycache__


In [3]:
# Cell 2: Configuration class with all parameters

class Config:
    """Configuration for FiftyOne embedding visualization pipeline."""
    
    # Model Configuration
    checkpoint_path = Path("checkpoints/encoder_only(4).pt")  # UPDATE THIS!
    device = device
    
    # Data Paths
    data_root = Path("data/ESC-50-master")
    audio_dir = data_root / "audio"
    metadata_csv = data_root / "meta" / "esc50.csv"
    spectrogram_dir = Path("data/imgs/full")  # Use precomputed spectrograms for speed
    
    # Processing Configuration
    batch_size = 4
    num_workers = 0  # Set to 0 for Jupyter notebooks to avoid multiprocessing issues
    use_cache = True
    cache_dir = Path("data/embeddings/esc50_audiomae")
    
    # FiftyOne Configuration
    dataset_name = "esc50_audiomae"
    persistent = True
    embedding_field = "audiomae_embedding"
    
    # Visualization Configuration
    compute_umap = True
    compute_tsne = True
    add_synthetic_geo = True  # Generate synthetic lat/long for PoC
    
    # Model Architecture (must match checkpoint)
    img_size = 256
    patch_size = 16
    embed_dim = 768
    encoder_depth = 12
    encoder_heads = 12
    decoder_embed_dim = 512
    decoder_depth = 8
    decoder_heads = 16
    use_macaron = True
    use_swiglu = True
    use_rope = True

config = Config()

# Display configuration
print("Configuration:")
print(f"  Checkpoint: {config.checkpoint_path}")
print(f"  Data root: {config.data_root}")
print(f"  Cache dir: {config.cache_dir}")
print(f"  FiftyOne dataset: {config.dataset_name}")
print(f"  Batch size: {config.batch_size}")
print(f"  Device: {config.device}")

Configuration:
  Checkpoint: checkpoints/encoder_only(4).pt
  Data root: data/ESC-50-master
  Cache dir: data/embeddings/esc50_audiomae
  FiftyOne dataset: esc50_audiomae
  Batch size: 4
  Device: cpu


In [4]:
# Cell 3: Helper utilities (progress tracking, error handling, file validation)

class ProgressTracker:
    """Track and display pipeline progress."""
    
    def __init__(self):
        self.steps = {}
        
    def start(self, step_name):
        """Mark a step as started."""
        self.steps[step_name] = {"status": "running", "start_time": pd.Timestamp.now()}
        print(f"\n{'='*60}")
        print(f"Starting: {step_name}")
        print(f"{'='*60}")
        
    def complete(self, step_name):
        """Mark a step as completed."""
        if step_name in self.steps:
            elapsed = pd.Timestamp.now() - self.steps[step_name]["start_time"]
            self.steps[step_name]["status"] = "completed"
            self.steps[step_name]["elapsed"] = elapsed
            print(f"\nCompleted: {step_name} (Elapsed: {elapsed.total_seconds():.2f}s)")
            
    def summary(self):
        """Display summary of all steps."""
        print(f"\n{'='*60}")
        print("Pipeline Summary")
        print(f"{'='*60}")
        for step, info in self.steps.items():
            status = info["status"]
            elapsed = info.get("elapsed", "N/A")
            if elapsed != "N/A":
                elapsed = f"{elapsed.total_seconds():.2f}s"
            print(f"  {step}: {status} ({elapsed})")

def validate_paths(config):
    """Validate that all required paths exist."""
    errors = []
    
    if not config.data_root.exists():
        errors.append(f"Data root not found: {config.data_root}")
    
    if not config.audio_dir.exists():
        errors.append(f"Audio directory not found: {config.audio_dir}")
        
    if not config.metadata_csv.exists():
        errors.append(f"Metadata CSV not found: {config.metadata_csv}")
    
    if not config.checkpoint_path.exists():
        errors.append(f"Checkpoint not found: {config.checkpoint_path}")
        errors.append("  Please update config.checkpoint_path to point to your trained model.")
    
    if errors:
        print("\nValidation Errors:")
        for error in errors:
            print(f"  - {error}")
        return False
    
    print("\nPath validation: All paths exist!")
    return True

# Initialize progress tracker
tracker = ProgressTracker()

# Validate paths
if validate_paths(config):
    print("\nReady to proceed!")
else:
    print("\nPlease fix the errors above before continuing.")


Path validation: All paths exist!

Ready to proceed!


## Section 2: Data Loading Pipeline

In [5]:
# Cell 4: Discover audio files, load ESC-50 metadata CSV

tracker.start("Data Discovery")

# Load metadata from CSV
metadata_df = pd.read_csv(config.metadata_csv)
print(f"Loaded metadata: {len(metadata_df)} rows")
print(f"\nMetadata columns: {list(metadata_df.columns)}")
print(f"\nFirst few rows:")
print(metadata_df.head())

# Verify audio files exist
audio_files = sorted(config.audio_dir.glob("*.wav"))
print(f"\nFound {len(audio_files)} audio files")

# Add full file paths to metadata
metadata_df['filepath'] = metadata_df['filename'].apply(lambda x: str(config.audio_dir / x))

# Verify all files exist
missing_files = []
for idx, row in metadata_df.iterrows():
    if not Path(row['filepath']).exists():
        missing_files.append(row['filename'])

if missing_files:
    print(f"\nWarning: {len(missing_files)} files missing from metadata")
    print(f"First few missing: {missing_files[:5]}")
else:
    print(f"\nAll {len(metadata_df)} audio files verified!")

tracker.complete("Data Discovery")


Starting: Data Discovery
Loaded metadata: 2000 rows

Metadata columns: ['filename', 'fold', 'target', 'category', 'esc10', 'src_file', 'take']

First few rows:
            filename  fold  target        category  esc10  src_file take
0   1-100032-A-0.wav     1       0             dog   True    100032    A
1  1-100038-A-14.wav     1      14  chirping_birds  False    100038    A
2  1-100210-A-36.wav     1      36  vacuum_cleaner  False    100210    A
3  1-100210-B-36.wav     1      36  vacuum_cleaner  False    100210    B
4  1-101296-A-19.wav     1      19    thunderstorm  False    101296    A

Found 2000 audio files

All 2000 audio files verified!

Completed: Data Discovery (Elapsed: 0.51s)


In [6]:
# Cell 5: Enrich metadata, add synthetic lat/long for PoC

tracker.start("Metadata Enrichment")

def generate_synthetic_geo(category, seed=42):
    """Generate synthetic lat/long clustered by category.
    
    This creates geographic patterns where similar sound categories
    are clustered in specific regions for visualization demo.
    """
    np.random.seed(seed + hash(category) % 1000)
    
    # Define category clusters (rough geographic regions)
    category_centers = {
        'Animals': (40.7128, -74.0060),      # New York area
        'Natural soundscapes and water sounds': (47.6062, -122.3321),  # Seattle area
        'Human, non-speech sounds': (51.5074, -0.1278),  # London area
        'Interior/domestic sounds': (35.6762, 139.6503),  # Tokyo area
        'Exterior/urban noises': (34.0522, -118.2437)  # Los Angeles area
    }
    
    # Get center for this category
    center_lat, center_lon = category_centers.get(category, (0.0, 0.0))
    
    # Add random offset (within ~50km radius)
    lat_offset = np.random.normal(0, 0.5)
    lon_offset = np.random.normal(0, 0.5)
    
    return center_lat + lat_offset, center_lon + lon_offset

if config.add_synthetic_geo:
    # Generate synthetic coordinates
    coords = metadata_df['category'].apply(
        lambda cat: generate_synthetic_geo(cat)
    )
    metadata_df['latitude'] = coords.apply(lambda x: x[0])
    metadata_df['longitude'] = coords.apply(lambda x: x[1])
    
    print("Added synthetic geographic coordinates:")
    print(f"  Latitude range: [{metadata_df['latitude'].min():.2f}, {metadata_df['latitude'].max():.2f}]")
    print(f"  Longitude range: [{metadata_df['longitude'].min():.2f}, {metadata_df['longitude'].max():.2f}]")
    print("\nCoordinates clustered by category for meaningful visualization.")
else:
    print("Skipping synthetic geo generation (config.add_synthetic_geo = False)")

tracker.complete("Metadata Enrichment")


Starting: Metadata Enrichment
Added synthetic geographic coordinates:
  Latitude range: [-1.63, 1.58]
  Longitude range: [-1.05, 1.09]

Coordinates clustered by category for meaningful visualization.

Completed: Metadata Enrichment (Elapsed: 0.02s)


In [7]:
# Cell 6: Display dataset statistics

tracker.start("Dataset Statistics")

print("ESC-50 Dataset Statistics:")
print(f"  Total samples: {len(metadata_df)}")
print(f"  Number of classes: {metadata_df['target'].nunique()}")
print(f"  Number of categories: {metadata_df['category'].nunique()}")
print(f"  Number of folds: {metadata_df['fold'].nunique()}")

print("\nCategory distribution:")
category_counts = metadata_df['category'].value_counts()
for category, count in category_counts.items():
    print(f"  {category}: {count} samples")

print("\nClass distribution (top 10):")
class_counts = metadata_df['target'].value_counts().head(10)
for target, count in class_counts.items():
    print(f"  Class {target}: {count} samples")

print("\nFold distribution:")
fold_counts = metadata_df['fold'].value_counts().sort_index()
for fold, count in fold_counts.items():
    print(f"  Fold {fold}: {count} samples")

tracker.complete("Dataset Statistics")


Starting: Dataset Statistics
ESC-50 Dataset Statistics:
  Total samples: 2000
  Number of classes: 50
  Number of categories: 50
  Number of folds: 5

Category distribution:
  dog: 40 samples
  chirping_birds: 40 samples
  vacuum_cleaner: 40 samples
  thunderstorm: 40 samples
  door_wood_knock: 40 samples
  can_opening: 40 samples
  crow: 40 samples
  clapping: 40 samples
  fireworks: 40 samples
  chainsaw: 40 samples
  airplane: 40 samples
  mouse_click: 40 samples
  pouring_water: 40 samples
  train: 40 samples
  sheep: 40 samples
  water_drops: 40 samples
  church_bells: 40 samples
  clock_alarm: 40 samples
  keyboard_typing: 40 samples
  wind: 40 samples
  footsteps: 40 samples
  frog: 40 samples
  cow: 40 samples
  brushing_teeth: 40 samples
  car_horn: 40 samples
  crackling_fire: 40 samples
  helicopter: 40 samples
  drinking_sipping: 40 samples
  rain: 40 samples
  insects: 40 samples
  laughing: 40 samples
  hen: 40 samples
  engine: 40 samples
  breathing: 40 samples
  cryin

## Section 3: Embedding Generation

In [8]:
# Cell 7: Load AudioMAE checkpoint from user path

tracker.start("Model Loading")

# Import AudioMAE model and CheckpointLoader
from audiomae import AudioMAEPlusPlus, Config as AudioMAEConfig
from embeddings_utils import CheckpointLoader

# Create model config matching checkpoint
model_config = AudioMAEConfig()
model_config.img_size = config.img_size
model_config.patch_size = config.patch_size
model_config.embed_dim = config.embed_dim
model_config.encoder_depth = config.encoder_depth
model_config.encoder_heads = config.encoder_heads
model_config.decoder_embed_dim = config.decoder_embed_dim
model_config.decoder_depth = config.decoder_depth
model_config.decoder_heads = config.decoder_heads
model_config.use_macaron = config.use_macaron
model_config.use_swiglu = config.use_swiglu
model_config.use_rope = config.use_rope

# Create model
model = AudioMAEPlusPlus(model_config)
print(f"Created AudioMAE++ model")

# Load checkpoint using CheckpointLoader (handles encoder-only automatically)
try:
    loader = CheckpointLoader(config.checkpoint_path, device=config.device)
    model, checkpoint_info = loader.load(model)
    
    print(f"\nCheckpoint info: {checkpoint_info}")
    print(f"Model loaded successfully to {config.device}")
    print(f"Model parameters: {sum(p.numel() for p in model.parameters()) / 1e6:.2f}M")
    
except Exception as e:
    print(f"Error loading checkpoint: {e}")
    print("\nPlease verify:")
    print(f"  1. Checkpoint path is correct: {config.checkpoint_path}")
    print(f"  2. Model config matches checkpoint (embed_dim={config.embed_dim}, etc.)")
    raise

tracker.complete("Model Loading")


Starting: Model Loading
Created AudioMAE++ model
[OK] Loaded full model checkpoint

Checkpoint info: {'epoch': 'unknown', 'loss': 'unknown', 'has_optimizer': False}
Model loaded successfully to cpu
Model parameters: 221.87M

Completed: Model Loading (Elapsed: 10.05s)


In [9]:
# Cell 8: Define EmbeddingGenerator class

class SpectrogramDataset(Dataset):
    """Dataset for loading precomputed spectrograms."""
    
    def __init__(self, metadata_df, spectrogram_dir):
        self.metadata = metadata_df.reset_index(drop=True)
        self.spectrogram_dir = Path(spectrogram_dir)
        
    def __len__(self):
        return len(self.metadata)
    
    def __getitem__(self, idx):
        row = self.metadata.iloc[idx]
        
        # Get spectrogram filename (same as audio filename but .npy)
        audio_filename = row['filename']
        spec_filename = audio_filename.replace('.wav', '.npy')
        spec_path = self.spectrogram_dir / spec_filename
        
        # Load spectrogram
        try:
            spectrogram = np.load(spec_path)
            spectrogram = torch.from_numpy(spectrogram).float()
        except FileNotFoundError:
            print(f"Warning: Spectrogram not found: {spec_path}")
            # Return zeros if file not found
            spectrogram = torch.zeros(3, 224, 224, dtype=torch.float32)
        
        return spectrogram, idx

class EmbeddingGenerator:
    """Generate embeddings from AudioMAE model with batch processing."""
    
    def __init__(self, model, device, batch_size=32):
        self.model = model
        self.device = device
        self.batch_size = batch_size
        
    def extract_cls_embedding(self, spectrograms):
        """Extract CLS token embedding from AudioMAE encoder.
        
        Args:
            spectrograms: Tensor of shape [B, 3, 224, 224]
            
        Returns:
            embeddings: Tensor of shape [B, 768]
        """
        self.model.eval()
        with torch.no_grad():
            # Forward encoder with NO masking (mask_ratio=0.0)
            latent, _, _ = self.model.forward_encoder(spectrograms, mask_ratio=0.0)
            # CLS token is at position 0: latent[:, 0, :]
            cls_embedding = latent[:, 0, :].cpu().numpy()  # Shape: [B, 768]
        return cls_embedding
    
    def generate_embeddings(self, dataset, desc="Generating embeddings"):
        """Generate embeddings for entire dataset.
        
        Args:
            dataset: PyTorch Dataset returning (spectrogram, index)
            desc: Description for progress bar
            
        Returns:
            embeddings: numpy array of shape [N, 768]
            indices: numpy array of indices
        """
        dataloader = DataLoader(
            dataset,
            batch_size=self.batch_size,
            shuffle=False,
            num_workers=0,  # Avoid multiprocessing in notebooks
            pin_memory=True if self.device == "cuda" else False
        )
        
        all_embeddings = []
        all_indices = []
        
        for spectrograms, indices in tqdm(dataloader, desc=desc):
            spectrograms = spectrograms.to(self.device)
            
            # Extract embeddings
            embeddings = self.extract_cls_embedding(spectrograms)
            
            all_embeddings.append(embeddings)
            all_indices.append(indices.numpy())
        
        # Concatenate all batches
        embeddings = np.vstack(all_embeddings)
        indices = np.concatenate(all_indices)
        
        return embeddings, indices

print("EmbeddingGenerator class defined successfully!")

EmbeddingGenerator class defined successfully!


In [10]:
# Cell 9: Execute embedding generation on all 2000 samples

tracker.start("Embedding Generation")

# Check if embeddings are cached
cache_file = config.cache_dir / "embeddings.npy"
metadata_cache_file = config.cache_dir / "metadata.json"

if config.use_cache and cache_file.exists():
    print(f"Loading cached embeddings from {cache_file}")
    embeddings = np.load(cache_file)
    print(f"Loaded embeddings shape: {embeddings.shape}")
    
    # Load metadata to verify
    if metadata_cache_file.exists():
        with open(metadata_cache_file, 'r') as f:
            cache_metadata = json.load(f)
        print(f"Cache metadata: {cache_metadata}")
    
else:
    print("Generating embeddings from scratch...")
    
    # Create dataset
    dataset = SpectrogramDataset(metadata_df, config.spectrogram_dir)
    print(f"Created dataset with {len(dataset)} samples")
    
    # Create embedding generator
    generator = EmbeddingGenerator(
        model=model,
        device=config.device,
        batch_size=config.batch_size
    )
    
    # Generate embeddings
    embeddings, indices = generator.generate_embeddings(
        dataset,
        desc="Extracting embeddings"
    )
    
    print(f"\nGenerated embeddings shape: {embeddings.shape}")
    print(f"Expected shape: ({len(metadata_df)}, {config.embed_dim})")
    
    # Verify no NaNs
    nan_count = np.isnan(embeddings).sum()
    if nan_count > 0:
        print(f"\nWarning: Found {nan_count} NaN values in embeddings!")
    else:
        print("\nNo NaN values detected - embeddings are valid!")

# Add embeddings to metadata dataframe for easy access
metadata_df['embedding_idx'] = range(len(metadata_df))

tracker.complete("Embedding Generation")


Starting: Embedding Generation
Loading cached embeddings from data/embeddings/esc50_audiomae/embeddings.npy
Loaded embeddings shape: (2000, 768)
Cache metadata: {'shape': [2000, 768], 'num_samples': 2000, 'embedding_dim': 768, 'checkpoint': 'encoder_only.pt', 'dataset': 'esc50_audiomae', 'timestamp': '2025-12-29T20:10:35.046453'}

Completed: Embedding Generation (Elapsed: 0.13s)


In [11]:
# Cell 10: Cache embeddings as .npy for reuse

if config.use_cache and not cache_file.exists():
    tracker.start("Caching Embeddings")
    
    # Create cache directory
    config.cache_dir.mkdir(parents=True, exist_ok=True)
    
    # Save embeddings
    np.save(cache_file, embeddings)
    print(f"Saved embeddings to {cache_file}")
    print(f"File size: {cache_file.stat().st_size / 1e6:.2f} MB")
    
    # Save metadata
    cache_metadata = {
        "shape": list(embeddings.shape),
        "num_samples": len(metadata_df),
        "embedding_dim": config.embed_dim,
        "checkpoint": str(config.checkpoint_path),
        "dataset": config.dataset_name,
        "timestamp": pd.Timestamp.now().isoformat()
    }
    
    with open(metadata_cache_file, 'w') as f:
        json.dump(cache_metadata, f, indent=2)
    
    print(f"Saved metadata to {metadata_cache_file}")
    
    # Save config for reproducibility
    config_cache_file = config.cache_dir / "config.json"
    config_dict = {
        "img_size": config.img_size,
        "patch_size": config.patch_size,
        "embed_dim": config.embed_dim,
        "encoder_depth": config.encoder_depth,
        "use_macaron": config.use_macaron,
        "use_swiglu": config.use_swiglu,
        "use_rope": config.use_rope,
    }
    
    with open(config_cache_file, 'w') as f:
        json.dump(config_dict, f, indent=2)
    
    print(f"Saved config to {config_cache_file}")
    
    tracker.complete("Caching Embeddings")
    
else:
    print("Embeddings already cached or caching disabled.")

Embeddings already cached or caching disabled.


## Section 4: FiftyOne Integration

In [12]:
# Cell 11: Create FiftyOne dataset, add samples with file paths

tracker.start("FiftyOne Dataset Creation")

# Delete existing dataset if it exists
if config.dataset_name in fo.list_datasets():
    print(f"Deleting existing dataset: {config.dataset_name}")
    fo.delete_dataset(config.dataset_name)

# Create new dataset
dataset = fo.Dataset(
    name=config.dataset_name,
    persistent=config.persistent
)

print(f"Created FiftyOne dataset: {config.dataset_name}")
print(f"Persistent: {config.persistent}")

# Add samples to dataset
print(f"\nAdding {len(metadata_df)} samples to dataset...")
print("Using PNG spectrograms as primary media for visual exploration")

# Define paths for PNG spectrograms
png_dir = Path("data/imgs/pre")

samples = []
missing_pngs = []

for idx, row in tqdm(metadata_df.iterrows(), total=len(metadata_df), desc="Creating samples"):
    # Get PNG spectrogram path (primary media)
    audio_filename = row['filename']
    png_filename = audio_filename.replace('.wav', '.png')
    png_path = png_dir / png_filename
    
    # Check if PNG exists
    if not png_path.exists():
        missing_pngs.append(png_filename)
        continue
    
    # Create sample with PNG as primary filepath
    sample = fo.Sample(filepath=str(png_path))
    
    # Add audio filepath as separate field for reference
    sample["audio_filepath"] = row['filepath']
    
    samples.append(sample)

dataset.add_samples(samples)

print(f"\nDataset created with {len(dataset)} samples")
if missing_pngs:
    print(f"Warning: {len(missing_pngs)} PNG files missing (samples skipped)")
    print(f"First few missing: {missing_pngs[:5]}")

print(f"\nPrimary media: PNG spectrograms from {png_dir}")
print(f"Audio files accessible via 'audio_filepath' field")

tracker.complete("FiftyOne Dataset Creation")


Starting: FiftyOne Dataset Creation


Deleting existing dataset: esc50_audiomae
Created FiftyOne dataset: esc50_audiomae
Persistent: True

Adding 2000 samples to dataset...
Using PNG spectrograms as primary media for visual exploration


Creating samples: 100%|██████████| 2000/2000 [00:00<00:00, 4280.11it/s]


 100% |███████████████| 2000/2000 [410.2ms elapsed, 0s remaining, 4.9K samples/s]     

Dataset created with 2000 samples

Primary media: PNG spectrograms from data/imgs/pre
Audio files accessible via 'audio_filepath' field

Completed: FiftyOne Dataset Creation (Elapsed: 9.50s)


In [13]:
# Cell 12: Add metadata fields (labels, fold, category, GeoLocation)

tracker.start("Adding Metadata Fields")

print("Adding metadata fields to FiftyOne samples...")

for sample, (idx, row) in tqdm(
    zip(dataset, metadata_df.iterrows()),
    total=len(dataset),
    desc="Adding metadata"
):
    # Add label (class)
    sample["label"] = fo.Classification(label=str(row['target']))
    
    # Add category
    sample["category"] = row['category']
    
    # Add fold (for cross-validation)
    sample["fold"] = int(row['fold'])
    
    # Add ESC-10 indicator
    sample["esc10"] = bool(row['esc10'])
    
    # Add filename
    sample["filename"] = row['filename']
    
    # Add geographic location (if available)
    if config.add_synthetic_geo and 'latitude' in row and 'longitude' in row:
        # Note: FiftyOne expects [longitude, latitude] order!
        sample["location"] = fo.GeoLocation(
            point=[row['longitude'], row['latitude']]
        )
    
    sample.save()

print(f"\nAdded metadata to all {len(dataset)} samples")
print(f"\nDataset fields: {list(dataset.get_field_schema().keys())}")

tracker.complete("Adding Metadata Fields")


Starting: Adding Metadata Fields
Adding metadata fields to FiftyOne samples...


Adding metadata: 100%|██████████| 2000/2000 [00:04<00:00, 468.09it/s]


Added metadata to all 2000 samples

Dataset fields: ['id', 'filepath', 'tags', 'metadata', 'created_at', 'last_modified_at', 'audio_filepath', 'label', 'category', 'fold', 'esc10', 'filename', 'location']

Completed: Adding Metadata Fields (Elapsed: 4.28s)





In [14]:
# Cell 13: Attach 768-dim embeddings to each sample

tracker.start("Attaching Embeddings")

print(f"Attaching {config.embed_dim}-dimensional embeddings...")

for sample, embedding in tqdm(
    zip(dataset, embeddings),
    total=len(dataset),
    desc="Attaching embeddings"
):
    # Convert numpy array to list for FiftyOne storage
    sample[config.embedding_field] = embedding.tolist()
    sample.save()

print(f"\nAttached embeddings to all {len(dataset)} samples")
print(f"Embedding field name: {config.embedding_field}")
print(f"Embedding dimension: {len(embeddings[0])}")

# Verify embeddings
sample = dataset.first()
print(f"\nExample embedding shape: {len(sample[config.embedding_field])}")
print(f"First 10 values: {sample[config.embedding_field][:10]}")

tracker.complete("Attaching Embeddings")


Starting: Attaching Embeddings
Attaching 768-dimensional embeddings...


Attaching embeddings: 100%|██████████| 2000/2000 [00:09<00:00, 215.46it/s]



Attached embeddings to all 2000 samples
Embedding field name: audiomae_embedding
Embedding dimension: 768

Example embedding shape: 768
First 10 values: [-0.08253182470798492, 0.7688832879066467, -0.1343298703432083, -0.589703381061554, 0.7069021463394165, -0.6623245477676392, -0.3486599624156952, -0.19210349023342133, -0.336894154548645, 0.714085042476654]

Completed: Attaching Embeddings (Elapsed: 9.30s)


In [15]:
# Cell 14: Compute similarity index

tracker.start("Computing Similarity Index")

print("Computing similarity index for nearest neighbor search...")

# Compute similarity index using sklearn backend
fob.compute_similarity(
    dataset,
    embeddings=config.embedding_field,
    brain_key="audiomae_similarity",
    backend="sklearn",  # Fast, in-memory backend
)

print(f"\nSimilarity index computed successfully!")
print(f"Brain key: audiomae_similarity")
print(f"Backend: sklearn")

# Test similarity search
print("\nTesting similarity search...")
query_sample = dataset.first()
similar_view = dataset.sort_by_similarity(
    query_sample.id,  # Use sample ID, not the sample object
    k=10,
    brain_key="audiomae_similarity"
)

print(f"Found {len(similar_view)} similar samples")
print("\nTop 5 most similar samples:")
for i, sample in enumerate(similar_view[:5]):
    print(f"  {i+1}. {sample.filename} (Category: {sample.category})")

tracker.complete("Computing Similarity Index")


Starting: Computing Similarity Index
Computing similarity index for nearest neighbor search...

Similarity index computed successfully!
Brain key: audiomae_similarity
Backend: sklearn

Testing similarity search...
Found 10 similar samples

Top 5 most similar samples:
  1. 1-100032-A-0.png (Category: dog)
  2. 3-148932-A-34.png (Category: can_opening)
  3. 5-252248-A-34.png (Category: can_opening)
  4. 5-204604-A-24.png (Category: coughing)
  5. 3-147342-A-34.png (Category: can_opening)

Completed: Computing Similarity Index (Elapsed: 1.26s)


In [16]:
# Cell 15: Compute UMAP and t-SNE visualizations

tracker.start("Computing Visualizations")

# UMAP visualization (recommended - faster and better preservation)
if config.compute_umap:
    print("Computing UMAP visualization...")
    fob.compute_visualization(
        dataset,
        embeddings=config.embedding_field,
        method="umap",
        num_dims=2,
        brain_key="umap_viz",
        n_neighbors=15,
        min_dist=0.1,
        seed=42
    )
    print("UMAP visualization computed successfully!")

# t-SNE visualization (alternative - slower, good for local structure)
if config.compute_tsne:
    print("\nComputing t-SNE visualization...")
    print("(This may take a few minutes for 2000 samples)")
    fob.compute_visualization(
        dataset,
        embeddings=config.embedding_field,
        method="tsne",
        num_dims=2,
        brain_key="tsne_viz",
        perplexity=100,
        seed=42
    )
    print("t-SNE visualization computed successfully!")

print("\nAll visualizations ready!")

tracker.complete("Computing Visualizations")


Starting: Computing Visualizations
Computing UMAP visualization...
Generating visualization...
UMAP(n_jobs=1, random_state=42, verbose=True)
Fri Jan  2 16:05:14 2026 Construct fuzzy simplicial set
Fri Jan  2 16:05:18 2026 Finding Nearest Neighbors
Fri Jan  2 16:05:24 2026 Finished Nearest Neighbor Search
Fri Jan  2 16:05:27 2026 Construct embedding


Epochs completed:   5%| ▌          26/500 [00:00]

	completed  0  /  500 epochs


Epochs completed:  18%| █▊         90/500 [00:01]

	completed  50  /  500 epochs
	completed  100  /  500 epochs


Epochs completed:  37%| ███▋       184/500 [00:01]

	completed  150  /  500 epochs


Epochs completed:  45%| ████▌      226/500 [00:01]

	completed  200  /  500 epochs


Epochs completed:  58%| █████▊     288/500 [00:02]

	completed  250  /  500 epochs


Epochs completed:  66%| ██████▌    330/500 [00:02]

	completed  300  /  500 epochs


Epochs completed:  75%| ███████▍   373/500 [00:02]

	completed  350  /  500 epochs


Epochs completed:  88%| ████████▊  438/500 [00:02]

	completed  400  /  500 epochs


Epochs completed:  96%| █████████▌ 478/500 [00:03]

	completed  450  /  500 epochs


Epochs completed: 100%| ██████████ 500/500 [00:03]


Fri Jan  2 16:05:30 2026 Finished embedding
UMAP visualization computed successfully!

Computing t-SNE visualization...
(This may take a few minutes for 2000 samples)
Generating visualization...
[t-SNE] Computing 301 nearest neighbors...
[t-SNE] Indexed 2000 samples in 0.001s...
[t-SNE] Computed neighbors for 2000 samples in 0.242s...
[t-SNE] Computed conditional probabilities for sample 1000 / 2000
[t-SNE] Computed conditional probabilities for sample 2000 / 2000
[t-SNE] Mean sigma: 0.893845
[t-SNE] Computed conditional probabilities in 0.158s
[t-SNE] Iteration 50: error = 58.0708084, gradient norm = 0.0056193 (50 iterations in 0.683s)
[t-SNE] Iteration 100: error = 57.5564270, gradient norm = 0.0002185 (50 iterations in 0.505s)
[t-SNE] Iteration 150: error = 57.5538216, gradient norm = 0.0024789 (50 iterations in 0.664s)
[t-SNE] Iteration 200: error = 57.5374451, gradient norm = 0.0005048 (50 iterations in 0.509s)
[t-SNE] Iteration 250: error = 57.5514221, gradient norm = 0.0001396 (

In [17]:
# Cell 15.5: Verify Mapbox Configuration

import json
from pathlib import Path

# Check FiftyOne app config for Mapbox token
app_config_path = Path.home() / ".fiftyone" / "app_config.json"

if app_config_path.exists():
    with open(app_config_path, 'r') as f:
        app_config = json.load(f)
    
    mapbox_token = app_config.get("plugins", {}).get("map", {}).get("mapboxAccessToken")
    
    if mapbox_token:
        print("✓ Mapbox token configured in FiftyOne app_config.json")
        print(f"  Token: {mapbox_token[:20]}...")
        print(f"  Config file: {app_config_path}")
        print("\n✓ Map view should be available in FiftyOne App!")
    else:
        print("❌ Mapbox token not found in app_config.json")
        print(f"\nTo enable map visualization:")
        print(f"  1. Get a free token: https://account.mapbox.com/access-tokens/")
        print(f"  2. Add to {app_config_path}")
        print(f'     {{"plugins": {{"map": {{"mapboxAccessToken": "YOUR_TOKEN"}}}}}}')
else:
    print(f"⚠ FiftyOne app config not found at {app_config_path}")
    print("\nConfig will be created automatically when needed.")
    print("To manually configure Mapbox, see:")
    print("https://docs.voxel51.com/user_guide/config.html#configuring-a-mapbox-token")

✓ Mapbox token configured in FiftyOne app_config.json
  Token: pk.eyJ1IjoibG91aXNhY...
  Config file: /home/louis/.fiftyone/app_config.json

✓ Map view should be available in FiftyOne App!


In [18]:
# Cell 16: Launch FiftyOne App, display dataset

tracker.start("Launching FiftyOne App")

print("Launching FiftyOne App...")
print("\nThe app will open in your browser.")
print("\nAvailable features:")
print("  - Browse audio samples with metadata")
print("  - Search by similarity (select a sample, then 'Sort by similarity')")
print("  - View geographic distribution (map view)")
print("  - Explore UMAP/t-SNE embeddings (Embeddings panel)")
print("  - Filter by category, fold, esc10, etc.")
print("\nPress Ctrl+C in the terminal to stop the app when done.")

# Launch the app (auto=False to avoid notebook display issues)
session = fo.launch_app(dataset, auto=False)

print(f"\nDataset: {config.dataset_name}")
print(f"Total samples: {len(dataset)}")
print(f"\nSession active at: http://localhost:{session.server_port}")
print(f"Open this URL in your browser to view the FiftyOne App")

tracker.complete("Launching FiftyOne App")


Starting: Launching FiftyOne App
Launching FiftyOne App...

The app will open in your browser.

Available features:
  - Browse audio samples with metadata
  - Search by similarity (select a sample, then 'Sort by similarity')
  - View geographic distribution (map view)
  - Explore UMAP/t-SNE embeddings (Embeddings panel)
  - Filter by category, fold, esc10, etc.

Press Ctrl+C in the terminal to stop the app when done.

Could not connect session, trying again in 10 seconds

Session launched. Run `session.show()` to open the App in a cell output.

Dataset: esc50_audiomae
Total samples: 2000

Session active at: http://localhost:5151
Open this URL in your browser to view the FiftyOne App

Completed: Launching FiftyOne App (Elapsed: 21.35s)


In [19]:
# Cell 17: Demo queries (similarity search, geo filtering, metadata queries)

print("Demo Queries\n" + "="*60)

# 1. Filter by category
print("\n1. Filtering by category: 'Animals'")
animals_view = dataset.match({"category": "Animals"})
print(f"   Found {len(animals_view)} animal sounds")

# 2. Filter by fold (for cross-validation)
print("\n2. Filtering by fold: fold 1")
fold1_view = dataset.match({"fold": 1})
print(f"   Found {len(fold1_view)} samples in fold 1")

# 3. ESC-10 subset
print("\n3. Filtering ESC-10 subset")
esc10_view = dataset.match({"esc10": True})
print(f"   Found {len(esc10_view)} samples in ESC-10 subset")

# 4. Similarity search example
print("\n4. Similarity search: Find sounds similar to first sample")
query_sample = dataset.first()
print(f"   Query sample: {query_sample.filename}")
print(f"   Category: {query_sample.category}")

similar_view = dataset.sort_by_similarity(
    query_sample.id,  # Use sample ID, not the sample object
    k=10,
    brain_key="audiomae_similarity"
)

print(f"\n   Top 5 most similar samples:")
for i, sample in enumerate(similar_view[:5]):
    print(f"     {i+1}. {sample.filename}")
    print(f"        Category: {sample.category}")
    print(f"        Label: {sample.label.label}")

# 5. Combined filter: Animals in fold 1
print("\n5. Combined filter: Animals in fold 1")
combined_view = dataset.match({
    "category": "Animals",
    "fold": 1
})
print(f"   Found {len(combined_view)} samples")

# 6. Geographic filtering example (if synthetic geo was added)
if config.add_synthetic_geo:
    print("\n6. Geographic filtering: Samples near New York")
    # Note: This is just a demo - real geographic queries would use FiftyOne's geo operators
    ny_region = dataset.match({"category": "Animals"})  # Animals cluster near NY
    print(f"   Samples in 'Animals' category (clustered near NY): {len(ny_region)}")

print("\n" + "="*60)
print("Demo queries complete!")
print("\nTry these in the FiftyOne App:")
print("  - Click a sample, then use 'Sort by similarity'")
print("  - Use the Embeddings panel to explore UMAP/t-SNE")
print("  - Use the map view to see geographic distribution")
print("  - Use filters in the sidebar to explore subsets")

Demo Queries

1. Filtering by category: 'Animals'
   Found 0 animal sounds

2. Filtering by fold: fold 1
   Found 400 samples in fold 1

3. Filtering ESC-10 subset
   Found 400 samples in ESC-10 subset

4. Similarity search: Find sounds similar to first sample
   Query sample: 1-100032-A-0.png
   Category: dog

   Top 5 most similar samples:
     1. 1-100032-A-0.png
        Category: dog
        Label: 0
     2. 3-148932-A-34.png
        Category: can_opening
        Label: 34
     3. 5-252248-A-34.png
        Category: can_opening
        Label: 34
     4. 5-204604-A-24.png
        Category: coughing
        Label: 24
     5. 3-147342-A-34.png
        Category: can_opening
        Label: 34

5. Combined filter: Animals in fold 1
   Found 0 samples

6. Geographic filtering: Samples near New York
   Samples in 'Animals' category (clustered near NY): 0

Demo queries complete!

Try these in the FiftyOne App:
  - Click a sample, then use 'Sort by similarity'
  - Use the Embeddings panel to

In [20]:
# Cell 18: Export options (save dataset, export visualizations)

print("Export Options\n" + "="*60)

print("\nThe FiftyOne dataset is already persistent and saved to disk.")
print(f"Dataset name: {config.dataset_name}")

# Show how to reload dataset in future sessions
print("\nTo reload this dataset in a future session:")
print(f"  import fiftyone as fo")
print(f"  dataset = fo.load_dataset('{config.dataset_name}')")
print(f"  session = fo.launch_app(dataset)")

# Export embeddings
print("\nEmbeddings are cached at:")
print(f"  {config.cache_dir / 'embeddings.npy'}")

# Export metadata
print("\nTo export metadata to CSV:")
export_csv = config.cache_dir / "dataset_with_embeddings.csv"
print(f"  metadata_df.to_csv('{export_csv}', index=False)")

# Show dataset info
print("\nDataset Information:")
print(dataset.stats())

# Summary
tracker.summary()

print("\n" + "="*60)
print("FiftyOne Embedding Visualization System Ready!")
print("="*60)
print("\nNext steps:")
print("  1. Explore the dataset in the FiftyOne App")
print("  2. Test similarity search on different audio types")
print("  3. Analyze UMAP/t-SNE clustering patterns")
print("  4. Filter and query by metadata fields")
print("  5. Extend to custom audio datasets using the same pipeline")

Export Options

The FiftyOne dataset is already persistent and saved to disk.
Dataset name: esc50_audiomae

To reload this dataset in a future session:
  import fiftyone as fo
  dataset = fo.load_dataset('esc50_audiomae')
  session = fo.launch_app(dataset)

Embeddings are cached at:
  data/embeddings/esc50_audiomae/embeddings.npy

To export metadata to CSV:
  metadata_df.to_csv('data/embeddings/esc50_audiomae/dataset_with_embeddings.csv', index=False)

Dataset Information:
{'samples_count': 2000, 'samples_bytes': 20932698, 'samples_size': '20.0MB', 'total_bytes': 20932698, 'total_size': '20.0MB'}

Pipeline Summary
  Data Discovery: completed (0.51s)
  Metadata Enrichment: completed (0.02s)
  Dataset Statistics: completed (0.01s)
  Model Loading: completed (10.05s)
  Embedding Generation: completed (0.13s)
  FiftyOne Dataset Creation: completed (9.50s)
  Adding Metadata Fields: completed (4.28s)
  Attaching Embeddings: completed (9.30s)
  Computing Similarity Index: completed (1.26s)
  