<a href="https://colab.research.google.com/github/mercadoerik1031/snn-sound-localization/blob/new_approach/snn_sound_localization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**SNN Sounnd Localization**



---



In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


# Pip Installs

In [None]:
! pip install snntorch --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m109.0/109.0 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.2/76.2 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25h

# Imports

In [None]:
import pandas as pd
import os
import librosa
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import numpy as np
from snntorch import spikegen
import gc
from concurrent.futures import ThreadPoolExecutor
import time

# Config

In [None]:
config = {
    # Google Colab Path
    "metadata_path": "/content/drive/My Drive/Colab Notebooks/Masters Project/metadata.parquet",
    "ambisonics_path": "/content/drive/My Drive/Colab Notebooks/Masters Project/spatial_librispeech_sample/ambisonics_sample",
    "noise_path": "/content/drive/My Drive/Colab Notebooks/Masters Project/spatial_librispeech_sample/noise_ambisonics_sample",
    "output_path": "/content/drive/My Drive/Colab Notebooks/Masters Project/spatial_librispeech_sample/preprocessed_samples",
    "device": "cuda" if torch.cuda.is_available() else "cpu",
    "batch_size_pre": 32,
    "sr": 16000,
    "parallel_processing": True,
    "max_workers": 22,

    "time_based_encoding": True,
    "num_steps": 20,
    "max_rate": 10,
    "noise": True,
}

# Filter Data

In [None]:
def filter_data(metadata_path=config["metadata_path"], ambisonics_path=config["ambisonics_path"], noise_path=config["noise_path"]):
  # Load metadata
  metadata = pd.read_parquet(metadata_path, engine="pyarrow")

  # Get lists of all files in directories
  ambisonic_files = [f for f in os.listdir(ambisonics_path) if os.path.isfile(os.path.join(ambisonics_path, f))]
  noise_files = [f for f in os.listdir(noise_path) if os.path.isfile(os.path.join(noise_path, f))]

  # Extract sample ids from filenames and filter metadata
  sample_ids = [int(f.split(".")[0].lstrip("0") or 0) for f in ambisonic_files]
  filtered_metadata = metadata[metadata["sample_id"].isin(sample_ids)]

  # Create full file paths
  ambisonic_files = [os.path.join(ambisonics_path, f) for f in ambisonic_files]
  noise_files = [os.path.join(noise_path, f) for f in noise_files]

  return filtered_metadata, ambisonic_files, noise_files


# Preprocess Functions

## Normalize

In [None]:
def normalize(audio_data, device=config["device"]):
  audio_data = audio_data.to(device)
  return (audio_data - audio_data.min()) / (audio_data.max() - audio_data.min())


## Rate Based Encoding

In [None]:
def rate_based_encoding(audio_data, max_rate=config["max_rate"], num_steps=config["num_steps"], device=config["device"]):
    if audio_data is None:
        raise ValueError("Input data is None.")

    # Check if audio_data is already a tensor, if not convert it
    if not isinstance(audio_data, torch.Tensor):
        audio_data = torch.tensor(audio_data, device=device)

    audio_data = audio_data.float().to(device)

    normalized_data = normalize(audio_data)

    spike_rates = normalized_data * max_rate

    spike_train = spikegen.rate(spike_rates, num_steps=num_steps)

    return spike_train



## Time Based Encoding

In [None]:
def time_based_encoding(audio_data, num_steps=config["num_steps"], device=config["device"]):
    if audio_data is None:
        raise ValueError("Input data is None.")

    # Check if audio_data is already a tensor, if not convert it
    if not isinstance(audio_data, torch.Tensor):
        audio_data = torch.tensor(audio_data, device=device)

    audio_data = audio_data.float().to(device)

    normalized_data = normalize(audio_data)

    print(f"norm[0] min {torch.min(normalized_data[0])}")
    print(f"norm[0] max {torch.max(normalized_data[0])}")
    print(f"norm[0] mean {torch.mean(normalized_data[0])}")
    print(f"{normalized_data[0]}")

    print(f"norm[1] min {torch.min(normalized_data[1])}")
    print(f"norm[1] max {torch.max(normalized_data[1])}")
    print(f"norm[1] mean {torch.mean(normalized_data[1])}")
    print(f"{normalized_data[1]}")

    print(f"norm[2] min {torch.min(normalized_data[2])}")
    print(f"norm[2] max {torch.max(normalized_data[2])}")
    print(f"norm[2] mean {torch.mean(normalized_data[2])}")
    print(f"{normalized_data[2]}")

    print(f"norm[3] min {torch.min(normalized_data[3])}")
    print(f"norm[3] max {torch.max(normalized_data[3])}")
    print(f"norm[3] mean {torch.mean(normalized_data[3])}")
    print(f"{normalized_data[3]}")

    spike_times = torch.where(normalized_data > 0.5, 1, 0)

    # spike_train = spikegen.latency(spike_times, num_steps=num_steps, bypass=True)
    # spike_train = spikegen.latency(spike_times, num_steps=num_steps)
    spike_train = spike_times

    return spike_train


## Preprocess Function

In [None]:
def preprocess(ambisonic_file, noise_file, duration, device=config["device"], sr=config["sr"]):
    # Load ambisonic audio directly to GPU if possible
    audio = torch.tensor(librosa.load(ambisonic_file, sr=sr, mono=False, duration=duration)[0], device=device)
    length = duration * sr

    # Pad Ambisonic File
    padded_ambisonic = torch.nn.functional.pad(audio, (0, max(0, length - audio.shape[1])))

    # Combine Noise (Optional)
    if config["noise"] and noise_file:
        # Load Noise File
        noise_audio = torch.tensor(librosa.load(noise_file, sr=sr, mono=False, duration=duration)[0], device=device)

        # Pad Noise File
        padded_noise = torch.nn.functional.pad(noise_audio, (0, max(0, length - noise_audio.shape[1])))

        # Combine Ambisonic & Noise
        combined_audio = padded_ambisonic + padded_noise
    else:
        combined_audio = padded_ambisonic

    # Processed_audio should be processed on GPU
    spike_trains = time_based_encoding(combined_audio) if config["time_based_encoding"] else rate_based_encoding(combined_audio)

    return spike_trains



## Process & Save Batches

In [None]:
def split_data(metadata):
    # Add a 'set' column to specify train, validation, or test
    train_indices = metadata[metadata['split'] == 'train'].index
    train_idx, valid_idx = train_test_split(train_indices, test_size=0.2, random_state=42)

    metadata['set'] = 'test'  # Initialize all as test
    metadata.loc[train_idx, 'set'] = 'train'  # Mark train
    metadata.loc[valid_idx, 'set'] = 'validation'  # Mark validation
    return metadata

In [None]:
def process_batch(batch_ambisonic_files, batch_noise_files, batch_metadata, output_path, duration, sr, batch_id):
    processed_data = []
    labels = []

    # Process each file in the batch
    for ambisonic_file, noise_file, meta_row in zip(batch_ambisonic_files, batch_noise_files, batch_metadata.itertuples()):
        spike_trains = preprocess(ambisonic_file, noise_file, duration)
        processed_data.append(spike_trains.cpu())

        labels.append({
            'sample_id': meta_row.sample_id,
            'split': meta_row.split,
            'azimuth': batch_metadata.at[meta_row.Index, 'speech/azimuth'],
            'elevation': batch_metadata.at[meta_row.Index, 'speech/elevation']
        })

    # Save processed data and labels
    batch_data_filename = f'processed_batch_{batch_id}.pt'
    batch_labels_filename = f'labels_batch_{batch_id}.csv'
    torch.save(torch.stack(processed_data), os.path.join(output_path, batch_data_filename))
    pd.DataFrame(labels).to_csv(os.path.join(output_path, batch_labels_filename), index=False)

    print(f"Batch {batch_id} processed and saved.")





In [None]:
def parallel_process_batches(metadata, ambisonic_files, noise_files, duration, batch_size=config["batch_size_pre"], output_path=config["output_path"], sr=config["sr"], max_workers=config["max_workers"]):
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = []
        for i in range(0, len(ambisonic_files), batch_size):
            batch_ambisonic_files = ambisonic_files[i:i+batch_size]
            batch_noise_files = noise_files[i:i+batch_size]
            batch_metadata = metadata.iloc[i:i+batch_size]

            # Submit to the executor
            futures.append(executor.submit(process_batch, batch_ambisonic_files, batch_noise_files, batch_metadata, output_path, duration, sr, i // batch_size))

        # Wait for all futures to complete
        for future in futures:
            future.result()

    print("All batches processed and saved in parallel.")

# DataLoaders

In [None]:
class SoundLocalizationDataset(Dataset):
    def __init__(self, labels_df, base_path):
        self.labels_df = labels_df
        self.base_path = base_path

    def __len__(self):
        return len(self.labels_df)

    def __getitem__(self, idx):
        # Fetch the batch id for this index
        batch_id = self.labels_df.iloc[idx]['batch_id']

        # Load the corresponding batch data
        data_path = f'{self.base_path}/processed_batch_{batch_id}.pt'
        batch_data = torch.load(data_path)

        # Calculate the index within the batch
        local_idx = idx % len(batch_data)

        # Get the sample and its label
        sample = batch_data[local_idx]
        label = self.labels_df.iloc[idx][['azimuth', 'elevation']].values.astype('float32')

        return sample, label


In [None]:
batch_size=32

# Filter and Split Data
filtered_metadata, ambisonic_files, noise_files = filter_data()
duration = int(np.round(filtered_metadata["audio_info/duration"].mean() + filtered_metadata["audio_info/duration"].std()))
metadata_with_splits = split_data(filtered_metadata)

# Process Batches
if config.get("parallel_processing", False):
    start_time = time.time()
    parallel_process_batches(metadata_with_splits, ambisonic_files, noise_files, duration)
    end_time = time.time()
    print(f"parallel_process took: {int((end_time - start_time) // 60)} minutes {int((end_time - start_time) % 60)} seconds")
else:
    for i in range(0, len(ambisonic_files), batch_size):
        process_batch(ambisonic_files[i:i+batch_size], noise_files[i:i+batch_size], metadata_with_splits.iloc[i:i+batch_size], config["output_path"], duration, config["sr"], i // batch_size)




[1;30;43mStreaming output truncated to the last 5000 lines.[0m
norm[2] max 0.7276105284690857
norm[2] min 0.24680058658123016
norm[2] mean 0.47515007853507996
norm[2] max 0.8014057278633118
norm[2] mean 0.5423075556755066
norm[0] min 0.034253112971782684
norm[0] max 0.8419836759567261
norm[0] min 0.35319405794143677norm[0] mean 0.5023785829544067
norm[0] min 0.09654361009597778

norm[0] max 0.9002628922462463
norm[0] max 1.0
norm[0] mean 0.4728451371192932
norm[0] mean 0.6132599711418152
tensor([0.5279, 0.5279, 0.5279,  ..., 0.5279, 0.5279, 0.5279], device='cuda:0')
norm[1] min 0.11089812964200974
tensor([0.5423, 0.5423, 0.5423,  ..., 0.5423, 0.5423, 0.5423], device='cuda:0')
norm[1] max 1.0
tensor([0.4730, 0.4730, 0.4730,  ..., 0.4730, 0.4730, 0.4730], device='cuda:0')
norm[3] min 0.0
norm[1] min 0.0
norm[3] max 1.0
norm[1] max 0.9083058834075928
tensor([0.4751, 0.4751, 0.4751,  ..., 0.4751, 0.4751, 0.4751], device='cuda:0')norm[1] mean 0.5250954627990723

norm[1] mean 0.47304818034

In [None]:
# Load and concatenate label files
label_file_paths = [f'/content/drive/My Drive/Colab Notebooks/Masters Project/spatial_librispeech_sample/preprocessed_samples/labels_batch_{i}.csv' for i in range(23)]
labels_df = pd.concat([pd.read_csv(file) for file in label_file_paths], ignore_index=True)
labels_df['batch_id'] = labels_df.index // 32

# Split the data
metadata = split_data(labels_df)

# Create datasets and data loaders
data_path = "/content/drive/My Drive/Colab Notebooks/Masters Project/spatial_librispeech_sample/preprocessed_samples"
train_dataset = SoundLocalizationDataset(metadata[metadata['set'] == 'train'], data_path)
valid_dataset = SoundLocalizationDataset(metadata[metadata['set'] == 'validation'], data_path)
test_dataset = SoundLocalizationDataset(metadata[metadata['set'] == 'test'], data_path)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Access the first batch of training data
first_batch_data, first_batch_labels = next(iter(train_loader))
# print("First batch data:", first_batch_data)
# print("First batch labels:", first_batch_labels)

In [None]:
first_batch_data.shape

torch.Size([32, 4, 256000])

In [None]:
count_0 = 0
count_1 = 0
for num in first_batch_data[0, 3, :]:
    if num.item() == 0:
        count_0 += 1
    else:
        count_1 += 1
print(f"0s: {count_0}, 1s: {count_1}")


0s: 244005, 1s: 11995
