<a href="https://colab.research.google.com/github/mercadoerik1031/snn-sound-localization/blob/main/snn_sound_localization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**SNN Sounnd Localization**



---



# Pip Installs

In [1]:
! pip install snntorch brian2 brian2hears --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m109.0/109.0 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.9/4.9 MB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.2/76.2 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for brian2hears (setup.py) ... [?25l[?25hdone


# Imports

In [2]:
import pandas as pd
import os
import librosa
import torch
from snntorch import spikegen
from brian2 import *
from brian2hears import *

In [3]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


# Config

In [4]:
config = {
    # Google Colab Path
    "metadata_path": "/content/drive/My Drive/Colab Notebooks/Masters Project/metadata.parquet",
    "ambisonics_path": "/content/drive/My Drive/Colab Notebooks/Masters Project/spatial_librispeech_sample/ambisonics_sample",
    "noise_ambisonics_path": "/content/drive/My Drive/Colab Notebooks/Masters Project/spatial_librispeech_sample/noise_ambisonics_sample",

    # Local Path
    # "metadata_path": r"C:\Users\merca\OneDrive\Documents\MyFiles\Code\Masters_Project\spatial_librispeech_sample\metadata.parquet",
    # "ambisonics_path": r"c:\Users\merca\OneDrive\Documents\MyFiles\Code\masters_project\spatial_librispeech_sample\ambisonics_sample",

    "time_based_encoding": True,
    "num_steps": 20,
    "max_rate": 10,
    "device": "cuda" if torch.cuda.is_available() else "cpu",
    "noise": True,
}

# Filter Data

In [5]:
def filter_data(metadata_path=config["metadata_path"], ambisonics_path=config["ambisonics_path"], noise_path=config["noise_ambisonics_path"]):
    metadata = pd.read_parquet(metadata_path, engine="pyarrow")
    ambisonics_files = [f for f in os.listdir(ambisonics_path) if os.path.isfile(os.path.join(ambisonics_path, f))]
    noise_files = [f for f in os.listdir(noise_path) if os.path.isfile(os.path.join(noise_path, f))]


    sample_ids = []

    for file_name in ambisonics_files:
        number, _ = file_name.split(".")
        number.lstrip("0")

        if not number:
            number = 0

        sample_ids.append(int(number))

    filtered_metadata = metadata[metadata["sample_id"].isin(sample_ids)]

    return filtered_metadata, ambisonics_files, noise_files


# Preprocess Audio

## Cochlear Filter

In [6]:
def cochlear_filter(channel_data, sr):
    # Create a mono sound object from the channel data
    sound = Sound(channel_data, samplerate=sr*Hz)

    # Set up the cochlear model
    num_channels = 32  # Number of channels in the filter bank
    cf = erbspace(20*Hz, 20*kHz, num_channels)  # Center frequencies
    gammatone = Gammatone(sound, cf)

    # Process the sound with the cochlear filter
    filtered_sound = gammatone.process()

    # Convert the filtered signal to a numpy array
    filtered_data = filtered_sound.T  # Transpose to get the correct shape
    return filtered_data

## Normalize

In [7]:
def normalize(data, device=config["device"]):

  if isinstance(data, np.ndarray):
        data = torch.from_numpy(data).float()

  # Move data to the specified device (GPU or CPU)
  data = data.to(device)

  return (data - data.min()) / (data.max() - data.min())

## Rate Based Encoding

In [8]:
def rate_based_encoding(data, max_rate=config["max_rate"], num_steps=config["num_steps"], device=config["device"]):
    if data is None:
      raise ValueError("Input data is None.")

    data = torch.from_numpy(data).float().to(device)

    normalized_data = normalize(data, device)

    spike_rates = normalized_data * max_rate

    spike_train = spikegen.rate(spike_rates, num_steps= num_steps)

    return spike_train

## Time Based Encoding

In [9]:
def time_based_encoding(data, num_steps=config["device"], device=config["device"]):
    if data is None:
      raise ValueError("Input data is None.")

    data_tensor = torch.from_numpy(data).float()

    normalized_data = normalize(data_tensor)

    spike_times = torch.where(normalized_data > 0.5, 1, 0)

    spike_trains = spikegen.latency(spike_times, num_steps=num_steps)

    return spike_trains



## Preprocess Function

In [10]:
def preprocess_audio(ambisonic_filepath, noise_filepath ,max_duration):
    """
    W: Omnidirectional
    X: Front - Back
    Y: Left - Right
    Z: Top - Bottom
    """
    audio, sr = librosa.load(ambisonic_filepath, sr=None, mono=False)
    print(f"Original shape: {audio.shape}, Sampling rate: {sr}")

    max_length = int(max_duration * sr)
    print(f"Max length in samples: {max_length}")

    padded_audio = librosa.util.fix_length(data=audio, size=max_length)


    if config["noise"] and noise_filepath:
      noise_audio, _ = librosa.load(noise_filepath, sr=sr, mono=False)
      padded_noise_audio = librosa.util.fix_length(data=noise_audio, size=max_length)

      combined_audio = padded_audio + padded_noise_audio
      print(f"Speech and Noise have been combined")
    else:
      combined_audio = padded_audio
      print(f"Speech and Noise have NOT been combined")
      print(f"Padded shape: {combined_audio.shape}")

    # Process each channel separately
    processed_W = cochlear_filter(combined_audio[0], sr)
    processed_X = cochlear_filter(combined_audio[1], sr)
    processed_Y = cochlear_filter(combined_audio[2], sr)
    processed_Z = cochlear_filter(combined_audio[3], sr)
    print(f"processed_W.shape: {processed_W.shape}")

    if config["time_based_encoding"]:
        print(f"Using Time Based Encoding")
        spike_trains_W = time_based_encoding(processed_W, config["num_steps"])
        spike_trains_X = time_based_encoding(processed_X, config["num_steps"])
        spike_trains_Y = time_based_encoding(processed_Y, config["num_steps"])
        spike_trains_Z = time_based_encoding(processed_Z, config["num_steps"])
    else:
        print(f"Using Rate Based Encoding")
        spike_trains_W = rate_based_encoding(processed_W, config['max_rate'], config['num_steps'])
        spike_trains_X = rate_based_encoding(processed_X, config['max_rate'], config['num_steps'])
        spike_trains_Y = rate_based_encoding(processed_Y, config['max_rate'], config['num_steps'])
        spike_trains_Z = rate_based_encoding(processed_Z, config['max_rate'], config['num_steps'])

    print(f"spikes_W: {spike_trains_W.shape}")
    print(f"spikes_X: {spike_trains_X.shape}")
    print(f"spikes_Y: {spike_trains_Y.shape}")
    print(f"spikes_Z: {spike_trains_Z.shape}")

    return spike_trains_W, spike_trains_X, spike_trains_Y, spike_trains_Z





# Run

In [11]:
filtered_metadata, ambisonics_files, noise_files = filter_data()
max_duration = filtered_metadata["audio_info/duration"].max()

all_spike_trains_W = []
all_spike_trains_X = []
all_spike_trains_Y = []
all_spike_trains_Z = []

for ambisonic_file, noise_file in zip(ambisonics_files[:10], noise_files[:10]):
  # Creat File Paths
  ambisonic_file = os.path.join(config["ambisonics_path"], ambisonic_file)
  noise_file = os.path.join(config["noise_ambisonics_path"], noise_file)

  # Process Each of The Files
  processed_data = preprocess_audio(ambisonic_file, noise_file, max_duration)
  if processed_data is None:
    raise ValueError("processed_data is None. Check preprocess_audio Function")

  spike_train_W, spike_train_X, spike_train_Y, spike_train_Z = processed_data

  # Store Processed Data
  all_spike_trains_W.append(spike_train_W)
  all_spike_trains_X.append(spike_train_X)
  all_spike_trains_Y.append(spike_train_Y)
  all_spike_trains_Z.append(spike_train_Z)

# Print Shape
print(f"all_spike_trains_W.shape: {all_spike_trains_W.shape}")


Original shape: (4, 251579), Sampling rate: 16000
Max length in samples: 524310
Speech and Noise have been combined
processed_W.shape: (32, 524310)
Using Time Based Encoding
spikes_W: torch.Size([20, 32, 524310])
spikes_X: torch.Size([20, 32, 524310])
spikes_Y: torch.Size([20, 32, 524310])
spikes_Z: torch.Size([20, 32, 524310])
Original shape: (4, 244241), Sampling rate: 16000
Max length in samples: 524310
Speech and Noise have been combined
processed_W.shape: (32, 524310)
Using Time Based Encoding
spikes_W: torch.Size([20, 32, 524310])
spikes_X: torch.Size([20, 32, 524310])
spikes_Y: torch.Size([20, 32, 524310])
spikes_Z: torch.Size([20, 32, 524310])
Original shape: (4, 327322), Sampling rate: 16000
Max length in samples: 524310
Speech and Noise have been combined
processed_W.shape: (32, 524310)
Using Time Based Encoding


OutOfMemoryError: CUDA out of memory. Tried to allocate 1.25 GiB. GPU 0 has a total capacty of 15.77 GiB of which 138.38 MiB is free. Process 39535 has 15.63 GiB memory in use. Of the allocated memory 14.16 GiB is allocated by PyTorch, and 1.12 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF