<a href="https://colab.research.google.com/github/mercadoerik1031/snn-sound-localization/blob/main/snn_sound_localization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**SNN Sounnd Localization**



---



# Pip Installs

In [2]:
! pip install snntorch brian2 brian2hears --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m109.0/109.0 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.9/4.9 MB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.2/76.2 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for brian2hears (setup.py) ... [?25l[?25hdone


# Imports

In [3]:
import pandas as pd
import os
import librosa
import torch
from snntorch import spikegen
from brian2 import *
from brian2hears import *

In [7]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


# Config

In [8]:
config = {
    # Google Colab Path
    "metadata_path": "/content/drive/My Drive/Colab Notebooks/Masters Project/metadata.parquet",
    "ambisonics_path": "/content/drive/My Drive/Colab Notebooks/Masters Project/spatial_librispeech_sample/ambisonics_sample",
    "noise_ambisonics_path": "/content/drive/My Drive/Colab Notebooks/Masters Project/spatial_librispeech_sample/noise_ambisonics_sample",

    # Local Path
    # "metadata_path": r"C:\Users\merca\OneDrive\Documents\MyFiles\Code\Masters_Project\spatial_librispeech_sample\metadata.parquet",
    # "ambisonics_path": r"c:\Users\merca\OneDrive\Documents\MyFiles\Code\masters_project\spatial_librispeech_sample\ambisonics_sample",

    "time_based_encoding": True,
    "num_steps": 10,
    "max_rate": 10,
    "device": "cuda" if torch.cuda.is_available() else "cpu",
    "noise": True,
}

# Filter Data

In [9]:
def filter_data(metadata_path=config["metadata_path"], ambisonics_path=config["ambisonics_path"], noise_path=config["noise_ambisonics_path"]):

    metadata = pd.read_parquet(metadata_path, engine="pyarrow")
    ambisonics_files = [f for f in os.listdir(ambisonics_path) if os.path.isfile(os.path.join(ambisonics_path, f))]
    noise_files = [f for f in os.listdir(noise_path) if os.path.isfile(os.path.join(noise_path, f))]

    sample_ids = []

    for file_name in ambisonics_files:
        number, _ = file_name.split(".")
        number.lstrip("0")

        if not number:
            number = 0

        sample_ids.append(int(number))

    filtered_metadata = metadata[metadata["sample_id"].isin(sample_ids)]

    return filtered_metadata, ambisonics_files, noise_files


In [10]:
filtered_metadata, ambisonics_files, noise_files = filter_data()

In [11]:

print(f"Max: {filtered_metadata['audio_info/duration'].max()}")
print(f"Mean: {filtered_metadata['audio_info/duration'].mean()}")
print(f"Min: {filtered_metadata['audio_info/duration'].min()}")
print(f"Median: {filtered_metadata['audio_info/duration'].median()}")
print(f"STD: {filtered_metadata['audio_info/duration'].std()}")


Max: 32.769375
Mean: 10.773610714285715
Min: 0.71825
Median: 12.628875
STD: 4.748215274235851


# Preprocess Audio

## Cochlear Filter

In [6]:
def cochlear_filter(audio_data, sr):
    num_channels = audio_data.shape[0]  # Number of channels in the audio data
    processed_channels = []

    for channel in range(num_channels):
        sound = Sound(audio_data[channel], samplerate=sr*Hz)
        cf = erbspace(20*Hz, 20*kHz, 32)  # Center frequencies for 32 channels
        gammatone = Gammatone(sound, cf)
        filtered_sound = gammatone.process()
        filtered_data = filtered_sound.T  # Transpose to get the correct shape
        processed_channels.append(filtered_data)

    # Combine the processed data from all channels
    combined_processed_data = np.stack(processed_channels, axis=0)
    return combined_processed_data


In [7]:
# def cochlear_filter(audio_data, st):
#   return audio_data

## Normalize

In [8]:
def normalize(data, device=config["device"]):

  if isinstance(data, np.ndarray):
        data = torch.from_numpy(data).float()

  # Move data to the specified device (GPU or CPU)
  data = data.to(device)

  return (data - data.min()) / (data.max() - data.min())

## Rate Based Encoding

In [9]:
def rate_based_encoding(data, max_rate=config["max_rate"], num_steps=config["num_steps"], device=config["device"]):
    if data is None:
      raise ValueError("Input data is None.")

    data = torch.from_numpy(data).float().to(device)

    normalized_data = normalize(data, device)

    spike_rates = normalized_data * max_rate

    spike_train = spikegen.rate(spike_rates, num_steps=num_steps)

    return spike_train

## Time Based Encoding

In [10]:
def time_based_encoding(data, num_steps=config["device"], device=config["device"]):
    if data is None:
      raise ValueError("Input data is None.")

    data_tensor = torch.from_numpy(data).float()

    normalized_data = normalize(data_tensor)

    spike_times = torch.where(normalized_data > 0.5, 1, 0)

    spike_train = spikegen.latency(spike_times, num_steps=num_steps, bypass=True)

    print(f"spike_train.shape: {spike_train.shape}")

    return spike_train



## Preprocess Function

In [11]:
def preprocess_audio(ambisonic_filepath, noise_filepath, start_time, end_time, sr):
    duration = end_time - start_time
    max_length = int(duration * sr)

    # Load ambisonic audio
    audio, _ = librosa.load(ambisonic_filepath, sr=sr, mono=False, offset=start_time, duration=duration)
    audio = audio if audio.ndim == 2 else np.expand_dims(audio, axis=0)  # Ensure audio is 2D
    padded_audio = librosa.util.fix_length(audio, size=max_length, axis=1)

    if config["noise"] and noise_filepath:
        # Load noise audio
        noise_audio, _ = librosa.load(noise_filepath, sr=sr, mono=False, offset=start_time, duration=duration)
        noise_audio = noise_audio if noise_audio.ndim == 2 else np.expand_dims(noise_audio, axis=0)  # Ensure noise audio is 2D
        padded_noise_audio = librosa.util.fix_length(noise_audio, size=max_length, axis=1)

        # Ensure the channel counts are equal
        min_channels = min(padded_audio.shape[0], padded_noise_audio.shape[0])
        combined_audio = padded_audio[:min_channels] + padded_noise_audio[:min_channels]
    else:
        combined_audio = padded_audio

    processed_audio = cochlear_filter(combined_audio, sr)

    # Encoding
    if config["time_based_encoding"]:
        spike_trains = time_based_encoding(processed_audio, config["num_steps"])
    else:
        spike_trains = rate_based_encoding(processed_audio, config['max_rate'], config["num_steps"])

    return spike_trains


## Blocks

In [12]:
def preprocess_audio_in_blocks(block_duration, sr):
    ambisonics_path = config["ambisonics_path"]
    noise_ambisonics_path = config["noise_ambisonics_path"]

    ambisonics_files = sorted([f for f in os.listdir(ambisonics_path) if os.path.isfile(os.path.join(ambisonics_path, f))])
    noise_files = sorted([f for f in os.listdir(noise_ambisonics_path) if os.path.isfile(os.path.join(noise_ambisonics_path, f))])

    all_processed_data = []

    for ambisonic_file, noise_file in zip(ambisonics_files, noise_files):
      print(f"ambisonic_file: {ambisonic_file}. noise_file: {noise_file}")
        ambisonic_filepath = os.path.join(ambisonics_path, ambisonic_file)
        noise_filepath = os.path.join(noise_ambisonics_path, noise_file)

        # Get the duration of the audio file (assuming both ambisonic and noise files have the same duration)
        total_duration = librosa.get_duration(path=ambisonic_filepath, sr=sr)
        total_blocks = int(total_duration // block_duration) + (total_duration % block_duration > 0)

        for block_idx in range(total_blocks):
            start_time = block_idx * block_duration
            end_time = min((block_idx + 1) * block_duration, total_duration)

            processed_block = preprocess_audio(ambisonic_filepath, noise_filepath, start_time, end_time, sr)
            all_processed_data.append(processed_block)

    final_preprocessed_data = torch.stack(all_processed_data)

    return final_preprocessed_data


# Run

In [13]:
block_duration = 5.0  # Duration of each block in seconds
sample_rate = 16000  # Sample rate for audio files

preprocessed_data = preprocess_audio_in_blocks(
    block_duration=block_duration,
    sr=sample_rate
)

spike_train.shape: torch.Size([10, 4, 32, 80000])
spike_train.shape: torch.Size([10, 4, 32, 80000])
spike_train.shape: torch.Size([10, 4, 32, 71352])
spike_train.shape: torch.Size([10, 4, 32, 80000])
spike_train.shape: torch.Size([10, 4, 32, 80000])
spike_train.shape: torch.Size([10, 4, 32, 69368])
spike_train.shape: torch.Size([10, 4, 32, 80000])
spike_train.shape: torch.Size([10, 4, 32, 80000])
spike_train.shape: torch.Size([10, 4, 32, 56428])
spike_train.shape: torch.Size([10, 4, 32, 27578])
spike_train.shape: torch.Size([10, 4, 32, 80000])
spike_train.shape: torch.Size([10, 4, 32, 80000])
spike_train.shape: torch.Size([10, 4, 32, 78180])
spike_train.shape: torch.Size([10, 4, 32, 38574])
spike_train.shape: torch.Size([10, 4, 32, 80000])
spike_train.shape: torch.Size([10, 4, 32, 80000])
spike_train.shape: torch.Size([10, 4, 32, 62973])
spike_train.shape: torch.Size([10, 4, 32, 80000])
spike_train.shape: torch.Size([10, 4, 32, 80000])
spike_train.shape: torch.Size([10, 4, 32, 62757])


OutOfMemoryError: CUDA out of memory. Tried to allocate 376.00 MiB. GPU 0 has a total capacty of 14.75 GiB of which 59.06 MiB is free. Process 2088 has 14.69 GiB memory in use. Of the allocated memory 13.70 GiB is allocated by PyTorch, and 897.38 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
# _, ambisonics_files, noise_files = filter_data()
# batch_size = 10  # Adjust this based on your memory capacity
# total_files = len(ambisonics_files)
# batches = total_files // batch_size
# num_steps = 10
# num_channels = 4
# sr = 16_000
# duration = 5
# length_samples = duration * sr
# num_cochlear_filters = 32

# for batch_idx in range(batches):
#     start_idx = batch_idx * batch_size
#     end_idx = start_idx + batch_size

#     # Adjust the tensor size for the current batch
#     current_batch_size = min(batch_size, total_files - start_idx)
#     all_processed_data_batch = torch.empty(current_batch_size, num_steps, num_channels, num_cochlear_filters, length_samples, device=config["device"])
#     # all_processed_data_batch = torch.empty(current_batch_size, num_steps, num_channels, length_samples, device=config["device"])

#     for idx, (ambisonic_file, noise_file) in enumerate(zip(ambisonics_files[start_idx:end_idx], noise_files[start_idx:end_idx])):
#         # Create file paths
#         ambisonic_file = os.path.join(config["ambisonics_path"], ambisonic_file)
#         noise_file = os.path.join(config["noise_ambisonics_path"], noise_file)

#         # Process each file
#         processed_data = preprocess_audio(ambisonic_file, noise_file, duration)
#         if processed_data is None:
#             raise ValueError("processed_data is None. Check preprocess_audio Function")

#         # Store the processed data in the preallocated tensor for the batch
#         all_processed_data_batch[idx] = processed_data

#     # Now all_processed_data_batch contains the processed data for the current batch
#     # Use this data for training your model
#     # train_your_model(all_processed_data_batch)

#     # After training, clear the memory
#     del all_processed_data_batch
#     torch.cuda.empty_cache()

# # If there are remaining files that were not processed (due to uneven division), process them here
# # This step is optional and only needed if total_files is not a multiple of batch_size
# remaining_files = total_files % batch_size
# if remaining_files > 0:
#     start_idx = batches * batch_size
#     end_idx = total_files
#     all_processed_data_remaining = torch.empty(remaining_files, num_steps, num_channels, num_cochlear_filters, length_samples, device=config["device"])
#     # all_processed_data_remaining = torch.empty(remaining_files, num_steps, num_channels, length_samples, device=config["device"])

#     # for idx, (ambisonic_file, noise_file) in enumerate(zip(ambisonics_files[start_idx:end_idx], noise_files[start_idx:end_idx])):
#     #     # ... (process each file and store in all_processed_data_remaining)

#     # # Use all_processed_data_remaining for training, then discard it
#     # # ...

#     del all_processed_data_remaining
#     torch.cuda.empty_cache()

# # Note: Replace 'train_your_model()' with the actual function that trains your model

In [None]:
# filtered_metadata, ambisonics_files, noise_files = filter_data()

# # Parameters
# num_files = len(ambisonics_files)  # Number of files to process
# num_steps = config["num_steps"]
# num_channels = 4  # Number of channels in ambisonic audio
# num_cochlear_filters = 32
# sr = 16000  # Sampling rate
# duration = 5  # Maximum duration in seconds
# length_samples = duration * sr

# # Preallocate tensor to store all processed data
# # all_processed_data = torch.empty(num_files, num_steps, num_channels, num_cochlear_filters, length_samples, device=config["device"])
# all_processed_data = torch.empty(num_files, num_steps, num_channels, length_samples, device=config["device"])

# for idx, (ambisonic_file, noise_file) in enumerate(zip(ambisonics_files, noise_files)):
#     # Create file paths
#     ambisonic_file = os.path.join(config["ambisonics_path"], ambisonic_file)
#     noise_file = os.path.join(config["noise_ambisonics_path"], noise_file)

#     # Process each of the files
#     processed_data = preprocess_audio(ambisonic_file, noise_file, duration)
#     if processed_data is None:
#         raise ValueError("processed_data is None. Check preprocess_audio Function")

#     # Store the processed data in the preallocated tensor
#     all_processed_data[idx] = processed_data

# # Print Shape
# print(f"all_processed_data.shape: {all_processed_data.shape}")
