<a href="https://colab.research.google.com/github/mercadoerik1031/snn-sound-localization/blob/main/snn_sound_localization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**SNN Sounnd Localization**



---



# Pip Installs

In [1]:
! pip install snntorch brian2 brian2hears --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m109.0/109.0 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.9/4.9 MB[0m [31m36.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.2/76.2 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for brian2hears (setup.py) ... [?25l[?25hdone


# Imports

In [2]:
import pandas as pd
import os
import librosa
import torch
from snntorch import spikegen
from brian2 import *
from brian2hears import *

In [3]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


# Config

In [4]:
config = {
    # Google Colab Path
    "metadata_path": "/content/drive/My Drive/Colab Notebooks/Masters Project/metadata.parquet",
    "ambisonics_path": "/content/drive/My Drive/Colab Notebooks/Masters Project/spatial_librispeech_sample/ambisonics_sample",
    "noise_ambisonics_path": "/content/drive/My Drive/Colab Notebooks/Masters Project/spatial_librispeech_sample/noise_ambisonics_sample",

    # Local Path
    # "metadata_path": r"C:\Users\merca\OneDrive\Documents\MyFiles\Code\Masters_Project\spatial_librispeech_sample\metadata.parquet",
    # "ambisonics_path": r"c:\Users\merca\OneDrive\Documents\MyFiles\Code\masters_project\spatial_librispeech_sample\ambisonics_sample",

    "time_based_encoding": True,
    "num_steps": 20,
    "max_rate": 10,
    "device": "cuda" if torch.cuda.is_available() else "cpu",
    "noise": True,
}

# Filter Data

In [5]:
def filter_data(metadata_path=config["metadata_path"], ambisonics_path=config["ambisonics_path"], noise_path=config["noise_ambisonics_path"]):

    metadata = pd.read_parquet(metadata_path, engine="pyarrow")
    ambisonics_files = [f for f in os.listdir(ambisonics_path) if os.path.isfile(os.path.join(ambisonics_path, f))]
    noise_files = [f for f in os.listdir(noise_path) if os.path.isfile(os.path.join(noise_path, f))]

    sample_ids = []

    for file_name in ambisonics_files:
        number, _ = file_name.split(".")
        number.lstrip("0")

        if not number:
            number = 0

        sample_ids.append(int(number))

    filtered_metadata = metadata[metadata["sample_id"].isin(sample_ids)]

    return filtered_metadata, ambisonics_files, noise_files


# Preprocess Audio

## Cochlear Filter

In [6]:
def cochlear_filter(audio_data, sr):
    num_channels = audio_data.shape[0]  # Number of channels in the audio data
    processed_channels = []

    for channel in range(num_channels):
        sound = Sound(audio_data[channel], samplerate=sr*Hz)
        cf = erbspace(20*Hz, 20*kHz, 32)  # Center frequencies for 32 channels
        gammatone = Gammatone(sound, cf)
        filtered_sound = gammatone.process()
        filtered_data = filtered_sound.T  # Transpose to get the correct shape
        processed_channels.append(filtered_data)

    # Combine the processed data from all channels
    combined_processed_data = np.stack(processed_channels, axis=0)
    return combined_processed_data


## Normalize

In [8]:
def normalize(data, device=config["device"]):

  if isinstance(data, np.ndarray):
        data = torch.from_numpy(data).float()

  # Move data to the specified device (GPU or CPU)
  data = data.to(device)

  return (data - data.min()) / (data.max() - data.min())

## Rate Based Encoding

In [9]:
def rate_based_encoding(data, max_rate=config["max_rate"], num_steps=config["num_steps"], device=config["device"]):
    if data is None:
      raise ValueError("Input data is None.")

    data = torch.from_numpy(data).float().to(device)

    normalized_data = normalize(data, device)

    spike_rates = normalized_data * max_rate

    spike_train = spikegen.rate(spike_rates, num_steps= num_steps)

    return spike_train

## Time Based Encoding

In [10]:
def time_based_encoding(data, num_steps=config["device"], device=config["device"]):
    if data is None:
      raise ValueError("Input data is None.")

    data_tensor = torch.from_numpy(data).float()

    normalized_data = normalize(data_tensor)

    spike_times = torch.where(normalized_data > 0.5, 1, 0)

    spike_train = spikegen.latency(spike_times, num_steps=num_steps)

    print(f"spike_train.shape: {spike_train.shape}")

    return spike_train



## Preprocess Function

In [11]:
def preprocess_audio(ambisonic_filepath, noise_filepath ,duration):

    audio, sr = librosa.load(ambisonic_filepath, sr=None, mono=False)
    print(f"Original shape: {audio.shape}, Sampling rate: {sr}")

    max_length = int(duration * sr)
    print(f"Max length in samples: {max_length}")

    padded_audio = librosa.util.fix_length(data=audio, size=max_length)

    if config["noise"] and noise_filepath:
      noise_audio, _ = librosa.load(noise_filepath, sr=sr, mono=False)
      padded_noise_audio = librosa.util.fix_length(data=noise_audio, size=max_length)

      combined_audio = padded_audio + padded_noise_audio
      print(f"Speech and Noise have been combined")
    else:
      combined_audio = padded_audio
      print(f"Speech and Noise have NOT been combined")
      print(f"Padded shape: {combined_audio.shape}")

    processed_audio = cochlear_filter(combined_audio, sr)

    if config["time_based_encoding"]:
        spike_trains = time_based_encoding(processed_audio, config["num_steps"])
        del processed_audio

    else:
        spike_trains = rate_based_encoding(processed_audio, config['max_rate'], config['num_steps'])

    return spike_trains





# Run

In [None]:
filtered_metadata, ambisonics_files, noise_files = filter_data()
mduration = 5

# Parameters
num_files = len(ambisonics_files)  # Number of files to process
num_steps = config["num_steps"]
num_channels = 4  # Number of channels in ambisonic audio
num_cochlear_filters = 32
sr = 16000  # Sampling rate
duration = 5  # Maximum duration in seconds
length_samples = duration * sr

# Preallocate tensor to store all processed data
all_processed_data = torch.empty(num_files, num_steps, num_channels, num_cochlear_filters, length_samples, device=config["device"])

for idx, (ambisonic_file, noise_file) in enumerate(zip(ambisonics_files, noise_files)):
    # Create file paths
    ambisonic_file = os.path.join(config["ambisonics_path"], ambisonic_file)
    noise_file = os.path.join(config["noise_ambisonics_path"], noise_file)

    # Process each of the files
    processed_data = preprocess_audio(ambisonic_file, noise_file, max_duration)
    if processed_data is None:
        raise ValueError("processed_data is None. Check preprocess_audio Function")

    # Store the processed data in the preallocated tensor
    all_processed_data[idx] = processed_data

# Print Shape
print(f"all_processed_data.shape: {all_processed_data.shape}")
