In [3]:
import pandas as pd
import os
import librosa
import torch

# Config

In [4]:
config = {
    "metadata_path": "spatial_librispeech_sample/metadata.parquet",
    "ambisonics_path": "c:\\Users\\merca\\OneDrive\\Documents\\MyFiles\\Code\\masters_project\\spatial_librispeech_sample\\ambisonics_sample",
    
    "time_based_encoding": True,
    "num_steps": 100,
    "max_rate": 100
}

# Filter Data

In [5]:
# Path to the metadata file
metadata_path = config["metadata_path"]

# Load the metadata file
metadata = pd.read_parquet(metadata_path, engine="pyarrow")

# Path to the ambisonics folder
ambisonics_path = config["ambisonics_path"]

# List all files in the ambisonics folder
ambisonics_files = [f for f in os.listdir(ambisonics_path) if os.path.isfile(os.path.join(ambisonics_path, f))]


## Strip 0s from filenames

In [6]:
sample_ids = []

for file_name in ambisonics_files:
    number, _ = file_name.split(".")
    number.lstrip("0")
    
    if not number:
        number = 0
        
    sample_ids.append(int(number))

filtered_metadata = metadata[metadata["sample_id"].isin(sample_ids)]
filtered_metadata.shape

(735, 47)

# Preprocess Audio

## Cochlear Filer

In [None]:
def cochlear_filter(channel_data, sr):
    pass
    # return processed_data

## Normalize

In [28]:
def normalize(data):
    return (data - data.min()) / (data.max() - data.min()) 

## Rate Based Encoding

In [None]:
def rate_based_encoding(data, max_rate=100, num_steps=100):
    data_tensor = torch.from_numpy(data).float()
    
    normalized_data = normalize(data)
    
    spike_rates = normalized_data * max_rate
    
    spike_train = spikegen.rate(spike_rates, num_steps= num_steps)
    
    return spike_train

## Time Based Encoding

In [None]:
def time_based_encoding(data, num_steps=100):
    data_tensor = torch.from_numpy(data).float()
    
    normalized_data = normalize(data)
    
    spike_times = torch.where(normalized_data > 0.5, 1, 0)
    
    spike_trains = spikegen.latency(spike_times, num_steps=num_steps)
    
    return spike_trains
    
    

In [None]:
def preprocess_audio(filepath, max_duration):
    """
    W: Omnidirectional
    X: Front - Back
    Y: Left - Right
    Z: Top - Bottom
    """
    audio, sr = librosa.load(filepath, sr=None, mono=False)
    # print(f"Original shape: {audio.shape}, Sampling rate: {sr}")

    max_length = int(max_duration * sr)
    # print(f"Max length in samples: {max_length}")

    padded_audio = librosa.util.fix_length(data=audio, size=max_length)
    # print(f"Padded shape: {padded_audio.shape}")

    W, X, Y, Z = padded_audio[0], padded_audio[1], padded_audio[2], padded_audio[3]
    
    processed_W = cochlear_filter(W, sr)
    processed_X = cochlear_filter(X, sr)
    processed_Y = cochlear_filter(Y, sr)
    processed_Z = cochlear_filter(Z, sr)

    if config["time_based_encoding"]:
        spike_trains_W = time_based_encoding(processed_W, config["num_steps"])
        spike_trains_X = time_based_encoding(processed_X, config["num_steps"])
        spike_trains_Y = time_based_encoding(processed_Y, config["num_steps"])
        spike_trains_Z = time_based_encoding(processed_Z, config["num_steps"])
    else:
        spike_trains_W = rate_based_encoding(processed_W, config['max_rate'], config['num_steps'])
        spike_trains_X = rate_based_encoding(processed_X, config['max_rate'], config['num_steps'])
        spike_trains_Y = rate_based_encoding(processed_Y, config['max_rate'], config['num_steps'])
        spike_trains_Z = rate_based_encoding(processed_Z, config['max_rate'], config['num_steps'])

    return spike_trains_W, spike_trains_X, spike_trains_Y, spike_trains_Z



    

In [None]:
spikes_W, spikes_X, spikes_Y, spikes_Z = preprocess_audio(os.path.join(ambisonics_path, ambisonics_files[0]), filtered_metadata["audio_info/duration"].max())