This notebook generates binaural sound examples for the common slopes amplitudes interpolation problem. 
First the soundfield at octave bands is generated by getting the learned amplitudes from the DNN in octave bands. Then an ambisonics RIR is
reconstructed from the learned amplitudes using white noise shaping.

Simultaneously, an HRTF dataset is loaded and converted to the ambisonics domain. The ambisonics RIRs are first rotated, according to the head orientation and then convolved with the HRTFs' SH representation. This rotated soundfield is then convolved with the input mono signal to get the binauralised output.

In [None]:
import torch
import numpy as np
from pathlib import Path
import soundfile as sf
import pyfar as pf
import librosa
import pickle
import IPython
from loguru import logger
from copy import deepcopy
import matplotlib.pyplot as plt
import os
os.chdir('../..')

from spatial_sampling.inference import get_ambisonic_rirs
from spatial_sampling.dataloader import SpatialThreeRoomDataset, SpatialRoomDataset
from diff_gfdn.utils import ms_to_samps

from src.sofa_parser import HRIRSOFAReader, SRIRSOFAWriter, save_to_sofa
from src.sound_examples import binaural_dynamic_rendering, add_direct_and_early_path

In [None]:
out_path = 'output/spatial_sampling/sound_examples'
audio_path = 'audio/sound_examples/'

### Create a trajectory of a listener moving across the space

In [None]:
# along x axis between three rooms
start_pos_x, start_pos_y = (0.5, 3.5)
end_pos_x, end_pos_y = (9, 3.5)
num_pos = 50
head_orientation_az = np.deg2rad(np.linspace(200, 30, num_pos))
head_orientation_el = np.deg2rad(np.zeros(num_pos))

linear_trajectory_x = np.linspace(start_pos_x, end_pos_x, num_pos)
linear_trajectory_y = np.linspace(start_pos_y, end_pos_y, num_pos)
linear_trajectory_z = 1.5 * np.ones(num_pos)

rec_pos_list = np.zeros((num_pos, 3))
rec_pos_list[:, 0] = linear_trajectory_x
rec_pos_list[:, 1] = linear_trajectory_y
rec_pos_list[:, 2] = linear_trajectory_z
orientation_list = np.zeros((num_pos, 2))
orientation_list[:, 0] = head_orientation_az
orientation_list[:, 1] = head_orientation_el

# along y-axis between rooms 2 and 3
start_pos_x, start_pos_y = (9.1, 3.5)
end_pos_x, end_pos_y = (9.0, 12.0)
num_pos = 68

linear_trajectory_x = np.linspace(start_pos_x, end_pos_x, num_pos)
linear_trajectory_y = np.linspace(start_pos_y, end_pos_y, num_pos)
linear_trajectory_z = 1.5 * np.ones(num_pos)
head_orientation_az = np.deg2rad(np.linspace(30, 150, num_pos))
head_orientation_el = np.deg2rad(np.zeros(num_pos))

rec_pos_list = np.vstack((rec_pos_list, np.vstack((linear_trajectory_x, linear_trajectory_y, linear_trajectory_z)).T))
head_orientation_list = np.vstack((orientation_list, np.vstack((head_orientation_az, head_orientation_el)).T))

### Get the true room dataset with its corresponding ambisonics RIR

In [None]:
room_data_pkl_path = Path('resources/Georg_3room_FDTD/srirs_spatial.pkl').resolve()
config_path = Path('data/config/spatial_sampling/').resolve()

# get the original dataset
true_cs_room_data = SpatialThreeRoomDataset(room_data_pkl_path)

save_path = Path('resources/SOFA files/true_ambi_srirs.sofa').resolve()
save_to_sofa(deepcopy(true_cs_room_data), save_path)


### Get the mono, dry stimulus and resample it

In [None]:
sig_type = 'speech'

speech_data = pf.signals.files.drums() if sig_type == 'drums' else pf.signals.files.speech()
speech = np.squeeze(speech_data.time)
fs = speech_data.sampling_rate
new_fs = int(true_cs_room_data.sample_rate)

if fs != new_fs:
    speech = librosa.resample(speech, orig_sr = fs, target_sr = new_fs)

# add some silence at the end
silence = np.zeros(ms_to_samps(500, new_fs))
speech_app = np.concatenate((speech, silence))
                   
save_path = Path(f'{audio_path}/stimulus/{sig_type}.wav').resolve()
sf.write(save_path, speech_app, new_fs)
IPython.display.Audio(save_path)

### Load the HRTF dataset

In [None]:
from src.sofa_parser import HRIRSOFAReader

hrtf_path = Path('resources/HRTF/48kHz/KEMAR_Knowl_EarSim_SmallEars_FreeFieldComp_48kHz.sofa')
hrtf_reader = HRIRSOFAReader(hrtf_path)

### Create a sound examples object for the reference RIRs

In [None]:
from src.sound_examples import binaural_dynamic_rendering

update_ms = 250 #should be a factor of 1s
ani_save_path = Path(f'{out_path}/treble_data_binaural').resolve()

In [None]:
dynamic_renderer = binaural_dynamic_rendering(true_cs_room_data, 
                                             rec_pos_list, 
                                             head_orientation_list, 
                                             speech_app, 
                                             hrtf_reader, 
                                             update_ms=update_ms)
dynamic_renderer.animate_moving_listener(ani_save_path)
save_path = Path(f'{out_path}/extended_stimulus_{sig_type}.wav').resolve()
sf.write(save_path, dynamic_renderer.extended_stimulus, int(true_cs_room_data.sample_rate))

In [None]:
# save a subset of the receiver positions used to create the listening examples as SOFA file
subset_true_cs_room_data = deepcopy(true_cs_room_data)
subset_rirs = true_cs_room_data.rirs[dynamic_renderer.rec_idxs, ...]
subset_true_cs_room_data.update_rirs(subset_rirs)
subset_true_cs_room_data.update_receiver_pos(rec_pos_list)

save_path = Path('resources/SOFA files/true_ambi_srirs_trajectory.sofa').resolve()
save_to_sofa(deepcopy(subset_true_cs_room_data), save_path)

save_path = Path('resources/SOFA files/true_ambi_srirs_late_trajectory.sofa').resolve()
save_to_sofa(deepcopy(subset_true_cs_room_data), save_path)

In [None]:
# cross-fading convolution with the reference set of RIRs
ref_output = dynamic_renderer.binaural_filter_overlap_add()

ref_output_norm = dynamic_renderer.normalise_loudness(ref_output, true_cs_room_data.sample_rate, db_lufs=-24)
save_path = Path(f'{out_path}/binaural_reference_moving_listener_{sig_type}.wav').resolve()
sf.write(save_path, ref_output_norm, int(true_cs_room_data.sample_rate)) 
IPython.display.Audio(save_path)

dynamic_renderer.combine_animation_and_sound(f'{ani_save_path}_moving_listener.mp4', f'{save_path}', f'{ani_save_path}_reference_{sig_type}')
del dynamic_renderer

### Get the room dataset using the common slopes directional amplitudes

In [None]:
from importlib import reload
import spatial_sampling
reload(spatial_sampling.inference)
from spatial_sampling.inference import get_ambisonic_rirs

output_pkl_path = Path('output/spatial_sampling/grid_rir_treble_cs_ambi_rirs.pkl').resolve()

# get predicted output from the trained models
if not os.path.exists(output_pkl_path):
    cs_room_data = get_ambisonic_rirs(rec_pos_list, true_cs_room_data, 
                                      use_trained_model=False, output_pkl_path=output_pkl_path)
else:
    with open(output_pkl_path, "rb") as f:
        cs_room_data = pickle.load(f)

save_path = Path('resources/SOFA files/cs_predicted_ambi_srirs_late.sofa').resolve()
save_to_sofa(deepcopy(cs_room_data), save_path)

save_path = Path('resources/SOFA files/cs_predicted_ambi_srirs.sofa').resolve()
full_rirs = add_direct_and_early_path(true_cs_room_data, cs_room_data)

cs_room_data.update_rirs(full_rirs)
save_to_sofa(deepcopy(cs_room_data), save_path)

### Create sound example with CS SRIRs

In [None]:
dynamic_renderer = binaural_dynamic_rendering(cs_room_data, 
                                             rec_pos_list, 
                                             head_orientation_list, 
                                             speech_app, 
                                             hrtf_reader, 
                                             update_ms=update_ms)

# cross-fading convolution with the reference set of RIRs
cs_output = dynamic_renderer.binaural_filter_overlap_add()

cs_output_norm = dynamic_renderer.normalise_loudness(cs_output, cs_room_data.sample_rate, db_lufs=-24)
save_path = Path(f'{out_path}/binaural_cs_moving_listener_{sig_type}.wav').resolve()
sf.write(save_path, cs_output_norm, int(cs_room_data.sample_rate)) 
IPython.display.Audio(save_path)

dynamic_renderer.combine_animation_and_sound(f'{ani_save_path}_moving_listener.mp4', f'{save_path}', f'{ani_save_path}_cs_{sig_type}')
del dynamic_renderer

### Get the room dataset with the predicted amplitudes from the DNN with its corresponding ambisonics RIR

In [None]:
from importlib import reload
import spatial_sampling
reload(spatial_sampling.inference)
from spatial_sampling.inference import get_ambisonic_rirs

grid_res = 0.9
output_pkl_path = Path(f'output/spatial_sampling/grid_rir_treble_mlp_ambi_rirs_grid_res={grid_res:.1f}.pkl').resolve()

# get predicted output from the trained models
if not os.path.exists(output_pkl_path):
    pred_cs_room_data = get_ambisonic_rirs(rec_pos_list,true_cs_room_data, use_trained_model=True, 
                                           config_path=config_path, grid_resolution_m=grid_res, output_pkl_path=output_pkl_path)
else:
    with open(output_pkl_path, "rb") as f:
        pred_cs_room_data = pickle.load(f)

save_path = Path(f'resources/SOFA files/mlp_predicted_ambi_srirs_late_grid_spacing={grid_res:.1f}m.sofa').resolve()
save_to_sofa(deepcopy(pred_cs_room_data), save_path)

save_path = Path(f'resources/SOFA files/mlp_predicted_ambi_srirs_grid_spacing={grid_res:.1f}m.sofa').resolve()
full_rirs = add_direct_and_early_path(true_cs_room_data, pred_cs_room_data)
pred_cs_room_data.update_rirs(full_rirs)
save_to_sofa(deepcopy(pred_cs_room_data), save_path)

#### Plot an RIR for sanity check

In [None]:
start_idx = ms_to_samps(50, true_cs_room_data.sample_rate)
end_idx = int(2*true_cs_room_data.sample_rate)
pos_num = 18
chan_num = 2
plt.plot(cs_room_data.rirs[pos_num, chan_num, start_idx:end_idx])
plt.plot(pred_cs_room_data.rirs[pos_num, chan_num,start_idx:end_idx])
# plt.plot(true_cs_room_data.rirs[pos_num, chan_num, start_idx:end_idx])
plt.legend(['CS pred', 'MLP pred', 'Reference'])

save_path = Path(f'{audio_path}/reference_ambi/reference_ir_pos={pos_num}_chan={chan_num}.wav').resolve()
sf.write(save_path, true_cs_room_data.rirs[pos_num, chan_num, :], true_cs_room_data.sample_rate)

save_path = Path(f'{audio_path}/reference_ambi/cs_ir_pos={pos_num}_chan={chan_num}.wav').resolve()
sf.write(save_path, cs_room_data.rirs[pos_num, chan_num, :], true_cs_room_data.sample_rate)

save_path = Path(f'{audio_path}/reference_ambi/pred_cs_ir_pos={pos_num}_chan={chan_num}.wav').resolve()
sf.write(save_path, pred_cs_room_data.rirs[pos_num, chan_num, :], true_cs_room_data.sample_rate)

### Create sound example with MLP predicted SRIRs

In [None]:
dynamic_renderer = binaural_dynamic_rendering(pred_cs_room_data, 
                                             rec_pos_list, 
                                             head_orientation_list, 
                                             speech_app, 
                                             hrtf_reader, 
                                             update_ms=update_ms)

# cross-fading convolution with the reference set of RIRs
pred_output = dynamic_renderer.binaural_filter_overlap_add()

pred_output_norm = dynamic_renderer.normalise_loudness(pred_output, pred_cs_room_data.sample_rate, db_lufs=-24)
save_path = Path(f'{out_path}/binaural_mlp_grid_res={grid_res:.1f}_moving_listener_{sig_type}.wav').resolve()
sf.write(save_path, pred_output_norm, int(pred_cs_room_data.sample_rate)) 
IPython.display.Audio(save_path)

dynamic_renderer.combine_animation_and_sound(f'{ani_save_path}_moving_listener.mp4', f'{save_path}', 
                                             f'{ani_save_path}_mlp_grid_res={grid_res:.1f}_{sig_type}')