# Additive noise datasets

In [1]:
from sbb_project import consts
from sbb_project.augmentation import augmentations

In [42]:
# Import the data augmentation component from ASR collection
from nemo.collections.asr.parts.preprocessing import perturb, segment
import glob
import IPython.display as ipd
import torch
import librosa
import json
import random

### Types of perturbations

In [3]:
perturb.perturbation_types

{'speed': nemo.collections.asr.parts.preprocessing.perturb.SpeedPerturbation,
 'time_stretch': nemo.collections.asr.parts.preprocessing.perturb.TimeStretchPerturbation,
 'gain': nemo.collections.asr.parts.preprocessing.perturb.GainPerturbation,
 'impulse': nemo.collections.asr.parts.preprocessing.perturb.ImpulsePerturbation,
 'shift': nemo.collections.asr.parts.preprocessing.perturb.ShiftPerturbation,
 'noise': nemo.collections.asr.parts.preprocessing.perturb.NoisePerturbation,
 'white_noise': nemo.collections.asr.parts.preprocessing.perturb.WhiteNoisePerturbation,
 'rir_noise_aug': nemo.collections.asr.parts.preprocessing.perturb.RirAndNoisePerturbation,
 'transcode_aug': nemo.collections.asr.parts.preprocessing.perturb.TranscodePerturbation,
 'random_segment': nemo.collections.asr.parts.preprocessing.perturb.RandomSegmentPerturbation}

### Obtain all audio files and define loader

In [4]:
train_dataset = consts.MANIFEST_DIR.joinpath(consts.MANIFEST_FILE.format("train"))
test_dataset = consts.MANIFEST_DIR.joinpath(consts.MANIFEST_FILE.format("test"))
val_dataset = consts.MANIFEST_DIR.joinpath(consts.MANIFEST_FILE.format("val"))

In [5]:
!head -n 1 {train_dataset}

{"audio_filepath": "/home/user/code/sbb_asr/data/sbb_exchange/all_samples/audios/9dae9654-d72f-4b0c-9212-f2dc8e58f1ad.wav", "text": "rangierfahrt von eins ins gleis eins drei antworten", "duration": 12.0}


In [6]:
audio_files = glob.glob(str(consts.SBB_DATA_EXCHANGE_AUDIO.joinpath('*.wav')))

In [7]:
noise_files = glob.glob(str(consts.NOISE_DIR.joinpath('*.wav')))

In [8]:
def load_audio(filepath, sr) -> segment.AudioSegment:
    sample_segment = segment.AudioSegment.from_file(filepath, target_sr=sr)
    return sample_segment

In [9]:
sample_segment = load_audio(audio_files[0], sr = 16000)
ipd.Audio(sample_segment.samples, rate = 16000)

## White Noise

White Noise perturbation is performed by the following steps :
1) Randomly sample the amplitude of the noise from a uniformly distributed range (defined in dB)
2) Sample gaussian noise (mean = 0, std = 1) with same length as audio signal
3) Scale this gaussian noise by the amplitude (in dB scale)
4) Add this noise vector to the original sample

In [10]:
sample_segment_whitenoise = load_audio(audio_files[0], sr = 16000)

In [11]:
white_noise = perturb.WhiteNoisePerturbation(min_level=-80, max_level=-40)

In [12]:
white_noise.perturb(sample_segment_whitenoise)

In [13]:
ipd.Audio(sample_segment_whitenoise.samples, rate=16000)

## Real World Noise Perturbation

Noise perturbation is performed by the following steps :
1) Randomly sample the amplitude scale of the noise sample from a uniformly distributed range (defined in dB)
2) Randomly choose an audio clip from the set of noise audio samples available
3) Compute the gain (in dB) required for the noise clip as compared to the original sample and scale the noise by this factor
4) If the noise snippet is of shorter duration than the original audio, then randomly select an index in time from the original sample, where the noise snippet will be added
5) If instead the noise snippet is longer than the duration of the original audio, then randomly subsegment the noise snippet and add the full snippet to the original audio

In [18]:
noise_manifest = consts.MANIFEST_DIR.joinpath(consts.NOISE_MANIFEST_FILE)

In [41]:
write_noise_manifest(noise_files, noise_manifest)

      1.9842437e-02  1.2551837e-02] as keyword args. From version 0.10 passing these as positional arguments will result in an error
      duration = librosa.get_duration(x, sr=_sr)
    


Wrote 1 segments for filename /home/user/code/sbb_asr/data/manifests/noise.json


      2.3476014e-02  1.6482541e-02] as keyword args. From version 0.10 passing these as positional arguments will result in an error
      duration = librosa.get_duration(x, sr=_sr)
    


Wrote 1 segments for filename /home/user/code/sbb_asr/data/manifests/noise.json


     -7.2379392e-03 -6.2738485e-03] as keyword args. From version 0.10 passing these as positional arguments will result in an error
      duration = librosa.get_duration(x, sr=_sr)
    


Wrote 1 segments for filename /home/user/code/sbb_asr/data/manifests/noise.json


     -1.6572963e-02 -1.3357342e-02] as keyword args. From version 0.10 passing these as positional arguments will result in an error
      duration = librosa.get_duration(x, sr=_sr)
    


Wrote 1 segments for filename /home/user/code/sbb_asr/data/manifests/noise.json
Finished preparing manifest !


In [43]:
rng = random.Random(0)

In [85]:
sample_segment_realnoise = load_audio(audio_files[0], sr = 16000)

In [86]:
noise = perturb.NoisePerturbation(manifest_path = str(noise_manifest),
                                  min_snr_db = -2, 
                                  max_snr_db = -2,
                                  max_gain_db = 10.0,
                                  rng = rng)

[NeMo I 2022-12-07 14:07:25 collections:194] Dataset loaded with 4 files totalling 0.03 hours
[NeMo I 2022-12-07 14:07:25 collections:195] 0 files were filtered totalling 0.00 hours


In [87]:
noise.perturb(sample_segment_realnoise)

In [88]:
ipd.Audio(sample_segment_realnoise.samples, rate=16000)

## Speed perturbation

Speed perturbation changes the speed of the speech, but does not preserve pitch of the sound. 

In [126]:
sample_segment_speed = load_audio(audio_files[0], sr = 16000)

In [127]:
resample_type = 'scipy'
speed = perturb.SpeedPerturbation(16000, resample_type, min_speed_rate=1.2, max_speed_rate=1.8, num_rates=-1)

In [128]:
speed.perturb(sample_segment_speed)

In [129]:
ipd.Audio(sample_segment_speed.samples, rate=16000)

## Time stretch perturbation

Time Stretch perturbation changes the speed of the speech, and also preserve pitch of the sound. Try a few random augmentations to see how the pitch remains close to the same with change in duration of the audio file.

In [110]:
sample_segment_timestretch = load_audio(audio_files[0], sr = 16000)

In [114]:
time_stretch = perturb.TimeStretchPerturbation(min_speed_rate=0.8, max_speed_rate=1.5, num_rates=3)

In [115]:
time_stretch.perturb(sample_segment_timestretch)

In [116]:
ipd.Audio(sample_segment_timestretch.samples, rate=16000)