In [1]:
import torch
import torchaudio
import torchaudio.functional as F

import matplotlib.pyplot as plt

from IPython.display import Audio
import librosa

In [2]:
import os
os.listdir()

['test_audio.wav', 'SpeechAug.md', 'sa.ipynb', 'asset']

In [3]:
audio_path = './test_audio.wav'
data, sampling_rate = librosa.load(audio_path)

In [19]:
from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift, Shift
import numpy as np

augment = Compose([
    AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5),
    TimeStretch(min_rate=1.5, max_rate=1.5, p=0.5),
    PitchShift(min_semitones=-4, max_semitones=4, p=0.5),
    # Shift(p=0.5),
])

# Augment/transform/perturb the audio data
augmented_samples = augment(samples=data, sample_rate=sampling_rate)

In [20]:
import IPython.display as ipd
ipd.Audio(augmented_samples, rate=sampling_rate)

## Time strength
Time stretching in audio processing is a technique used to alter the duration of an audio signal without changing its pitch.

The librosa.effects.time_stretch function is a powerful tool in the librosa library, designed for time-stretching audio signals in Python.
Stretch factor `rate`. If `rate > 1`, then the signal is sped up. If `rate < 1`, then the signal is slowed down.


In [4]:
import IPython.display as ipd
data_fast = librosa.effects.time_stretch(data, rate=1.25)
ipd.Audio(data_fast, rate=sampling_rate)



The librosa library in Python provides robust tools for audio analysis and manipulation, including pitch shifting. With librosa, you can easily adjust the pitch of an audio signal using functions like librosa.effects.pitch_shift.

n_steps: This parameter specifies the number of semitones by which to shift the pitch of the audio signal.
        Positive values will raise the pitch.
        Negative values will lower the pitch.

In [6]:

y_third = librosa.effects.pitch_shift(data, sr=sampling_rate, n_steps=4)
ipd.Audio(y_third, rate=sampling_rate)

In [23]:
ipd.Audio(data, rate=sampling_rate)

In [25]:
from audiomentations import TimeStretch

transform = TimeStretch(
    min_rate=0.6,
    max_rate=0.6,
    leave_length_unchanged=False,
    p=1.0
)

augmented_sound = transform(data, sample_rate=16000)
ipd.Audio(augmented_sound, rate=sampling_rate)

In [None]:


# Shift down by a tritone (six steps if bins_per_octave is 12)

y_tritone = librosa.effects.pitch_shift(y, sr=sr, n_steps=-6)

# Shift up by 3 quarter-tones

y_three_qt = librosa.effects.pitch_shift(y, sr=sr, n_steps=3,

                                         bins_per_octave=24)

# Add noise

Spectrogram Augmentation:

    SpecAugment operates in the frequency domain by applying various augmentations to the spectrogram of an audio signal. This method is based on the observation that perturbing the spectrogram can help the ASR models generalize better to variations in real-world data.

Three Main Augmentations:

    Time Masking: Randomly masks certain time segments of the spectrogram. This simulates the effect of missing or corrupted speech information over time.
    Frequency Masking: Randomly masks certain frequency bands of the spectrogram. This helps the model become more resilient to variations in frequency content.
    Time Warping: Applies slight distortions to the time axis of the spectrogram. This augmentation simulates variations in speech rate and tempo.

Benefits:

    Improves Generalization: By introducing variability into the training data, SpecAugment helps models generalize better and perform more robustly in diverse conditions.
    Simplicity and Effectiveness: SpecAugment is relatively simple to implement and does not require additional data, making it an attractive option for improving ASR performance.

In [None]:
import numpy as np
import matplotlib.pyplot as plt

def apply_time_mask(spectrogram, mask_size):
    """
    Apply time masking to the spectrogram.
    """
    num_frames = spectrogram.shape[1]
    mask_start = np.random.randint(0, num_frames - mask_size)
    spectrogram[:, mask_start:mask_start + mask_size] = 0
    return spectrogram

def apply_freq_mask(spectrogram, mask_size):
    """
    Apply frequency masking to the spectrogram.
    """
    num_freqs = spectrogram.shape[0]
    mask_start = np.random.randint(0, num_freqs - mask_size)
    spectrogram[mask_start:mask_start + mask_size, :] = 0
    return spectrogram

# Generate a random spectrogram for demonstration
spectrogram = np.random.rand(80, 100)  # 80 frequency bins, 100 time frames

# Apply time and frequency masking
masked_spectrogram = apply_time_mask(spectrogram.copy(), mask_size=10)
masked_spectrogram = apply_freq_mask(masked_spectrogram.copy(), mask_size=5)

# Plot the original and masked spectrograms
plt.figure(figsize=(12, 6))

plt.subplot(1, 3, 1)
plt.title('Original Spectrogram')
plt.imshow(spectrogram, aspect='auto', origin='lower')
plt.colorbar()

plt.subplot(1, 3, 2)
plt.title('Time Masked Spectrogram')
plt.imshow(apply_time_mask(spectrogram.copy(), mask_size=10), aspect='auto', origin='lower')
plt.colorbar()

plt.subplot(1, 3, 3)
plt.title('Frequency Masked Spectrogram')
plt.imshow(apply_freq_mask(spectrogram.copy(), mask_size=5), aspect='auto', origin='lower')
plt.colorbar()

plt.tight_layout()
plt.show()


In [None]:
def time_warp(spec, W=5):
    num_rows = spec.shape[1]
    spec_len = spec.shape[2]

    y = num_rows // 2
    horizontal_line_at_ctr = spec[0][y]
    # assert len(horizontal_line_at_ctr) == spec_len

    point_to_warp = horizontal_line_at_ctr[random.randrange(W, spec_len-W)]
    # assert isinstance(point_to_warp, torch.Tensor)

    # Uniform distribution from (0,W) with chance to be up to W negative
    dist_to_warp = random.randrange(-W, W)
    src_pts = torch.tensor([[[y, point_to_warp]]])
    dest_pts = torch.tensor([[[y, point_to_warp + dist_to_warp]]])
    warped_spectro, dense_flows = sparse_image_warp(spec, src_pts, dest_pts)
    return warped_spectro.squeeze(3)


def spec_augment(mel_spectrogram, time_warping_para=80, frequency_masking_para=27,
                 time_masking_para=100, frequency_mask_num=1, time_mask_num=1):
    """Spec augmentation Calculation Function.
    'SpecAugment' have 3 steps for audio data augmentation.
    first step is time warping using Tensorflow's image_sparse_warp function.
    Second step is frequency masking, last step is time masking.
    # Arguments:
      mel_spectrogram(numpy array): audio file path of you want to warping and masking.
      time_warping_para(float): Augmentation parameter, "time warp parameter W".
        If none, default = 80 for LibriSpeech.
      frequency_masking_para(float): Augmentation parameter, "frequency mask parameter F"
        If none, default = 100 for LibriSpeech.
      time_masking_para(float): Augmentation parameter, "time mask parameter T"
        If none, default = 27 for LibriSpeech.
      frequency_mask_num(float): number of frequency masking lines, "m_F".
        If none, default = 1 for LibriSpeech.
      time_mask_num(float): number of time masking lines, "m_T".
        If none, default = 1 for LibriSpeech.
    # Returns
      mel_spectrogram(numpy array): warped and masked mel spectrogram.
    """
    v = mel_spectrogram.shape[1]
    tau = mel_spectrogram.shape[2]

    # Step 1 : Time warping
    warped_mel_spectrogram = time_warp(mel_spectrogram, W=time_warping_para)

    # Step 2 : Frequency masking
    for i in range(frequency_mask_num):
        f = np.random.uniform(low=0.0, high=frequency_masking_para)
        f = int(f)
        f0 = random.randint(0, v-f)
        warped_mel_spectrogram[:, f0:f0+f, :] = 0

    # Step 3 : Time masking
    for i in range(time_mask_num):
        t = np.random.uniform(low=0.0, high=time_masking_para)
        t = int(t)
        t0 = random.randint(0, tau-t)
        warped_mel_spectrogram[:, :, t0:t0+t] = 0

    return warped_mel_spectrogram

In [11]:

def freq_mask(spec, F=30, num_masks=1, pad_value=0):
    """Frequency masking

    :param torch.Tensor spec: input tensor with shape (dim, T)
    :param int F: maximum width of each mask
    :param int num_masks: number of masks
    :param bool pad_value: value for padding
    """
    cloned = spec.unsqueeze(0).clone()
    num_mel_channels = cloned.shape[1]

    for i in range(0, num_masks):
        f = random.randrange(0, F)
        f_zero = random.randrange(0, num_mel_channels - f)

        # avoids randrange error if values are equal and range is empty
        if (f_zero == f_zero + f):
            return cloned.squeeze(0)

        mask_end = random.randrange(f_zero, f_zero + f)
        cloned[0][f_zero:mask_end] = pad_value

    return cloned.squeeze(0)


def time_mask(spec, T=40, num_masks=1, p=0.2, pad_value=0):
    """Time masking

    :param torch.Tensor spec: input tensor with shape (dim, T)
    :param int T: maximum width of each mask
    :param int num_masks: number of masks
    :param bool pad_value: value for padding
    """
    cloned = spec.unsqueeze(0).clone()
    len_spectro = cloned.shape[2]
    T = min(T, int(len_spectro * p / num_masks))

    for i in range(0, num_masks):
        t = random.randrange(0, T)
        t_zero = random.randrange(0, len_spectro - t)

        # avoids randrange error if values are equal and range is empty
        if (t_zero == t_zero + t):
            return cloned.squeeze(0)

        mask_end = random.randrange(t_zero, t_zero + t)
        cloned[0][:, t_zero:mask_end] = pad_value
    return cloned.squeeze(0)


In [19]:
SAMPLE_WAV = download_asset("tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav")
metadata = torchaudio.info(SAMPLE_WAV)
sampling_rate = metadata.sample_rate
ipd.Audio(SAMPLE_WAV, rate=sampling_rate)

In [20]:
metadata = torchaudio.info(SAMPLE_WAV)

In [17]:
waveform

tensor([[-7.3242e-04, -7.6294e-04, -6.4087e-04,  ...,  7.3242e-04,
          2.1362e-04,  6.1035e-05]])

In [18]:
SAMPLE_WAV

'/home/levi/.cache/torch/hub/torchaudio/tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav'

In [21]:
waveform, sample_rate = torchaudio.load(SAMPLE_WAV, normalize=True)
transform = torchaudio.transforms.Spectrogram(n_fft=800)
spectrogram = transform(waveform)
spectrogram

tensor([[[7.8038e+00, 6.9181e-01, 4.5300e+00,  ..., 1.6729e+00,
          7.2812e+00, 1.4755e+00],
         [5.7427e+00, 5.6298e-01, 2.8078e+00,  ..., 2.0042e+00,
          2.5289e+00, 1.4977e-01],
         [3.0530e+00, 6.8753e-02, 5.6608e-01,  ..., 1.3401e-01,
          1.0927e-01, 7.2176e-02],
         ...,
         [2.9123e-08, 3.0743e-06, 3.3997e-06,  ..., 3.6706e-06,
          1.1105e-05, 1.1012e-07],
         [2.6830e-05, 7.7786e-06, 3.1466e-05,  ..., 1.6776e-06,
          8.3443e-06, 2.8491e-06],
         [9.2090e-05, 9.6572e-06, 5.4176e-05,  ..., 1.1435e-05,
          1.1881e-05, 1.7578e-07]]])