In [None]:
import torch
import torchaudio
import matplotlib.pyplot as plt
from IPython.display import Audio
import librosa
import os

In [None]:
libri= torchaudio.datasets.LIBRISPEECH('./corpus', url='train-clean-100')

In [None]:
libri[0]

In [None]:
waveform, sample_rate, transcript, _, _, _ = libri[0]

In [None]:
Audio(waveform.numpy(), rate=sample_rate)

In [None]:
libri.get_metadata(0)

In [None]:
waveform2, sample_rate2= torchaudio.load(os.path.join('./corpus/LibriSpeech/',libri.get_metadata(0)[0]))

In [None]:
waveform == waveform2

In [None]:
def plot_waveform(waveform, sample_rate):
    waveform = waveform.numpy()

    num_channels, num_frames = waveform.shape
    time_axis = torch.arange(0, num_frames) / sample_rate

    figure, axes = plt.subplots(num_channels, 1)
    if num_channels == 1:
        axes = [axes]
    for c in range(num_channels):
        axes[c].plot(time_axis, waveform[c], linewidth=1)
        axes[c].grid(True)
        if num_channels > 1:
            axes[c].set_ylabel(f"Channel {c+1}")
    figure.suptitle("waveform")

In [None]:
plot_waveform(waveform, sample_rate)

In [None]:
import torchaudio.transforms as T


class MyPipeline(torch.nn.Module):
    def __init__(
        self,
        input_freq=16000,
        resample_freq=16000,
        n_fft=400,
        hop_length=160,
        n_mel=80,
        stretch_factor=0.8,
    ):
        super().__init__()
        self.resample = T.Resample(orig_freq=input_freq, new_freq=resample_freq)

        self.spec = T.Spectrogram(n_fft=n_fft, power=2)
        self.mel_scale = T.MelScale(
            n_mels=n_mel, sample_rate=resample_freq, n_stft=n_fft // 2 + 1)
    def forward(self, waveform: torch.Tensor) -> torch.Tensor:
        # Resample the input
        resampled = self.resample(waveform)

        # Convert to power spectrogram
        spec = self.spec(resampled)

        # Convert to mel-scale
        mel = self.mel_scale(spec)

        return mel

In [None]:
# Instantiate a pipeline
pipeline = MyPipeline()

# Perform the transform
features = pipeline(waveform)


In [None]:
def plot_spectrogram(specgram, title=None, ylabel="freq_bin", ax=None):
    if ax is None:
        _, ax = plt.subplots(1, 1)
    if title is not None:
        ax.set_title(title)
    ax.set_ylabel(ylabel)
    ax.imshow(librosa.power_to_db(specgram), origin="lower", aspect="auto", interpolation="nearest")


In [None]:
features

In [None]:
features.shape

In [None]:
plot_spectrogram(features[0])

In [None]:
class MyPipeline2(torch.nn.Module):
    def __init__(
        self,
        input_freq=16000,
        resample_freq=16000,
        n_fft=400,
        hop_length=160,
        n_mel=80,
        stretch_factor=0.8,
    ):
        super().__init__()
        self.resample = T.Resample(orig_freq=input_freq, new_freq=resample_freq)

        self.spec = T.Spectrogram(n_fft=n_fft, power=2)
        self.spec_aug = torch.nn.Sequential(
            T.TimeStretch(stretch_factor, fixed_rate=True),
            T.FrequencyMasking(freq_mask_param=80),
            T.TimeMasking(time_mask_param=80),
        )
        self.mel_scale = T.MelScale(
            n_mels=n_mel, sample_rate=resample_freq, n_stft=n_fft // 2 + 1)
    def forward(self, waveform: torch.Tensor) -> torch.Tensor:
        # Resample the input
        resampled = self.resample(waveform)

        # Convert to power spectrogram
        spec = self.spec(resampled)
        
        # Apply SpecAugment
        spec = self.spec_aug(spec)

        # Convert to mel-scale
        mel = self.mel_scale(spec)

        return mel

In [None]:
# Instantiate a pipeline
pipeline2 = MyPipeline2()

# Perform the transform
features2 = pipeline2(waveform)

In [None]:
features2.shape

In [None]:
plot_spectrogram(features2[0])