In [1]:
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999

<IPython.core.display.Javascript object>

In [2]:
import cv2
import random
import torch
import torchaudio

import IPython.display as ipd

from pathlib import Path

# Download validation set only (because it's smaller) to inspect the data.
Path('../data/raw').mkdir(parents=True, exist_ok=True)
val_set = torchaudio.datasets.SPEECHCOMMANDS('../data/raw', subset="validation", download=True)

# Playback + associated STFT spectrogram for some samples

In [None]:
# Playback controls followed by associated spectrogram.
# NOTE: MFCC on 16kHz can give empty mel filterbanks (zero values)
# downsampling the audio to 8kHz avoids this.

# Re-run this cell for new samples.
def inspect_samples(dataset, n_samples, transform, is_random=True):
    ds_length = len(dataset)
    for idx in range(n_samples):
        _idx = idx
        if is_random:
            _idx = int(random.random() * ds_length)
        waveform, sample_rate, label, speaker_id, utterance_id = dataset[_idx]
        sound = ipd.Audio(waveform, rate=sample_rate)
        spec = torch.moveaxis(transform(waveform), 0, -1)
        print(f"Spec - min {spec.min():.2f} max {spec.max():.2f}")
        _spec = spec.detach().cpu().numpy()
        _,ret = cv2.imencode('.jpg', _spec) 
        img = ipd.Image(data=ret)
        ipd.display(sound)
        ipd.display(img)

transform = torchaudio.transforms.Spectrogram(n_fft=256)
inspect_samples(dataset=val_set, n_samples=10, transform=transform, is_random=True)

In [None]:
# Observe some audio samples not the same length. 
# That is OK, we can still re-size the spectrogram
# to the same size, the frequency dimension stays the same
# but the time dimension stretches.

# The result is either audio slightly sped up / slowed down.
# Doing this can be considered a form of augmentation.

# Playback + associated MFCC spectrogram for some samples

In [None]:
transform = torchaudio.transforms.MFCC(n_mfcc=10)
inspect_samples(dataset=val_set, n_samples=10, transform=transform, is_random=True)

In [None]:
# NOTE setting n_mfcc too high can result in zero. It's a result
# of nyquist sampling theorem. But more detail here:
# https://stackoverflow.com/a/56930624/1897312
# Two options: 
# 1 - reduce n_mfcc if you want to keep sample rate the same.
# 2 - downsample the audio.

# Display the UMAP manifold for random samples from the validation set.

In [None]:
import umap
import random
import numpy as np
from torchvision.transforms import Resize

def compute_manifold(dataset, n_samples, transform, is_random=False):
    X = []
    y = []
    label_set = set([])
    ds_length = len(dataset)
    for idx in range(n_samples):
        _idx = idx
        if is_random:
            _idx = int(random.random() * ds_length)
        waveform, sample_rate, label, speaker_id, utterance_id = dataset[_idx]
        label_set.add(label)
        sound = ipd.Audio(waveform, rate=sample_rate)
        spec = torch.moveaxis(transform(waveform), 0, -1)
        _spec = spec.detach().cpu().numpy()
        X.append(Resize((32,32))(spec).flatten())
        y.append(list(label_set).index(label))
    X = np.stack(X, axis=0)
    y = np.array(y)
    manifold = umap.UMAP().fit(X, y)
    X_reduced = manifold.transform(X)
    label_list = list(label_set)
    # Map the index to the label name (the word spoken)
    _y = [label_list[s] for s in y]
    return manifold, np.array(_y)
    
transform = torchaudio.transforms.Spectrogram(n_fft=256)    
manifold, y = compute_manifold(val_set, n_samples=1000, transform=transform, is_random=False)

# Ordered samples UMAP projection

In [None]:
import umap.plot
umap.plot.points(manifold, labels=y, theme="fire")

In [None]:
# Can see that there is a clear separation between the first few ordered classes
# for a majority of the samples. Shows the spectrograms contain good features that
# can be used as input features to a classifier model. 

# The Plan is to use this to fine-tune / train an image based CNN.

# Random samples UMAP projection

In [None]:
import umap.plot
manifold, y = compute_manifold(val_set, n_samples=1000, transform=transform, is_random=True)
umap.plot.points(manifold, labels=y, theme="fire")

In [None]:
# Need more samples to see the pattern in all of the data. 
# But there are too many samples to do this cleanly in a notebook.