In [18]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchaudio
import sys

import matplotlib.pyplot as plt
import IPython.display as ipd

from tqdm import tqdm
from pprint import pprint

# Dataset Downloading

In [19]:
train_dataset = torchaudio.datasets.SPEECHCOMMANDS(root='./', url='speech_commands_v0.01', download=True, subset='training')
valid_dataset = torchaudio.datasets.SPEECHCOMMANDS(root='./', url='speech_commands_v0.01', download=True, subset='validation')
test_dataset = torchaudio.datasets.SPEECHCOMMANDS(root='./', url='speech_commands_v0.01', download=True, subset='testing')

In [20]:
print(len(train_dataset), len(valid_dataset), len(test_dataset), )

51088 6798 6835


# Data Processing

In [21]:
char_map_str = """
 a 0
 b 1
 c 2
 d 3
 e 4
 f 5
 g 6
 h 7
 i 8
 j 9
 k 10
 l 11
 m 12
 n 13
 o 14
 p 15
 q 16
 r 17
 s 18
 t 19
 u 20
 v 21
 w 22
 x 23
 y 24
 z 25
 """
 
class TextTransform:
    """ Maps characters to their indices, and vice versa """
    def __init__(self):
        self.char_map = {}
        self.index_map = {}
        for line in char_map_str.strip().split('\n'):
            ch, index = line.split()
            self.char_map[ch] = int(index)
            self.index_map[int(index)] = ch

    def text_to_int(self, text: list[str]):
        """ Use a character map and convert text to an integer sequence """
        int_sequence = []
        for c in text:
            ind = self.char_map[c]
            int_sequence.append(ind)
        return int_sequence

    def int_to_text(self, labels: list[int]):
        """ Use a character map and convert integer labels to an text sequence """
        string = []
        for i in labels:
            string.append(self.index_map[i])
        return ''.join(string)


# TODO: SpecAugment (masking augmentations)
train_audio_transforms = nn.Sequential(
    torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_mels=128),
    # torchaudio.transforms.FrequencyMasking(freq_mask_param=15),
    # torchaudio.transforms.TimeMasking(time_mask_param=35)
)

valid_audio_transforms = torchaudio.transforms.MelSpectrogram()

text_transform = TextTransform()

In [22]:
# testing the code above
word_start = "yes"
index = text_transform.text_to_int(word_start)
word_recovered = text_transform.int_to_text(index)

print(word_start, "-->", index, "-->", word_recovered)

yes --> [24, 4, 18] --> yes


Функция __data_processing__ будет позже вызвана в __collate_fn__ дата лоадеров.

Формат данных в датасете: tuple (wave, sample_rate, utterance (label), speaker id, utterance number)

In [23]:
sample = train_dataset.__getitem__(n=2)
sample

(tensor([[-0.0025, -0.0021, -0.0017,  ..., -0.0030, -0.0033, -0.0031]]),
 16000,
 'bed',
 '004ae714',
 1)

In [24]:
def data_processing(data, data_type="train"):
    spectrograms = []
    labels = []
    input_lengths = []
    label_lengths = []
    for (waveform, _, utterance, _, _) in data:
        if data_type == 'train':
            spec = train_audio_transforms(waveform).squeeze(0).transpose(0, 1)
        else:
            spec = valid_audio_transforms(waveform).squeeze(0).transpose(0, 1)
        spectrograms.append(spec)
        # labels are lists of integer character ids
        label = torch.Tensor(text_transform.text_to_int(utterance.lower()))
        labels.append(label)
        # input_lengths, label_lengths are used in loss function
        input_lengths.append(spec.shape[0]//2)
        label_lengths.append(len(label))

    spectrograms = nn.utils.rnn.pad_sequence(spectrograms, batch_first=True).unsqueeze(1).transpose(2, 3)
    labels = nn.utils.rnn.pad_sequence(labels, batch_first=True)

    return spectrograms, labels, input_lengths, label_lengths

In [25]:
# testing
data_processing((sample,))

(tensor([[[[0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00,
            0.0000e+00, 0.0000e+00],
           [1.3406e-02, 1.1594e-02, 1.6736e-02,  ..., 1.7414e-02,
            1.6212e-02, 2.0798e-02],
           [7.2180e-02, 6.2425e-02, 9.0109e-02,  ..., 9.3760e-02,
            8.7287e-02, 1.1198e-01],
           ...,
           [5.7900e-06, 2.8763e-06, 3.3100e-06,  ..., 2.9408e-06,
            2.1657e-06, 3.0014e-06],
           [9.8653e-06, 2.7193e-06, 1.2482e-06,  ..., 6.7003e-07,
            5.9030e-07, 1.4782e-06],
           [9.3147e-06, 2.3657e-07, 1.7895e-07,  ..., 2.5175e-07,
            8.9278e-08, 2.5031e-06]]]]),
 tensor([[1., 4., 3.]]),
 [37],
 [3])

# Building a Model