In [1]:
import os
import sys

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
import torch.optim as optim
import torchaudio
from torchaudio.datasets import SPEECHCOMMANDS

import matplotlib.pyplot as plt
import IPython.display as ipd
import pytorch_lightning as pl
from pytorch_lightning import LightningDataModule, LightningModule, Trainer
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.callbacks import ModelCheckpoint

In [2]:
labels = ['backward', 'bed', 'bird', 'cat','dog', 'down', 'eight', 'five', 'follow', 'forward', 'four', 'go', 'happy', 'house', 'learn',
          'left', 'marvin', 'nine', 'no', 'off', 'on', 'one', 'right', 'seven', 'sheila', 'six', 'stop', 'three', 'tree', 'two', 'up',
          'visual', 'wow', 'yes', 'zero']

In [3]:
class SpeechCommands(SPEECHCOMMANDS):
    def __init__(self, subset: str = None):
        super().__init__("./", download=True)

        def load_list(filename):
            filepath = os.path.join(self._path, filename)
            with open(filepath) as fileobj:
                return [os.path.normpath(os.path.join(self._path, line.strip())) for line in fileobj]

        if subset == "validate":
            self._walker = load_list("validation_list.txt")
        elif subset == "test":
            self._walker = load_list("testing_list.txt")
        elif subset == "train":
            excludes = load_list("validation_list.txt") + load_list("testing_list.txt")
            excludes = set(excludes)
            self._walker = [w for w in self._walker if w not in excludes]

In [4]:
def label_to_index(word):
    return torch.tensor(labels.index(word))

def index_to_label(index):
    return labels[index]

In [5]:
class SpeechDataModule(LightningDataModule):
    def __init__(self, batch_size, labels=labels, transform=None):
        super().__init__()
        self.batch_size = batch_size
        self.labels = labels
        self.transform = transform

    def setup(self, stage=None):
        if stage == "fit" or stage is None:
            self.train_set = SpeechCommands("train")
            self.valid_set = SpeechCommands("validate")

        if stage == "test" or stage is None:
            self.test_set = SpeechCommands("test")

    def train_dataloader(self):
        loader = DataLoader(self.train_set,
                            batch_size=self.batch_size,
                            shuffle=True,
                            drop_last=True,
                            collate_fn=self.collate_fn,
                            pin_memory=True,
                            num_workers=4
                           )
        return loader

    def val_dataloader(self):
        loader = DataLoader(self.valid_set,
                            batch_size=self.batch_size,
                            shuffle=False,
                            drop_last=False,
                            collate_fn=self.collate_fn,
                            pin_memory=True,
                            num_workers=4
                           )
        return loader

    def test_dataloader(self):
        loader = DataLoader(self.test_set,
                            batch_size=self.batch_size,
                            shuffle=False,
                            drop_last=False,
                            collate_fn=self.collate_fn,
                            pin_memory=True,
                            num_workers=4
                           )
        return loader

    def pad_sequence(self, batch):
        batch = [item.t() for item in batch]
        batch = torch.nn.utils.rnn.pad_sequence(batch, batch_first=True, padding_value=0.)
        batch = self.transform(batch.permute(0, 2, 1))
        return batch

    def label_to_index(self, word):
        return torch.tensor(self.labels.index(word))

    def collate_fn(self, batch):
        tensors, targets = [], []

        for waveform, _, label, _, _ in batch:
            tensors += [waveform]
            targets += [self.label_to_index(label)]

        tensors = self.pad_sequence(tensors)
        targets = torch.stack(targets)

        return tensors, targets

In [6]:
train_set = SpeechCommands("train")
test_set = SpeechCommands("test")

In [7]:
class VoiceRecognizerNN(nn.Module):
    def __init__(self, n_input=1, n_output=35, stride=16, n_channel=32):
        super().__init__()
        self.conv1 = nn.Conv1d(n_input, n_channel, kernel_size=80, stride=stride)
        self.bn1 = nn.BatchNorm1d(n_channel)
        self.pool1 = nn.MaxPool1d(4)
        self.conv2 = nn.Conv1d(n_channel, n_channel, kernel_size=3)
        self.bn2 = nn.BatchNorm1d(n_channel)
        self.pool2 = nn.MaxPool1d(4)
        self.conv3 = nn.Conv1d(n_channel, 2 * n_channel, kernel_size=3)
        self.bn3 = nn.BatchNorm1d(2 * n_channel)
        self.pool3 = nn.MaxPool1d(4)
        self.conv4 = nn.Conv1d(2 * n_channel, 2 * n_channel, kernel_size=3)
        self.bn4 = nn.BatchNorm1d(2 * n_channel)
        self.pool4 = nn.MaxPool1d(4)
        self.fc1 = nn.Linear(2 * n_channel, n_output)

    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(self.bn1(x))
        x = self.pool1(x)
        x = self.conv2(x)
        x = F.relu(self.bn2(x))
        x = self.pool2(x)
        x = self.conv3(x)
        x = F.relu(self.bn3(x))
        x = self.pool3(x)
        x = self.conv4(x)
        x = F.relu(self.bn4(x))
        x = self.pool4(x)
        x = F.avg_pool1d(x, x.shape[-1])
        x = x.permute(0, 2, 1)
        x = self.fc1(x)
        return F.log_softmax(x, dim=2)

In [8]:
class VoiceRecognizer(LightningModule):
    def __init__(self, model_hparams, optimizer_hparams):
        super().__init__()
        self.optimizer_hparams = optimizer_hparams
        self.model = VoiceRecognizerNN(model_hparams["n_input"],
                        model_hparams["n_output"],
                        model_hparams["stride"],
                        model_hparams["n_channel"]
                       )

    def forward(self, x):
        x = self.model(x)
        return F.log_softmax(x, dim=2)

    def training_step(self, batch, batch_idx):
        data, target = batch
        output = self(data).squeeze()
        loss = F.nll_loss(output, target)
        self.log("tr_loss", loss, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        data, target = batch
        output = self(data).squeeze()
        loss = F.nll_loss(output, target)

        preds = torch.argmax(output, dim=1)
        acc = (target == preds).float().mean()

        self.log("val_loss", loss, prog_bar=True)
        self.log("val_acc", acc, prog_bar=True)
        return loss

    def test_step(self, batch, batch_idx):
        data, target = batch
        output = self(data).squeeze()

        preds = torch.argmax(output, dim=1)
        acc = (target == preds).float().mean()
        self.log("test_acc", acc, prog_bar=True)
        return acc

    def configure_optimizers(self):
        params = self.optimizer_hparams
        optimizer = torch.optim.AdamW(self.parameters(),
                                      lr=params["learning_rate"],
                                      weight_decay=params["weight_decay"]
                                     )

        schedular = torch.optim.lr_scheduler.StepLR(optimizer,
                                                    gamma=params["gamma"],
                                                    step_size=params["step_size"]
                                                   )
        scheduler = {"scheduler": schedular, "interval": "epoch", "frequency": 1}
        return [optimizer], [scheduler]

In [9]:
batch_size = 512
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

if device == "cuda":
    num_workers = 1
    pin_memory = True
else:
    num_workers = 0
    pin_memory = False

cuda


In [10]:
max_epochs = 0
early_stop_callback = EarlyStopping(monitor="val_loss", min_delta=0.00, patience=5, verbose=False, mode="min")
checkpoint_callback = ModelCheckpoint(monitor="val_loss", dirpath='./speech/models/', filename='spcm_{epoch:03d}', save_top_k=5)
model_hparams = {"n_input": 1,
          "n_output": 35,
          "stride": 16,
          "n_channel": 32
         }
optimizer_hparams = {"learning_rate": 0.01,
                      "weight_decay": 0.0001,
                      "gamma": 0.5,
                      "step_size": 10,
                      "patience": 5
                     }
model = VoiceRecognizer(model_hparams, optimizer_hparams)


In [11]:
trainer = Trainer(max_epochs=max_epochs,
                  accelerator="gpu" if str(device).startswith("cuda") else "cpu",
                  precision=16,  # Automatic Mixed Precision (AMP)
                  callbacks=[early_stop_callback, checkpoint_callback]
                 )

Using 16bit native Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [12]:
orig_freq = 16000
new_freq = 8000
transform = torchaudio.transforms.Resample(orig_freq=orig_freq, new_freq=new_freq)

In [13]:
dm = SpeechDataModule(batch_size=batch_size, transform=transform)

In [14]:
trainer.fit(model, dm)

  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type              | Params
--------------------------------------------
0 | model | VoiceRecognizerNN | 26.9 K
--------------------------------------------
26.9 K    Trainable params
0         Non-trainable params
26.9 K    Total params
0.054     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=0` reached.


In [15]:
import pyaudio
import wave

chunk = 1024  # Record in chunks of 1024 samples
sample_format = pyaudio.paInt16  # 16 bits per sample
channels = 2
fs = 44100  # Record at 44100 samples per second
seconds = 3
filename = "output.wav"

p = pyaudio.PyAudio()  # Create an interface to PortAudio

print('Recording')

stream = p.open(format=sample_format,
                channels=channels,
                rate=fs,
                frames_per_buffer=chunk,
                input=True)

frames = []  # Initialize array to store frames

# Store data in chunks for 3 seconds
for i in range(0, int(fs / chunk * seconds)):
    data = stream.read(chunk)
    frames.append(data)

# Stop and close the stream
stream.stop_stream()
stream.close()
# Terminate the PortAudio interface
p.terminate()

print('Finished recording')

# Save the recorded data as a WAV file
wf = wave.open(filename, 'wb')
wf.setnchannels(channels)
wf.setsampwidth(p.get_sample_size(sample_format))
wf.setframerate(fs)
wf.writeframes(b''.join(frames))
wf.close()

Could not import the PyAudio C module 'pyaudio._portaudio'.


ImportError: libportaudio.so.2: cannot open shared object file: No such file or directory