# Playground
- a "battlefield" - a place to experiment and try new things.

Load dataset, augment data, print samples, etc.

In [8]:
# Libraries
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from utils.preprocessing import Preprocessing
import numpy as np


In [9]:
# Import custom preprocessing library
prep = Preprocessing()
train_set = prep.download(split='train', download=False)



In [10]:
# Print random audio sample
prep.print_raw_sample(np.random.randint(len(train_set)), train_set)

+------ Printing info for sample [22126] ------+
- Waveform is a Tensor of size [1, 229440], type=torch.float32
- Sample rate: 16000
- Transcript: TALKED ABOUT BUT NOT IN A HARMFUL WAY AND ...
- Speaker id: 7067
- Chapter id: 76047
- Utterance id: 27
+---------------------------------------+


In [11]:
# Create data loader
train_loader = DataLoader(dataset=train_set,
                          batch_size=16,
                          shuffle=True,
                          collate_fn=lambda x: prep.preprocess(x, "train"))



In [12]:
# Inspect batch of data
prep.print_loader_info(train_loader)

+------ Dataloader length: 28539 ------+
# Batches: 1784
Spectogram shape: [16, 1, 128, 1349]
Label shape: [16, 274]
Mel length (length of each spectogram): [603, 549, 493, 602, 519, 598] ...
Idx length (length of each label): [274, 177, 225, 215, 211, 210] ...
+------------------------------------+


In [13]:
torch.cuda.is_available()

False

Import custom models and neural layers, and test them on sample data.

In [6]:
from models.deep_speech_2 import MyLayerNorm, MySkipCNN, MyBiGRU, MyDeepSpeech
from torch.nn import Conv2d, Linear

#layer_norm = MyLayerNorm(n_bins=128)
cnn = Conv2d(1, 32, 3, stride=2, padding=1)
skip_cnn = MySkipCNN(32, 32, kernel=3, stride=1, drop_rate=0.5, n_bins=64)
fc = Linear(2048, 512)

bigru_1 = MyBiGRU(512, 512, 0.5, True)
bigru_2 = MyBiGRU(1024, 512, 0.5, True)

deep_speech = MyDeepSpeech(3, 5, 512, 29, 128)

In [7]:
# Check shapes layer by layer
for idx, data in enumerate(train_loader):
    
    # batch of data to play with
    spec = data[0]

    #### layer norm ####
    # print(f"Shape before: {spec.shape}")
    # out = layer_norm(spec)
    # print(f"Shape after: {out.shape}")
    
    # Skip cnn
    print(f"Shape before: {spec.shape}") # [16, 1, 128, 1315]
    out = cnn(spec)
    out = skip_cnn(out)

    sizes = out.shape
    print(f"Shape after CNN and SkipCNN: {sizes}") # [16, 32, 64, 658]
    
    out = out.view(sizes[0], sizes[1]*sizes[2], sizes[3])
    print(f"Shape after view: {out.shape}") # [16, 2048, 658]

    out = out.transpose(1, 2)
    print(f"Shape after tranpose: {out.shape}") # [16, 658, 2048]

    out = fc(out)
    print(f"Shape after linear layer: {out.shape}") # [16, 648, 512]

    out = bigru_1(out)
    print(f"Shape after bigru 1: {out.shape}")  # [16, 648, 1024]
    
    out = bigru_2(out)
    print(f"Shape after bigru 2: {out.shape}")  # [16, 648, 1024]

    out = Linear(1024, 512)(out)
    out = nn.GELU()(out)
    out = nn.Dropout(0.5)(out)
    out = nn.Linear(512, 27)(out)
    print(f"Shape after final linear classifier: {out.shape}") # [16, 639, 27]
    break

Shape before: torch.Size([16, 1, 128, 1265])
Shape after CNN and SkipCNN: torch.Size([16, 32, 64, 633])
Shape after view: torch.Size([16, 2048, 633])
Shape after tranpose: torch.Size([16, 633, 2048])
Shape after linear layer: torch.Size([16, 633, 512])
Shape after bigru 1: torch.Size([16, 633, 1024])
Shape after bigru 2: torch.Size([16, 633, 1024])
Shape after final linear classifier: torch.Size([16, 633, 27])


In [8]:
# Check deep speech model
for idx, data in enumerate(train_loader):
    spec = data[0]

    print(f"Input shape: {spec.shape}")
    out = deep_speech(spec)
    print(f"Output shape: {out.shape}")
    break

Input shape: torch.Size([16, 1, 128, 1305])
Deep: torch.Size([16, 32, 64, 653])
Deep: torch.Size([16, 32, 64, 653])
Deep: torch.Size([16, 653, 512])
After RNN block: torch.Size([16, 653, 1024])
Final: torch.Size([16, 653, 29])
Output shape: torch.Size([16, 653, 29])


In [4]:
! nvidia-smi

Fri Mar 29 10:28:08 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 537.70                 Driver Version: 537.70       CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                     TCC/WDDM  | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 3060 ...  WDDM  | 00000000:01:00.0 Off |                  N/A |
| N/A   26C    P0              23W / 105W |      0MiB /  6144MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [7]:
import torch
torch.zeros(1).cuda()

AssertionError: Torch not compiled with CUDA enabled

In [7]:
import torch
import sys

print(sys.version)
print(torch.version.cuda)

3.12.2 | packaged by conda-forge | (main, Feb 16 2024, 20:42:31) [MSC v.1937 64 bit (AMD64)]
None


Wandb - dummy script to log accuracy and loss

In [3]:
1305 // 2

652

In [9]:
! pip install wandb



In [10]:
! wandb login 33a194fab1b28225adfa0561f9a0dcbae4adff8b

wandb: Appending key for api.wandb.ai to your netrc file: C:\Users\marco\.netrc


In [11]:
import wandb
import random

# Init
wandb.init(
    project="asr_librispeech",

    # hyperparams to track
    config= {
        "lr": 0.001,
        "model": "dummy_number_1",
        "dataset": "db_fool",
        "epochs": 10,
    }
)

# Train
epochs = 100
offset = random.random() / 5
acc = 0
for epoch in range(2, epochs):

    # compute dummy accuracy and loss
    acc = 1 - 2 ** -epoch - random.random() / epoch - offset
    #acc += 0.4
    #acc = acc % 1
    loss = 2 ** -epoch + random.random() / epoch + offset
    
    # LOG!
    wandb.log({
        "acc": acc,
        "loss": loss
    })

# Stop all the thing (put this line only in notebooks)
wandb.finish()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33miu4hry[0m ([33mfantastic_4[0m). Use [1m`wandb login --relogin`[0m to force relogin


0,1
acc,▁▇▇█▇█▇█████████████████████████████████
loss,█▃▂▁▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
acc,0.86983
loss,0.12401


## Metrics

In [43]:
from utils.metrics import word_errors, wer, cer

In [42]:
word_errors("harry potter", "harry osborne")

(1.0, 2)

Filter operator - removes any empty strings resulting from consecutive white spaces.

In [19]:
reference = "harry potter is a  wizard"
print('ORIGINAL')
print(reference.split(' '))
print()

print('WITHOUT FILTER')
print(" ".join(reference.split(' ')))
print()

print('WITH FILTER')
print(" ".join(filter(None, reference.split(' '))))
print()

print('COLLAPSE MODE')
print("".join(filter(None, reference.split(' '))))

ORIGINAL
['harry', 'potter', 'is', 'a', '', 'wizard']

WITHOUT FILTER
harry potter is a  wizard

WITH FILTER
harry potter is a wizard

COLLAPSE MODE
harrypotterisawizard


In [44]:
wer("harry potter is a magician", "harry potter is a wizard") # 1/5 = 0.2 

0.2

In [47]:
wer("Peter Parker is Spider", "peter park is spider") # 1 errors over 4 -> 0.25

0.25

Difference between CER and WER
- CER: works at char level. Basically, it is a huge edit distance between ref and hyp. May be a good idea to test our models.
- WER: computes the edit distance at word level. In some cases, we get drastic chages after deleting one space - see below.

In [48]:
print("CER:", cer("spider man", "spiderman")) # 1/10 = 0.1 GOOD
print("WER", wer("spider man", "spiderman")) # 2/2 = 1.0 BAD

CER: 0.1
WER 1.0


CTC Loss. So basically, once we have the model predictions we need to take the log of the softmax. Why? Because it is written on the doc. Also, the shape is a bit silly - `[time, batch, classes]`. Targets must be [batch, length]

In [49]:
import torch
torch.full(size=(16,), fill_value=50, dtype=torch.long)


tensor([50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50])

Playing around with Bidirectional GRU blocks.

In [3]:
import torch
from models.blocks import BiGRU

seq_len, emb_size = 650, 512
hid_size = 32
input_size = (16, seq_len, emb_size)

gru = BiGRU(emb_size, hid_size, 0.1, False)

x = torch.randn(input_size)

print(f"Before: {x.shape}")

x = gru(x)

print(f"After: {x.shape}")

Before: torch.Size([16, 650, 512])
After: torch.Size([16, 650, 64])


In [2]:
from models.deep_speech import DeepSpeech
import torch

x = torch.randn((16, 1, 128, 1300))

dp = DeepSpeech(3, 2, 512, 29, 128)

out = dp(x)
out.shape

torch.Size([16, 650, 29])

In [3]:
len("abcdefghijklmnopqrstuvwxyz")

26

Playing aroung with attention layers.

In [9]:
import torch
import torch.nn as nn

In [8]:
input_shape = (1, 650, 512)
features = torch.randn((input_shape))

multi_head_attention = nn.MultiheadAttention(
    embed_dim=input_shape[2],
    num_heads=4,
    batch_first=True)

attn_output, att_weights = multi_head_attention(features, features, features)
print(att_weights.shape)

torch.Size([1, 650, 650])


In [10]:
input_shape = (1, 650, 512)
features = torch.randn((input_shape))

layer_norm = nn.LayerNorm(512)
out = layer_norm(features)
out.shape

torch.Size([1, 650, 512])