In [1]:
# %pip install pytorch_lightning
# %pip install transformers
# %pip install torchmetrics
# %pip install soundfile
# %pip install librosa
# %pip install ipywidgets

In [2]:
# imports
import torch
import torch.nn as nn
import torch.optim as optim
# import math
import pytorch_lightning as pl
# import torchaudio
import torchmetrics
from torch.optim.lr_scheduler import ReduceLROnPlateau



In [3]:
torch.cuda.is_available()

True

In [4]:
from local_dataset import AudioEmotionsDataset 
# import TQDMProgressBar
from pytorch_lightning.callbacks import TQDMProgressBar

In [5]:
BATCH_SIZE = 32
dataset = AudioEmotionsDataset("data/audio-emotions", batch_size=BATCH_SIZE)

train = dataset.train_dataloader
test = dataset.test_dataloader


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2024-03-12 05:03:58.844962: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-03-12 05:03:58.886470: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX512F AVX512_VNNI, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [7]:
m = nn.MaxPool1d(99, stride = 50)

m2 = nn.MaxPool1d(3, stride=2)

pad = nn.ConstantPad1d((0, 2), 0)

c = nn.Conv1d(kernel_size=1, in_channels=1024, out_channels=7)

s = nn.Softmax(dim=1)

for batch in train:
    X, y = batch
    print(X.shape)
    X2 = m(X)

    X2 = pad(X2)
    print(X2.shape)

    X3 = m2(X2)
    print(X3.shape)

    X3 = X3.view(X3.shape[0], X3.shape[1], 1)
    X4 = c(X3).squeeze()
    print(X4.shape)

    X5 = s(X4)
    print(X5.shape)

    print(f"{X5}")

    print(f"{torch.argmax(X5, dim=1)}")
    break

torch.Size([32, 1, 112202])
torch.Size([32, 1, 2245])
torch.Size([32, 1, 1122])


RuntimeError: shape '[32, 1, 1]' is invalid for input of size 35904

In [None]:
class Reshape(nn.Module):
    def __init__(self, shape):
        super(Reshape, self).__init__()
        self.shape = shape

    def forward(self, x):
        return x.view((x.shape[0], *self.shape))
    
    def __call__(self, x):
        return self.forward(x)
    
class Squeeze(nn.Module):
    def __init__(self):
        super(Squeeze, self).__init__()

    def forward(self, x):
        return x.squeeze()
    
    def __call__(self, x):
        return self.forward(x)

class Print(nn.Module):
    def __init__(self, name: int=0):
        super(Print, self).__init__()
        self.name = name

    def forward(self, x):
        # print(f"{self.name:2d}: {x.shape}")
        return x
    
    def __call__(self, x):
        return self.forward(x)
    


class SpeechEmotionRecognitionModel(pl.LightningModule):
    def __init__(self, input_size, num_classes, dim_feedforward=2048, dim_model=1024, nhead=8, num_encoder_layers=6, num_decoder_layers=6, lr=1e-2, dropout=0.1):
        super(SpeechEmotionRecognitionModel, self).__init__()
        self.lr = lr

        encoder_layers = nn.TransformerEncoderLayer(dim_model, nhead, dim_feedforward, dropout)
        # self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_encoder_layers)
        # self.encoder = nn.Linear(input_size, dim_model)
        # self.decoder = nn.Linear(dim_model, num_classes)

        self.layers = nn.Sequential(
            # max-pool 1
            # nn.MaxPool1d(99, stride = 50),

            # Print(1),

            # # pad to keep shape nice to work with
            # nn.ConstantPad1d((0, 2), 0),

            # Print(2),

            # # max-pool 2
            # nn.MaxPool1d(3, stride=2),

            # Print(3),

            # # transformer encoder
            # nn.TransformerEncoder(encoder_layers, num_encoder_layers),

            # reshape
            # Reshape((1024, 1)),

            # Print(4),
            nn.Linear(input_size, 1024),

            nn.Linear(1024, 1024),

            nn.Linear(1024, 1024),

            nn.Linear(1024, 7),

            # # output CNN
            # nn.Conv1d(kernel_size=1, in_channels=1024, out_channels=7),

            # Print(5),

            # Squeeze(),

            # Print(6),

            # softmax
            nn.Softmax(dim=1)
            # nn.Sigmoid()
        )
        

        # initialize the metrics
        self.loss_function = nn.CrossEntropyLoss()
        self.precision = torchmetrics.Precision(task='multiclass', num_classes=num_classes, average="macro")
        self.recall = torchmetrics.Recall(task='multiclass', num_classes=num_classes, average="macro")
        self.F1 = torchmetrics.F1Score(task='multiclass', num_classes=num_classes, average="macro")

    def forward(self, src):
        # src = self.encoder(src)
        # src = src.unsqueeze(1)  # Add batch dimension
        # output = self.transformer_encoder(src)
        # output = output.squeeze(1)  # Remove the batch dimension
        # output = self.decoder(output)
        # return output

        output = self.layers(src)
        return output

    def training_step(self, batch, batch_idx):
        src, tgt = batch[0], batch[1]
        output = self(src)

        print(f"{output.shape = }")
        print(f"{tgt.shape = }")
        loss = self.loss_function(output, tgt)
        self.log('cross entropy loss', loss, on_step=True, on_epoch=True, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        src, tgt = batch
        output = self(src)
        loss = self.loss_function(output, tgt)
        self.log('cross entropy loss', loss, on_epoch=True, prog_bar=True)
        # print(f"LOSS: {loss}")

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
        scheduler = {
            'scheduler': ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=2, verbose=True),
            'monitor': 'cross entropy loss_epoch',  # Name of the metric to monitor
            'interval': 'epoch',
            'frequency': 1,
        }
        return {'optimizer': optimizer, 'lr_scheduler': scheduler}

    # function for evaluating the quality of output and target
    def evaluation(self, output, target, loss):
        # Calculate and log metrics
        # output = output.squeeze()

        precision = self.precision(output, target)
        recall = self.recall(output, target)
        f1 = self.F1(output, target)
        # print('precision:{}'.format(self.precision(output, target)))
        # self.log('recall:{}'.format(self.recall(output, target)))
        # self.log('f1:{}'.format(self.F1(output, target)))

        print(f"CE:        {loss}")
        print(f"PRECISION: {precision}")
        print(f"RECALL:    {recall}")
        print(f"F1:        {f1}")

        self.log('precision', precision)
        self.log('recall', recall)
        self.log('f1', f1)

In [None]:
model = SpeechEmotionRecognitionModel(input_size=dataset.feature_count, num_classes=dataset.class_count)

# for p in model.parameters():
#     if p.dim() > 1:
#         nn.init.xavier_uniform_(p)

trainer = pl.Trainer(default_root_dir='checkpoints',callbacks=[TQDMProgressBar(refresh_rate=10)], accelerator="auto", max_epochs=50, min_epochs=10, log_every_n_steps=1)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [None]:
trainer.fit(model, train_dataloaders=train, val_dataloaders=test)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name          | Type                | Params
------------------------------------------------------
0 | layers        | Sequential          | 106 M 
1 | loss_function | CrossEntropyLoss    | 0     
2 | precision     | MulticlassPrecision | 0     
3 | recall        | MulticlassRecall    | 0     
4 | F1            | MulticlassF1Score   | 0     
------------------------------------------------------
106 M     Trainable params
0         Non-trainable params
106 M     Total params
427.860   Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

output.shape = torch.Size([32, 7])
tgt.shape = torch.Size([32, 7])
output.shape = torch.Size([32, 7])
tgt.shape = torch.Size([32, 7])
output.shape = torch.Size([32, 7])
tgt.shape = torch.Size([32, 7])
output.shape = torch.Size([32, 7])
tgt.shape = torch.Size([32, 7])
output.shape = torch.Size([32, 7])
tgt.shape = torch.Size([32, 7])
output.shape = torch.Size([32, 7])
tgt.shape = torch.Size([32, 7])
output.shape = torch.Size([32, 7])
tgt.shape = torch.Size([32, 7])
output.shape = torch.Size([32, 7])
tgt.shape = torch.Size([32, 7])
output.shape = torch.Size([32, 7])
tgt.shape = torch.Size([32, 7])
output.shape = torch.Size([32, 7])
tgt.shape = torch.Size([32, 7])
output.shape = torch.Size([32, 7])
tgt.shape = torch.Size([32, 7])
output.shape = torch.Size([32, 7])
tgt.shape = torch.Size([32, 7])
output.shape = torch.Size([32, 7])
tgt.shape = torch.Size([32, 7])
output.shape = torch.Size([32, 7])
tgt.shape = torch.Size([32, 7])
output.shape = torch.Size([32, 7])
tgt.shape = torch.Size([32,

Validation: |          | 0/? [00:00<?, ?it/s]

output.shape = torch.Size([32, 7])
tgt.shape = torch.Size([32, 7])
output.shape = torch.Size([32, 7])
tgt.shape = torch.Size([32, 7])
output.shape = torch.Size([32, 7])
tgt.shape = torch.Size([32, 7])
output.shape = torch.Size([32, 7])
tgt.shape = torch.Size([32, 7])
output.shape = torch.Size([32, 7])
tgt.shape = torch.Size([32, 7])
output.shape = torch.Size([32, 7])
tgt.shape = torch.Size([32, 7])
output.shape = torch.Size([32, 7])
tgt.shape = torch.Size([32, 7])
output.shape = torch.Size([32, 7])
tgt.shape = torch.Size([32, 7])
output.shape = torch.Size([32, 7])
tgt.shape = torch.Size([32, 7])
output.shape = torch.Size([32, 7])
tgt.shape = torch.Size([32, 7])
output.shape = torch.Size([32, 7])
tgt.shape = torch.Size([32, 7])
output.shape = torch.Size([32, 7])
tgt.shape = torch.Size([32, 7])
output.shape = torch.Size([32, 7])
tgt.shape = torch.Size([32, 7])
output.shape = torch.Size([32, 7])
tgt.shape = torch.Size([32, 7])
output.shape = torch.Size([32, 7])
tgt.shape = torch.Size([32,

Validation: |          | 0/? [00:00<?, ?it/s]

output.shape = torch.Size([32, 7])
tgt.shape = torch.Size([32, 7])
output.shape = torch.Size([32, 7])
tgt.shape = torch.Size([32, 7])
output.shape = torch.Size([32, 7])
tgt.shape = torch.Size([32, 7])
output.shape = torch.Size([32, 7])
tgt.shape = torch.Size([32, 7])
output.shape = torch.Size([32, 7])
tgt.shape = torch.Size([32, 7])
output.shape = torch.Size([32, 7])
tgt.shape = torch.Size([32, 7])
output.shape = torch.Size([32, 7])
tgt.shape = torch.Size([32, 7])
output.shape = torch.Size([32, 7])
tgt.shape = torch.Size([32, 7])
output.shape = torch.Size([32, 7])
tgt.shape = torch.Size([32, 7])
output.shape = torch.Size([32, 7])
tgt.shape = torch.Size([32, 7])
output.shape = torch.Size([32, 7])
tgt.shape = torch.Size([32, 7])
output.shape = torch.Size([32, 7])
tgt.shape = torch.Size([32, 7])
output.shape = torch.Size([32, 7])
tgt.shape = torch.Size([32, 7])
output.shape = torch.Size([32, 7])
tgt.shape = torch.Size([32, 7])
output.shape = torch.Size([32, 7])
tgt.shape = torch.Size([32,

Validation: |          | 0/? [00:00<?, ?it/s]

In [None]:
for batch in test:
    X, y = batch
    X = X.cuda(0)
    y = y.cuda(0)
    print(f"X: {X.shape}")
    print(f"y: {y.shape}")
    output = model(X.cuda(0))
    # print out put device
    print(f"output: {X.device}")
    model.evaluation(output, y, model.loss_function(output, y))

    print(f"\n\nACTUAL")
    print(y)

    # print the predictions
    print(f"\n\nPredictions: {torch.argmax(output, dim=1)}")
    # print sums of predictions
    print(f"Predictions sum: {torch.sum(output, dim=1)}")
    print(output)
    break

X: torch.Size([32, 102400])
y: torch.Size([32, 7])
output: cuda:0
CE:        2.009172201156616
PRECISION: 0.5078125
RECALL:    0.5078125
F1:        0.5078125


ACTUAL
tensor([[0., 0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 1., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 1., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0., 0.],
        [0., 0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 1., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0.],
        [0., 0., 0., 0., 0., 1., 0.