In [1]:
# %pip install pytorch_lightning

In [11]:
# imports
import torch
import torch.nn as nn
import torch.optim as optim
import math
import pytorch_lightning as pl
import torchaudio
import torchmetrics
from torch.optim.lr_scheduler import ReduceLROnPlateau

In [3]:
torch.cuda.is_available()

False

In [4]:
class SpeechEmotionRecognitionModel(pl.LightningModule):
    def __init__(self, input_size, num_classes, dim_feedforward=2048, dim_model=1024, nhead=8, num_encoder_layers=6, num_decoder_layers=6, lr=1e-2, dropout=0.1):
        super(SpeechEmotionRecognitionModel, self).__init__()
        self.lr = lr

        encoder_layers = nn.TransformerEncoderLayer(dim_model, nhead, dim_feedforward, dropout)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_encoder_layers)
        self.encoder = nn.Linear(input_size, dim_model)
        self.decoder = nn.Linear(dim_model, num_classes)
        
        self.loss_function = nn.CrossEntropyLoss()

        # initialize the metrics
        self.precision = torchmetrics.Precision(task='multiclass', num_classes=num_classes, average="macro")
        self.recall = torchmetrics.Recall(task='multiclass', num_classes=num_classes, average="macro")
        self.F1 = torchmetrics.F1Score(task='multiclass', num_classes=num_classes, average="macro")

    def forward(self, src):
        src = self.encoder(src)
        src = src.unsqueeze(1)  # Add batch dimension
        output = self.transformer_encoder(src)
        output = output.squeeze(1)  # Remove the batch dimension
        output = self.decoder(output)
        return output

    def training_step(self, batch, batch_idx):
        src, tgt = batch[0], batch[1]
        output = self(src)
        loss = self.loss_function(output, tgt)
        self.log('cross entropy loss', loss, on_step=True, on_epoch=True, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        src, tgt = batch
        output = self(src)
        loss = self.loss_function(output, tgt.float())
        self.log('cross entropy loss', loss, on_epoch=True, prog_bar=True)

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
        scheduler = {
            'scheduler': ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=2, verbose=True),
            'monitor': 'cross entropy loss_epoch',  # Name of the metric to monitor
            'interval': 'epoch',
            'frequency': 1,
        }
        return {'optimizer': optimizer, 'lr_scheduler': scheduler}

    # function for evaluating the quality of output and target
    def evaluation(self, output, target, loss):
        # Calculate and log metrics
        output = output.squeeze()
        print('precision:{}'.format(self.precision(output, target)))
        self.log('recall:{}'.format(self.recall(output, target)))
        self.log('f1:{}'.format(self.F1(output, target)))

        # Calculate the loss
        print("cross entropy loss:{}".format(loss))

In [5]:
from local_dataset import AudioEmotionsDataset 
# import TQDMProgressBar
from pytorch_lightning.callbacks import TQDMProgressBar

In [6]:
BATCH_SIZE = 32
dataset = AudioEmotionsDataset("data/audio-emotions", batch_size=BATCH_SIZE)

train = dataset.train_dataloader
test = dataset.test_dataloader


Some weights of the model checkpoint at facebook/wav2vec2-base-960h were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You sho

max_len = 114220


In [12]:
model = SpeechEmotionRecognitionModel(input_size=dataset.feature_count, num_classes=dataset.class_count)

for p in model.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

trainer = pl.Trainer(default_root_dir='checkpoints',callbacks=[TQDMProgressBar(refresh_rate=10)], accelerator="auto", max_epochs=50, min_epochs=10, log_every_n_steps=1)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [13]:
trainer.fit(model, train_dataloaders=train, val_dataloaders=test)


  | Name                | Type                | Params
------------------------------------------------------------
0 | transformer_encoder | TransformerEncoder  | 50.4 M
1 | encoder             | Linear              | 116 M 
2 | decoder             | Linear              | 7.2 K 
3 | loss_function       | CrossEntropyLoss    | 0     
4 | precision           | MulticlassPrecision | 0     
5 | recall              | MulticlassRecall    | 0     
6 | F1                  | MulticlassF1Score   | 0     
------------------------------------------------------------
167 M     Trainable params
0         Non-trainable params
167 M     Total params
669.475   Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/Users/amittaijoel/miniconda3/envs/ling-project/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:492: Your `val_dataloader`'s sampler has shuffling enabled, it is strongly recommended that you turn shuffling off for val/test dataloaders.
/Users/amittaijoel/miniconda3/envs/ling-project/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:436: Consider setting `persistent_workers=True` in 'val_dataloader' to speed up the dataloader worker initialization.
/Users/amittaijoel/miniconda3/envs/ling-project/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:436: Consider setting `persistent_workers=True` in 'train_dataloader' to speed up the dataloader worker initialization.


Training: |          | 0/? [00:00<?, ?it/s]

/Users/amittaijoel/miniconda3/envs/ling-project/lib/python3.11/site-packages/pytorch_lightning/trainer/call.py:54: Detected KeyboardInterrupt, attempting graceful shutdown...
