In [40]:
import os
import pytorch_lightning as pl
import torch 
from torch import nn
from torch import utils

from torchaudio import datasets, transforms
from torchvision import models

from transformers import T5Tokenizer, T5ForConditionalGeneration



In [9]:
dataset = datasets.LIBRISPEECH(
                            root="./",
                            url="dev-clean",
                            folder_in_archive="LibriSpeech",
                            download=False
                            )

In [8]:
encoder = models.efficientnet_b0(weights=models.EfficientNet_B0_Weights.IMAGENET1K_V1).features
decoder = T5ForConditionalGeneration.from_pretrained("t5-small")
tokenizer = T5Tokenizer.from_pretrained("t5-small")
transform = transforms.MelSpectrogram(16000, n_fft=800)

In [70]:
class TokenizedDataset(utils.data.Dataset):
    def __init__(self, raw_dataset, tokenizer):
        self.raw_dataset = raw_dataset
        self.tokenizer = tokenizer
    
    def __len__(self):
        return len(self.raw_dataset)
    
    def __getitem__(self, idx):
        data = self.raw_dataset[idx]
        x = data[0]
        y = data[2]

        y = self.tokenizer.encode_plus(
                                        text=y,  # the sentence to be encoded
                                        add_special_tokens=True,  # Add [CLS] and [SEP]
                                        max_length=512,  # maximum length of a sentence
                                        padding="max_length",  # Add [PAD]s
                                        return_tensors='pt',  # ask the function to return PyTorch tensors
                                        truncation=True,
                                    )
        return x,y["input_ids"], y["attention_mask"]



In [78]:
mydataset = TokenizedDataset(dataset, tokenizer)

train_set, val_set, test_set = utils.data.random_split(mydataset, [0.6, 0.2, 0.2])

train_loader = utils.data.DataLoader(train_set, batch_size=2)
val_loader = utils.data.DataLoader(val_set, batch_size=2)
test_set = utils.data.DataLoader(test_set, batch_size=2)


In [80]:
class EncoderDecoder(pl.LightningModule):
    def __init__(self, encoder, decoder, transform, tokenizer):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.transform = transform
        self.tokenizer = tokenizer

        random_input = torch.rand((1,90000))
        random_spectrogram = self.transform(random_input)
        random_spectrogram = random_spectrogram.repeat(1,3,1,1)
        random_extracted_features = self.encoder(random_spectrogram)
        self.n_filters = random_extracted_features.shape[1]


    def forward(self, *args):

        x = args[0]
        out = self.transform(x)
        out = out.repeat(1, 3, 1, 1)
        out = self.encoder(out)
        out = out.permute(0, 2, 3, 1)
        out = out.reshape(-1, int(out.shape[1] * out.shape[2] * out.shape[3]/ self.decoder.config.d_model), self.decoder.config.d_model )

        if len(args) > 1:
            y = args[1]
            mask = args[2]
            out = self.decoder(inputs_embeds=out, labels=y,return_dict=True, decoder_attention_mask=mask)
        else:
            out = self.decoder.generate(inputs_embeds=out,
                                        max_length=1024,
                                        min_length=0)
        
        return out

    def training_step(self, train_batch, batch_idx):
        x, y = train_batch
        out  = self.forward(x, y)
        loss = out.loss
        self.log("train_loss")
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=3e-4)
        return optimizer


      




        

In [73]:
def translate_encoded_ids(encoded_ids_list, tokenizer):
    phrases = []
    for encoded_ids in encoded_ids_list:
        decoded_ids = tokenizer.decode(encoded_ids, skip_special_tokens=True)
        phrases.append(decoded_ids)
    return phrases


x = dataset[0][0]
y = dataset[0][2]

tz = tokenizer.encode_plus(
    text=y,  # the sentence to be encoded
    add_special_tokens=True,  # Add [CLS] and [SEP]
    max_length=512,  # maximum length of a sentence
    padding="max_length",  # Add [PAD]s
    return_tensors='pt',  # ask the function to return PyTorch tensors
    truncation=True,
)
y = tz["input_ids"]
mask = tz["attention_mask"]
print(y.shape)

model = EncoderDecoder(encoder=encoder,
                       decoder=decoder,
                       transform=transform,
                       tokenizer=tokenizer)

model(x,y, mask).loss


torch.Size([1, 512])


tensor(3.8561, grad_fn=<NllLossBackward0>)

In [81]:
model = EncoderDecoder(encoder=encoder,
                       decoder=decoder,
                       transform=transform,
                       tokenizer=tokenizer)
trainer = pl.Trainer()
trainer.fit(model, train_loader, val_loader)

GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(
  rank_zero_warn(
  rank_zero_warn("You passed in a `val_dataloader` but have no `validation_step`. Skipping val loop.")
Missing logger folder: /Users/pdcos/Documents/Estudos/Mestrado/IA048/Projeto Final/Código/IA048_Projeto_Final/lightning_logs

  | Name      | Type                       | Params
---------------------------------------------------------
0 | encoder   | Sequential                 | 4.0 M 
1 | decoder   | T5ForConditionalGeneration | 60.5 M
2 | transform | MelSpectrogram             | 0     
---------------------------------------------------------
64.5 M    Trainable params
0         Non-trainable params
64.5 M    Total params
258.057   Total estimated model params size (MB)
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

RuntimeError: stack expects each tensor to be equal size, but got [1, 69920] at entry 0 and [1, 134160] at entry 1