In [1]:
import torch
import torchaudio
from torchaudio.datasets import LIBRISPEECH
from torch.utils.data import DataLoader, random_split


In [2]:
# Télécharger et charger le dataset LibriSpeech

train_dataset = LIBRISPEECH(root="./data", url="train-clean-100", download=True)

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [3]:
def pad_or_trim(audio, target_length=16000*30):  # 30 seconds at 16kHz
    if audio.shape[1] > target_length:
        # Trim the audio to the target length
        return audio[:, :target_length]
    elif audio.shape[1] < target_length:
        # Pad the audio with zeros (silence) if it's shorter than the target length
        padding_size = target_length - audio.shape[1]
        padding = torch.zeros((audio.shape[0], padding_size)).to(DEVICE)
        return torch.cat([audio, padding], dim=1)
    else:
        return audio

In [4]:
import torchaudio.transforms as T
from tqdm import tqdm

class LibriSpeech(torch.utils.data.Dataset):
    """
    A simple class to wrap LibriSpeech and trim/pad the audio to 30 seconds.
    It will drop the last few seconds of a very small portion of the utterances.
    """
    def __init__(self, split="train-clean-100", device=DEVICE):
        self.dataset = LIBRISPEECH(root="./data", url=split, download=True)
        self.device = device
        self.mel_transform = T.MelSpectrogram(sample_rate=16000).to(self.device)
        self.characters = None
        char_to_index = None

    def get_characters(self):
        characters = set()
        for _, _, text, _, _, _ in tqdm(train_dataset.dataset):
            characters.update(text)
        self.characters = sorted(list(characters))
        self.characters.append('')
        self.char_to_index = {char: index for index, char in enumerate(characters)}


    def text_to_sequence(self, text):
        return [self.char_to_index[char] for char in text]
        
    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, item):
        assert self.characters is not None
        audio, sample_rate, text, _, _, _ = self.dataset[item]
        assert sample_rate == 16000
        mel = pad_or_trim(self.mel_transform(audio.to(self.device))).to(self.device)
        return (mel, self.text_to_sequence(text))

In [None]:
# Division du dataset en train et validation sets
train_size = int(0.8 * len(train_dataset))
val_size = len(train_dataset) - train_size
train_dataset, val_dataset = random_split(train_dataset, [train_size, val_size])

# Chargement par batchs
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)


In [None]:
import torch.nn as nn

class MLP(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes, num_hidden_layers):
        super(MLP, self).__init__()
        self.hidden_layers = nn.ModuleList([nn.Linear(input_size, hidden_size)])
        self.hidden_layers.extend([nn.Linear(hidden_size, hidden_size) for _ in range(num_hidden_layers)])
        self.output_layer = nn.Linear(hidden_size, num_classes)
        self.relu = nn.ReLU()

    def forward(self, x):
        for hidden_layer in self.hidden_layers:
            x = self.relu(hidden_layer(x))
        x = self.output_layer(x)
        return x

In [None]:
def greedy_decode_batch(output, blank_label, collapse_repeated=True):
    """
    output: Tensor of shape (T, N, C) - Output from the model (log probabilities)
    blank_label: The label used for the blank symbol in CTC Loss
    collapse_repeated: Whether to collapse repeated characters or not
    """
    # Choisissez le caractère le plus probable à chaque étape pour chaque séquence dans le batch
    _, max_indices = output.max(dim=2)
    max_indices = max_indices.transpose(0, 1)  # Transformer en (N, T) pour faciliter le traitement du batch

    decoded_batch = []
    for sequence in max_indices:
        decoded_sequence = []
        prev = None
        for idx in sequence:
            if idx != prev or not collapse_repeated:
                if idx != blank_label:
                    decoded_sequence.append(idx)
            prev = idx
        decoded_batch.append(decoded_sequence)

    return decoded_batch

In [None]:
import numpy as np
def train(model, train_loader, criterion, optimizer, epoch):
    train_losses = []
    val_losses = []
    for i in range(tqdm(epoch)):
        train_loss = []
        val_loss = []
        for inputs, targets in enumerate(train_loader):    
            target_lengths = torch.tensor([len(t) for t in targets])

            input_lengths = torch.full((inputs.size(0),), inputs.size(2), dtype=torch.long)  # Supposant que la taille est [batch, features, seq_len]

            # Forward pass
            outputs = model(inputs)
            outputs = outputs.log_softmax(2).permute(1, 0, 2)  # La sortie doit être [seq_len, batch, num_classes]

            loss = criterion(outputs, targets, input_lengths, target_lengths)

            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            train_loss.append(loss.item())
        
        
        for inputs, targets in enumerate(val_loader):
            with torch.no_grad():
                target_lengths = torch.tensor([len(t) for t in targets])
                input_lengths = torch.full((inputs.size(0),), inputs.size(2), dtype=torch.long)  # Supposant que la taille est [batch, features, seq_len]

                # Forward pass
                outputs = model(inputs)
                outputs = outputs.log_softmax(2).permute(1, 0, 2)  # La sortie doit être [seq_len, batch, num_classes]

                loss = criterion(outputs, targets, input_lengths, target_lengths)
                val_loss.append(loss.item())
        
        tloss = np.mean(train_loss)
        vloss = np.mean(val_loss)
        print(f'Epoch {i}/{epoch}, Train loss : {tloss}, Validation Loss: {vloss}')
        train_losses.append(tloss)
        val_losses.append(vloss)

    return train_losses, val_losses

In [None]:
for el in train_loader:
    print(el.shape)
    break

RuntimeError: Tensors must have same number of dimensions: got 3 and 2

In [None]:
model = MLP(input_size, hidden_size, num_classes).to(train_dataset.device)
criterion = nn.CTCLoss(blank=num_classes - 1)  # Le dernier caractère est considéré comme 'blank'
optimizer = optim.Adam(model.parameters(), lr=0.001)