# Downloading code and dataset

In [1]:
!git clone https://github.com/nikoryagin/YVector.git

Cloning into 'YVector'...
remote: Enumerating objects: 10, done.[K
remote: Counting objects: 100% (10/10), done.[K
remote: Compressing objects: 100% (8/8), done.[K
remote: Total 10 (delta 2), reused 10 (delta 2), pack-reused 0[K
Unpacking objects: 100% (10/10), done.


In [2]:
%cd YVector
#!mkdir dataset

/content/YVector


In [None]:
!wget https://www.openslr.org/resources/12/train-clean-100.tar.gz

--2022-04-03 15:15:17--  https://www.openslr.org/resources/12/train-clean-100.tar.gz
Resolving www.openslr.org (www.openslr.org)... 46.101.158.64
Connecting to www.openslr.org (www.openslr.org)|46.101.158.64|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: http://us.openslr.org/resources/12/train-clean-100.tar.gz [following]
--2022-04-03 15:15:17--  http://us.openslr.org/resources/12/train-clean-100.tar.gz
Resolving us.openslr.org (us.openslr.org)... 46.101.158.64
Connecting to us.openslr.org (us.openslr.org)|46.101.158.64|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6387309499 (5.9G) [application/x-gzip]
Saving to: ‘train-clean-100.tar.gz’


2022-04-03 15:20:14 (20.6 MB/s) - ‘train-clean-100.tar.gz’ saved [6387309499/6387309499]



# Preparing dataset

In [None]:
from pathlib import Path
files = Path('/content/YVector/dataset/LibriSpeech/train-clean-100').glob('*')
wrong_to_correct = {}
for i, file in enumerate(files):
    wrong_to_correct[int(file.name)] = i

In [None]:
import random
import torch

class RandomClip:
    def __init__(self, clip_length):
        self.clip_length = int(clip_length)

    def __call__(self, item):
        item[0] = item[0].squeeze()
        audio_length = item[0].shape[0]
        if audio_length < self.clip_length:
            item[0] = torch.nn.functional.pad(item[0], (self.clip_length // 2, self.clip_length // 2))
        audio_length = item[0].shape[0]

        offset = random.randint(0, audio_length-self.clip_length)

        item[0] = item[0][offset:(offset+self.clip_length)]
        item[0] = item[0].unsqueeze(0)
        item[0] = item[0].unsqueeze(0)
        return item

class Normalize:
    def __call__(self, item):
        item[0] = item[0].squeeze()
        item[0] = item[0] / torch.max(item[0] + 0.000001)
        item[0] = item[0].unsqueeze(0)
        item[0] = item[0].unsqueeze(0)
        return item


def collate_fn(data):
    for i in range(len(data)):
        data[i] = tuple(RandomClip(16000*3.9)(list(data[i])))
        data[i] = tuple(Normalize()(list(data[i])))
        if i == 0:
          batch_wave = data[i][0]
          batch_labels = torch.tensor(wrong_to_correct[data[i][3]]).unsqueeze(0)

        else:
          batch_wave = torch.cat((batch_wave, data[i][0]), dim = 0)
          batch_labels = torch.cat((batch_labels, torch.tensor(wrong_to_correct[data[i][3]]).unsqueeze(0)), dim = 0)


    return batch_wave, batch_labels


In [None]:
import torchaudio
libri_train = torchaudio.datasets.LIBRISPEECH(root='dataset', download=False)
dataloader = torch.utils.data.DataLoader(libri_train,
                                          batch_size=32,
                                          shuffle=True,
                                          num_workers=4,
                                          collate_fn = collate_fn,
                                          pin_memory=True)

  cpuset_checked))


# Setting up training process

In [3]:
from yvector import YVectorModel

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
# from https://github.com/Leethony/Additive-Margin-Softmax-Loss-Pytorch
class AdMSoftmaxLoss(nn.Module):

    def __init__(self, in_features, out_features, s=30.0, m=0.4):
        '''
        AM Softmax Loss
        '''
        super(AdMSoftmaxLoss, self).__init__()
        self.s = s
        self.m = m
        self.in_features = in_features
        self.out_features = out_features
        self.fc = nn.Linear(in_features, out_features, bias=False)

    def forward(self, x, labels):
        '''
        input shape (N, in_features)
        '''
        assert len(x) == len(labels)
        assert torch.min(labels) >= 0
        assert torch.max(labels) < self.out_features
        
        for W in self.fc.parameters():
            W = F.normalize(W, dim=1)

        x = F.normalize(x, dim=1)

        wf = self.fc(x)
        numerator = self.s * (torch.diagonal(wf.transpose(0, 1)[labels]) - self.m)
        excl = torch.cat([torch.cat((wf[i, :y], wf[i, y+1:])).unsqueeze(0) for i, y in enumerate(labels)], dim=0)
        denominator = torch.exp(numerator) + torch.sum(torch.exp(self.s * excl), dim=1)
        L = numerator - torch.log(denominator)
        return -torch.mean(L)

In [None]:
in_features = 512
out_features = 251 # Number of classes

criterion = AdMSoftmaxLoss(in_features, out_features, s=30.0, m=0.35).to('cuda')

In [3]:
model = YVectorModel().to('cuda')

In [None]:
optimizer = torch.optim.SGD(model.parameters(), 0.01, momentum=0.9)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Training

In [None]:
n_epoch = 60
min_loss = 2.9
for i in range(n_epoch):
  j = 1
  cum_loss = 0
  for X, y in dataloader:
      optimizer.zero_grad()
      X, y = X.to('cuda'), y.to('cuda')
      embeds = model(X)
      loss = criterion(embeds, y)
      loss.backward()
      optimizer.step()
      with torch.no_grad():
        j += 1
        cum_loss += loss
        if j % 100 == 0:
          if cum_loss / 100 < min_loss:
            min_loss = cum_loss / 100
            torch.save({
            'epoch': i,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': loss,
            }, '/content/drive/MyDrive/vk/vk{}.pth'.format(i))
          print(cum_loss / 100)
          cum_loss = 0