In [42]:
%config Completer.use_jedi = False
%load_ext autoreload
%autoreload 2

In [43]:
from IPython.display import display, HTML, Video
display(HTML("<style>.container { width:90% !important; }</style>"))

# Load data

In [44]:
import torch
import torchaudio
from torch.utils.data import Dataset, DataLoader

import sentencepiece

In [45]:
import os
import glob
import json
import regex

import tqdm.notebook as tqdm

import numpy as np
import pandas as pd

from ipywidgets import GridBox, Audio, HBox, VBox, Box, Label, Layout

import matplotlib.pyplot as plt
import matplotlib_inline

%matplotlib inline
# matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

In [46]:
base_path = '/home/jupyter/mnt/datasets'

libri_speech_base_path = os.path.join(base_path, 'LibriSpeech_ds')
golos_base_path = os.path.join(base_path, 'golos')

In [47]:
from src.dataset import get_libri_speech_dataset, get_golos_dataset

In [48]:
def sample_dataset(dataset, n=4):
    grid = []
    for idx in range(n):
        grid.append(
            VBox([
                Label('{0:d}, {1}, {2:.1f}'.format(idx, dataset['text'][idx], dataset['duration'][idx])),
                Audio.from_file(dataset['audio_path'][idx], autoplay=False, loop=False),
            ])
        )
    
    return HBox([VBox(grid[0::2]), VBox(grid[1::2])])

In [29]:
# !git push https://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@github.com/nakhodnov17/conformer.git

To https://github.com/nakhodnov17/conformer.git
   1c5b78a..2f4cf01  Nazarov -> Nazarov


In [49]:
libri_speech_dev = get_libri_speech_dataset(libri_speech_base_path, split='dev')
libri_speech_test = get_libri_speech_dataset(libri_speech_base_path, split='test')
libri_speech_train = get_libri_speech_dataset(libri_speech_base_path, split='train')

print('Loaded {0:d} objects'.format(len(libri_speech_dev['audio_path'])))
print('Loaded {0:d} objects'.format(len(libri_speech_test['audio_path'])))
print('Loaded {0:d} objects'.format(len(libri_speech_train['audio_path'])))

Loaded 1400 objects
Loaded 1352 objects
Loaded 54472 objects


In [50]:
libri_speech_test

Unnamed: 0,audio_path,text,duration
0,/home/jupyter/mnt/datasets/LibriSpeech_ds/test...,для вас души моей царицы красавицы для вас одн...,11.35
1,/home/jupyter/mnt/datasets/LibriSpeech_ds/test...,примите ж вы мой труд игривый,2.10
2,/home/jupyter/mnt/datasets/LibriSpeech_ds/test...,ничьих не требуя похвал счастлив уж я надеждой...,11.29
3,/home/jupyter/mnt/datasets/LibriSpeech_ds/test...,златая цепь на дубе том и днем и ночью кот уче...,6.65
4,/home/jupyter/mnt/datasets/LibriSpeech_ds/test...,идет направо песнь заводит налево сказку говорит,4.26
...,...,...,...
1347,/home/jupyter/mnt/datasets/LibriSpeech_ds/test...,зачем от гор и мимо башен летит орел тяжел и с...,6.12
1348,/home/jupyter/mnt/datasets/LibriSpeech_ds/test...,зачем арапа своего младая любит дездемона как ...,4.86
1349,/home/jupyter/mnt/datasets/LibriSpeech_ds/test...,затем что ветру и орлу и сердцу девы нет закона,3.83
1350,/home/jupyter/mnt/datasets/LibriSpeech_ds/test...,гордись таков и ты поэт и для тебя условий нет,3.56


In [39]:
sample_dataset(libri_speech_dev)

HBox(children=(VBox(children=(VBox(children=(Label(value='0, воистину еврейки молодой мне дорого душевное спас…

In [51]:
sample_dataset(libri_speech_test)

HBox(children=(VBox(children=(VBox(children=(Label(value='0, для вас души моей царицы красавицы для вас одних …

In [52]:
sample_dataset(libri_speech_train)

HBox(children=(VBox(children=(VBox(children=(Label(value='0, он сделал это так неловко что задел образок моего…

In [53]:
# golos_train = get_golos_dataset(golos_base_path, split='train')
# golos_test_crowd = get_golos_dataset(golos_base_path, split='test/crowd')
# golos_test_farfield = get_golos_dataset(golos_base_path, split='test/farfield')

# print('Loaded {0:d} objects'.format(len(golos_train['audio_path'])))
# print('Loaded {0:d} objects'.format(len(golos_test_crowd['audio_path'])))
# print('Loaded {0:d} objects'.format(len(golos_test_farfield['audio_path'])))

In [54]:
# sample_dataset(golos_train)

In [55]:
# sample_dataset(golos_test_crowd)

In [56]:
# sample_dataset(golos_test_farfield)

In [57]:
train_datasets = {
    'libri_speech/train': libri_speech_train, 
    # 'golos/train': golos_train
}

test_datasets = {
    # 'golos/test/crowd': golos_test_crowd,
    # 'golos/test/farfield': golos_test_farfield,
    'libri_speech/dev': libri_speech_dev,
    'libri_speech/test': libri_speech_test,
}

# Create tokenizer

In [58]:
# # Save text from all train datasets to file
# ### YOUR CODE HERE
# with open('texts.txt', 'w', encoding='utf-8') as f:
#     for i, text in libri_speech_train.iterrows():
#         f.write(text['text'] + '\n')

In [59]:
# # Train sentencepiece tokenizer
# sentencepiece.SentencePieceTrainer.train(
#     input='texts.txt', model_prefix='tokenizer', vocab_size=128, model_type='unigram',
#     bos_id=-1, eos_id=-1, character_coverage=1.0
# )

In [60]:
# Load tokenizer model
sp_tokenizer = sentencepiece.SentencePieceProcessor(model_file='tokenizer.model')

In [61]:
# Check if it is working
sp_tokenizer.encode_as_ids(['привет, как тебя зовут'])

[[89, 7, 62, 0, 87, 1, 47, 25, 11, 1, 17, 15, 7, 12, 9]]

In [62]:
sp_tokenizer.encode_as_pieces(['привет, как тебя зовут'])

[['▁при',
  'в',
  'ет',
  ',',
  '▁как',
  '▁',
  'те',
  'б',
  'я',
  '▁',
  'з',
  'о',
  'в',
  'у',
  'т']]

# Create Dataset

In [96]:
from src.dataset import AudioDataset, collate_fn

In [97]:
# ds_train.data.iloc[27055]

In [98]:
# ds_train.data['audio_path'].head().tolist()

In [99]:
# ds_train.data['audio_path'].iloc[0]

In [100]:
ds_train = AudioDataset(pd.concat(train_datasets.values()), sp_tokenizer, min_duration=1.36, max_duration=10.96)

ds_test_dict = {
    name: AudioDataset(dataset, sp_tokenizer, min_duration=1.36, max_duration=10.96)
    for name, dataset in test_datasets.items()
}
ds_train_dict = {
    name: AudioDataset(dataset, sp_tokenizer, min_duration=1.36, max_duration=10.96)
    for name, dataset in train_datasets.items()
}

In [101]:
ds_train[0]

('/home/jupyter/mnt/datasets/LibriSpeech_ds/train/audio/8086/11365/shortstories_11_garshin_0260.wav',
 tensor([-3.0518e-05,  0.0000e+00,  9.1553e-05,  ..., -1.6235e-02,
         -2.4017e-02, -3.4882e-02]),
 21760,
 'быть может еще успею',
 tensor([ 75,  56,  90,  61,   9,   1,   2, 126,   2,  51,   8,  34,   2,  28]),
 14)

# Create Dataloader

In [102]:
batch_size = 8
num_workers = 0

dl_train = DataLoader(
    ds_train, batch_size=batch_size, shuffle=True,
    num_workers=num_workers, pin_memory=False, collate_fn=collate_fn
)

dl_test_dict = {
    name: DataLoader(
        dataset, batch_size=batch_size, shuffle=False,
        num_workers=num_workers, pin_memory=False, collate_fn=collate_fn
    )
    for name, dataset in ds_test_dict.items()
}
dl_train_dict = {
    name: DataLoader(
        dataset, batch_size=batch_size, shuffle=False,
        num_workers=num_workers, pin_memory=False, collate_fn=collate_fn
    )
    for name, dataset in ds_train_dict.items()
}

In [103]:
dl_train

<torch.utils.data.dataloader.DataLoader at 0x7fb0d9005f10>

In [104]:
next(iter(dl_train))

{'audio_path': ['/home/jupyter/mnt/datasets/LibriSpeech_ds/train/audio/8169/14105/obyknovennayaistoriya_11_goncharov_0171.wav',
  '/home/jupyter/mnt/datasets/LibriSpeech_ds/train/audio/8086/11365/shortstories_02_garshin_0030.wav',
  '/home/jupyter/mnt/datasets/LibriSpeech_ds/train/audio/8169/13240/obryv_058_goncharov_0029.wav',
  '/home/jupyter/mnt/datasets/LibriSpeech_ds/train/audio/13587/15349/vekhi_13_various_0308.wav',
  '/home/jupyter/mnt/datasets/LibriSpeech_ds/train/audio/8086/7771/early_short_stories_14_jabotinsky_0097.wav',
  '/home/jupyter/mnt/datasets/LibriSpeech_ds/train/audio/295/162/Leo-Tolstoy-Detstvo-RUSSIAN-15-Detstvo_0024.wav',
  '/home/jupyter/mnt/datasets/LibriSpeech_ds/train/audio/8169/12256/dvoryanskoegnezdo_16_turgenev_0087.wav',
  '/home/jupyter/mnt/datasets/LibriSpeech_ds/train/audio/8086/15088/silhouettes3_22_aykhenvald_0273.wav'],
 'audio': tensor([[ 2.4414e-04,  6.1035e-05,  6.1035e-05,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         [ 1.678