In [37]:
%config Completer.use_jedi = False
%load_ext autoreload
%autoreload 2

In [38]:
from IPython.display import display, HTML, Video
display(HTML("<style>.container { width:90% !important; }</style>"))

# Load data

In [39]:
import torch
import torchaudio
from torch.utils.data import Dataset, DataLoader

import sentencepiece

In [40]:
import os
import glob
import json
import regex

import tqdm.notebook as tqdm

import numpy as np
import pandas as pd

from ipywidgets import GridBox, Audio, HBox, VBox, Box, Label, Layout

import matplotlib.pyplot as plt
import matplotlib_inline

%matplotlib inline
# matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

In [41]:
base_path = '/home/jupyter/mnt/datasets/'

libri_speech_base_path = os.path.join(base_path, 'LibriSpeech_ds')
golos_base_path = os.path.join(base_path, 'golos')

In [42]:
from src.dataset import get_libri_speech_dataset, get_golos_dataset

In [43]:
def sample_dataset(dataset, n=4):
    grid = []
    for idx in range(n):
        grid.append(
            VBox([
                Label('{0:d}, {1}, {2:.1f}'.format(idx, dataset['text'][idx], dataset['duration'][idx])),
                Audio.from_file(dataset['audio_path'][idx], autoplay=False, loop=False),
            ])
        )
    
    return HBox([VBox(grid[0::2]), VBox(grid[1::2])])

In [44]:
libri_speech_dev = get_libri_speech_dataset(libri_speech_base_path, split='dev')
libri_speech_test = get_libri_speech_dataset(libri_speech_base_path, split='test')
libri_speech_train = get_libri_speech_dataset(libri_speech_base_path, split='train')

print('Loaded {0:d} objects'.format(len(libri_speech_dev['audio_path'])))
print('Loaded {0:d} objects'.format(len(libri_speech_test['audio_path'])))
print('Loaded {0:d} objects'.format(len(libri_speech_train['audio_path'])))

Loaded 1400 objects
Loaded 1352 objects
Loaded 54472 objects


In [45]:
libri_speech_dev

Unnamed: 0,audio_path,text,duration
0,/home/jupyter/mnt/datasets/LibriSpeech_ds/dev/...,воистину еврейки молодой мне дорого душевное с...,4.58
1,/home/jupyter/mnt/datasets/LibriSpeech_ds/dev/...,приди ко мне прелестный ангел мой и мирное при...,7.80
2,/home/jupyter/mnt/datasets/LibriSpeech_ds/dev/...,любезных уст улыбкою довольный царю небес и го...,7.18
3,/home/jupyter/mnt/datasets/LibriSpeech_ds/dev/...,смиренных струн быть может наконец ее пленят ц...,8.00
4,/home/jupyter/mnt/datasets/LibriSpeech_ds/dev/...,властитель он и мыслей и сердец,2.68
...,...,...,...
1395,/home/jupyter/mnt/datasets/LibriSpeech_ds/dev/...,и над тесниной торжествуя как муж на страже в ...,8.90
1396,/home/jupyter/mnt/datasets/LibriSpeech_ds/dev/...,сатрап смутился изумленный и гнев в нем душу п...,4.94
1397,/home/jupyter/mnt/datasets/LibriSpeech_ds/dev/...,и свой совет разноплеменный он любопытный вопр...,7.14
1398,/home/jupyter/mnt/datasets/LibriSpeech_ds/dev/...,и что их сила и кто им вождь и отчего сердца и...,8.40


In [46]:
sample_dataset(libri_speech_dev)

HBox(children=(VBox(children=(VBox(children=(Label(value='0, воистину еврейки молодой мне дорого душевное спас…

In [47]:
sample_dataset(libri_speech_test)

HBox(children=(VBox(children=(VBox(children=(Label(value='0, для вас души моей царицы красавицы для вас одних …

In [48]:
sample_dataset(libri_speech_train)

HBox(children=(VBox(children=(VBox(children=(Label(value='0, он сделал это так неловко что задел образок моего…

In [49]:
# golos_train = get_golos_dataset(golos_base_path, split='train')
# golos_test_crowd = get_golos_dataset(golos_base_path, split='test/crowd')
# golos_test_farfield = get_golos_dataset(golos_base_path, split='test/farfield')

# print('Loaded {0:d} objects'.format(len(golos_train['audio_path'])))
# print('Loaded {0:d} objects'.format(len(golos_test_crowd['audio_path'])))
# print('Loaded {0:d} objects'.format(len(golos_test_farfield['audio_path'])))

In [50]:
# sample_dataset(golos_train)

In [51]:
# sample_dataset(golos_test_crowd)

In [52]:
# sample_dataset(golos_test_farfield)

In [53]:
train_datasets = {
    'libri_speech/train': libri_speech_train, 
#     'golos/train': golos_train
}

test_datasets = {
#     'golos/test/crowd': golos_test_crowd,
#     'golos/test/farfield': golos_test_farfield,
    'libri_speech/dev': libri_speech_dev,
    'libri_speech/test': libri_speech_test,
}

# Create tokenizer

In [54]:
libri_speech_train

Unnamed: 0,audio_path,text,duration
0,/home/jupyter/mnt/datasets/LibriSpeech_ds/trai...,он сделал это так неловко что задел образок мо...,9.76
1,/home/jupyter/mnt/datasets/LibriSpeech_ds/trai...,я высунул нос из под одеяла остановил рукою об...,11.30
2,/home/jupyter/mnt/datasets/LibriSpeech_ds/trai...,он же в пестром ваточном халате подпоясанном п...,16.08
3,/home/jupyter/mnt/datasets/LibriSpeech_ds/trai...,отчего он не бьет мух около володи ной постели...,4.01
4,/home/jupyter/mnt/datasets/LibriSpeech_ds/trai...,нет володя старше меня а я меньше всех оттого ...,4.04
...,...,...,...
54467,/home/jupyter/mnt/datasets/LibriSpeech_ds/trai...,все персоны захлопали и лорд епископ и городск...,6.46
54468,/home/jupyter/mnt/datasets/LibriSpeech_ds/trai...,я уже не хочу описывать как мне показывали зав...,5.20
54469,/home/jupyter/mnt/datasets/LibriSpeech_ds/trai...,достаточно будет одной детали,2.20
54470,/home/jupyter/mnt/datasets/LibriSpeech_ds/trai...,впереди шел епископ он председательствовал на ...,15.10


In [55]:
# # Save text from all train datasets to file
# ### YOUR CODE HERE
# with open('texts.txt', 'w', encoding = 'utf-8') as file:    
#     for key, value in train_datasets.items():
#         for key1, value1 in value.iterrows():
#             value1['text']
#             print(value1['text'], file = file, sep = '\n')
# ...

In [56]:
# # Train sentencepiece tokenizer
# sentencepiece.SentencePieceTrainer.train(
#     input='texts.txt', model_prefix='tokenizer', vocab_size=128, model_type='unigram',
#     bos_id=-1, eos_id=-1, character_coverage=1.0
# )

In [57]:
# Load tokenizer model
sp_tokenizer = sentencepiece.SentencePieceProcessor(model_file='tokenizer.model')

In [58]:
# Check if it is working
sp_tokenizer.encode_as_ids(['привет, как тебя зовут'])

[[89, 7, 62, 0, 87, 1, 47, 25, 11, 1, 17, 15, 7, 12, 9]]

In [59]:
sp_tokenizer.encode_as_pieces(['привет, как тебя зовут'])

[['▁при',
  'в',
  'ет',
  ',',
  '▁как',
  '▁',
  'те',
  'б',
  'я',
  '▁',
  'з',
  'о',
  'в',
  'у',
  'т']]

# Create Dataset

In [60]:
from src.dataset import AudioDataset, collate_fn

In [61]:
ds_train = AudioDataset(pd.concat(train_datasets.values()), sp_tokenizer, min_duration=1.36, max_duration=10.96)

ds_test_dict = {
    name: AudioDataset(dataset, sp_tokenizer, min_duration=1.36, max_duration=10.96)
    for name, dataset in test_datasets.items()
}
ds_train_dict = {
    name: AudioDataset(dataset, sp_tokenizer, min_duration=1.36, max_duration=10.96)
    for name, dataset in train_datasets.items()
}



In [62]:
ds_train[0]

('/home/jupyter/mnt/datasets/LibriSpeech_ds/train/audio/8086/11365/shortstories_11_garshin_0260.wav',
 tensor([-3.0518e-05,  0.0000e+00,  9.1553e-05,  ..., -1.6235e-02,
         -2.4017e-02, -3.4882e-02]),
 21760,
 'быть может еще успею',
 tensor([ 75,  56,  90,  61,   9,   1,   2, 126,   2,  51,   8,  34,   2,  28]),
 14)

# Create Dataloader

In [65]:
batch_size = 8
num_workers = 0

dl_train = DataLoader(
    ds_train, batch_size=batch_size, shuffle=True,
    num_workers=num_workers, pin_memory=False, collate_fn=collate_fn
)

dl_test_dict = {
    name: DataLoader(
        dataset, batch_size=batch_size, shuffle=False,
        num_workers=num_workers, pin_memory=False, collate_fn=collate_fn
    )
    for name, dataset in ds_test_dict.items()
}
dl_train_dict = {
    name: DataLoader(
        dataset, batch_size=batch_size, shuffle=False,
        num_workers=num_workers, pin_memory=False, collate_fn=collate_fn
    )
    for name, dataset in ds_train_dict.items()
}

In [66]:
next(iter(dl_train))

{'audio_path': ('/home/jupyter/mnt/datasets/LibriSpeech_ds/train/audio/8086/11365/shortstories_18_garshin_0175.wav',
  tensor([-0.0075, -0.0183,  0.0360,  ...,  0.0020,  0.0021,  0.0020]),
  125120,
  'солгал он таким странным голосом что на лице смотревшей ему в глаза дуняши выразилось недоумение',
  tensor([103,   5,  35,   3,   5,  68, 117,   6,   4,   1,  39,  24,  20,  55,
            4,   1,  29,  38,   8,  15,   4,  66,  33,   1,  27,  59,   2,  10,
            4,  15,   9,  40,   7,  95,  13,   1,   2,  91,  18,  92,  30,  17,
            3,   1,  94,  20,  11, 119,  96,  24,  17,   6,  38,  71,  21,  86,
           12,   4,   2,  22,   2]),
  61),
 'audio': tensor([[-0.0075, -0.0183,  0.0360,  ...,  0.0000,  0.0000,  0.0000],
         [-0.0002, -0.0002, -0.0001,  ...,  0.0000,  0.0000,  0.0000],
         [-0.0004,  0.0008, -0.0029,  ...,  0.0003,  0.0003,  0.0007],
         ...,
         [ 0.0000, -0.0002, -0.0003,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0139,  0.0176, 