In [3]:
%config Completer.use_jedi = False
%load_ext autoreload
%autoreload 2

In [4]:
from IPython.display import display, HTML, Video
display(HTML("<style>.container { width:90% !important; }</style>"))

# Load data

In [5]:
import torch
import torchaudio
from torch.utils.data import Dataset, DataLoader

import sentencepiece

In [6]:
import os
import glob
import json
import regex

import tqdm.notebook as tqdm

import numpy as np
import pandas as pd

from ipywidgets import GridBox, Audio, HBox, VBox, Box, Label, Layout

import matplotlib.pyplot as plt
import matplotlib_inline

%matplotlib inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

In [7]:
base_path = './dataset/'

libri_speech_base_path = os.path.join(base_path, 'LibriSpeech/ruls_data')
golos_base_path = os.path.join(base_path, 'golos')

In [8]:
from src.dataset import get_libri_speech_dataset, get_golos_dataset

In [9]:
def sample_dataset(dataset, n=4):
    grid = []
    for idx in range(n):
        grid.append(
            VBox([
                Label('{0:d}, {1}, {2:.1f}'.format(idx, dataset['text'][idx], dataset['duration'][idx])),
                Audio.from_file(dataset['audio_path'][idx], autoplay=False, loop=False),
            ])
        )
    
    return HBox([VBox(grid[0::2]), VBox(grid[1::2])])

In [10]:
libri_speech_dev = get_libri_speech_dataset(libri_speech_base_path, split='dev')
libri_speech_test = get_libri_speech_dataset(libri_speech_base_path, split='test')
libri_speech_train = get_libri_speech_dataset(libri_speech_base_path, split='train')

print('Loaded {0:d} objects'.format(len(libri_speech_dev['audio_path'])))
print('Loaded {0:d} objects'.format(len(libri_speech_test['audio_path'])))
print('Loaded {0:d} objects'.format(len(libri_speech_train['audio_path'])))

Loaded 1400 objects
Loaded 1352 objects
Loaded 54472 objects


In [11]:
sample_dataset(libri_speech_dev)

HBox(children=(VBox(children=(VBox(children=(Label(value='0, воистину еврейки молодой мне дорого душевное спас…

In [12]:
sample_dataset(libri_speech_test)

HBox(children=(VBox(children=(VBox(children=(Label(value='0, для вас души моей царицы красавицы для вас одних …

In [13]:
sample_dataset(libri_speech_train)

HBox(children=(VBox(children=(VBox(children=(Label(value='0, он сделал это так неловко что задел образок моего…

In [14]:
golos_train = get_golos_dataset(golos_base_path, split='train')
golos_test_crowd = get_golos_dataset(golos_base_path, split='test/crowd')
golos_test_farfield = get_golos_dataset(golos_base_path, split='test/farfield')

print('Loaded {0:d} objects'.format(len(golos_train['audio_path'])))
print('Loaded {0:d} objects'.format(len(golos_test_crowd['audio_path'])))
print('Loaded {0:d} objects'.format(len(golos_test_farfield['audio_path'])))

Loaded 1103799 objects
Loaded 9994 objects
Loaded 1916 objects


In [15]:
sample_dataset(golos_train)

HBox(children=(VBox(children=(VBox(children=(Label(value='0, алиби, 1.2'), Audio(value=b'RIFFd\x97\x00\x00WAVE…

In [16]:
sample_dataset(golos_test_crowd)

HBox(children=(VBox(children=(VBox(children=(Label(value='0, афина воспроизведи музыку вперемешку, 4.9'), Audi…

In [17]:
sample_dataset(golos_test_farfield)

HBox(children=(VBox(children=(VBox(children=(Label(value='0, джой хватит, 1.7'), Audio(value=b'RIFFZ\xa8\x01\x…

In [18]:
train_datasets = {
    'libri_speech/train': libri_speech_train, 
    'golos/train': golos_train
}

test_datasets = {
    'golos/test/crowd': golos_test_crowd,
    'golos/test/farfield': golos_test_farfield,
    'libri_speech/dev': libri_speech_dev,
    'libri_speech/test': libri_speech_test,
}

# Create tokenizer

In [19]:
# Save text from all train datasets to file
### YOUR CODE HERE
...

In [20]:
# Train sentencepiece tokenizer
sentencepiece.SentencePieceTrainer.train(
    input='texts.txt', model_prefix='tokenizer', vocab_size=128, model_type='unigram',
    bos_id=-1, eos_id=-1, character_coverage=1.0
)

sentencepiece_trainer.cc(77) LOG(INFO) Starts training with : 
trainer_spec {
  input: texts.txt
  input_format: 
  model_prefix: tokenizer
  model_type: UNIGRAM
  vocab_size: 128
  self_test_sample_size: 0
  character_coverage: 1
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: -1
  eos_id: -1
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
  enable_differential_privacy: 0
  differential_privacy_noise_level: 0
  differential_privacy_clipping_th

In [22]:
# Load tokenizer model
sp_tokenizer = sentencepiece.SentencePieceProcessor(model_file='tokenizer.model')

In [27]:
# Check if it is working
sp_tokenizer.encode_as_ids(['привет, как тебя зовут'])

[[34, 56, 58, 8, 0, 78, 90, 1, 21, 68, 10, 8]]

In [28]:
sp_tokenizer.encode_as_pieces(['привет, как тебя зовут'])

[['▁п', 'ри', 'ве', 'т', ',', '▁как', '▁тебя', '▁', 'з', 'ов', 'у', 'т']]

# Create Dataset

In [29]:
from src.dataset import AudioDataset, collate_fn

In [30]:
ds_train = AudioDataset(pd.concat(train_datasets.values()), sp_tokenizer, min_duration=1.36, max_duration=10.96)

ds_test_dict = {
    name: AudioDataset(dataset, sp_tokenizer, min_duration=1.36, max_duration=10.96)
    for name, dataset in test_datasets.items()
}
ds_train_dict = {
    name: AudioDataset(dataset, sp_tokenizer, min_duration=1.36, max_duration=10.96)
    for name, dataset in train_datasets.items()
}

In [31]:
ds_train[0]

('./dataset/golos/train/crowd/9/458139fb831999abeb2b69254d4480ea.wav',
 tensor([ 0.0000e+00,  0.0000e+00,  3.0518e-05,  ...,  2.1362e-04,
         -7.3242e-04, -8.5449e-04]),
 21760,
 'глобус фото',
 tensor([77, 67, 22, 10,  5,  1, 63,  6, 40]),
 9)

# Create Dataloader

In [35]:
batch_size = 8
num_workers = 0

dl_train = DataLoader(
    ds_train, batch_size=batch_size, shuffle=True,
    num_workers=num_workers, pin_memory=False, collate_fn=collate_fn
)

dl_test_dict = {
    name: DataLoader(
        dataset, batch_size=batch_size, shuffle=False,
        num_workers=num_workers, pin_memory=False, collate_fn=collate_fn
    )
    for name, dataset in ds_test_dict.items()
}
dl_train_dict = {
    name: DataLoader(
        dataset, batch_size=batch_size, shuffle=False,
        num_workers=num_workers, pin_memory=False, collate_fn=collate_fn
    )
    for name, dataset in ds_train_dict.items()
}

In [36]:
next(iter(dl_train))

{'audio_path': ('./dataset/LibriSpeech/ruls_data/train/audio/8169/14105/obyknovennayaistoriya_08_goncharov_0136.wav',
  './dataset/golos/train/crowd/4/b880489e1465a3172b757c8d2b45fd31.wav',
  './dataset/golos/train/crowd/1/505ccf5cd724d5ae290ec27e8b5c6f63.wav',
  './dataset/golos/train/farfield/175c04c29b26272231f82570a64aba83.wav',
  './dataset/golos/train/crowd/3/470cc640ad5f32a5f2c3bba137996fef.wav',
  './dataset/golos/train/crowd/7/62a1af671ce1cc629fe217e925f22df1.wav',
  './dataset/golos/train/crowd/0/380dabd87c705a935a9ef14e44587c05.wav',
  './dataset/golos/train/crowd/8/c8dfdd33c23841cd3725983250409f37.wav'),
 'audio': tensor([[-0.0246, -0.0266, -0.0275,  ..., -0.0005, -0.0002,  0.0002],
         [ 0.0005,  0.0005,  0.0004,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0005,  0.0008,  0.0009,  ...,  0.0000,  0.0000,  0.0000]