In [2]:
%config Completer.use_jedi = False
%load_ext autoreload
%autoreload 2

In [3]:
from IPython.display import display, HTML, Video
display(HTML("<style>.container { width:90% !important; }</style>"))

# Load data

In [4]:
import torch
import torchaudio
from torch.utils.data import Dataset, DataLoader

import sentencepiece

In [5]:
import os
import glob
import json
import regex

import tqdm.notebook as tqdm

import numpy as np
import pandas as pd

from ipywidgets import GridBox, Audio, HBox, VBox, Box, Label, Layout

import matplotlib.pyplot as plt
import matplotlib_inline

%matplotlib inline
# matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

In [6]:
base_path = '/home/jupyter/mnt/datasets'

libri_speech_base_path = os.path.join(base_path, 'LibriSpeech_ds')
golos_base_path = os.path.join(base_path, 'golos')

In [7]:
from src.dataset import get_libri_speech_dataset, get_golos_dataset

In [8]:
def sample_dataset(dataset, n=4):
    grid = []
    for idx in range(n):
        grid.append(
            VBox([
                Label('{0:d}, {1}, {2:.1f}'.format(idx, dataset['text'][idx], dataset['duration'][idx])),
                Audio.from_file(dataset['audio_path'][idx], autoplay=False, loop=False),
            ])
        )
    
    return HBox([VBox(grid[0::2]), VBox(grid[1::2])])

In [63]:
! git push https://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@github.com/nakhodnov17/conformer.git

To https://github.com/nakhodnov17/conformer.git
   153201b..1c5b78a  Nazarov -> Nazarov


In [9]:
libri_speech_dev = get_libri_speech_dataset(libri_speech_base_path, split='dev')
libri_speech_test = get_libri_speech_dataset(libri_speech_base_path, split='test')
libri_speech_train = get_libri_speech_dataset(libri_speech_base_path, split='train')

print('Loaded {0:d} objects'.format(len(libri_speech_dev['audio_path'])))
print('Loaded {0:d} objects'.format(len(libri_speech_test['audio_path'])))
print('Loaded {0:d} objects'.format(len(libri_speech_train['audio_path'])))

Loaded 1400 objects
Loaded 1352 objects
Loaded 54472 objects


In [10]:
sample_dataset(libri_speech_dev)

HBox(children=(VBox(children=(VBox(children=(Label(value='0, воистину еврейки молодой мне дорого душевное спас…

In [11]:
sample_dataset(libri_speech_test)

HBox(children=(VBox(children=(VBox(children=(Label(value='0, для вас души моей царицы красавицы для вас одних …

In [12]:
sample_dataset(libri_speech_train)

HBox(children=(VBox(children=(VBox(children=(Label(value='0, он сделал это так неловко что задел образок моего…

In [13]:
# golos_train = get_golos_dataset(golos_base_path, split='train')
# golos_test_crowd = get_golos_dataset(golos_base_path, split='test/crowd')
# golos_test_farfield = get_golos_dataset(golos_base_path, split='test/farfield')

# print('Loaded {0:d} objects'.format(len(golos_train['audio_path'])))
# print('Loaded {0:d} objects'.format(len(golos_test_crowd['audio_path'])))
# print('Loaded {0:d} objects'.format(len(golos_test_farfield['audio_path'])))

In [14]:
# sample_dataset(golos_train)

In [15]:
# sample_dataset(golos_test_crowd)

In [16]:
# sample_dataset(golos_test_farfield)

In [17]:
train_datasets = {
    'libri_speech/train': libri_speech_train, 
    # 'golos/train': golos_train
}

test_datasets = {
    # 'golos/test/crowd': golos_test_crowd,
    # 'golos/test/farfield': golos_test_farfield,
    'libri_speech/dev': libri_speech_dev,
    'libri_speech/test': libri_speech_test,
}

# Create tokenizer

In [18]:
# # Save text from all train datasets to file
# ### YOUR CODE HERE
# with open('texts.txt', 'w', encoding='utf-8') as f:
#     for i, text in libri_speech_train.iterrows():
#         f.write(text['text'] + '\n')

In [19]:
# # Train sentencepiece tokenizer
# sentencepiece.SentencePieceTrainer.train(
#     input='texts.txt', model_prefix='tokenizer', vocab_size=128, model_type='unigram',
#     bos_id=-1, eos_id=-1, character_coverage=1.0
# )

In [20]:
# Load tokenizer model
sp_tokenizer = sentencepiece.SentencePieceProcessor(model_file='tokenizer.model')

In [21]:
# Check if it is working
sp_tokenizer.encode_as_ids(['привет, как тебя зовут'])

[[89, 7, 62, 0, 87, 1, 47, 25, 11, 1, 17, 15, 7, 12, 9]]

In [22]:
sp_tokenizer.encode_as_pieces(['привет, как тебя зовут'])

[['▁при',
  'в',
  'ет',
  ',',
  '▁как',
  '▁',
  'те',
  'б',
  'я',
  '▁',
  'з',
  'о',
  'в',
  'у',
  'т']]

# Create Dataset

In [23]:
from src.dataset import AudioDataset, collate_fn

In [24]:
ds_train = AudioDataset(pd.concat(train_datasets.values()), sp_tokenizer, min_duration=1.36, max_duration=10.96)

ds_test_dict = {
    name: AudioDataset(dataset, sp_tokenizer, min_duration=1.36, max_duration=10.96)
    for name, dataset in test_datasets.items()
}
ds_train_dict = {
    name: AudioDataset(dataset, sp_tokenizer, min_duration=1.36, max_duration=10.96)
    for name, dataset in train_datasets.items()
}

In [25]:
ds_train[0]

IndexError: index out of bounds

# Create Dataloader

In [None]:
batch_size = 8
num_workers = 0

dl_train = DataLoader(
    ds_train, batch_size=batch_size, shuffle=True,
    num_workers=num_workers, pin_memory=False, collate_fn=collate_fn
)

dl_test_dict = {
    name: DataLoader(
        dataset, batch_size=batch_size, shuffle=False,
        num_workers=num_workers, pin_memory=False, collate_fn=collate_fn
    )
    for name, dataset in ds_test_dict.items()
}
dl_train_dict = {
    name: DataLoader(
        dataset, batch_size=batch_size, shuffle=False,
        num_workers=num_workers, pin_memory=False, collate_fn=collate_fn
    )
    for name, dataset in ds_train_dict.items()
}

In [None]:
next(iter(dl_train))