In [249]:
#!g1.1
%config Completer.use_jedi = False
%load_ext autoreload
%autoreload 2

In [250]:
#!g1.1
from IPython.display import display, HTML, Video
display(HTML("<style>.container { width:90% !important; }</style>"))

In [251]:
#!g1.1
import torch
import torchaudio
from torch.utils.data import Dataset, DataLoader

import sentencepiece

In [252]:
#!g1.1
import os
import glob
import json
import regex

import tqdm.notebook as tqdm

import numpy as np
import pandas as pd

from ipywidgets import GridBox, Audio, HBox, VBox, Box, Label, Layout

import matplotlib.pyplot as plt
# import matplotlib_inline

%matplotlib inline
# matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

In [253]:
#!g1.1
base_path = '/home/jupyter/mnt/datasets'

libri_speech_base_path = os.path.join(base_path, 'LibriSpeech_ds')
golos_base_path = os.path.join(base_path, 'golos')

device = torch.device("cuda", 0)

In [254]:
#!g1.1
from src.dataset import get_libri_speech_dataset, get_golos_dataset

libri_speech_dev = get_libri_speech_dataset(libri_speech_base_path, split='dev')
libri_speech_train = get_libri_speech_dataset(libri_speech_base_path, split='train')
libri_speech_test = get_libri_speech_dataset(libri_speech_base_path, split='test')

print('Loaded {0:d} objects'.format(len(libri_speech_dev['audio_path'])))
print('Loaded {0:d} objects'.format(len(libri_speech_train['audio_path'])))
print('Loaded {0:d} objects'.format(len(libri_speech_test['audio_path'])))

# Load tokenizer model
sp_tokenizer = sentencepiece.SentencePieceProcessor(model_file='tokenizer.model')

Loaded 1400 objects
Loaded 54472 objects
Loaded 1352 objects


In [255]:
#!g1.1
from src.dataset import AudioDataset, collate_fn

libri_speech_dev_ds = AudioDataset(libri_speech_dev, sp_tokenizer, min_duration=1.36, max_duration=10.96)
libri_speech_train_ds = AudioDataset(libri_speech_train, sp_tokenizer, min_duration=1.36, max_duration=10.96)
libri_speech_test_ds = AudioDataset(libri_speech_test, sp_tokenizer, min_duration=1.36, max_duration=10.96)

batch_size = 20
num_workers = 0

libri_speech_dev_dl = DataLoader(
    libri_speech_dev_ds, batch_size=batch_size, shuffle=False,
    num_workers=num_workers, pin_memory=False, collate_fn=collate_fn
)

libri_speech_train_dl = DataLoader(
    libri_speech_train_ds, batch_size=batch_size, shuffle=False,
    num_workers=num_workers, pin_memory=False, collate_fn=collate_fn
)

libri_speech_test_dl = DataLoader(
    libri_speech_test_ds, batch_size=batch_size, shuffle=False,
    num_workers=num_workers, pin_memory=False, collate_fn=collate_fn
)

train_dataloaders = {
    'libri_speech/train': libri_speech_train_dl, 
#     'golos/train': golos_train
}

validate_dataloaders = {
#     'golos/test/crowd': golos_test_crowd,
#     'golos/test/farfield': golos_test_farfield,
    'libri_speech/dev': libri_speech_dev_dl,
    'libri_speech/test': libri_speech_test_dl,
}



In [241]:
#!g1.1
batch = next(libri_speech_dev_dl)
batch["audio"] = batch["audio"].to(device)
batch["audio_len"] = batch["audio_len"].to(device)
batch["audio"].device

TypeError: 'DataLoader' object is not an iterator

In [246]:
#!g1.1
from src.conformer import Conformer

conformer = Conformer()
conformer.to(device);

In [247]:
#!g1.1
weights = torch.load("conformer.pt")
conformer.load_state_dict(weights)

<All keys matched successfully>

In [248]:
#!g1.1
conformer.eval()
log_pb, enc_len, gp = conformer(batch["audio"], batch["audio_len"])
print(log_pb)
print(enc_len)
print(gp)

NameError: name 'conformer' is not defined

In [172]:
#!g1.1
sp_tokenizer.decode(gp[1][gp[1] != 128].tolist())

'на ггде  же перрвый ззваный гость'

In [173]:
#!g1.1
from src.metrics import WERMetric

metric = WERMetric(128, sp_tokenizer)

reference = batch["text"]

metric.update(log_pb, enc_len, reference)

wer, words, scores = metric.compute()
print(wer, words, scores, sep="\n")
print(hypothesis, reference, sep="\n")

0.5777777777777777
45
26
['дай бог чтоб просвветились мы', 'на где же первый званый гость', 'вот что хочет ссяне зоинька', 'м только лень и непоковство', 'теснимма шведов фрайд заратьюг', 'усоты куда свой танный путь', 'давновней и скорасгоралась', 'вн неемрачный дух неснал покоя']
['дай бог чтоб просветились мы', 'но где же первый званый гость', 'вот что хочется мне зоинька', 'в нем только лень и непокорство', 'тесним мы шведов рать за ратью', 'куда свой тайный путь направил', 'давно в ней искра разгоралась', 'в нем мрачный дух не знал покоя']


In [175]:
#!g1.1
a = conformer.loss(torch.transpose(log_pb, 0, 1), batch["tokens"], enc_len, batch["tokens_len"])

In [None]:
#!g1.1

In [184]:
#!g1.1
from src.train import evaluate

wer_res_dev, ctc_res_dev = evaluate(conformer, sp_tokenizer, libri_speech_dev_dl, device)
wer_res_train, ctc_res_train = evaluate(conformer, sp_tokenizer, libri_speech_train_dl, device)
wer_res_test, ctc_res_test = evaluate(conformer, sp_tokenizer, libri_speech_test_dl, device)

100%|██████████| 147/147 [00:10<00:00, 14.48it/s]
100%|██████████| 6153/6153 [14:45<00:00,  6.95it/s]
100%|██████████| 148/148 [00:22<00:00,  6.49it/s]


In [185]:
#!g1.1
print(wer_res_dev, wer_res_train, wer_res_test, sep="\n")

(0.5690275229357799, 13625, 7753)
(0.48950873808021966, 539020, 263855)
(0.573453975491593, 14036, 8049)


In [None]:
#!g1.1
from src.train import train
from src.scheduler import NoamAnnealing

optimizer = torch.optim.AdamW(conformer.parameters(), lr=2, weight_decay=1e-3)
scheduler = NoamAnnealing(optimizer, d_model=conformer.d_model, warmup_steps=600)

train(conformer, sp_tokenizer, None, optimizer, scheduler, 10, libri_speech_dev_dl, validate_dataloaders, device, model_dir="model_train")

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=59.0), HTML(value='')))