In [1]:
%load_ext autoreload
%autoreload 2

import os
import argparse
import json
import re
import codecs
import shutil
from IPython.display import Audio
import io
import tempfile
import torch
import torchaudio
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt

import sentencepiece as sp
from sentencepiece import SentencePieceProcessor, SentencePieceTrainer
import fairseq
from fairseq.data.encoders.gpt2_bpe import GPT2BPE, GPT2BPEConfig
from fairseq.tasks import TASK_REGISTRY
from sentencepiece import SentencePieceProcessor
from fairseq.binarizer import VocabularyDatasetBinarizer, FileBinarizer, AlignmentDatasetBinarizer, BinarizeSummary
from fairseq.data import data_utils
from fairseq.data import Dictionary
from fairseq.data import StripTokenDataset, AppendTokenDataset, TruncateDataset, RandomCropDataset, AppendTokenDataset, PrependTokenDataset, ConcatDataset, PadDataset, TokenBlockDataset, \
    MonolingualDataset, LanguagePairDataset, MaskTokensDataset, NumelDataset, ConcatSentencesDataset, NestedDictionaryDataset, RawLabelDataset

from asr_dataset import ASRDataset
plt.style.use("seaborn-v0_8")

## Speech Recognition

In [2]:
librispeech_path = "/mnt/dl/NLP/LibriSpeech/train-clean-100/LibriSpeech/"

In [3]:
os.listdir(librispeech_path)

['SPEAKERS.TXT',
 'train-clean-100',
 'BOOKS.TXT',
 'CHAPTERS.TXT',
 'README.TXT',
 'LICENSE.TXT']

In [4]:
librispeech_train_path = "/mnt/dl/NLP/LibriSpeech/train-dataset"
os.makedirs(librispeech_train_path, exist_ok=True)
librispeech_train_text_path = os.path.join(librispeech_train_path, "text")
librispeech_train_audio_path = os.path.join(librispeech_train_path, "audio")
os.makedirs(librispeech_train_audio_path, exist_ok=True)
os.makedirs(librispeech_train_text_path, exist_ok=True)

In [5]:
s = set()
idx = 0
with (open(os.path.join(librispeech_train_text_path, "text.txt"), "w") as train_f, 
      open(os.path.join(librispeech_train_text_path, "metadata.txt"), "w") as meta_f, 
      open(os.path.join(librispeech_train_text_path, "audio.txt"), "w") as audio_f, 
      ):
    for reader_id in sorted(os.listdir(os.path.join(librispeech_path, "train-clean-100")), key=lambda x: int(x)):
        chapters = os.listdir(os.path.join(librispeech_path, "train-clean-100", reader_id))
        for chap in chapters:
            transcript = os.path.join(librispeech_path, "train-clean-100", reader_id, chap, f"{reader_id}-{chap}.trans.txt")
            with open(transcript, "r") as tf:
                for line in tf:
                    line = line.strip()
                    if not line.strip():
                        continue
                    audio_id =  line.split(" ")[0]
                    shutil.copyfile(os.path.join(librispeech_path, "train-clean-100", reader_id, chap, audio_id + ".flac"), 
                                    os.path.join(librispeech_train_audio_path, f"{idx:05}.flac")
                                    )
                    train_f.write(" ".join(line.split(" ")[1:]))
                    train_f.write("\n")
                    
                    audio_f.write(os.path.join(librispeech_train_audio_path, f"{idx:05}.flac"))
                    audio_f.write("\n")
                    
                    meta_f.write(os.path.join(librispeech_path, "train-clean-100", reader_id, chap, audio_id + ".flac") + "\n")
                    
                    idx += 1

In [6]:
idx

28539

In [7]:
librispeech_train_pieces_path = os.path.join(librispeech_train_path, "sentencepiece")
os.makedirs(librispeech_train_pieces_path, exist_ok=True)

In [8]:
# Train spm model
bpemode = "unigram"
nbpe = 5000
librispeech_bpe_model_path = os.path.join(librispeech_train_pieces_path, "bpe.m")
spm =  SentencePieceTrainer.train(input=os.path.join(librispeech_train_text_path, "text.txt"),
                                    vocab_size=nbpe, model_writer=codecs.open(librispeech_bpe_model_path, "wb"), 
                                    model_type=bpemode, character_coverage=1., input_sentence_size=100000000,
                                    unk_id=3, eos_id=2, pad_id=1, bos_id=-1)

sentencepiece_trainer.cc(77) LOG(INFO) Starts training with : 
trainer_spec {
  input: /mnt/dl/NLP/LibriSpeech/train-dataset/text/text.txt
  input_format: 
  model_prefix: 
  model_type: UNIGRAM
  vocab_size: 5000
  self_test_sample_size: 0
  character_coverage: 1
  input_sentence_size: 100000000
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 3
  bos_id: -1
  eos_id: 2
  pad_id: 1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
  enable_differential_privacy: 0
  

In [9]:
encoder = SentencePieceProcessor(librispeech_bpe_model_path)

In [10]:
encoder.get_piece_size()

5000

In [11]:
d = dict()
labels = []
with (open(os.path.join(librispeech_train_text_path, "text.txt")) as rf,
      open(os.path.join(librispeech_train_pieces_path, "text_pieces.txt"), "w") as wf,
      ):
    for line in rf:
        line = line.strip()
        line_piece = encoder.EncodeAsPieces(line)
        for p in line_piece:
          d[p] = d.get(p, 0) + 1
        wf.write(" ".join(line_piece))
        labels.append({"text": line, "piece": " ".join(line_piece)})
        wf.write("\n")
      

In [12]:
labels[0]

{'text': 'NORTHANGER ABBEY', 'piece': '▁NORTH ANG ER ▁ABBE Y'}

In [13]:
encoder.piece_to_id("<pad>")

1

In [14]:
encoder.id_to_piece(3)

'<unk>'

In [15]:
vocab = []
for i in range(nbpe):
    piece = encoder.id_to_piece(i)
    vocab.append((piece, str(i)))

In [16]:
vocab[:10]

[('▁THE', '0'),
 ('<pad>', '1'),
 ('</s>', '2'),
 ('<unk>', '3'),
 ('S', '4'),
 ('▁AND', '5'),
 ('▁OF', '6'),
 ('▁TO', '7'),
 ('▁A', '8'),
 ('ED', '9')]

In [17]:
encoder.DecodeIds(4)

'S'

In [18]:
encoder.DecodeIds(3)

' ⁇ '

In [19]:
tokens = [t for t, idx in vocab]
tokens[:10]

['▁THE', '<pad>', '</s>', '<unk>', 'S', '▁AND', '▁OF', '▁TO', '▁A', 'ED']

In [20]:
bpe_dict = sorted(d.items(), key=lambda x: x[1], reverse=True)

In [21]:
bpe_dict[:10]

[('▁THE', 60522),
 ('S', 37967),
 ('▁AND', 33316),
 ('▁OF', 29894),
 ('▁TO', 27992),
 ('▁A', 24032),
 ('ED', 18292),
 ('▁IN', 18013),
 ('▁I', 14580),
 ('▁HE', 13703)]

In [22]:
with open(os.path.join(librispeech_train_pieces_path, "dict.txt"), "w") as f:
    for token, count in bpe_dict:
        f.write(f"{token} {count}")
        f.write("\n")
with open(os.path.join(librispeech_train_pieces_path, "tokens.txt"), "w") as f:
    for token, idx in vocab:
        f.write(f"{token}")
        f.write("\n")


In [23]:
dictionary = Dictionary.load(os.path.join(librispeech_train_pieces_path, "dict.txt"))

In [24]:
dictionary.bos()

0

In [25]:
# Build training data
audio_files = [text.strip() for text in open(os.path.join(librispeech_train_text_path, "audio.txt"))]
ls_train_data = {}
for i, audio_fname in enumerate(audio_files):
    input = {}
    output = {}
    sample_info = torchaudio.info(audio_fname)
    input["length_ms"]  = int( sample_info.num_frames / sample_info.sample_rate / 0.001)
    assert sample_info.num_channels == 1
    input["path"] = audio_fname
    output["text"] = labels[i]["text"]
    output["token"] = labels[i]["piece"]
    output["tokenid"] = ", ".join(map(str, [t.tolist() for t in  dictionary.encode_line(labels[i]["piece"], append_eos=False)]))
    ls_train_data[f"{i:05}"] = {"input": input, "output": output}
    
    # print(input["length_ms"], sample_info.num_channels, sample_info.num_frames, sample_info.sample_rate)

In [26]:
librispeech_train_path

'/mnt/dl/NLP/LibriSpeech/train-dataset'

In [27]:
librispeech_train_data_path = os.path.join(librispeech_train_path, "data")
os.makedirs(librispeech_train_data_path, exist_ok=True)

In [28]:
json.dump(ls_train_data, codecs.open(os.path.join(librispeech_train_data_path, "train.json"), "w"), indent=4)

In [42]:
# Task speech recognition 
asr_dict = Dictionary.load(os.path.join(librispeech_train_pieces_path, "dict.txt"))

In [43]:
ctc_blanc = asr_dict.add_symbol("<ctc_blank>")

In [31]:
data_samples = json.load(codecs.open(os.path.join(librispeech_train_data_path, "train.json")))

In [32]:
data_samples['00000']

{'input': {'length_ms': 1965,
  'path': '/mnt/dl/NLP/LibriSpeech/train-dataset/audio/00000.flac'},
 'output': {'text': 'NORTHANGER ABBEY',
  'token': '▁NORTH ANG ER ▁ABBE Y',
  'tokenid': '827, 1460, 54, 3749, 39'}}

In [33]:
sorted_samples = sorted(data_samples.items(), key=lambda sample: sample[1]["input"]["length_ms"], 
                        reverse=True)

In [34]:
sorted_samples[0]

('25725',
 {'input': {'length_ms': 24524,
   'path': '/mnt/dl/NLP/LibriSpeech/train-dataset/audio/25725.flac'},
  'output': {'text': 'THERE WAS OF COURSE NO LEGALITY IN THE ACT AND KARL THE GREAT WAS IN NO REAL SENSE THE SUCCESSOR OF HONORIUS AND ROMULUS AUGUSTULUS BUT HE RULED A GROUP OF KINGDOMS WHICH EMBRACED THE LARGER HALF OF THE OLD WESTERN EMPIRE AND FORMED A FAIR EQUIPOISE TO THE REALM NOW RULED BY IRENE FROM EIGHT HUNDRED THEN ONWARD WE HAVE ONCE MORE A WEST ROMAN EMPIRE IN EXISTENCE AS WELL AS THE EAST ROMAN',
   'token': '▁THERE ▁WAS ▁OF ▁COURSE ▁NO ▁LEG AL ITY ▁IN ▁THE ▁ACT ▁AND ▁K AR L ▁THE ▁GREAT ▁WAS ▁IN ▁NO ▁REAL ▁SENSE ▁THE ▁SUCCESS OR ▁OF ▁HONOR IUS ▁AND ▁RO M UL US ▁AUGUST UL US ▁BUT ▁HE ▁RULE D ▁A ▁GROUP ▁OF ▁KINGDOM S ▁WHICH ▁EMBRAC ED ▁THE ▁LARGE R ▁HALF ▁OF ▁THE ▁OLD ▁WESTERN ▁EMPIRE ▁AND ▁FORMED ▁A ▁FAIR ▁E QUI P O ISE ▁TO ▁THE ▁REAL M ▁NOW ▁RULE D ▁BY ▁I RE NE ▁FROM ▁EIGHT ▁HUNDRED ▁THEN ▁ON WARD ▁WE ▁HAVE ▁ONCE ▁MORE ▁A ▁WEST ▁ROMAN ▁EMPIRE ▁IN ▁EXISTENCE ▁AS 

In [35]:
sorted_samples[1]

('19601',
 {'input': {'length_ms': 19985,
   'path': '/mnt/dl/NLP/LibriSpeech/train-dataset/audio/19601.flac'},
  'output': {'text': 'IF WE TAKE IT IN THE WIDEST MEANING THIS WOULD EVIDENTLY INCLUDE EVERY POSSIBLE MEDICAL TASK FROM FILLING A PAINFUL TOOTH TO OPERATING ON A PAINFUL APPENDIX AS IN EVERY CASE WHERE PAIN RESULTS THE MENTAL EQUILIBRIUM IS DISTURBED BY IT AND THE NORMAL MENTAL LIFE OF THE PATIENT REDUCED IN ITS EFFICIENCY',
   'token': '▁IF ▁WE ▁TAKE ▁IT ▁IN ▁THE ▁WIDE ST ▁MEANING ▁THIS ▁WOULD ▁EVIDENTLY ▁INCLUD E ▁EVERY ▁POSSIBLE ▁ME D ICAL ▁TASK ▁FROM ▁FILL ING ▁A ▁PAINFUL ▁TOO TH ▁TO ▁O PER ATING ▁ON ▁A ▁PAINFUL ▁A PP EN DI X ▁AS ▁IN ▁EVERY ▁CASE ▁WHERE ▁PAIN ▁RESULT S ▁THE ▁MENTAL ▁E QUI L I BR IUM ▁IS ▁DISTURB ED ▁BY ▁IT ▁AND ▁THE ▁NORMAL ▁MENTAL ▁LIFE ▁OF ▁THE ▁PATIENT ▁REDUC ED ▁IN ▁ITS ▁E F FICIENCY',
   'tokenid': '66, 52, 213, 17, 11, 4, 894, 102, 1633, 45, 62, 1422, 2199, 35, 196, 653, 48, 32, 518, 1747, 49, 1826, 16, 9, 2797, 183, 202, 8, 179, 674, 944, 36, 9, 27

In [36]:
len(sorted_samples)

28539

In [44]:
asr_dataset = ASRDataset(fname=os.path.join(librispeech_train_data_path, "train.json"), asr_dict=asr_dict)

In [53]:
asr_dataset[0]

{'id': 0,
 'audio': tensor([[-1.0151, -1.0751, -1.1164,  ..., -0.5994, -0.6832, -0.7371],
         [-1.0151, -1.0751, -1.1164,  ..., -0.5664, -0.6700, -0.6668],
         [-1.0151, -1.0751, -1.1164,  ..., -0.4569, -0.6798, -0.6443],
         ...,
         [-1.0151, -1.0751, -1.1164,  ..., -0.6817, -0.7759, -0.6673],
         [-1.0151, -1.0751, -1.1164,  ..., -0.7449, -0.7478, -0.7143],
         [-1.0151, -1.0751, -1.1164,  ..., -0.8040, -0.7029, -0.6784]]),
 'text': [55,
  14,
  7,
  351,
  56,
  1086,
  95,
  170,
  11,
  4,
  638,
  6,
  460,
  177,
  98,
  4,
  129,
  14,
  11,
  56,
  752,
  731,
  4,
  1295,
  109,
  7,
  1336,
  1770,
  6,
  778,
  70,
  287,
  99,
  2054,
  287,
  99,
  29,
  13,
  1205,
  32,
  9,
  1144,
  7,
  2024,
  5,
  47,
  2518,
  10,
  4,
  410,
  61,
  339,
  7,
  4,
  125,
  2882,
  3132,
  6,
  1748,
  9,
  627,
  225,
  723,
  78,
  85,
  1010,
  8,
  4,
  752,
  70,
  87,
  1205,
  32,
  40,
  12,
  106,
  333,
  49,
  833,
  328,
  74,
  36,
  618

In [54]:
asr_dataset[0]["audio"].size()

torch.Size([2451, 80])

In [55]:
asr_dataset[10]["audio"].size()

torch.Size([1722, 80])