In [1]:
import re
import json
import os
import shutil

import numpy as np

import pandas as pd
pd.options.display.max_columns = 100

import IPython.display as ipd

import datasets as hfd

from transformers import (
    Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor, Wav2Vec2Processor, 
)

import torch
print(torch.cuda.is_available())

True


In [2]:
DATA_ROOT_DP = os.environ['DATA_HOME']
CV_PROCESSED_DP = f'{DATA_ROOT_DP}/datasets/cv-corpus-8.0-2022-01-19__be__processed'

In [3]:
artifacts_dp = 'artifacts'
os.makedirs(artifacts_dp, exist_ok=True)

In [4]:
ds = hfd.load_from_disk(CV_PROCESSED_DP)
ds

DatasetDict({
    train: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 314305
    })
    dev: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 15803
    })
    test: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 15801
    })
})

### preprocess text

In [6]:
char_map = {'n': 'н', 'ґ': 'г'}
char_map = str.maketrans(char_map)
char_map

{110: 'н', 1169: 'г'}

In [7]:
def preprocess_text(example, char_map):
    text = example['sentence']
    text = text.lower()
    # remove non-word chars
    text = re.sub(r"[^\w\s']", '', text)    
    # sub multiple sequential space chars with single space
    text = re.sub(r"\s+", ' ', text)
    text = text.translate(char_map)
    example['sentence'] = text
    return example

In [44]:
# ds = ds.map(preprocess_text, fn_kwargs=dict(char_map=char_map))

### create vocab.json

In [9]:
# def extract_all_chars(batch):
#     s = [set(x) for x in batch['sentence']]
#     s_all = set().union(*s)
#     return {'vocab': [list(s_all)]}

In [10]:
# vocab = ds.map(
#     extract_all_chars, keep_in_memory=True, 
#     batched=True, batch_size=32, remove_columns=ds.column_names["train"]
# )
# vocab = {split: set().union(*vocab[split]['vocab']) for split in vocab}
# vocab = set().union(vocab['train'], vocab['dev'], vocab['test'])
# vocab = sorted(vocab)

  0%|          | 0/9823 [00:00<?, ?ba/s]

  0%|          | 0/494 [00:00<?, ?ba/s]

  0%|          | 0/494 [00:00<?, ?ba/s]

In [21]:
# x = pd.DataFrame({'char': vocab})
# x['ord'] = x['char'].apply(ord)
# x.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34
char,,',i,а,б,в,г,д,е,ж,з,й,к,л,м,н,о,п,р,с,т,у,ф,х,ц,ч,ш,ы,ь,э,ю,я,ё,і,ў
ord,32.0,39,105,1072,1073,1074,1075,1076,1077,1078,1079,1081,1082,1083,1084,1085,1086,1087,1088,1089,1090,1091,1092,1093,1094,1095,1096,1099,1100,1101,1102,1103,1105,1110,1118


In [None]:
# vocab_dict = {v: k for k, v in enumerate(sorted(vocab))}

# vocab_dict["|"] = vocab_dict[" "]
# del vocab_dict[" "]

# vocab_dict["[UNK]"] = len(vocab_dict)
# vocab_dict["[PAD]"] = len(vocab_dict)

# print(f'vocab len: {len(vocab_dict)}')

In [51]:
# with open(os.path.join(artifacts_dp, 'vocab.json'), 'w') as fout:
#     json.dump(vocab_dict, fout, indent=2)

In [52]:
print(vocab_dict)

{"'": 1, 'i': 2, 'а': 3, 'б': 4, 'в': 5, 'г': 6, 'д': 7, 'е': 8, 'ж': 9, 'з': 10, 'й': 11, 'к': 12, 'л': 13, 'м': 14, 'н': 15, 'о': 16, 'п': 17, 'р': 18, 'с': 19, 'т': 20, 'у': 21, 'ф': 22, 'х': 23, 'ц': 24, 'ч': 25, 'ш': 26, 'ы': 27, 'ь': 28, 'э': 29, 'ю': 30, 'я': 31, 'ё': 32, 'і': 33, 'ў': 34, '|': 0, '[UNK]': 35, '[PAD]': 36}


In [55]:
# with open(os.path.join(artifacts_dp, 'vocab.json')) as fin:
#     vocab = json.load(fin)

### processor

In [58]:
# tokenizer = Wav2Vec2CTCTokenizer(
#     os.path.join(artifacts_dp, 'vocab.json'),
#     unk_token="[UNK]", 
#     pad_token="[PAD]", 
#     word_delimiter_token="|"
# )

# feature_extractor = Wav2Vec2FeatureExtractor(
#     feature_size=1, sampling_rate=16_000, padding_value=0.0, 
#     do_normalize=True, return_attention_mask=False
# )

# processor = Wav2Vec2Processor(
#     feature_extractor=feature_extractor, 
#     tokenizer=tokenizer
# )

In [66]:
# processor.save_pretrained(os.path.join(artifacts_dp, 'processor'))

In [9]:
processor = Wav2Vec2Processor.from_pretrained(os.path.join(artifacts_dp, 'processor'))

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [10]:
vocab = processor.tokenizer.get_vocab()
pd.Series(vocab).to_frame().T

Unnamed: 0,',i,а,б,в,г,д,е,ж,з,й,к,л,м,н,о,п,р,с,т,у,ф,х,ц,ч,ш,ы,ь,э,ю,я,ё,і,ў,|,[UNK],[PAD],<s>,</s>
0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,0,35,36,37,38


### Input values

In [11]:
def prepare_dataset(batch):
    audio = batch['audio']
    batch['input_values'] = processor(
        audio['array'], sampling_rate=audio['sampling_rate']
    )['input_values'][0]
    
    with processor.as_target_processor():
        batch['labels'] = processor(batch['sentence'])['input_ids']
        
    return batch

In [12]:
ds_p = ds.map(prepare_dataset, remove_columns=ds.column_names['train'], num_proc=8)
ds_p

DatasetDict({
    train: Dataset({
        features: ['input_values', 'labels'],
        num_rows: 314305
    })
    dev: Dataset({
        features: ['input_values', 'labels'],
        num_rows: 15803
    })
    test: Dataset({
        features: ['input_values', 'labels'],
        num_rows: 15801
    })
})

### save to disk

In [14]:
CV_PROCESSED_2_DP = f'{DATA_ROOT_DP}/datasets/cv-corpus-8.0-2022-01-19__be__processed__2'

In [15]:
ds_p.save_to_disk(os.path.join(CV_PROCESSED_2_DP))

### reread

In [23]:
ds_p = hfd.load_from_disk(CV_PROCESSED_2_DP)
ds_p

### debug

In [36]:
vocab_dict_inverted = dict(zip(vocab.values(), vocab.keys()))

In [41]:
ix = 2054

inputs = np.asarray(ds_p['train'][ix]['input_values'])
print(inputs.max(), inputs.mean(), inputs.std(), len(inputs))

print(ds['train'][ix]['sentence'])

print(processor.decode(ds_p['train'][ix]['labels']))

y = list(map(vocab_dict_inverted.get, ds_p['train'][ix]['labels']))
print(''.join(y))

8.660139083862305 1.934119883273166e-09 0.9999900884003534 81216
што гэта свет клінам сышоўся
што гэта свет клінам сышоўся
што|гэта|свет|клінам|сышоўся


In [43]:
ix = 51432

inputs = np.asarray(ds_p['train'][ix]['input_values'])
print(inputs.max(), inputs.mean(), inputs.std(), len(inputs))

print(ds['train'][ix]['sentence'])

print(processor.decode(ds_p['train'][ix]['labels']))

y = list(map(vocab_dict_inverted.get, ds_p['train'][ix]['labels']))
print(''.join(y))

5.936769008636475 5.3624984834760684e-09 0.9999971653160846 60480
ужо праз год будаўніцтва было скончана
ужо праз год будаўніцтва было скончана
ужо|праз|год|будаўніцтва|было|скончана
