# Make token vocabulary
Train a tokenizer on the Tira ASR dataset

In [91]:
from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)
from transformers import WhisperTokenizer
import pandas as pd
import numpy as np

In [74]:
metadata='/Users/markjos/projects/malachor5/data/tira-asr/metadata.csv'

In [75]:
unk = "[UNK]"
trainer=trainers.WordPieceTrainer(vocab_size=100, special_tokens=[unk])

In [76]:
df=pd.read_csv(metadata)
def get_text_corpus():
    for row in df['transcription']:
        yield row

In [77]:
tokenizer = Tokenizer(models.WordPiece(unk_token=unk))
tokenizer.train_from_iterator(get_text_corpus(), trainer=trainer)






In [78]:
line=df['transcription'][5000]
encoding=tokenizer.encode(line)

line, encoding.tokens, encoding.ids

('ágánɔ̂nà àɽàŋgár nd̪ɔ̀bà',
 ['a',
  '##́',
  '##g',
  '##á',
  '##n',
  '##ɔ',
  '##̂',
  '##n',
  '##à ',
  '##à',
  '##ɽ',
  '##à',
  '##ŋ',
  '##g',
  '##á',
  '##r',
  '## ',
  '##nd',
  '##̪',
  '##ɔ̀',
  '##b',
  '##à'],
 [2,
  44,
  69,
  89,
  46,
  49,
  76,
  46,
  94,
  85,
  50,
  85,
  56,
  69,
  89,
  45,
  54,
  99,
  53,
  88,
  57,
  85])

In [79]:
tokenizer.get_vocab()

{'##ð': 51,
 '##l': 59,
 '##̂': 76,
 '##d': 48,
 'g': 8,
 'n': 15,
 '##k': 66,
 '##b': 57,
 '##o': 65,
 '##v': 68,
 '##ɜ': 78,
 'h': 9,
 '##ʃ': 79,
 't': 20,
 '##r': 45,
 '##f': 84,
 '##ɛ̀': 87,
 'v': 22,
 'ɾ': 34,
 'c': 4,
 '##j': 61,
 '##h': 83,
 'u': 21,
 '##á': 89,
 '##́ ': 86,
 'j': 11,
 'o': 16,
 '##ì': 90,
 '##ɔ́ ': 95,
 's': 19,
 'ŋ': 25,
 '[UNK]': 0,
 'ɜ': 29,
 '##̪': 53,
 '##g': 69,
 '##p': 75,
 '##ɔ́': 96,
 '##nd': 99,
 '##ɭ': 82,
 '##ú': 97,
 '##e': 72,
 'l': 13,
 'a': 2,
 '##w': 70,
 'd': 5,
 'f': 7,
 '##ʊ': 58,
 '##m': 74,
 'ð': 24,
 'ə': 27,
 '## ': 54,
 'ɟ': 30,
 '##ɽ': 50,
 '##ŋ': 56,
 '̄': 40,
 '##ə': 64,
 '##ɲ': 73,
 '##̄': 80,
 ' ': 1,
 '##u': 62,
 '##à': 85,
 '̂': 39,
 '##c': 63,
 '##ə̀': 91,
 '##ɛ́': 98,
 '##ɛ': 60,
 'ɭ': 31,
 '##ɟ': 81,
 '##ɔ': 49,
 'w': 23,
 '̪': 42,
 '́': 38,
 '̌': 41,
 'i': 10,
 'e': 6,
 'ʊ': 36,
 'm': 14,
 'p': 17,
 'r': 18,
 'ɲ': 32,
 'ɽ': 33,
 '̀': 37,
 '##ɾ': 67,
 'ɛ': 28,
 '##́': 44,
 '##à ': 94,
 '##̀': 47,
 '##n': 46,
 '##a': 43,
 

In [80]:
tokenized_corpus=[x.tokens for x in tokenizer.encode_batch(df['transcription'])]
tokenized_corpus_ids=[x.ids for x in tokenizer.encode_batch(df['transcription'])]
tokenized_corpus

[['m',
  '##ə̀',
  '##n',
  '##à ',
  '##c',
  '##à',
  '##t̪',
  '##á',
  '## ',
  '##á',
  '##n',
  '## ',
  '##a',
  '##̂',
  '##j',
  '##t̪',
  '##ɔ́ ',
  '##ŋ',
  '##ì',
  '##ð',
  '##ə̀',
  '##n',
  '##ɔ́',
  '##ŋ',
  '## ',
  '##ŋ',
  '##i',
  '##́',
  '##r',
  '##ɔ́',
  '##t̪',
  '##ɛ',
  '##̌',
  '## ',
  '##i',
  '##́',
  '##ŋ',
  '##g',
  '##á',
  '##ŋ',
  '##à',
  '##ɽ',
  '##ì',
  '##ɲ',
  '##à'],
 ['i',
  '##́',
  '##ŋ',
  '##g',
  '##á',
  '##ŋ',
  '##a',
  '##̂',
  '##ɽ',
  '##ì',
  '##ɲ',
  '##a',
  '##̌'],
 ['i',
  '##́',
  '##ŋ',
  '##g',
  '##á',
  '##ŋ',
  '##a',
  '##̂',
  '##ɽ',
  '##ì',
  '##ɲ',
  '##a',
  '##̌'],
 ['i',
  '##́',
  '##ŋ',
  '##g',
  '##á',
  '##ŋ',
  '##a',
  '##̂',
  '##ɽ',
  '##ì',
  '##ɲ',
  '##a',
  '##̌'],
 ['ŋ',
  '##ì',
  '##ð',
  '##ə̀',
  '##n',
  '##i',
  '##́ ',
  '##ŋ',
  '##r',
  '##ɔ̀',
  '##t̪',
  '##ɔ́ ',
  '##t̪',
  '##ɔ́',
  '##w',
  '##ə̀',
  '##n',
  '##ì'],
 ['ŋ',
  '##ì',
  '##ð',
  '##ə̀',
  '##n',
  '##i

In [81]:
tokenized_corpus_concat=np.concatenate(tokenized_corpus)
tokenized_corpus_concat.shape

(433420,)

In [86]:
token_counts=pd.Series(tokenized_corpus_concat).value_counts()
token_counts[token_counts<=100]

o        90
g        59
s        56
c        50
ə        49
ɽ        45
##ʃ      41
p        38
h        24
ɾ        24
̀        17
́        12
ɟ        11
##h      10
##ɭ       8
ʊ         7
ʃ         4
[UNK]     2
##f       2
̪         1
̂         1
Name: count, dtype: int64

In [83]:
pd.Series(tokenized_corpus_concat).unique().shape

(95,)

# Get predicted Whisper tokens
Run Whisper tokenizer on transcribed Tira ASR dataset

In [123]:
whisper_labels = '/Users/markjos/projects/malachor5/data/tira-asr/tira-clean-split-transcribed.csv'

whisper_df=pd.read_csv(whisper_labels)
whisper_df.head()

Unnamed: 0,path,croatian,split
0,HH01082021-m03s37ms011-m03s39ms552.wav,A prije jedi za hala.,train
1,HH01082021-m03s41ms371-m03s45ms023.wav,Apri-jadi-vanhala.,train
2,HH01082021-m04s43ms401-m04s46ms528.wav,A prije je vledo da hvala.,train
3,HH01082021-m04s48ms835-m04s51ms580.wav,A prije je vledo za hala.,train
4,HH01082021-m05s04ms737-m05s07ms762.wav,Apre je vledo za mala.,train


In [124]:
wh_tok=WhisperTokenizer.from_pretrained('openai/whisper-large-v3')
wh_tok

WhisperTokenizer(name_or_path='openai/whisper-large-v3', vocab_size=50257, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|startoftranscript|>', '<|en|>', '<|zh|>', '<|de|>', '<|es|>', '<|ru|>', '<|ko|>', '<|fr|>', '<|ja|>', '<|pt|>', '<|tr|>', '<|pl|>', '<|ca|>', '<|nl|>', '<|ar|>', '<|sv|>', '<|it|>', '<|id|>', '<|hi|>', '<|fi|>', '<|vi|>', '<|he|>', '<|uk|>', '<|el|>', '<|ms|>', '<|cs|>', '<|ro|>', '<|da|>', '<|hu|>', '<|ta|>', '<|no|>', '<|th|>', '<|ur|>', '<|hr|>', '<|bg|>', '<|lt|>', '<|la|>', '<|mi|>', '<|ml|>', '<|cy|>', '<|sk|>', '<|te|>', '<|fa|>', '<|lv|>', '<|bn|>', '<|sr|>', '<|az|>', '<|sl|>', '<|kn|>', '<|et|>', '<|mk|>', '<|br|>', '<|eu|>', '<|is|>', '<|hy|>', '<|ne|>', '<|mn|>', '<|bs|>', '<|kk|>', '<|sq|>', '<|sw|>', '<|gl|>', '<|mr|>', '<

In [125]:
croatian_tokenized=whisper_df['croatian'].str.lower().apply(wh_tok.tokenize)

croatian_tokenized

0                 [Ġa, Ġpri, je, Ġjed, i, Ġza, Ġh, ala, .]
1                    [Ġap, ri, -, jadi, -, van, hal, a, .]
2        [Ġa, Ġpri, je, Ġje, Ġv, led, o, Ġda, Ġh, val, ...
3         [Ġa, Ġpri, je, Ġje, Ġv, led, o, Ġza, Ġh, ala, .]
4                [Ġap, re, Ġje, Ġv, led, o, Ġza, Ġmala, .]
                               ...                        
20475                                 [Ġdo, vol, j, ni, .]
20476                                             [Ġne, .]
20477                                 [Ġlo, v, ile, ga, .]
20478                                [Ġto, Ġje, Ġon, o, .]
20479             [Ġl, j, ub, av, ,, Ġh, r, v, ats, ka, .]
Name: croatian, Length: 20480, dtype: object

In [126]:
croatian_tk_concat=np.concatenate(croatian_tokenized)
croatian_tk_concat.shape

(257589,)

In [127]:
pd.Series(croatian_tk_concat).value_counts()

.        15875
,        12673
Ġu        8106
v         7177
j         6583
         ...  
ungen        1
apon         1
onia         1
Ġmet         1
Ġdado        1
Name: count, Length: 2549, dtype: int64

In [120]:
df['clip']

0        data/tira-asr/clips/HH20220719-1-m03s04ms240-m...
1        data/tira-asr/clips/HH20220719-1-m04s31ms280-m...
2        data/tira-asr/clips/HH20220719-1-m04s35ms030-m...
3        data/tira-asr/clips/HH20220719-1-m05s04ms295-m...
4        data/tira-asr/clips/HH20220719-1-m05s31ms510-m...
                               ...                        
23944    data/tira-asr/clips/HH20210830-m37s17ms782-m37...
23945    data/tira-asr/clips/HH20210830-m37s29ms861-m37...
23946    data/tira-asr/clips/HH20210830-m38s33ms888-m38...
23947    data/tira-asr/clips/HH10142020-m03s32ms061-m03...
23948    data/tira-asr/clips/HH10142020-m04s26ms973-m04...
Name: clip, Length: 23949, dtype: object