# Make token vocabulary
Train a tokenizer on the Tira ASR dataset

In [5]:
from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)
from transformers import WhisperTokenizer
import pandas as pd
import numpy as np
import seaborn
import string
from clap.encoders import *
from transformers import DebertaV2Tokenizer, AutoProcessor
import torch
from scipy.spatial.distance import cdist, cosine, squareform
from tqdm import tqdm
tqdm.pandas()
from unidecode import unidecode
import random
import json

In [6]:
metadata='/Users/markjos/projects/malachor5/data/hf-datasets/tira-clean/metadata.csv'

In [14]:
df=pd.read_csv(metadata)
def get_text_corpus():
    train_df=df[df['split']=='train']
    for row in train_df['transcription'].unique():
        yield row

In [33]:
unique_words = set()
words = []
for text in get_text_corpus():
    unique_words.update(text.split())
    words.extend(text.split())
len(unique_words), len(words)

(7246, 24108)

In [16]:
unk = "[UNK]"
trainer=trainers.BpeTrainer(vocab_size=7000, special_tokens=[unk], max_token_length=8, min_frequency=2)

In [17]:
tokenizer = Tokenizer(models.BPE())
tokenizer.train_from_iterator(get_text_corpus(), trainer=trainer)
tira_vocab=tokenizer.get_vocab()
len(tira_vocab)






4152

In [19]:
i=random.randint(0,len(df))
line=df['transcription'].iloc[i]
encoding=tokenizer.encode(line)

line, encoding.tokens, encoding.ids

('kàðɛ́və̀lèðɔ́', ['kàðɛ́', 'və̀lè', 'ðɔ́'], [1097, 607, 265])

In [20]:
vocab_lens=[]
for item in tira_vocab:
    vocab_lens.append(len(item.strip().replace('#', '')))
np.array(vocab_lens).mean()

4.921242774566474

In [34]:
wh_tok = WhisperTokenizer.from_pretrained('openai/whisper-tiny')
tokenized_corpus = [
    wh_tok.encode(sentence, add_special_tokens=False) for sentence in get_text_corpus()
]
len(tokenized_corpus)

7263

In [35]:
unique_sentences_tokenized = [
    ' '.join(
        str(tok) for tok in wh_tok.encode(sentence, add_special_tokens=False)
    ) for sentence in get_text_corpus()
]
tira_tkzd_path='/Users/markjos/projects/malachor5/data/hf-datasets/tira-clean/tira_text_tkzd.txt'
with open(tira_tkzd_path, 'w') as f:
    for sentence in unique_sentences_tokenized:
        f.write(sentence + '\n')