# Make token vocabulary
Train a tokenizer on the Tira ASR dataset

In [5]:
from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)
from transformers import WhisperTokenizer
import pandas as pd
import numpy as np
import seaborn
import string
from clap.encoders import *
from transformers import DebertaV2Tokenizer, AutoProcessor
import torch
from scipy.spatial.distance import cdist, cosine, squareform
from tqdm import tqdm
tqdm.pandas()
from unidecode import unidecode
import random
import json

In [6]:
metadata='/Users/markjos/projects/malachor5/data/hf-datasets/tira-clean/metadata.csv'

In [7]:
df=pd.read_csv(metadata)
def get_text_corpus():
    train_df=df[df['split']=='train']
    for row in train_df['transcription']:
        yield row

In [8]:
unique_words = set()
for text in get_text_corpus():
    unique_words.update(text.split())
len(unique_words)

7246

In [9]:
unk = "[UNK]"
trainer=trainers.BpeTrainer(vocab_size=7000, special_tokens=[unk], max_token_length=8, min_frequency=2)

In [10]:
tokenizer = Tokenizer(models.BPE())
tokenizer.train_from_iterator(get_text_corpus(), trainer=trainer)
tira_vocab=tokenizer.get_vocab()
len(tira_vocab)






5810

In [11]:
i=random.randint(0,len(df))
line=df['transcription'].iloc[i]
encoding=tokenizer.encode(line)

line, encoding.tokens, encoding.ids

('íɾcɛ́cí ðə̀t̪ɾɛ̀',
 ['í', 'ɾ', 'c', 'ɛ́', 'cí ', 'ðə̀', 't̪', 'ɾɛ̀'],
 [54, 33, 4, 56, 164, 146, 51, 72])

In [12]:
vocab_lens=[]
for item in tira_vocab:
    vocab_lens.append(len(item.strip().replace('#', '')))
np.array(vocab_lens).mean()

5.174526678141136

In [None]:
wh_tok = WhisperTokenizer.from_pretrained('openai/whisper-tiny')
tokenized_corpus = [
    wh_tok.encode(sentence, skip_special_tokens=True) for sentence in get_text_corpus()
]
tokenized_corpus

[[50258,
  50363,
  75,
  7250,
  136,
  222,
  85,
  7250,
  136,
  222,
  75,
  133,
  249,
  136,
  222,
  23436,
  133,
  249,
  32797,
  75,
  344,
  32797,
  77,
  133,
  249,
  136,
  222,
  133,
  122,
  133,
  249,
  136,
  222,
  50257],
 [50258,
  50363,
  129,
  233,
  136,
  222,
  3680,
  32797,
  83,
  133,
  249,
  32797,
  85,
  7250,
  32797,
  75,
  133,
  249,
  136,
  224,
  23436,
  133,
  242,
  32797,
  220,
  273,
  136,
  103,
  133,
  242,
  136,
  222,
  4231,
  136,
  222,
  70,
  133,
  249,
  136,
  222,
  50257],
 [50258,
  50363,
  129,
  233,
  133,
  242,
  32797,
  23436,
  133,
  242,
  32797,
  4423,
  233,
  64,
  32797,
  371,
  7250,
  32797,
  75,
  133,
  249,
  136,
  222,
  23436,
  64,
  136,
  222,
  690,
  108,
  64,
  136,
  222,
  129,
  233,
  64,
  136,
  222,
  875,
  136,
  222,
  220,
  273,
  136,
  103,
  133,
  242,
  136,
  222,
  4231,
  136,
  222,
  50257],
 [50258,
  50363,
  23436,
  7250,
  136,
  222,
  85,
  7250,
  136