# Import Modules

In [1]:
import os

os.chdir("../")

In [2]:
import pandas as pd
import glob
import json
import numpy as np
from tqdm.notebook import tqdm

# Load Datasets

In [3]:
dataset = json.load(open("./datasets/correction_train.jsonl"))

In [4]:
merged_text = [d["from"] for d in dataset]
merged_text = "".join(merged_text)

In [5]:
with open("./datasets/vocab_generic.txt", "w") as file:
    file.write(merged_text)

# Build Tokenizer

In [6]:
!export TOKENIZERS_PARALLELISM=true

In [7]:
from tokenizers import trainers, ByteLevelBPETokenizer
from transformers import T5TokenizerFast

In [8]:
def build_fast_bert_tokenizer(files, max_vocab_size):
    tokenizer = ByteLevelBPETokenizer()
    trainer = trainers.BpeTrainer(show_progress=True)
    tokenizer.train(files=files)
    return T5TokenizerFast(tokenizer_object=tokenizer)

In [9]:
tokenizer = build_fast_bert_tokenizer(files=["./datasets/vocab_generic.txt"], max_vocab_size=40000) 






In [10]:
!rm -rf "./model_artifacts/pretrained_tokenizer_generic/"

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [11]:
tokenizer.save_pretrained("./model_artifacts/pretrained_tokenizer_generic")

('./model_artifacts/pretrained_tokenizer_generic/tokenizer_config.json',
 './model_artifacts/pretrained_tokenizer_generic/special_tokens_map.json',
 './model_artifacts/pretrained_tokenizer_generic/tokenizer.json')

# Test

In [12]:
from transformers import T5TokenizerFast

In [13]:
tokenizer = T5TokenizerFast.from_pretrained("./model_artifacts/pretrained_tokenizer_generic/")
# tokenizer = T5TokenizerFast.from_pretrained("t5-small")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [14]:
tokenizer.vocab_size

30000

In [21]:
input_ids = tokenizer("আমি বাংলায় গান গাই the quick brown fix jumps over the lazy dog THE QUICK BROWN FOX JUMPS OVER THE LAZY DOG", return_tensors="pt").input_ids

In [22]:
input_ids

tensor([[  559,   263,   275,   354,   267,   259,   270,   318,   259,   264,
           318,   259,   288,   958,  4401, 10391,  6914,  7516,  1003,  6197,
          4280,  3325,  6678, 10043,   958,  1241,  5698,    88,  3025,    70,
          1427,    39,    36,  8190,    52, 15031,    42,  1667,    49,    46,
            54,    45,  2696,    46,    55,  4639,    52,    44,    47,    50,
          3173,    53,    36,    49,  1427,    39,    36,  3766,    32,    57,
            56,  1168,    46,    38]])

In [23]:
tokenizer.batch_decode(input_ids)

['আমি বাংলায় গান গাই the quick brown fix jumps over the lazy dog THE QUICK BROWN FOX JUMPS OVER THE LAZY DOG']