# Import Modules

In [1]:
import os

os.chdir("../")

In [2]:
import pandas as pd
import glob
import json
import numpy as np
from tqdm.notebook import tqdm

# Load Datasets

In [3]:
dataset = json.load(open("./datasets/correction_train.jsonl"))

In [4]:
merged_text = [d["from"] for d in dataset]
merged_text = "\n".join(merged_text)

In [5]:
with open("./datasets/vocab_generic.txt", "w") as file:
    file.write(merged_text)

# Build Fast Tokenizer

In [6]:
!export TOKENIZERS_PARALLELISM=true

In [7]:
from tokenizers import trainers, ByteLevelBPETokenizer, Tokenizer
from transformers import T5TokenizerFast
from datasets import load_dataset

In [8]:
!rm -rf "./model_artifacts/pretrained_tokenizer_generic/"

In [9]:
def batch_iterator(dataset, batch_size=1000):
    for i in range(0, len(dataset), batch_size):
        yield dataset[i : i + batch_size]["from"]


def build_fast_tokenizer(dataset_path):
    tokenizer = ByteLevelBPETokenizer()
    dataset = load_dataset("json", data_files=[dataset_path], split="train")
    tokenizer.train_from_iterator(
        batch_iterator(dataset),
        vocab_size=50265,
        min_frequency=2,
        special_tokens=[
            "<s>",
            "<pad>",
            "</s>",
            "<unk>",
            "<mask>",
        ],
    )
    return T5TokenizerFast(tokenizer_object=tokenizer)

In [10]:
tokenizer = build_fast_tokenizer("./datasets/correction_train.jsonl")






In [11]:
tokenizer.save_pretrained("./model_artifacts/pretrained_tokenizer_generic")

('./model_artifacts/pretrained_tokenizer_generic/tokenizer_config.json',
 './model_artifacts/pretrained_tokenizer_generic/special_tokens_map.json',
 './model_artifacts/pretrained_tokenizer_generic/tokenizer.json')

# Alt

ref: https://github.com/huggingface/transformers/tree/main/examples/flax/language-modeling#train-tokenizer-2
```python
from utils.tokenizer import SentencePieceUnigramTokenizer

vocab_size = 32_0
input_sentence_size = None


dataset = load_dataset("json", data_files=["./datasets/correction_train.jsonl"], split="train")

tokenizer = SentencePieceUnigramTokenizer(unk_token="<unk>", eos_token="</s>", pad_token="<pad>")


def batch_iterator(input_sentence_size=None):
    if input_sentence_size is None:
        input_sentence_size = len(dataset)
    batch_length = 100
    for i in range(0, input_sentence_size, batch_length):
        yield dataset[i: i + batch_length]["from"]


tokenizer.train_from_iterator(
    iterator=batch_iterator(input_sentence_size=input_sentence_size),
    vocab_size=vocab_size,
    show_progress=True,
)

# Save files to disk
tokenizer.save("./hudai/tokenizer.json")
```

# Test

In [12]:
from transformers import T5TokenizerFast

In [13]:
tokenizer = T5TokenizerFast.from_pretrained(
    "./model_artifacts/pretrained_tokenizer_generic/"
)
# tokenizer = T5TokenizerFast.from_pretrained("t5-small")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [14]:
tokenizer.vocab_size

50265

In [15]:
input_ids = tokenizer(
    "আমি বাংলায় গান গাই the quick brown fix jumps over the lazy dog THE QUICK BROWN FOX JUMPS OVER THE LAZY DOG",
    return_tensors="pt",
).input_ids

In [16]:
input_ids

tensor([[  563,   268,   280,   359,   272,   264,   275,   323,   264,   269,
           323,   264,   293,   963, 37177,  6844,  7445,  1004,  6147,  4260,
          3310,  6612,  9920,   963,  1241,  5653,    93,  3020,    75,  1425,
            44,    41,  8099,    57, 14804,    47,  1665,    54,    51,    59,
            50,  2682,    51,    60,  4609,    57,    49,    52,    55,  3155,
            58, 33097,  1425,    44,    41,  3744,    37,    62,    61,  1170,
            51,    43]])

In [17]:
tokenizer.batch_decode(input_ids)

['আমি বাংলায় গান গাই the quick brown fix jumps over the lazy dog THE QUICK BROWN FOX JUMPS OVER THE LAZY DOG']