<a href="https://colab.research.google.com/github/lkarjun/malayalam-language-model/blob/main/Malayalam-Language-Model/training_tokenizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Imports

In [None]:
!pip install -qq tokenizer transformers datasets

[K     |████████████████████████████████| 311 kB 12.0 MB/s 
[K     |████████████████████████████████| 134 kB 25.8 MB/s 
[K     |████████████████████████████████| 1.1 MB 54.1 MB/s 
[K     |████████████████████████████████| 212 kB 52.8 MB/s 
[K     |████████████████████████████████| 144 kB 54.8 MB/s 
[K     |████████████████████████████████| 271 kB 46.6 MB/s 
[K     |████████████████████████████████| 94 kB 4.4 MB/s 
[?25h

In [1]:
from tokenizers import Tokenizer, pre_tokenizers, decoders
from tokenizers.models import Unigram, WordPiece
from tokenizers.trainers import UnigramTrainer, WordPieceTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.processors import TemplateProcessing
from tokenizers.pre_tokenizers import Digits
from tokenizers.normalizers import Strip

from datasets import load_dataset
import pandas as pd

### Dataset Loading

In [2]:
dset = load_dataset("lkarjun/Malayalam-Articles")
dset

Using custom data configuration lkarjun--Malayalam-Articles-d44c52244000c266
Reusing dataset csv (/root/.cache/huggingface/datasets/csv/lkarjun--Malayalam-Articles-d44c52244000c266/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e)


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['content', 'is_valid'],
        num_rows: 26617
    })
    validation: Dataset({
        features: ['content', 'is_valid'],
        num_rows: 8873
    })
})

In [7]:
df = pd.concat([
                dset['train'].to_pandas().dropna(),
                dset['validation'].to_pandas().dropna()
])

## Traning Bert Tokenizer

In [3]:
from tokenizers import BertWordPieceTokenizer

In [5]:
bert_tok = BertWordPieceTokenizer()

In [4]:
def batch_iterator(bs = 5000, col = 'content'):
  for i in range(0, len(df), bs):
    yield df[i: i + bs][col]

In [None]:
bert_tokenizer = bert_tok.train_from_iterator(batch_iterator(), 
                                              vocab_size=50000)

In [None]:
bert_tokenizer.save_pretrained("bert-tokenizer")

## Training WordPiece Tokenizer 

In [None]:
tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
tokenizer.normalizer = Strip()
tokenizer.decoders = decoders.WordPiece()

trainer = WordPieceTrainer(vocab_size=1000,
                           min_frequency=4,
                           special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
                           show_progress=True)

In [None]:
pre_tokenizer = pre_tokenizers.Sequence([
                                         Whitespace(),  
                                         Digits(individual_digits=False)
                                        ])


tokenizer.pre_tokenizer = pre_tokenizer

# training tokenizer
tokenizer.train(sample, trainer)

In [None]:
tokenizer.post_processor = TemplateProcessing(
                                single="[CLS] $A [SEP]",
                                pair="[CLS] $A [SEP] $B:1 [SEP]:1",
                                special_tokens=[
                                        ("[CLS]", tokenizer.token_to_id("[CLS]")),
                                        ("[SEP]", tokenizer.token_to_id("[SEP]")),
                                  ],
                              )
tokenizer.decoders = decoders.WordPiece()

In [None]:
tokenizer.enable_padding(direction="right", pad_id=3, pad_token="[PAD]")
tokenizer.enable_truncation(max_length=500)

In [None]:
output = tokenizer.encode(sample_txt[:20])
print(output.tokens)

['[CLS]', 'ഇ', '##ന്ത്', '##യ', '##യിലെ', 'ആദ്യ', '##ത്തെ', '[SEP]']


In [None]:
print(output.ids)

[1, 33, 862, 108, 596, 584, 286, 2]


In [None]:
tokenizer.decode(output.ids, skip_special_tokens=True)

'ഇ ##ന്ത് ##യ ##യിലെ ആദ്യ ##ത്തെ'

In [None]:
check_unk = tokenizer.encode("Hello")
check_unk.tokens, check_unk.ids, tokenizer.id_to_token(0)

(['[CLS]', '[UNK]', '[SEP]'], [1, 0, 2], '[UNK]')

In [None]:
output = tokenizer.encode_batch([sample_txt[:30], sample_txt[:10]])
print(output[1].tokens)
print(output[0].tokens)

['[CLS]', 'ഇ', '##ന്ത്', '##യ', '##യിലെ', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']
['[CLS]', 'ഇ', '##ന്ത്', '##യ', '##യിലെ', 'ആദ്യ', '##ത്തെ', 'വ', '##നി', '##ത', '##ാ', 'ഐ', '##\u200c', '##എ', '##\u200c', '[SEP]']


In [None]:
print(output[1].attention_mask)

[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


### Training Tokenizer

In [None]:
tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
tokenizer.normalizer = Strip()
tokenizer.decoders = decoders.WordPiece()

trainer = WordPieceTrainer(vocab_size=75000,
                           min_frequency=4,
                           special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
                           show_progress=True)

pre_tokenizer = pre_tokenizers.Sequence([
                                         Whitespace(),  
                                         Digits(individual_digits=False)
                                        ])


tokenizer.pre_tokenizer = pre_tokenizer

# training tokenizer
tokenizer.train(df['file_path'], trainer)

tokenizer.post_processor = TemplateProcessing(
                                single="[CLS] $A [SEP]",
                                pair="[CLS] $A [SEP] $B:1 [SEP]:1",
                                special_tokens=[
                                        ("[CLS]", tokenizer.token_to_id("[CLS]")),
                                        ("[SEP]", tokenizer.token_to_id("[SEP]")),
                                  ],
                              )

tokenizer.decoders = decoders.WordPiece()

tokenizer.enable_padding(direction="right", pad_id=3, pad_token="[PAD]")
tokenizer.enable_truncation(max_length=500)

In [None]:
tokenizer.save("tokenizer-malayalam.json")

In [None]:
from transformers import PreTrainedTokenizerFast

In [None]:
wrapped_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
)

In [None]:
print(
    wrapped_tokenizer.pad_token_id,
    wrapped_tokenizer.cls_token_id,
    wrapped_tokenizer.sep_token_id
)

3 1 2


In [None]:
wrapped_tokenizer.save_pretrained("Tokenizer")

('Tokenizer\\tokenizer_config.json',
 'Tokenizer\\special_tokens_map.json',
 'Tokenizer\\tokenizer.json')