<a href="https://colab.research.google.com/github/lkarjun/malayalam-language-model/blob/language-model/training_tokenizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Dataset Loading

In [None]:
!pip install -qq dvc[gdrive]

!dvc get https://github.com/lkarjun/malayalam-language-model \
Datasets/

In [2]:
!unzip -q 'Datasets/*.zip' -d Datasets/

In [3]:
import pandas as pd
from pathlib import Path

DS_Path = Path("../Datasets")

csv_files = ["magazine_files.csv", 
             "wikitext_files.csv",
             "article_files.csv"
             ]
             
df = pd.concat([pd.read_csv(DS_Path/csv) for csv in csv_files])

In [15]:
sample = df['file_path'][:10].to_list()

with open(sample[0], "r", encoding="utf-8") as file:
  sample_txt = file.read()

### Imports

In [4]:
!pip install -qq tokenizer transformers

[K     |████████████████████████████████| 77 kB 3.2 MB/s 
[K     |████████████████████████████████| 3.5 MB 32.8 MB/s 
[K     |████████████████████████████████| 6.8 MB 51.8 MB/s 
[K     |████████████████████████████████| 596 kB 61.4 MB/s 
[K     |████████████████████████████████| 67 kB 5.3 MB/s 
[K     |████████████████████████████████| 895 kB 49.4 MB/s 
[?25h

In [2]:
from tokenizers import Tokenizer, pre_tokenizers, decoders
from tokenizers.models import Unigram, WordPiece
from tokenizers.trainers import UnigramTrainer, WordPieceTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.processors import TemplateProcessing
from tokenizers.pre_tokenizers import Digits
from tokenizers.normalizers import Strip

## Training Subword Tokenizer For Malayalm 

In [16]:
tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
tokenizer.normalizer = Strip()
tokenizer.decoders = decoders.WordPiece()

trainer = WordPieceTrainer(vocab_size=1000,
                           min_frequency=4,
                           special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
                           show_progress=True)

In [27]:
pre_tokenizer = pre_tokenizers.Sequence([
                                         Whitespace(),  
                                         Digits(individual_digits=False)
                                        ])


tokenizer.pre_tokenizer = pre_tokenizer

# training tokenizer
tokenizer.train(sample, trainer)

In [28]:
tokenizer.post_processor = TemplateProcessing(
                                single="[CLS] $A [SEP]",
                                pair="[CLS] $A [SEP] $B:1 [SEP]:1",
                                special_tokens=[
                                        ("[CLS]", tokenizer.token_to_id("[CLS]")),
                                        ("[SEP]", tokenizer.token_to_id("[SEP]")),
                                  ],
                              )
tokenizer.decoders = decoders.WordPiece()

In [29]:
tokenizer.enable_padding(direction="right", pad_id=3, pad_token="[PAD]")
tokenizer.enable_truncation(max_length=500)

In [30]:
output = tokenizer.encode(sample_txt[:20])
print(output.tokens)

['[CLS]', 'ഇ', '##ന്ത്', '##യ', '##യിലെ', 'ആദ്യ', '##ത്തെ', '[SEP]']


In [31]:
print(output.ids)

[1, 33, 862, 108, 596, 584, 286, 2]


In [32]:
tokenizer.decode(output.ids, skip_special_tokens=True)

'ഇ ##ന്ത് ##യ ##യിലെ ആദ്യ ##ത്തെ'

In [33]:
check_unk = tokenizer.encode("Hello")
check_unk.tokens, check_unk.ids, tokenizer.id_to_token(0)

(['[CLS]', '[UNK]', '[SEP]'], [1, 0, 2], '[UNK]')

In [34]:
output = tokenizer.encode_batch([sample_txt[:30], sample_txt[:10]])
print(output[1].tokens)
print(output[0].tokens)

['[CLS]', 'ഇ', '##ന്ത്', '##യ', '##യിലെ', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']
['[CLS]', 'ഇ', '##ന്ത്', '##യ', '##യിലെ', 'ആദ്യ', '##ത്തെ', 'വ', '##നി', '##ത', '##ാ', 'ഐ', '##\u200c', '##എ', '##\u200c', '[SEP]']


In [35]:
print(output[1].attention_mask)

[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


### Training Tokenizer

In [36]:
tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
tokenizer.normalizer = Strip()
tokenizer.decoders = decoders.WordPiece()

trainer = WordPieceTrainer(vocab_size=75000,
                           min_frequency=4,
                           special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
                           show_progress=True)

pre_tokenizer = pre_tokenizers.Sequence([
                                         Whitespace(),  
                                         Digits(individual_digits=False)
                                        ])


tokenizer.pre_tokenizer = pre_tokenizer

# training tokenizer
tokenizer.train(df['file_path'], trainer)

tokenizer.post_processor = TemplateProcessing(
                                single="[CLS] $A [SEP]",
                                pair="[CLS] $A [SEP] $B:1 [SEP]:1",
                                special_tokens=[
                                        ("[CLS]", tokenizer.token_to_id("[CLS]")),
                                        ("[SEP]", tokenizer.token_to_id("[SEP]")),
                                  ],
                              )

tokenizer.decoders = decoders.WordPiece()

tokenizer.enable_padding(direction="right", pad_id=3, pad_token="[PAD]")
tokenizer.enable_truncation(max_length=500)

In [37]:
tokenizer.save("tokenizer-malayalam.json")

In [39]:
from transformers import PreTrainedTokenizerFast

In [40]:
wrapped_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
)

In [52]:
print(
    wrapped_tokenizer.pad_token_id,
    wrapped_tokenizer.cls_token_id,
    wrapped_tokenizer.sep_token_id
)

3 1 2


In [53]:
wrapped_tokenizer.save_pretrained("Tokenizer")

('Tokenizer\\tokenizer_config.json',
 'Tokenizer\\special_tokens_map.json',
 'Tokenizer\\tokenizer.json')