<a href="https://colab.research.google.com/github/lkarjun/malayalam-language-model/blob/language-model/training_tokenizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Dataset Loading

In [None]:
!pip install -qq dvc[gdrive]

!dvc get https://github.com/lkarjun/malayalam-language-model \
Datasets/

In [2]:
!unzip -q 'Datasets/*.zip' -d Datasets/

In [1]:
import pandas as pd
from pathlib import Path

DS_Path = Path("/content/Datasets")

csv_files = ["magazine_files.csv", 
             "wikitext_files.csv",
             "article_files.csv"
             ]
             
df = pd.concat([pd.read_csv(DS_Path/csv) for csv in csv_files])

In [2]:
sample = df['file_path'][:10].to_list()

with open(sample[0], "r") as file:
  sample_txt = file.read()

### Imports

In [None]:
!pip install -qq tokenizer transformers

In [3]:
from tokenizers import Tokenizer, pre_tokenizers, decoders
from tokenizers.models import Unigram, WordPiece
from tokenizers.trainers import UnigramTrainer, WordPieceTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.processors import TemplateProcessing
from tokenizers.pre_tokenizers import Digits
from tokenizers.normalizers import Strip

## Training Subword Tokenizer For Malayalm 

In [5]:
tokenizer = Tokenizer(WordPiece(unk_token="<unk>"))
tokenizer.normalizer = Strip()
tokenizer.decoders = decoders.WordPiece()

trainer = WordPieceTrainer(vocab_size=1000,
                           min_frequency=4,
                           special_tokens=["<unk>", "<bos>", "<eos>", "<pad>", "<mask>"],
                           show_progress=True)

In [7]:
pre_tokenizer = pre_tokenizers.Sequence([
                                         Whitespace(),  
                                         Digits(individual_digits=False)
                                        ])


tokenizer.pre_tokenizer = pre_tokenizer

# training tokenizer
tokenizer.train(sample, trainer)

In [8]:
tokenizer.post_processor = TemplateProcessing(
                                single="<bos> $A <eos>",
                                pair="<bos> $A <eos> <bos> $B:1 <eos>:1",
                                special_tokens=[
                                        ("<bos>", tokenizer.token_to_id("<bos>")),
                                        ("<eos>", tokenizer.token_to_id("<eos>")),
                                  ],
                              )
tokenizer.decoders = decoders.WordPiece()

In [9]:
tokenizer.enable_padding(pad_id=3, pad_token="<pad>")
tokenizer.enable_truncation(max_length=500)

In [10]:
output = tokenizer.encode(sample_txt[:20])
print(output.tokens)

['<bos>', 'ഇ', '##ന്ത്', '##യ', '##യിലെ', 'ആദ്യ', '##ത്തെ', '<eos>']


In [11]:
print(output.ids)

[1, 33, 861, 117, 596, 583, 287, 2]


In [12]:
tokenizer.decode(output.ids, skip_special_tokens=True)

'ഇ ##ന്ത് ##യ ##യിലെ ആദ്യ ##ത്തെ'

In [13]:
check_unk = tokenizer.encode("Hello")
check_unk.tokens, check_unk.ids, tokenizer.id_to_token(0)

(['<bos>', '<unk>', '<eos>'], [1, 0, 2], '<unk>')

In [16]:
output = tokenizer.encode_batch([sample_txt[:30], sample_txt[:10]])
print(output[1].tokens)
print(output[0].tokens)

['<bos>', 'ഇ', '##ന്ത്', '##യ', '##യിലെ', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
['<bos>', 'ഇ', '##ന്ത്', '##യ', '##യിലെ', 'ആദ്യ', '##ത്തെ', 'വ', '##നി', '##ത', '##ാ', 'ഐ', '##\u200c', '##എ', '##\u200c', '<eos>']


In [17]:
print(output[1].attention_mask)

[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


### Training Tokenizer

In [18]:
tokenizer = Tokenizer(WordPiece(unk_token="<unk>"))
tokenizer.normalizer = Strip()
tokenizer.decoders = decoders.WordPiece()

trainer = WordPieceTrainer(vocab_size=100000,
                           min_frequency=4,
                           special_tokens=["<unk>", "<bos>", "<eos>", "<pad>", "<mask>"],
                           show_progress=True)

pre_tokenizer = pre_tokenizers.Sequence([
                                         Whitespace(),  
                                         Digits(individual_digits=False)
                                        ])


tokenizer.pre_tokenizer = pre_tokenizer

# training tokenizer
tokenizer.train(df['file_path'], trainer)

tokenizer.enable_padding(pad_id=3, pad_token="<pad>")
tokenizer.enable_truncation(max_length=500)

tokenizer.post_processor = TemplateProcessing(
                                single="<bos> $A <eos>",
                                pair="<bos> $A <eos> <bos> $B:1 <eos>:1",
                                special_tokens=[
                                        ("<bos>", tokenizer.token_to_id("<bos>")),
                                        ("<eos>", tokenizer.token_to_id("<eos>")),
                                  ],
                              )

tokenizer.decoders = decoders.WordPiece()

In [19]:
tokenizer.save("tokenizer-malayalam.json")

In [20]:
from transformers import PreTrainedTokenizerFast

In [21]:
wrapped_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,
    unk_token="<unk>",
    pad_token="<pad>",
    cls_token="<bos>",
    sep_token="<eos>",
    mask_token="<mask>",
)

In [24]:
wrapped_tokenizer(sample_txt)

{'input_ids': [1, 4119, 3785, 8630, 81675, 1322, 38133, 5036, 2309, 9826, 34450, 36, 2721, 7114, 3182, 2355, 1298, 54142, 19, 2570, 2111, 7422, 5176, 3711, 4703, 34497, 7682, 3579, 98571, 19, 62827, 1267, 36210, 11545, 4822, 10428, 17338, 22177, 7310, 5570, 3182, 2355, 1298, 12164, 1312, 19509, 2093, 34327, 2947, 19, 11491, 14297, 54015, 10160, 17, 10152, 60903, 40243, 1264, 3674, 25485, 30493, 2570, 17, 4119, 3785, 8630, 81675, 1322, 38133, 5036, 19, 37784, 2073, 34964, 1304, 10757, 18426, 2406, 8832, 12882, 2570, 19, 2745, 13851, 15076, 2721, 2447, 19, 25930, 6094, 4209, 18, 2896, 3991, 65618, 8841, 7316, 3182, 17, 11240, 14049, 2448, 5292, 19807, 16340, 9973, 5288, 19, 11559, 18, 4970, 12905, 8752, 48719, 2893, 2461, 3182, 20150, 19, 7891, 4877, 3906, 2111, 9986, 12905, 8752, 6815, 8773, 73887, 1312, 7326, 25789, 2950, 14345, 9004, 19, 11180, 29434, 544, 47724, 6449, 28998, 19, 2593, 17, 74140, 13056, 2390, 3182, 18391, 4355, 19, 4518, 4679, 2079, 2136, 35503, 2338, 6449, 17031, 240

In [27]:
wrapped_tokenizer.save_pretrained("Tokenizer")

('Tokenizer/tokenizer_config.json',
 'Tokenizer/special_tokens_map.json',
 'Tokenizer/tokenizer.json')