<a href="https://colab.research.google.com/github/lkarjun/malayalam-language-model/blob/language-model/training_tokenizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Dataset Loading

In [None]:
!pip install -qq dvc[gdrive]

!dvc get https://github.com/lkarjun/malayalam-language-model \
Datasets/

In [2]:
!unzip -q 'Datasets/*.zip' -d Datasets/

In [1]:
import pandas as pd
from pathlib import Path

DS_Path = Path("/content/Datasets")

csv_files = ["magazine_files.csv", 
             "wikitext_files.csv",
             "article_files.csv"
             ]
             
df = pd.concat([pd.read_csv(DS_Path/csv) for csv in csv_files])

In [2]:
sample = df['file_path'][:10].to_list()

with open(sample[0], "r") as file:
  sample_txt = file.read()

### Imports

In [4]:
!pip install -qq tokenizer transformers

[K     |████████████████████████████████| 77 kB 3.2 MB/s 
[K     |████████████████████████████████| 3.5 MB 32.8 MB/s 
[K     |████████████████████████████████| 6.8 MB 51.8 MB/s 
[K     |████████████████████████████████| 596 kB 61.4 MB/s 
[K     |████████████████████████████████| 67 kB 5.3 MB/s 
[K     |████████████████████████████████| 895 kB 49.4 MB/s 
[?25h

In [3]:
from tokenizers import Tokenizer, pre_tokenizers, decoders
from tokenizers.models import Unigram, WordPiece
from tokenizers.trainers import UnigramTrainer, WordPieceTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.processors import TemplateProcessing
from tokenizers.pre_tokenizers import Digits
from tokenizers.normalizers import Strip

## Training Subword Tokenizer For Malayalm 

In [5]:
tokenizer = Tokenizer(WordPiece(unk_token="<unk>"))
tokenizer.normalizer = Strip()
tokenizer.decoders = decoders.WordPiece()

trainer = WordPieceTrainer(vocab_size=1000,
                           min_frequency=4,
                           special_tokens=["<unk>", "<bos>", "<eos>", "<pad>", "<mask>"],
                           show_progress=True)

In [7]:
pre_tokenizer = pre_tokenizers.Sequence([
                                         Whitespace(),  
                                         Digits(individual_digits=False)
                                        ])


tokenizer.pre_tokenizer = pre_tokenizer

# training tokenizer
tokenizer.train(sample, trainer)

In [8]:
tokenizer.post_processor = TemplateProcessing(
                                single="<bos> $A <eos>",
                                pair="<bos> $A <eos> <bos> $B:1 <eos>:1",
                                special_tokens=[
                                        ("<bos>", tokenizer.token_to_id("<bos>")),
                                        ("<eos>", tokenizer.token_to_id("<eos>")),
                                  ],
                              )
tokenizer.decoders = decoders.WordPiece()

In [9]:
tokenizer.enable_padding(pad_id=3, pad_token="<pad>")
tokenizer.enable_truncation(max_length=500)

In [10]:
output = tokenizer.encode(sample_txt[:20])
print(output.tokens)

['<bos>', 'ഇ', '##ന്ത്', '##യ', '##യിലെ', 'ആദ്യ', '##ത്തെ', '<eos>']


In [11]:
print(output.ids)

[1, 33, 861, 117, 596, 583, 287, 2]


In [12]:
tokenizer.decode(output.ids, skip_special_tokens=True)

'ഇ ##ന്ത് ##യ ##യിലെ ആദ്യ ##ത്തെ'

In [13]:
check_unk = tokenizer.encode("Hello")
check_unk.tokens, check_unk.ids, tokenizer.id_to_token(0)

(['<bos>', '<unk>', '<eos>'], [1, 0, 2], '<unk>')

In [16]:
output = tokenizer.encode_batch([sample_txt[:30], sample_txt[:10]])
print(output[1].tokens)
print(output[0].tokens)

['<bos>', 'ഇ', '##ന്ത്', '##യ', '##യിലെ', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
['<bos>', 'ഇ', '##ന്ത്', '##യ', '##യിലെ', 'ആദ്യ', '##ത്തെ', 'വ', '##നി', '##ത', '##ാ', 'ഐ', '##\u200c', '##എ', '##\u200c', '<eos>']


In [17]:
print(output[1].attention_mask)

[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


### Training Tokenizer

In [18]:
tokenizer = Tokenizer(WordPiece(unk_token="<unk>"))
tokenizer.normalizer = Strip()
tokenizer.decoders = decoders.WordPiece()

trainer = WordPieceTrainer(vocab_size=100000,
                           min_frequency=4,
                           special_tokens=["<unk>", "<bos>", "<eos>", "<pad>", "<mask>"],
                           show_progress=True)

pre_tokenizer = pre_tokenizers.Sequence([
                                         Whitespace(),  
                                         Digits(individual_digits=False)
                                        ])


tokenizer.pre_tokenizer = pre_tokenizer

# training tokenizer
tokenizer.train(df['file_path'], trainer)

tokenizer.enable_padding(pad_id=3, pad_token="<pad>")
tokenizer.enable_truncation(max_length=500)

tokenizer.post_processor = TemplateProcessing(
                                single="<bos> $A <eos>",
                                pair="<bos> $A <eos> <bos> $B:1 <eos>:1",
                                special_tokens=[
                                        ("<bos>", tokenizer.token_to_id("<bos>")),
                                        ("<eos>", tokenizer.token_to_id("<eos>")),
                                  ],
                              )

tokenizer.decoders = decoders.WordPiece()

In [19]:
tokenizer.save("tokenizer-malayalam.json")

In [5]:
from transformers import PreTrainedTokenizerFast

In [9]:
wrapped_tokenizer = PreTrainedTokenizerFast(
    tokenizer_file="Tokenizer/tokenizer.json",
    unk_token="<unk>",
    pad_token="<pad>",
    eos_token="<bos>",
    bos_token="<eos>",
    mask_token="<mask>",
)

In [15]:
wrapped_tokenizer.save_pretrained("Tokenizer")

('Tokenizer/tokenizer_config.json',
 'Tokenizer/special_tokens_map.json',
 'Tokenizer/tokenizer.json')