In [55]:
import pandas as pd

translations = pd.read_csv('data/pt-en.tsv', delimiter='\t', encoding='utf-8', header=None)
translations = translations[[1, 3]].rename(columns={1: 'pt', 3: 'en'})
translations = translations.dropna()

translations = translations[translations['pt'].str.split().apply(len) <= 10]
translations = translations[translations['en'].str.split().apply(len) <= 10]

translations


Unnamed: 0,pt,en
0,Uma menina chorando abriu a porta.,A crying girl opened the door.
1,Vamos tentar alguma coisa!,Let's try something.
2,Preciso ir dormir.,I have to go to sleep.
3,Preciso ir dormir.,I need to go to sleep.
4,O sinal '&' significa 'e'.,The sign '&' stands for 'and'.
...,...,...
295629,Somos da França.,We're from France.
295630,Nós somos da França.,We're from France.
295631,Tom está apenas se divertindo.,Tom is just having fun.
295632,Ele faleceu? Eu nem sabia que ele estava doente!,He's dead? I didn't even know he was sick!


Just following the step-by-step [tutorial from Tokenizers library](https://huggingface.co/docs/tokenizers/pipeline#all-together-a-bert-tokenizer-from-scratch) for creating a Bert tokenizer on our corpus

In [None]:
from tokenizers import Tokenizer, normalizers, decoders
from tokenizers.models import WordPiece
from tokenizers.normalizers import NFD, Lowercase, StripAccents
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.processors import TemplateProcessing
from tokenizers.trainers import WordPieceTrainer

def create_tokenizer():
    bert_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
    bert_tokenizer.normalizer = normalizers.Sequence([NFD(), Lowercase(), StripAccents()])
    bert_tokenizer.pre_tokenizer = Whitespace() # Splitting by whitespace and punctuations

    bert_tokenizer.post_processor = TemplateProcessing(
        single="[CLS] $A [SEP]",
        pair="[CLS] $A [SEP] $B:1 [SEP]:1",
        special_tokens=[
            ("[CLS]", 1),
            ("[SEP]", 2),
        ],
    )

    bert_tokenizer.decoder = decoders.WordPiece()

    trainer = WordPieceTrainer(vocab_size=30522, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
    return bert_tokenizer, trainer

pt_tokenizer, pt_trainer = create_tokenizer()
en_tokenizer, en_trainer = create_tokenizer()

pt_tokenizer.train_from_iterator(translations['pt'], pt_trainer)
en_tokenizer.train_from_iterator(translations['en'], en_trainer)

In [71]:
output = pt_tokenizer.encode("Oi, como vai?")
print(output.ids)

pt_tokenizer.decode(output.ids)

[1, 3166, 14, 313, 317, 32, 2]


'oi, como vai?'

In [73]:
output = en_tokenizer.encode("Hi, how are you?")
print(output.ids)

en_tokenizer.decode(output.ids)

[1, 2596, 16, 272, 207, 152, 34, 2]


'hi, how are you?'

In [74]:
# Now, lets save our tokenizers

pt_tokenizer.save('data/pt-tokens.json')
en_tokenizer.save('data/en-tokens.json')