In [3]:
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing

import os

In [4]:
file_names = []

if os.path.exists('../data/') and os.path.isdir('../data/'):
    file_names = os.listdir('../data/')

file_names = ['../data/'+file_name for file_name in file_names]

file_names

['../data/codigo_penal.txt',
 '../data/codigo_de_etica_publica.txt',
 '../data/codigo_procesal_penal_federal.txt',
 '../data/codigo_aeronautico.txt',
 '../data/codigo_procesal_civil_y_comercial.txt',
 '../data/codigo_aduanero.txt',
 '../data/codigo_electoral_nacional.txt',
 '../data/codigo_civil_y_comercial.txt',
 '../data/codigo_alimentario_argentino.txt',
 '../data/constitucion-nacional.txt',
 '../data/codigo_de_mineria.txt']

In [5]:
tokenizer = ByteLevelBPETokenizer()

# ensure tokenizer reserves the special tokens for the vocab
# (using the same as RoBERTa needs)
special_tokens = [
    "<pad>",
    "<unk>",
    "<s>",
    "</s>",
    "<mask>"
]

tokenizer.train(files=file_names,
                vocab_size=30000,
                min_frequency=2,
                special_tokens=special_tokens,
                show_progress=True)






Save files to disk

In [6]:
token_dir = '../LawBERTarg/'
if not os.path.exists(token_dir):
    os.makedirs(token_dir)
tokenizer.save_model(token_dir)

['../LawBERTarg/vocab.json', '../LawBERTarg/merges.txt']

## Testing tokenizer

In [7]:
tokenizer = ByteLevelBPETokenizer(vocab = '../LawBERTarg/vocab.json',
                                  merges = '../LawBERTarg/merges.txt',
                                  lowercase = False,)

Example encodings:

In [8]:
print(tokenizer.encode("la ley prohibe a").tokens)

['la', 'Ġley', 'Ġproh', 'ibe', 'Ġa']


In [9]:
print(tokenizer.encode("Aquel que dañe obra pública u obrare en").tokens)

['A', 'quel', 'Ġque', 'Ġda', 'Ã±e', 'Ġobra', 'ĠpÃºblica', 'Ġu', 'Ġobrare', 'Ġen']


Add start and end tokens:

In [10]:
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)

tokenizer.enable_truncation(max_length=512)

In [11]:
print(tokenizer.encode("legalidad, gestión y auditoría de toda la actividad de la administración pública centralizada y descentralizada").tokens)

['<s>', 'leg', 'alidad', ',', 'ĠgestiÃ³n', 'Ġy', 'ĠauditorÃŃa', 'Ġde', 'Ġtoda', 'Ġla', 'Ġactividad', 'Ġde', 'Ġla', 'ĠadministraciÃ³n', 'ĠpÃºblica', 'Ġcentralizada', 'Ġy', 'Ġdescentralizada', '</s>']
