In [1]:
from transformers import AutoTokenizer
import torch
import torch.neuron
from compiled_tokenizer import CompiledTokenizer

  from .autonotebook import tqdm as notebook_tqdm


### Get original tokenizer

In [2]:
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")

sample_input = 'This is an example sentence. We use it to compare tokenizers and their compiled variants.'

tokenizer_output = tokenizer(sample_input)
tokenizer_output

{'input_ids': [0, 3293, 83, 142, 27781, 149357, 5, 1401, 4527, 442, 47, 69101, 47, 1098, 52825, 7, 136, 2363, 375, 5974, 297, 20117, 7, 5, 2], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [3]:
tokenizer.__call__

<bound method PreTrainedTokenizerBase.__call__ of BertTokenizerFast(name_or_path='sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2', vocab_size=250002, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False)})>

In [4]:
tokenizer.save_pretrained('./hf_tokenizer')

('./hf_tokenizer/tokenizer_config.json',
 './hf_tokenizer/special_tokens_map.json',
 './hf_tokenizer/unigram.json',
 './hf_tokenizer/added_tokens.json',
 './hf_tokenizer/tokenizer.json')

### Test `__init__` + inference

In [5]:
MAX_LENGTH = 50

compiled_tokenizer_1 = CompiledTokenizer(tokenizer,max_length=MAX_LENGTH)
compiled_tokenizer_1_output = compiled_tokenizer_1(sample_input)
compiled_tokenizer_1_torch_output = compiled_tokenizer_1(sample_input,return_tensors='pt')

In [6]:
compiled_tokenizer_1.__class__, compiled_tokenizer_1.tokenization_settings, compiled_tokenizer_1.compiled, compiled_tokenizer_1.__call__

(compiled_tokenizer.CompiledTokenizer,
 {'padding': 'max_length',
  'truncation': True,
  'add_special_tokens': True,
  'max_length': 50},
 True,
 <bound method CompiledTokenizer.__call__ of <compiled_tokenizer.CompiledTokenizer object at 0x7ff0372d6130>>)

### Test `.from_tokenizer` + inference

In [7]:
MAX_LENGTH = 50

compiled_tokenizer_2 = CompiledTokenizer.from_tokenizer(tokenizer,max_length=MAX_LENGTH)
compiled_tokenizer_2_output = compiled_tokenizer_2(sample_input)
compiled_tokenizer_2_torch_output = compiled_tokenizer_2(sample_input,return_tensors='pt')

In [8]:
compiled_tokenizer_2.__class__, compiled_tokenizer_2.tokenization_settings, compiled_tokenizer_2.compiled, compiled_tokenizer_2.__call__

(compiled_tokenizer.CompiledTokenizer,
 {'padding': 'max_length',
  'truncation': True,
  'add_special_tokens': True,
  'max_length': 50},
 True,
 <bound method CompiledTokenizer.__call__ of <compiled_tokenizer.CompiledTokenizer object at 0x7ff0372d6dc0>>)

In [9]:
tokenizer_output, compiled_tokenizer_1_output, compiled_tokenizer_2_output, compiled_tokenizer_1_torch_output, compiled_tokenizer_2_torch_output

({'input_ids': [0, 3293, 83, 142, 27781, 149357, 5, 1401, 4527, 442, 47, 69101, 47, 1098, 52825, 7, 136, 2363, 375, 5974, 297, 20117, 7, 5, 2], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]},
 {'input_ids': [0, 3293, 83, 142, 27781, 149357, 5, 1401, 4527, 442, 47, 69101, 47, 1098, 52825, 7, 136, 2363, 375, 5974, 297, 20117, 7, 5, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]},
 {'input_ids': [0, 3293, 83, 142, 27781, 149357, 5, 1401, 4527, 442, 47, 69101, 47, 1098, 52825, 7, 136, 

### Test `save_pretrained` + `from_pretrained`

In [10]:
compiled_tokenizer_1.save_pretrained('compiled_tokenizer')

In [11]:
compiled_tokenizer_reloaded = CompiledTokenizer.from_pretrained('compiled_tokenizer')
compiled_tokenizer_reloaded.__class__, compiled_tokenizer_reloaded.tokenization_settings, compiled_tokenizer_reloaded.compiled, compiled_tokenizer_reloaded.__call__

(compiled_tokenizer.CompiledTokenizer,
 {'padding': 'max_length',
  'truncation': True,
  'add_special_tokens': True,
  'max_length': 50},
 True,
 <bound method CompiledTokenizer.__call__ of <compiled_tokenizer.CompiledTokenizer object at 0x7ff037821730>>)

In [12]:
compiled_tokenizer_reloaded_output = compiled_tokenizer_reloaded(sample_input)
compiled_tokenizer_reloaded_torch_output = compiled_tokenizer_reloaded(sample_input, return_tensors='pt')

In [13]:
compiled_tokenizer_1_output == compiled_tokenizer_reloaded_output

True

In [14]:
for token_type in compiled_tokenizer_1_torch_output:
    torch.all(compiled_tokenizer_1_torch_output[token_type].eq(compiled_tokenizer_2_torch_output[token_type])) # reconcile __init__ with from_pretrained
    torch.all(compiled_tokenizer_2_torch_output[token_type].eq(compiled_tokenizer_reloaded_torch_output[token_type])) # reconcile across export and re-import

### Test delegated methods

In [15]:
token_ids = compiled_tokenizer_1.encode_plus(sample_input)
encodings = compiled_tokenizer_1.encode(sample_input)
sample_input_reconsctructed = compiled_tokenizer_1.decode(encodings)
token_type_ids = compiled_tokenizer_1.create_token_type_ids_from_sequences(token_ids_0=token_ids['input_ids'])
tokens_converted_to_string = compiled_tokenizer_1.convert_tokens_to_string(token_ids)
cleaned_string = compiled_tokenizer_1.clean_up_tokenization(" 're 've some word ! ?")

In [16]:
(
    token_ids == compiled_tokenizer_1.tokenizer.encode_plus(sample_input),
    encodings == compiled_tokenizer_1.tokenizer.encode(sample_input),
    sample_input_reconsctructed == compiled_tokenizer_1.tokenizer.decode(encodings),
    token_type_ids == compiled_tokenizer_1.tokenizer.create_token_type_ids_from_sequences(token_ids_0=token_ids['input_ids']),
    tokens_converted_to_string == compiled_tokenizer_1.tokenizer.convert_tokens_to_string(token_ids),
    cleaned_string == compiled_tokenizer_1.tokenizer.clean_up_tokenization(" 're 've some word ! ?")
)


(True, True, True, True, True, True)

In [17]:
token_ids

{'input_ids': [0, 3293, 83, 142, 27781, 149357, 5, 1401, 4527, 442, 47, 69101, 47, 1098, 52825, 7, 136, 2363, 375, 5974, 297, 20117, 7, 5, 2], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [18]:
encodings[:10]

[0, 3293, 83, 142, 27781, 149357, 5, 1401, 4527, 442]

In [19]:
sample_input_reconsctructed

'<s> This is an example sentence. We use it to compare tokenizers and their compiled variants.</s>'

In [20]:
token_type_ids[:10]

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [21]:
tokens_converted_to_string

'input_idstoken_type_idsattention_mask'

In [22]:
cleaned_string

"'re've some word!?"