In [73]:
import tensorflow as tf
import tensorflow_text as text

## WordPieceTokenizer Demo

In [19]:
# !pip install -q -U tensorflow-text

In [20]:
# prepare vocabulary
# you could skip this part if you already have a vocab lookup table or you want to use prepared vocab
from tensorflow_text.tools.wordpiece_vocab import bert_vocab_from_dataset as bert_vocab

In [21]:
bert_tokenizer_params = dict(lower_case=True)

reserved_tokens=["[PAD]", "[UNK]", "[START]", "[END]"]

bert_vocab_args = dict(
    # The target vocabulary size
    vocab_size = 1000,
    # Reserved tokens that must be included in the vocabulary
    reserved_tokens=reserved_tokens,
    # Arguments for `text.BertTokenizer`
    bert_tokenizer_params=bert_tokenizer_params,
    # Arguments for `wordpiece_vocab.wordpiece_tokenizer_learner_lib.learn`
    learn_params={},
)

In [22]:
with open('wiki_usa.txt') as f:
    str_wiki_usa = f.read()

In [23]:
str_list_usa = str_wiki_usa.split()

In [24]:
print(str_list_usa[:10])

['Page', 'semi-protected', 'United', 'States', 'From', 'Wikipedia,', 'the', 'free', 'encyclopedia', 'Jump']


In [25]:
tmp = tf.data.Dataset.from_tensor_slices(str_list_usa)

2022-01-10 16:53:34.751568: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [35]:
counter = 0
for item in tmp.enumerate():
    print(item)
    if counter >= 10:
        break
    counter+=1

(<tf.Tensor: shape=(), dtype=int64, numpy=0>, <tf.Tensor: shape=(), dtype=string, numpy=b'Page'>)
(<tf.Tensor: shape=(), dtype=int64, numpy=1>, <tf.Tensor: shape=(), dtype=string, numpy=b'semi-protected'>)
(<tf.Tensor: shape=(), dtype=int64, numpy=2>, <tf.Tensor: shape=(), dtype=string, numpy=b'United'>)
(<tf.Tensor: shape=(), dtype=int64, numpy=3>, <tf.Tensor: shape=(), dtype=string, numpy=b'States'>)
(<tf.Tensor: shape=(), dtype=int64, numpy=4>, <tf.Tensor: shape=(), dtype=string, numpy=b'From'>)
(<tf.Tensor: shape=(), dtype=int64, numpy=5>, <tf.Tensor: shape=(), dtype=string, numpy=b'Wikipedia,'>)
(<tf.Tensor: shape=(), dtype=int64, numpy=6>, <tf.Tensor: shape=(), dtype=string, numpy=b'the'>)
(<tf.Tensor: shape=(), dtype=int64, numpy=7>, <tf.Tensor: shape=(), dtype=string, numpy=b'free'>)
(<tf.Tensor: shape=(), dtype=int64, numpy=8>, <tf.Tensor: shape=(), dtype=string, numpy=b'encyclopedia'>)
(<tf.Tensor: shape=(), dtype=int64, numpy=9>, <tf.Tensor: shape=(), dtype=string, numpy=b'J

In [37]:
%%time
# pt_vocab = bert_vocab.bert_vocab_from_dataset(
#     train_pt.batch(1000).prefetch(2),
#     **bert_vocab_args
# )

en_vocab = bert_vocab.bert_vocab_from_dataset(
    tmp.batch(1000).prefetch(2),
    **bert_vocab_args
)

CPU times: user 9.73 s, sys: 24.9 ms, total: 9.76 s
Wall time: 9.71 s


In [39]:
print(en_vocab[:10])
print(en_vocab[100:110])
print(en_vocab[1000:1010])
print(en_vocab[-10:])

['[PAD]', '[UNK]', '[START]', '[END]', '!', '"', '#', '$', '%', '&']
['##s', 'in', 'states', 'united', 'retrieved', 'to', '##ing', 'american', '##e', '##ed']
[]
['##ع', '##ل', '##ي', '##–', '##—', '##•', '##′', '##−', '##中', '##文']


In [40]:
def write_vocab_file(filepath, vocab):
    with open(filepath, 'w') as f:
        for token in vocab:
            print(token, file=f)

In [41]:
write_vocab_file('en_vocab.txt', en_vocab)

In [51]:
# build wordpiece tokenizer

en_wordpiece_tokenzier = text.WordpieceTokenizer(
    vocab_lookup_table="en_vocab.txt", 
    suffix_indicator='##', 
    max_bytes_per_word=100,
    max_chars_per_token=None, 
    token_out_type=tf.string,
    unknown_token='[UNK]', 
    split_unknown_characters=False
)

In [52]:
en_wordpiece_tokenzier.tokenize("hello world, my name is frank".split())

<tf.RaggedTensor [[b'h', b'##ell', b'##o'], [b'world', b'##,'], [b'm', b'##y'], [b'n', b'##ame'], [b'is'], [b'f', b'##ra', b'##n', b'##k']]>

In [53]:
# build wordpiece tokenizer

en_wordpiece_tokenzier = text.WordpieceTokenizer(
    vocab_lookup_table="en_vocab.txt", 
    suffix_indicator='##', 
    max_bytes_per_word=100,
    max_chars_per_token=None, 
    token_out_type=tf.int32,
    unknown_token='[UNK]', 
    split_unknown_characters=False
)

In [61]:
en_wordpiece_tokenzier.tokenize("hello world, this is a tokenizer,".split())

<tf.RaggedTensor [[43, 640, 156], [122, 926], [320], [116], [36], [105, 552, 121, 174, 364, 127, 926]]>

In [62]:
combined = []
for item in [[43, 640, 156], [122, 926], [320], [116], [36], [105, 552, 121, 174, 364, 127, 926]]:
    combined.extend(item)
print(combined)

[43, 640, 156, 122, 926, 320, 116, 36, 105, 552, 121, 174, 364, 127, 926]


In [63]:
en_wordpiece_tokenzier.detokenize([combined])

<tf.RaggedTensor [[b'hello', b'world,', b'this', b'is', b'a', b'tokenizer,']]>

## BertTokenizer Demo

In [64]:
# bert Tokenizer 可以使用相同的vocab但是tokenization不再需要句子pre-tokenization

In [66]:
en_bert_tokenizer = text.BertTokenizer('en_vocab.txt', **bert_tokenizer_params)

In [67]:
en_bert_tokenizer.tokenize("hello world, this is a tokenizer,")

<tf.RaggedTensor [[[43, 640, 156], [122], [15], [320], [116], [36], [105, 552, 121, 174, 364, 127], [15]]]>

In [70]:
combined = []
for item in [[43, 640, 156], [122], [15], [320], [116], [36], [105, 552, 121, 174, 364, 127], [15]]:
    combined.extend(item)
print(combined)

[43, 640, 156, 122, 15, 320, 116, 36, 105, 552, 121, 174, 364, 127, 15]


In [72]:
en_bert_tokenizer.detokenize([combined])

<tf.RaggedTensor [[b'hello', b'world', b',', b'this', b'is', b'a', b'tokenizer', b',']]>