In [35]:
import logging

import tensorflow_datasets as tfds
import tensorflow as tf

import os

# Import tf_text to load the ops used by the tokenizer saved model
import tensorflow_text as text
import re

In [4]:
logging.getLogger('tensorflow').setLevel(logging.ERROR)  # suppress warnings

In [5]:
# load dataset
os.getcwd()
config = tfds.translate.opus.OpusConfig(
    version=tfds.core.Version('0.1.0'),
    language_pair=("ja", "en"),
    subsets=["OpenSubtitles"]
)
builder = tfds.builder("opus", config=config, data_dir=os.getcwd() + "/../datasets/opus")
builder.download_and_prepare()
train_examples, test_examples = builder.as_dataset(split=['train[:80%]', 'train[80%:]'])



In [6]:
for i in train_examples.batch(3).take(1):
    ja_examples = i['ja']
    for ja in ja_examples.numpy():
        print(ja.decode('utf-8'))

    print()
    en_examples = i['en']
    for en in en_examples.numpy():
        print(en.decode('utf-8'))

- シールドは機能します

苦しんでいるのよ

こんな事を読んだけど...


-Sir, the shields are functional.

He's suffering, man.

I read this thing the other day about how...



# Tokenizer

In [7]:
train_en = train_examples.map(lambda d: d['en'])
train_ja = train_examples.map(lambda d: d['ja'])

In [8]:
from tensorflow_text.tools.wordpiece_vocab import bert_vocab_from_dataset as bert_vocab
import tensorflow_text as text

In [9]:
bert_tokenizer_params=dict(lower_case=True)
reserved_tokens=["[PAD]", "[UNK]", "[START]", "[END]"]

bert_vocab_args_en = dict(
    # The target vocabulary size
    vocab_size = 8000,
    # Reserved tokens that must be included in the vocabulary
    reserved_tokens=reserved_tokens,
    # Arguments for `text.BertTokenizer`
    bert_tokenizer_params=bert_tokenizer_params,
    # Arguments for `wordpiece_vocab.wordpiece_tokenizer_learner_lib.learn`
    learn_params={},
)
# japanese is a screwed up language
bert_vocab_args_ja = dict(
    # The target vocabulary size
    vocab_size = 32000,
    # Reserved tokens that must be included in the vocabulary
    reserved_tokens=reserved_tokens,
    # Arguments for `text.BertTokenizer`
    bert_tokenizer_params=bert_tokenizer_params,
    # Arguments for `wordpiece_vocab.wordpiece_tokenizer_learner_lib.learn`
    learn_params={},
)

In [71]:
%%time
en_vocab = bert_vocab.bert_vocab_from_dataset(
    train_en.prefetch(2),
    **bert_vocab_args_en
)

Wall time: 11min 39s


In [75]:
%%time
ja_vocab = bert_vocab.bert_vocab_from_dataset(
    train_ja.prefetch(2),
    **bert_vocab_args_ja
)

Wall time: 31min 19s


In [56]:
print(ja_vocab[:10])
print(ja_vocab[100:110])
print(ja_vocab[1000:1010])
print(ja_vocab[-10:])

['[PAD]', '[UNK]', '[START]', '[END]', '!', '"', '%', "'", '(', ')']
['ほ', 'ま', 'み', 'む', 'め', 'も', 'ゃ', 'や', 'ゆ', 'ょ']
['##の', '##よ', '##は', '##と', 'って', '##ね', '##に', '##た', '##な', '##を']
['##骨', '##高', '##髪', '##魔', '##鳥', '##鳴', '##鹿', '##麻', '##黒', '##黙']


In [24]:
print(en_vocab[:10])
print(en_vocab[100:110])
print(en_vocab[1000:1010])
print(en_vocab[-10:])
# i really don't know why the english vocab has japanese words but ok.
# it also has russian and greek letters....

['[PAD]', '[UNK]', '[START]', '[END]', '!', '"', '#', '$', '%', "'"]
['ο', 'σ', 'τ', 'а', 'в', 'г', 'о', 'р', 'т', 'у']
['entire', 'officer', 'walter', 'station', 'straight', 'wall', 'using', '##el', 'nick', 'mark']
['##締', '##胸', '##苏', '##茅', '##裂', '##言', '##迎', '##里', '##\ue0e1', '##�']


In [10]:
def write_vocab_file(filepath, vocab):
  with open(filepath, 'w', encoding="utf-8") as f:
    for token in vocab:
      print(token, file=f)
write_vocab_file('ja_vocab.txt', ja_vocab)
write_vocab_file('en_vocab.txt', en_vocab)

NameError: name 'ja_vocab' is not defined

In [25]:
def load_vocab(filepath):
    with open(filepath, 'r', encoding="utf-8") as f:
        vocab = []
        for x in f:
            vocab.append(x[:-1])
    return vocab
en_vocab = load_vocab("en_vocab.txt")
ja_vocab = load_vocab("ja_vocab.txt")

In [12]:
ja_tokenizer = text.BertTokenizer('ja_vocab.txt', **bert_tokenizer_params)
en_tokenizer = text.BertTokenizer('en_vocab.txt', **bert_tokenizer_params)

In [14]:
for i in train_examples.batch(3).take(1):
    ja_examples = i['ja']
    for ja in ja_examples.numpy():
        print(ja.decode('utf-8'))

    print()
    en_examples = i['en']
    for en in en_examples:
        print(en.numpy())

- シールドは機能します

苦しんでいるのよ

こんな事を読んだけど...


b'-Sir, the shields are functional.\n'
b"He's suffering, man.\n"
b'I read this thing the other day about how...\n'


In [15]:
# Tokenize the examples -> (batch, word, word-piece)
token_batch = en_tokenizer.tokenize(en_examples)
# Merge the word and word-piece axes -> (batch, tokens)
token_batch = token_batch.merge_dims(-2,-1)

for ex in token_batch.to_list():
  print(ex)

[15, 353, 14, 188, 6465, 212, 5601, 509, 16]
[200, 9, 57, 2851, 14, 283, 16]
[47, 711, 199, 321, 188, 355, 348, 234, 242, 16, 16, 16]


In [26]:
# Lookup each token id in the vocabulary.
txt_tokens = tf.gather(en_vocab, token_batch)
# Join with spaces.
tf.strings.reduce_join(txt_tokens, separator=' ', axis=-1)

<tf.Tensor: shape=(3,), dtype=string, numpy=
array([b'- sir , the shields are function ##al .',
       b"he ' s suffering , man .",
       b'i read this thing the other day about how . . .'], dtype=object)>

In [27]:
words = en_tokenizer.detokenize(token_batch)
tf.strings.reduce_join(words, separator=' ', axis=-1)

<tf.Tensor: shape=(3,), dtype=string, numpy=
array([b'- sir , the shields are functional .',
       b"he ' s suffering , man .",
       b'i read this thing the other day about how . . .'], dtype=object)>

In [28]:
START = tf.argmax(tf.constant(reserved_tokens) == "[START]")
END = tf.argmax(tf.constant(reserved_tokens) == "[END]")

def add_start_end(ragged):
  count = ragged.bounding_shape()[0]
  starts = tf.fill([count,1], START)
  ends = tf.fill([count,1], END)
  return tf.concat([starts, ragged, ends], axis=1)

In [29]:
words = en_tokenizer.detokenize(add_start_end(token_batch))
tf.strings.reduce_join(words, separator=' ', axis=-1)

<tf.Tensor: shape=(3,), dtype=string, numpy=
array([b'[START] - sir , the shields are functional . [END]',
       b"[START] he ' s suffering , man . [END]",
       b'[START] i read this thing the other day about how . . . [END]'],
      dtype=object)>

In [31]:
def cleanup_text(reserved_tokens, token_txt):
  # Drop the reserved tokens, except for "[UNK]".
  bad_tokens = [re.escape(tok) for tok in reserved_tokens if tok != "[UNK]"]
  bad_token_re = "|".join(bad_tokens)

  bad_cells = tf.strings.regex_full_match(token_txt, bad_token_re)
  result = tf.ragged.boolean_mask(token_txt, ~bad_cells)

  # Join them into strings.
  result = tf.strings.reduce_join(result, separator=' ', axis=-1)

  return result

In [32]:
en_examples.numpy()

array([b'-Sir, the shields are functional.\n', b"He's suffering, man.\n",
       b'I read this thing the other day about how...\n'], dtype=object)

In [33]:
token_batch = en_tokenizer.tokenize(en_examples).merge_dims(-2,-1)
words = en_tokenizer.detokenize(token_batch)
words

<tf.RaggedTensor [[b'-', b'sir', b',', b'the', b'shields', b'are', b'functional', b'.'],
 [b'he', b"'", b's', b'suffering', b',', b'man', b'.'],
 [b'i', b'read', b'this', b'thing', b'the', b'other', b'day', b'about',
  b'how', b'.', b'.', b'.']                                            ]>

In [34]:
cleanup_text(reserved_tokens, words).numpy()

array([b'- sir , the shields are functional .',
       b"he ' s suffering , man .",
       b'i read this thing the other day about how . . .'], dtype=object)

In [38]:
class CustomTokenizer(tf.Module):
  def __init__(self, reserved_tokens, vocab_path):
    self.tokenizer = text.BertTokenizer(vocab_path, lower_case=True)
    self._reserved_tokens = reserved_tokens
    self._vocab_path = tf.saved_model.Asset(vocab_path)

    vocab = load_vocab(vocab_path)
    self.vocab = tf.Variable(vocab)

    ## Create the signatures for export:

    # Include a tokenize signature for a batch of strings.
    self.tokenize.get_concrete_function(
        tf.TensorSpec(shape=[None], dtype=tf.string))

    # Include `detokenize` and `lookup` signatures for:
    #   * `Tensors` with shapes [tokens] and [batch, tokens]
    #   * `RaggedTensors` with shape [batch, tokens]
    self.detokenize.get_concrete_function(
        tf.TensorSpec(shape=[None, None], dtype=tf.int64))
    self.detokenize.get_concrete_function(
          tf.RaggedTensorSpec(shape=[None, None], dtype=tf.int64))

    self.lookup.get_concrete_function(
        tf.TensorSpec(shape=[None, None], dtype=tf.int64))
    self.lookup.get_concrete_function(
          tf.RaggedTensorSpec(shape=[None, None], dtype=tf.int64))

    # These `get_*` methods take no arguments
    self.get_vocab_size.get_concrete_function()
    self.get_vocab_path.get_concrete_function()
    self.get_reserved_tokens.get_concrete_function()

  @tf.function
  def tokenize(self, strings):
    enc = self.tokenizer.tokenize(strings)
    # Merge the `word` and `word-piece` axes.
    enc = enc.merge_dims(-2,-1)
    enc = add_start_end(enc)
    return enc

  @tf.function
  def detokenize(self, tokenized):
    words = self.tokenizer.detokenize(tokenized)
    return cleanup_text(self._reserved_tokens, words)

  @tf.function
  def lookup(self, token_ids):
    return tf.gather(self.vocab, token_ids)

  @tf.function
  def get_vocab_size(self):
    return tf.shape(self.vocab)[0]

  @tf.function
  def get_vocab_path(self):
    return self._vocab_path

  @tf.function
  def get_reserved_tokens(self):
    return tf.constant(self._reserved_tokens)

In [39]:
tokenizers = tf.Module()
tokenizers.ja = CustomTokenizer(reserved_tokens, 'ja_vocab.txt')
tokenizers.en = CustomTokenizer(reserved_tokens, 'en_vocab.txt')

In [40]:
model_name = 'model/tokenizer'
tf.saved_model.save(tokenizers, model_name)

In [41]:
reloaded_tokenizers = tf.saved_model.load(model_name)
reloaded_tokenizers.en.get_vocab_size().numpy()

7729

In [42]:
tokens = reloaded_tokenizers.en.tokenize(['Hello TensorFlow!'])
tokens.numpy()

array([[   2,  444,  855, 6287,  368, 1017, 3531,    4,    3]],
      dtype=int64)

In [43]:
text_tokens = reloaded_tokenizers.en.lookup(tokens)
text_tokens

<tf.RaggedTensor [[b'[START]', b'hello', b'ten', b'##so', b'##r', b'##f', b'##low', b'!',
  b'[END]']]>

In [44]:
round_trip = reloaded_tokenizers.en.detokenize(tokens)

print(round_trip.numpy()[0].decode('utf-8'))

hello tensorflow !
