In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
data_save_path = r'/content/drive/My Drive/data/'

In [None]:
!pip install -q -U tensorflow-text
!pip install -q tensorflow_datasets

[K     |████████████████████████████████| 4.9 MB 5.1 MB/s 
[?25h

In [None]:
import collections
import os
import pathlib
import re
import string
import sys
import tempfile
import time
import numpy as np
import matplotlib.pyplot as plt
import tensorflow_datasets as tfds
import tensorflow_text as text
import tensorflow as tf

In [None]:
tf.get_logger().setLevel('ERROR')
pwd = pathlib.Path.cwd()

In [None]:
examples, metadata = tfds.load('ted_hrlr_translate/ru_to_en', with_info=True, as_supervised=True)
train_examples, val_examples = examples['train'], examples['validation']

[1mDownloading and preparing dataset ted_hrlr_translate/ru_to_en/1.0.0 (download: 124.94 MiB, generated: Unknown size, total: 124.94 MiB) to /root/tensorflow_datasets/ted_hrlr_translate/ru_to_en/1.0.0...[0m


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]






0 examples [00:00, ? examples/s]

Shuffling and writing examples to /root/tensorflow_datasets/ted_hrlr_translate/ru_to_en/1.0.0.incomplete91RZ13/ted_hrlr_translate-train.tfrecord


  0%|          | 0/208106 [00:00<?, ? examples/s]

0 examples [00:00, ? examples/s]

Shuffling and writing examples to /root/tensorflow_datasets/ted_hrlr_translate/ru_to_en/1.0.0.incomplete91RZ13/ted_hrlr_translate-validation.tfrecord


  0%|          | 0/4805 [00:00<?, ? examples/s]

0 examples [00:00, ? examples/s]

Shuffling and writing examples to /root/tensorflow_datasets/ted_hrlr_translate/ru_to_en/1.0.0.incomplete91RZ13/ted_hrlr_translate-test.tfrecord


  0%|          | 0/5476 [00:00<?, ? examples/s]

[1mDataset ted_hrlr_translate downloaded and prepared to /root/tensorflow_datasets/ted_hrlr_translate/ru_to_en/1.0.0. Subsequent calls will reuse this data.[0m


In [None]:
for ru, en in train_examples.take(1):
    print("Russian: ", ru.numpy().decode('utf-8'))
    print("English:   ", en.numpy().decode('utf-8'))

Russian:  к : успех , перемены возможны только с оружием в руках .
English:    c : success , the change is only coming through the barrel of the gun .


In [None]:
train_en = train_examples.map(lambda ru, en: en)
train_ru = train_examples.map(lambda ru, en: ru)

In [None]:
from tensorflow_text.tools.wordpiece_vocab import bert_vocab_from_dataset as bert_vocab

In [None]:
bert_tokenizer_params=dict(lower_case=True)
reserved_tokens=["[PAD]", "[UNK]", "[START]", "[END]"]

bert_vocab_args = dict(
    # The target vocabulary size
    vocab_size = 8000,
    # Reserved tokens that must be included in the vocabulary
    reserved_tokens=reserved_tokens,
    # Arguments for `text.BertTokenizer`
    bert_tokenizer_params=bert_tokenizer_params,
    # Arguments for `wordpiece_vocab.wordpiece_tokenizer_learner_lib.learn`
    learn_params={},
)

In [None]:
%%time
ru_vocab = bert_vocab.bert_vocab_from_dataset(train_ru.batch(1000).prefetch(2), **bert_vocab_args)

CPU times: user 15min 48s, sys: 13.7 s, total: 16min 1s
Wall time: 16min 5s


In [None]:
print(ru_vocab[:10])
print(ru_vocab[100:110])
print(ru_vocab[1000:1010])
print(ru_vocab[-10:])

['[PAD]', '[UNK]', '[START]', '[END]', '!', '#', '$', '%', '&', "'"]
['ш', 'щ', 'ъ', 'ы', 'ь', 'э', 'ю', 'я', 'і', '՛']
['трудно', 'хотела', 'далеко', 'качестве', 'мою', '##3', '##де', '##ила', 'планеты', 'большие']
['##’', '##“', '##”', '##„', '##•', '##′', '##⁄', '##∇', '##♪', '##♫']


In [None]:
def write_vocab_file(filepath, vocab):
    with open(filepath, 'w') as f:
      for token in vocab:
        print(token, file=f)

write_vocab_file(data_save_path+'ru_vocab.txt', ru_vocab)

In [None]:
%%time
en_vocab = bert_vocab.bert_vocab_from_dataset(train_en.batch(1000).prefetch(2), **bert_vocab_args)

CPU times: user 4min 10s, sys: 10.9 s, total: 4min 21s
Wall time: 4min 42s


In [None]:
print(en_vocab[:10])
print(en_vocab[100:110])
print(en_vocab[1000:1010])
print(en_vocab[-10:])

['[PAD]', '[UNK]', '[START]', '[END]', '!', '#', '$', '%', '&', "'"]
['##s', 'have', 'but', 'what', 'on', 'do', 'with', 'can', 'there', 'about']
['revolution', '200', 'basic', 'potential', 'english', 'led', 'message', 'perfect', '##ce', 'nine']
['##–', '##—', '##‘', '##’', '##“', '##”', '##•', '##∇', '##♪', '##♫']


In [None]:
write_vocab_file(data_save_path+'en_vocab.txt', en_vocab)

In [None]:
ru_tokenizer = text.BertTokenizer(data_save_path+'ru_vocab.txt', **bert_tokenizer_params)
en_tokenizer = text.BertTokenizer(data_save_path+'en_vocab.txt', **bert_tokenizer_params)

In [None]:
for ru_examples, en_examples in train_examples.batch(3).take(1):
    for ex in en_examples:
        print(ex.numpy())

b'c : success , the change is only coming through the barrel of the gun .'
b'the documentation and the hands-on teaching methodology is also open-source and released as the creative commons .'
b"( video ) didi pickles : it 's four o'clock in the morning ."


In [None]:
# Tokenize the examples -> (batch, word, word-piece)
token_batch = en_tokenizer.tokenize(en_examples)
# Merge the word and word-piece axes -> (batch, tokens)
token_batch = token_batch.merge_dims(-2,-1)

for ex in token_batch.to_list():
    print(ex)

[41, 28, 1103, 14, 84, 243, 93, 200, 389, 218, 84, 6405, 87, 84, 2473, 16]
[84, 3914, 464, 85, 84, 702, 15, 104, 1495, 2346, 2024, 93, 187, 435, 15, 942, 85, 2533, 111, 84, 1068, 5725, 16]
[10, 400, 11, 168, 379, 1026, 1125, 28, 90, 9, 57, 316, 53, 9, 2501, 89, 84, 813, 16]


In [None]:
# Lookup each token id in the vocabulary.
txt_tokens = tf.gather(en_vocab, token_batch)
# Join with spaces.
tf.strings.reduce_join(txt_tokens, separator=' ', axis=-1)

<tf.Tensor: shape=(3,), dtype=string, numpy=
array([b'c : success , the change is only coming through the barrel of the gun .',
       b'the document ##ation and the hands - on teaching method ##ology is also open - source and released as the creative commons .',
       b"( video ) did ##i pick ##les : it ' s four o ' clock in the morning ."],
      dtype=object)>

In [None]:
words = en_tokenizer.detokenize(token_batch)
tf.strings.reduce_join(words, separator=' ', axis=-1)

<tf.Tensor: shape=(3,), dtype=string, numpy=
array([b'c : success , the change is only coming through the barrel of the gun .',
       b'the documentation and the hands - on teaching methodology is also open - source and released as the creative commons .',
       b"( video ) didi pickles : it ' s four o ' clock in the morning ."],
      dtype=object)>

In [None]:
START = tf.argmax(tf.constant(reserved_tokens) == "[START]")
END = tf.argmax(tf.constant(reserved_tokens) == "[END]")

def add_start_end(ragged):
    count = ragged.bounding_shape()[0]
    starts = tf.fill([count,1], START)
    ends = tf.fill([count,1], END)
    return tf.concat([starts, ragged, ends], axis=1)

In [None]:
words = en_tokenizer.detokenize(add_start_end(token_batch))
tf.strings.reduce_join(words, separator=' ', axis=-1)

<tf.Tensor: shape=(3,), dtype=string, numpy=
array([b'[START] c : success , the change is only coming through the barrel of the gun . [END]',
       b'[START] the documentation and the hands - on teaching methodology is also open - source and released as the creative commons . [END]',
       b"[START] ( video ) didi pickles : it ' s four o ' clock in the morning . [END]"],
      dtype=object)>

In [None]:
def cleanup_text(reserved_tokens, token_txt):
    # Drop the reserved tokens, except for "[UNK]".
    bad_tokens = [re.escape(tok) for tok in reserved_tokens if tok != "[UNK]"]
    bad_token_re = "|".join(bad_tokens)
    bad_cells = tf.strings.regex_full_match(token_txt, bad_token_re)
    result = tf.ragged.boolean_mask(token_txt, ~bad_cells)
    # Join them into strings.
    result = tf.strings.reduce_join(result, separator=' ', axis=-1)
    return result

In [None]:
en_examples.numpy()

array([b'c : success , the change is only coming through the barrel of the gun .',
       b'the documentation and the hands-on teaching methodology is also open-source and released as the creative commons .',
       b"( video ) didi pickles : it 's four o'clock in the morning ."],
      dtype=object)

In [None]:
token_batch = en_tokenizer.tokenize(en_examples).merge_dims(-2,-1)
words = en_tokenizer.detokenize(token_batch)
words

<tf.RaggedTensor [[b'c', b':', b'success', b',', b'the', b'change', b'is', b'only', b'coming', b'through', b'the', b'barrel', b'of', b'the', b'gun', b'.'], [b'the', b'documentation', b'and', b'the', b'hands', b'-', b'on', b'teaching', b'methodology', b'is', b'also', b'open', b'-', b'source', b'and', b'released', b'as', b'the', b'creative', b'commons', b'.'], [b'(', b'video', b')', b'didi', b'pickles', b':', b'it', b"'", b's', b'four', b'o', b"'", b'clock', b'in', b'the', b'morning', b'.']]>

In [None]:
cleanup_text(reserved_tokens, words).numpy()

array([b'c : success , the change is only coming through the barrel of the gun .',
       b'the documentation and the hands - on teaching methodology is also open - source and released as the creative commons .',
       b"( video ) didi pickles : it ' s four o ' clock in the morning ."],
      dtype=object)

In [None]:
class CustomTokenizer(tf.Module):
  def __init__(self, reserved_tokens, vocab_path):
    self.tokenizer = text.BertTokenizer(vocab_path, lower_case=True)
    self._reserved_tokens = reserved_tokens
    self._vocab_path = tf.saved_model.Asset(vocab_path)

    vocab = pathlib.Path(vocab_path).read_text().splitlines()
    self.vocab = tf.Variable(vocab)

    ## Create the signatures for export:   

    # Include a tokenize signature for a batch of strings. 
    self.tokenize.get_concrete_function(
        tf.TensorSpec(shape=[None], dtype=tf.string))

    # Include `detokenize` and `lookup` signatures for:
    #   * `Tensors` with shapes [tokens] and [batch, tokens]
    #   * `RaggedTensors` with shape [batch, tokens]
    self.detokenize.get_concrete_function(
        tf.TensorSpec(shape=[None, None], dtype=tf.int64))
    self.detokenize.get_concrete_function(
          tf.RaggedTensorSpec(shape=[None, None], dtype=tf.int64))

    self.lookup.get_concrete_function(
        tf.TensorSpec(shape=[None, None], dtype=tf.int64))
    self.lookup.get_concrete_function(
          tf.RaggedTensorSpec(shape=[None, None], dtype=tf.int64))

    # These `get_*` methods take no arguments
    self.get_vocab_size.get_concrete_function()
    self.get_vocab_path.get_concrete_function()
    self.get_reserved_tokens.get_concrete_function()

  @tf.function
  def tokenize(self, strings):
    enc = self.tokenizer.tokenize(strings)
    # Merge the `word` and `word-piece` axes.
    enc = enc.merge_dims(-2,-1)
    enc = add_start_end(enc)
    return enc

  @tf.function
  def detokenize(self, tokenized):
    words = self.tokenizer.detokenize(tokenized)
    return cleanup_text(self._reserved_tokens, words)

  @tf.function
  def lookup(self, token_ids):
    return tf.gather(self.vocab, token_ids)

  @tf.function
  def get_vocab_size(self):
    return tf.shape(self.vocab)[0]

  @tf.function
  def get_vocab_path(self):
    return self._vocab_path

  @tf.function
  def get_reserved_tokens(self):
    return tf.constant(self._reserved_tokens)

In [None]:
tokenizers = tf.Module()
tokenizers.ru = CustomTokenizer(reserved_tokens, data_save_path+'ru_vocab.txt')
tokenizers.en = CustomTokenizer(reserved_tokens, data_save_path+'en_vocab.txt')

In [None]:
model_name = 'ted_hrlr_translate_ru_en_converter'
tf.saved_model.save(tokenizers, data_save_path + model_name)

In [None]:
reloaded_tokenizers = tf.saved_model.load(data_save_path + model_name)
reloaded_tokenizers.ru.get_vocab_size().numpy()

7832

In [None]:
tokens = reloaded_tokenizers.ru.tokenize(['Привет, друзья!'])
tokens.numpy()

array([[   2, 3322,   14, 1753,    4,    3]])

In [None]:
text_tokens = reloaded_tokenizers.ru.lookup(tokens)
text_tokens

<tf.RaggedTensor [[b'[START]', b'\xd0\xbf\xd1\x80\xd0\xb8\xd0\xb2\xd0\xb5\xd1\x82', b',', b'\xd0\xb4\xd1\x80\xd1\x83\xd0\xb7\xd1\x8c\xd1\x8f', b'!', b'[END]']]>

In [None]:
round_trip = reloaded_tokenizers.ru.detokenize(tokens)

print(round_trip.numpy()[0].decode('utf-8'))

привет , друзья !


In [None]:
!zip -r {model_name}.zip {data_save_path + model_name}


zip error: Nothing to do! (try: zip -r ted_hrlr_translate_ru_en_converter.zip . -i /content/drive/My Drive/data/ted_hrlr_translate_ru_en_converter)
