In [1]:
import logging
import time

import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_text

In [2]:
[s for s in dir(logging.getLogger("tensorflow")) if "level" in s.lower()]

['getEffectiveLevel', 'level', 'setLevel']

In [3]:
logging.ERROR, logging.WARNING, logging.INFO

(40, 30, 20)

In [4]:
print(f'{logging.getLogger("tensorflow").getEffectiveLevel()}')
print(f'{logging.getLogger("tensorflow").level}')

20
20


In [5]:
#logging.getLogger("tensorflow").setLevel(logging.ERROR)

## Download The Dataset

In [6]:
examples, metadata = tfds.load(
    "ted_hrlr_translate/pt_to_en",
    with_info=True,
    as_supervised=True,
)
train_examples, val_examples = examples["train"], examples["validation"]

2022-10-01 13:04:03.059522: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_UNKNOWN: unknown error
2022-10-01 13:04:03.059566: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: T460p
2022-10-01 13:04:03.059574: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: T460p
2022-10-01 13:04:03.059698: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:200] libcuda reported version is: 515.76.0
2022-10-01 13:04:03.059725: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:204] kernel reported version is: 515.76.0
2022-10-01 13:04:03.059733: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:310] kernel version seems to match DSO: 515.76.0
2022-10-01 13:04:03.060929: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operatio

In [7]:
for pt_batch, en_batch in train_examples.batch(3).take(1):
    print('> Examples in Portuguese:')
    for pt in pt_batch.numpy():
        print(pt.decode('utf-8'))
    print()

    print('> Examples in English:')
    for en in en_batch.numpy():
        print(en.decode('utf-8'))


> Examples in Portuguese:
e quando melhoramos a procura , tiramos a única vantagem da impressão , que é a serendipidade .
mas e se estes fatores fossem ativos ?
mas eles não tinham a curiosidade de me testar .

> Examples in English:
and when you improve searchability , you actually take away the one advantage of print , which is serendipity .
but what if it were active ?
but they did n't test for curiosity .


2022-10-01 13:04:03.203582: W tensorflow/core/kernels/data/cache_dataset_ops.cc:856] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


In [8]:
model_name = "ted_hrlr_translate_pt_en_converter"
tf.keras.utils.get_file(
    f"{model_name}.zip",
    f"https://storage.googleapis.com/download.tensorflow.org/models/{model_name}.zip",
    cache_dir=".", cache_subdir="", extract=True
)

'./ted_hrlr_translate_pt_en_converter.zip'

In [9]:
tokenizers = tf.saved_model.load(model_name)

In [10]:
[s for s in dir(tokenizers) if not s.startswith('_')]

['en',
 'graph_debug_info',
 'pt',
 'signatures',
 'tensorflow_git_version',
 'tensorflow_version']

`en` and `pt` are the two tokenizers contained in `tokenizers`.  
Both of them have the same set of methods.

In [11]:
[s for s in dir(tokenizers.en) if not s.startswith('_')]

['detokenize',
 'get_reserved_tokens',
 'get_vocab_path',
 'get_vocab_size',
 'lookup',
 'tokenize',
 'tokenizer',
 'vocab']

The `tokenize` method
- splits punctuation
- lowercases
- unicode-normalizes
the input before mapping each token to a token id.

In [12]:
en_example = tf.constant([
    "What are you talking about?",
    "It's none of your business.",
    "I am a businessman",
])

In [13]:
token_ids = tokenizers.en.tokenize(en_example)
token_ids

<tf.RaggedTensor [[2, 90, 86, 79, 351, 95, 30, 3],
 [2, 76, 9, 55, 1686, 74, 135, 457, 15, 3], [2, 45, 340, 37, 457, 950, 3]]>

The `detokenize` method not only inverse maps the ids back to the tokens but also concatenate the tokens to restore original sentences.

In [14]:
round_trip = tokenizers.en.detokenize(token_ids)
round_trip

<tf.Tensor: shape=(3,), dtype=string, numpy=
array([b'what are you talking about ?', b"it ' s none of your business .",
       b'i am a businessman'], dtype=object)>

If we don't want the concatenation, we could use the `lookup` method.

In [15]:
tokens = tokenizers.en.lookup(token_ids)
tokens

<tf.RaggedTensor [[b'[START]', b'what', b'are', b'you', b'talking', b'about', b'?', b'[END]'],
 [b'[START]', b'it', b"'", b's', b'none', b'of', b'your', b'business', b'.',
  b'[END]']                                                                 ,
 [b'[START]', b'i', b'am', b'a', b'business', b'##man', b'[END]']]>

Note that
- `"[START]"` and `"[END]"` map to the token ids `2` and `3`, resp.
- the restored sentence by `detokenize` has thoughtfully removed `"[START]"` and `"[END]"`