# Visualizing Data Using the Embedding Projector in TensorBoard

## Setup

In [4]:
import os 
import tensorflow as tf
from tensorflow.keras import (
    layers,
    Sequential,
    losses
)
import tensorflow_datasets as tfds
from tensorboard.plugins import projector

## IMDB Data

In [2]:
(train_data, test_data), info = tfds.load(
    'imdb_reviews/subwords8k',
    split=(tfds.Split.TRAIN, tfds.Split.TEST),
    with_info=True,
    as_supervised=True
)
encoder = info.features['text'].encoder

train_batches = train_data.shuffle(1000).padded_batch(
    10, padded_shapes=((None,), ())
)
test_batches = test_data.shuffle(1000).padded_batch(
    10, padded_shapes=((None, ), ())
)

train_batch, train_labels = next(iter(train_batches))

2022-03-16 12:26:54.106223: W tensorflow/core/platform/cloud/google_auth_provider.cc:184] All attempts to get a Google authentication bearer token failed, returning an empty token. Retrieving token from files failed with "NOT_FOUND: Could not locate the credentials file.". Retrieving token from GCE failed with "FAILED_PRECONDITION: Error executing an HTTP request: libcurl code 6 meaning 'Couldn't resolve host name', error details: Could not resolve host: metadata".


[1mDownloading and preparing dataset 80.23 MiB (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /Users/mmenendezg/tensorflow_datasets/imdb_reviews/subwords8k/1.0.0...[0m


Dl Size...: 100%|██████████| 84125825/84125825 [00:00<00:00, 10120153854.78 MiB/s]
Dl Completed...: 100%|██████████| 1/1 [00:00<00:00, 95.03 url/s] 
2022-03-16 12:28:40.765744: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-03-16 12:28:40.766122: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
2022-03-16 12:28:40.859734: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


[1mDataset imdb_reviews downloaded and prepared to /Users/mmenendezg/tensorflow_datasets/imdb_reviews/subwords8k/1.0.0. Subsequent calls will reuse this data.[0m
Metal device set to: Apple M1 Pro

systemMemory: 32.00 GB
maxCacheSize: 10.67 GB



2022-03-16 12:28:40.899634: W tensorflow/core/kernels/data/cache_dataset_ops.cc:768] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


## Keras Embedding Layer

In [6]:
embedding_dim = 16
embedding = layers.Embedding(encoder.vocab_size, embedding_dim)

model = Sequential([
    embedding,
    layers.GlobalAvgPool1D(),
    layers.Dense(16, activation='relu'),
    layers.Dense(1),
])

model.compile(
    loss=losses.BinaryCrossentropy(from_logits=True),
    optimizer='adam',
    metrics=['accuracy']
)

history = model.fit(
    train_batches,
    epochs=1,
    validation_data = test_batches,
    validation_steps=20
)

   1/2500 [..............................] - ETA: 14:21 - loss: 0.6896 - accuracy: 0.3000

2022-03-16 12:33:25.264318: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.




2022-03-16 12:35:52.798696: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.




2022-03-16 12:35:53.378457: W tensorflow/core/kernels/data/cache_dataset_ops.cc:768] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


## Saving Data for TensorBoard

In [7]:
logdir = 'logs/imdb-example'
if not os.path.exists(logdir):
    os.makedirs(logdir)
    
with open(os.path.join(logdir, 'metadata.tsv'), 'w') as f:
    for subwords in encoder.subwords:
        f.write(f'{subwords}\n')
    for unknown in range(1, encoder.vocab_size - len(encoder.subwords)):
        f.write(f'unknown #{unknown}\n')


weights = tf.Variable(model.layers[0].get_weights()[0][1:])

checkpoint = tf.train.Checkpoint(embedding=weights)
checkpoint.save(os.path.join(logdir, 'embedding.ckpt'))

config = projector.ProjectorConfig()
embedding = config.embeddings.add()

embedding.tensor_name = 'embedding/.ATTRIBUTES/VARIABLE_VALUE'
embedding.metadata_path = 'metadata.tsv'
projector.visualize_embeddings(logdir, config)

In [8]:
%load_ext tensorboard

%tensorboard --logdir logs/imdb-example

Reusing TensorBoard on port 6007 (pid 14036), started 0:00:55 ago. (Use '!kill 14036' to kill it.)