In [1]:
import numpy as np

import tensorflow_datasets as tfds
import tensorflow as tf


In [2]:
import matplotlib.pyplot as plt


def plot_graphs(history, metric):
  plt.plot(history.history[metric])
  plt.plot(history.history['val_'+metric], '')
  plt.xlabel("Epochs")
  plt.ylabel(metric)
  plt.legend([metric, 'val_'+metric])

In [3]:
dataset, info = tfds.load('imdb_reviews', with_info=True,
                          as_supervised=True)
train_dataset, test_dataset = dataset['train'], dataset['test']

train_dataset.element_spec

2025-12-10 12:40:37.602526: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M2
2025-12-10 12:40:37.602701: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 8.00 GB
2025-12-10 12:40:37.602973: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 2.67 GB
2025-12-10 12:40:37.604009: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2025-12-10 12:40:37.605139: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


(TensorSpec(shape=(), dtype=tf.string, name=None),
 TensorSpec(shape=(), dtype=tf.int64, name=None))

In [4]:
for example, label in train_dataset.take(1):
  print('text: ', example.numpy())
  print('label: ', label.numpy())

text:  b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it."
label:  0


2025-12-10 12:41:06.415866: W tensorflow/core/kernels/data/cache_dataset_ops.cc:858] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.
2025-12-10 12:41:06.419886: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [5]:
BUFFER_SIZE = 10000
BATCH_SIZE = 64

In [6]:
train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

In [7]:
for example, label in train_dataset.take(1):
  print('texts: ', example.numpy()[:3])
  print()
  print('labels: ', label.numpy()[:3])

texts:  [b'Even if this film was allegedly a joke in response to critics it\'s still an awful film. If one is going to commit to that sort of thing at least make it a good joke.....first off, Jeroen Krabb\xc3\xa9 is i guess the poor man\'s Gerard Depardieu.....naturally i hate Gerard Depardieu even though he was very funny in the \'Iron Mask\' three musketeer one. Otherwise to me he is box office poison and Jeroen Krabb\xc3\xa9 is worse than that. The poor man\'s box office poison....really that is not being fair to the economically disenfranchised. If the \'4th Man\' is supposed to be some sort of critique of the Bourgeoisie....what am i saying? it isn\'t. Let\'s just say hypothetically, if it was supposed to be, it wasn\'t sharp enough. Satire is a tricky thing....if it isn\'t sharp enough the viewer becomes the butt of the joke instead......i think that is what happened. The story just ends up as a bunch of miserable disgusting characters doing nothing that anyone would care about a

2025-12-10 12:41:47.801870: W tensorflow/core/kernels/data/cache_dataset_ops.cc:858] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.
2025-12-10 12:41:47.807778: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [8]:
VOCAB_SIZE = 1000
encoder = tf.keras.layers.TextVectorization(
    max_tokens=VOCAB_SIZE)
encoder.adapt(train_dataset.map(lambda text, label: text))

2025-12-10 12:42:03.102298: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [10]:
vocab = np.array(encoder.get_vocabulary())
vocab[:50]

array(['', '[UNK]', 'the', 'and', 'a', 'of', 'to', 'is', 'in', 'it', 'i',
       'this', 'that', 'br', 'was', 'as', 'for', 'with', 'movie', 'but',
       'film', 'on', 'not', 'you', 'are', 'his', 'have', 'he', 'be',
       'one', 'its', 'at', 'all', 'by', 'an', 'they', 'from', 'who', 'so',
       'like', 'her', 'just', 'or', 'about', 'has', 'if', 'out', 'some',
       'there', 'what'], dtype='<U14')

In [11]:
encoded_example = encoder(example)[:3].numpy()
encoded_example

array([[ 54,  45,  11, ...,   0,   0,   0],
       [ 11,   7,   4, ...,   0,   0,   0],
       [ 10, 209,   6, ...,   0,   0,   0]])

In [12]:
for n in range(3):
  print("Original: ", example[n].numpy())
  print("Round-trip: ", " ".join(vocab[encoded_example[n]]))
  print()

Original:  b'Even if this film was allegedly a joke in response to critics it\'s still an awful film. If one is going to commit to that sort of thing at least make it a good joke.....first off, Jeroen Krabb\xc3\xa9 is i guess the poor man\'s Gerard Depardieu.....naturally i hate Gerard Depardieu even though he was very funny in the \'Iron Mask\' three musketeer one. Otherwise to me he is box office poison and Jeroen Krabb\xc3\xa9 is worse than that. The poor man\'s box office poison....really that is not being fair to the economically disenfranchised. If the \'4th Man\' is supposed to be some sort of critique of the Bourgeoisie....what am i saying? it isn\'t. Let\'s just say hypothetically, if it was supposed to be, it wasn\'t sharp enough. Satire is a tricky thing....if it isn\'t sharp enough the viewer becomes the butt of the joke instead......i think that is what happened. The story just ends up as a bunch of miserable disgusting characters doing nothing that anyone would care about

In [13]:
model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=64,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)
])

In [14]:
print([layer.supports_masking for layer in model.layers])

[False, True, True, True, True]


In [15]:
sample_text = ('The movie was cool. The animation and the graphics '
               'were out of this world. I would recommend this movie.')
predictions = model.predict(tf.constant([sample_text], dtype=tf.string))
print(predictions[0])

2025-12-10 12:43:35.966575: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 868ms/step
[-0.00512693]


In [16]:
# predict on a sample text with padding

text_with_padding = sample_text + "the " * 2000
predictions = model.predict(tf.constant([text_with_padding], dtype=tf.string))
print(predictions[0])

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 138ms/step
[0.01997026]


In [17]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

In [20]:
history = model.fit(train_dataset, epochs=10,
                    validation_data=test_dataset,
                    validation_steps=30)

Epoch 1/10


KeyboardInterrupt: 