### Original tutorial:## Example from: https://www.tensorflow.org/tutorials/text/text_classification_rnn


In [18]:
import numpy as np

import tensorflow_datasets as tfds
import tensorflow as tf

tfds.disable_progress_bar()

In [19]:
import matplotlib.pyplot as plt

def plot_graphs(history, metric):
  plt.plot(history.history[metric])
  plt.plot(history.history['val_'+metric], '')
  plt.xlabel("Epochs")
  plt.ylabel(metric)
  plt.legend([metric, 'val_'+metric])

In [20]:
## Download the imdb review dataset. This dataset has a binary label
## for each text input. Label beign the move is good or bad (positive or negative)
dataset, info = tfds.load('imdb_reviews', with_info=True,
                          as_supervised=True)
## Split the dataset into train and test set
train_dataset, test_dataset = dataset['train'], dataset['test']
## Check the data types of inputs and output
train_dataset.element_spec

(TensorSpec(shape=(), dtype=tf.string, name=None),
 TensorSpec(shape=(), dtype=tf.int64, name=None))

In [21]:
## view one input output pair
for example, label in train_dataset.take(1):
  print('text: ', example.numpy())
  print('label: ', label.numpy())

text:  b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it."
label:  0


2021-11-22 13:53:13.601339: W tensorflow/core/kernels/data/cache_dataset_ops.cc:768] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


In [22]:
len(train_dataset)
## Buffer size should be equal or greater than the size of train dataset for randomly suffling
BUFFER_SIZE = 10000
BATCH_SIZE = 64

In [23]:
train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
test_dataset = test_dataset.batch(BATCH_SIZE)

In [24]:
dataset = tf.data.Dataset.range(3)
dataset = dataset.shuffle(3, reshuffle_each_iteration=True)
dataset = dataset.repeat(2)
for eg in dataset:
    print(eg)

tf.Tensor(0, shape=(), dtype=int64)
tf.Tensor(2, shape=(), dtype=int64)
tf.Tensor(1, shape=(), dtype=int64)
tf.Tensor(2, shape=(), dtype=int64)
tf.Tensor(0, shape=(), dtype=int64)
tf.Tensor(1, shape=(), dtype=int64)


In [25]:
## Take one batch and view first 3 input text and output labels
for example, label in train_dataset.take(1):
  print('texts: ', example.numpy()[:3])
  print()
  print('labels: ', label.numpy()[:3])

texts:  [b'Repugnant Bronson thriller. Unfortunately, it\'s technically good and I gave it 4/10, but it\'s so utterly vile that it would be inconceivable to call it "entertainment". Far more disturbing than a typical slasher film.'
 b"I've watched this movie twice now on DVD, and both times it didn't fail to impress me with its unique impartial attitude. It seems more like a depiction of reality than most other Hollywood fare, especially on a topic that is still hotly discussed. Even though it sticks closely with the southern viewpoint, it doesn't fail to question it, and in the end the only sentence passed is that the war is lost, not matter what, and cruelty is a common denominator.<br /><br />What really makes this movie outstanding is the refusal to over-dramatize. Nowadays truly good movies (in a nutshell) are few and far apart, with mainstream fare being enjoyable (if you don't have high expectations), but terribly commercially spirited. I think this movie comes off as a truly go

2021-11-22 13:53:13.835594: W tensorflow/core/kernels/data/cache_dataset_ops.cc:768] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


In [26]:
## Vocabulary size defines the number of unique words we must keep for encoding data through tokenization
## any word that is out of the vocabulary will be annotated as a UKN
VOCAB_SIZE=1000
## The TextVectorization preprocessor converts the sentences into tokens
encoder = tf.keras.layers.experimental.preprocessing.TextVectorization(
    max_tokens=VOCAB_SIZE)
encoder.adapt(train_dataset.map(lambda text, label: text))

2021-11-22 13:53:13.884045: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


In [27]:
## view the first 20 words of the vocabulary
vocab = np.array(encoder.get_vocabulary())
vocab[:20]

array(['', '[UNK]', 'the', 'and', 'a', 'of', 'to', 'is', 'in', 'it', 'i',
       'this', 'that', 'br', 'was', 'as', 'for', 'with', 'movie', 'but'],
      dtype='<U14')

In [28]:
## view the tokens of the first 3 examples of the 1st batch
encoded_example = encoder(example)[:3].numpy()
encoded_example

array([[  1,   1, 773, ...,   0,   0,   0],
       [195, 284,  11, ...,   0,   0,   0],
       [ 10, 103,   9, ...,   0,   0,   0]])

In [29]:
for n in range(3):
  print("Original: ", example[n].numpy())
  print("Round-trip: ", " ".join(vocab[encoded_example[n]]))
  print()

Original:  b'Repugnant Bronson thriller. Unfortunately, it\'s technically good and I gave it 4/10, but it\'s so utterly vile that it would be inconceivable to call it "entertainment". Far more disturbing than a typical slasher film.'
Round-trip:  [UNK] [UNK] thriller unfortunately its [UNK] good and i gave it [UNK] but its so [UNK] [UNK] that it would be [UNK] to call it entertainment far more [UNK] than a typical [UNK] film                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            

In [30]:
## The sentiment classification model architecture:
model = tf.keras.Sequential([
    encoder, ## Tokenisation layer that converts a batch of input text to tokens with int values
    tf.keras.layers.Embedding( ## A trainable layer that learns to embbed the tokens to a real value distribution
        input_dim=len(encoder.get_vocabulary()),
        output_dim=64,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)
])

In [31]:
sample_text = ('The movie was cool. The animation and the graphics '
               'were out of this world. I would recommend this movie.')
predictions = model.predict(np.array([sample_text]))
print(predictions[0])
print(model.layers[1].output_shape)

2021-11-22 13:53:16.097769: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2021-11-22 13:53:16.221526: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2021-11-22 13:53:16.260129: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


[0.00463221]
(None, None, 64)


In [32]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

In [33]:
history = model.fit(train_dataset, epochs=10,
                    batch_size=BATCH_SIZE,
                    validation_data=test_dataset, 
                    validation_steps=30)

Epoch 1/10


2021-11-22 13:53:18.116337: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2021-11-22 13:53:18.427679: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2021-11-22 13:53:18.517368: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2021-11-22 13:53:20.975437: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2021-11-22 13:53:21.076547: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


  3/391 [..............................] - ETA: 1:01:23 - loss: 0.6932 - accuracy: 0.4896

KeyboardInterrupt: 

In [None]:
test_loss, test_acc = model.evaluate(test_dataset)

print('Test Loss: {}'.format(test_loss))
print('Test Accuracy: {}'.format(test_acc))

In [None]:
plt.figure(figsize=(16,8))
plt.subplot(1,2,1)
plot_graphs(history, 'accuracy')
plt.ylim(None,1)
plt.subplot(1,2,2)
plot_graphs(history, 'loss')
plt.ylim(0,None)

In [None]:
## do a prediction
sample_text = ('The movie was cool. The animation and the graphics '
               'were out of this world. I would recommend this movie.')
predictions = model.predict(np.array([sample_text]))
predictions