### Original tutorial:## Example from: https://www.tensorflow.org/tutorials/text/text_classification_rnn


In [1]:
import numpy as np

import tensorflow_datasets as tfds
import tensorflow as tf

tfds.disable_progress_bar()

In [2]:
import matplotlib.pyplot as plt

def plot_graphs(history, metric):
  plt.plot(history.history[metric])
  plt.plot(history.history['val_'+metric], '')
  plt.xlabel("Epochs")
  plt.ylabel(metric)
  plt.legend([metric, 'val_'+metric])

In [3]:
## Download the imdb review dataset. This dataset has a binary label
## for each text input. Label beign the move is good or bad (positive or negative)
dataset, info = tfds.load('imdb_reviews', with_info=True,
                          as_supervised=True)
## Split the dataset into train and test set
train_dataset, test_dataset = dataset['train'], dataset['test']
## Check the data types of inputs and output
train_dataset.element_spec

[1mDownloading and preparing dataset imdb_reviews/plain_text/1.0.0 (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...[0m
Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteVR1EE8/imdb_reviews-train.tfrecord
Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteVR1EE8/imdb_reviews-test.tfrecord
Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteVR1EE8/imdb_reviews-unsupervised.tfrecord




[1mDataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.[0m


(TensorSpec(shape=(), dtype=tf.string, name=None),
 TensorSpec(shape=(), dtype=tf.int64, name=None))

In [4]:
## view one input output pair
for example, label in train_dataset.take(1):
  print('text: ', example.numpy())
  print('label: ', label.numpy())

text:  b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it."
label:  0


In [5]:
len(train_dataset)
## Buffer size should be equal or greater than the size of train dataset for randomly suffling
BUFFER_SIZE = 10000
BATCH_SIZE = 64

In [6]:
train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
test_dataset = test_dataset.batch(BATCH_SIZE)

In [7]:
dataset = tf.data.Dataset.range(3)
dataset = dataset.shuffle(3, reshuffle_each_iteration=True)
dataset = dataset.repeat(2)
for eg in dataset:
    print(eg)

tf.Tensor(0, shape=(), dtype=int64)
tf.Tensor(1, shape=(), dtype=int64)
tf.Tensor(2, shape=(), dtype=int64)
tf.Tensor(2, shape=(), dtype=int64)
tf.Tensor(0, shape=(), dtype=int64)
tf.Tensor(1, shape=(), dtype=int64)


In [8]:
## Take one batch and view first 3 input text and output labels
for example, label in train_dataset.take(1):
  print('texts: ', example.numpy()[:3])
  print()
  print('labels: ', label.numpy()[:3])

texts:  [b'No hidden agenda. Pure scifi. All fun.<br /><br />I saw the original on TV and was scared pretty bad. I was a kid :)<br /><br />The original one can be appreciated more when compared to the new one which I saw and have forgotten. The original one, starring the great movie star Steve McQueen (BULLET), is by far the better and only version anyone should see.<br /><br />The movie production is dated, but the fx used to make the Blob stands up the test of time. I was convinced that that thing was moving on its own accord. 10/10<br /><br />-Zafoid'
 b'I bought this movie for 1 euro, not knowing what it was all about. I thought "hmmm, a movie named mutilation man must be if not very funny at least filled with gore". It wasn\'t funny alright. It was disturbing. Very disturbing. And I don\'t mind disturbing movies but this one just didn\'t mean anything, except that child abuse is not a good thing to do. hmmm... The quality of the images were terrible. The acting...there was no acti

In [9]:
## Vocabulary size defines the number of unique words we must keep for encoding data through tokenization
## any word that is out of the vocabulary will be annotated as a UKN
VOCAB_SIZE=1000
## The TextVectorization preprocessor converts the sentences into tokens
encoder = tf.keras.layers.experimental.preprocessing.TextVectorization(
    max_tokens=VOCAB_SIZE)
encoder.adapt(train_dataset.map(lambda text, label: text))

In [10]:
## view the first 20 words of the vocabulary
vocab = np.array(encoder.get_vocabulary())
vocab[:20]

array(['', '[UNK]', 'the', 'and', 'a', 'of', 'to', 'is', 'in', 'it', 'i',
       'this', 'that', 'br', 'was', 'as', 'for', 'with', 'movie', 'but'],
      dtype='<U14')

In [11]:
## view the tokens of the first 3 examples of the 1st batch
encoded_example = encoder(example)[:3].numpy()
encoded_example

array([[57,  1,  1, ...,  0,  0,  0],
       [10,  1, 11, ...,  0,  0,  0],
       [ 1,  7,  4, ...,  0,  0,  0]])

In [12]:
for n in range(3):
  print("Original: ", example[n].numpy())
  print("Round-trip: ", " ".join(vocab[encoded_example[n]]))
  print()

Original:  b'No hidden agenda. Pure scifi. All fun.<br /><br />I saw the original on TV and was scared pretty bad. I was a kid :)<br /><br />The original one can be appreciated more when compared to the new one which I saw and have forgotten. The original one, starring the great movie star Steve McQueen (BULLET), is by far the better and only version anyone should see.<br /><br />The movie production is dated, but the fx used to make the Blob stands up the test of time. I was convinced that that thing was moving on its own accord. 10/10<br /><br />-Zafoid'
Round-trip:  no [UNK] [UNK] [UNK] scifi all [UNK] br i saw the original on tv and was [UNK] pretty bad i was a kid br br the original one can be [UNK] more when [UNK] to the new one which i saw and have [UNK] the original one [UNK] the great movie star [UNK] [UNK] [UNK] is by far the better and only version anyone should [UNK] br the movie production is [UNK] but the [UNK] used to make the [UNK] [UNK] up the [UNK] of time i was [UNK]

In [13]:
## The sentiment classification model architecture:
model = tf.keras.Sequential([
    encoder, ## Tokenisation layer that converts a batch of input text to tokens with int values
    tf.keras.layers.Embedding( ## A trainable layer that learns to embbed the tokens to a real value distribution
        input_dim=len(encoder.get_vocabulary()),
        output_dim=64,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)
])

In [15]:
sample_text = ('The movie was cool. The animation and the graphics '
               'were out of this world. I would recommend this movie.')
predictions = model.predict(np.array([sample_text]))
print(predictions[0])
print(model.layers[1].output_shape)

[-0.00218701]
(None, None, 64)


In [16]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

In [17]:
history = model.fit(train_dataset, epochs=10,
                    validation_data=test_dataset, 
                    validation_steps=30)

Epoch 1/10
 33/391 [=>............................] - ETA: 11:49 - loss: 0.6933 - accuracy: 0.4915

KeyboardInterrupt: ignored

In [None]:
test_loss, test_acc = model.evaluate(test_dataset)

print('Test Loss: {}'.format(test_loss))
print('Test Accuracy: {}'.format(test_acc))

In [None]:
plt.figure(figsize=(16,8))
plt.subplot(1,2,1)
plot_graphs(history, 'accuracy')
plt.ylim(None,1)
plt.subplot(1,2,2)
plot_graphs(history, 'loss')
plt.ylim(0,None)

In [None]:
## do a prediction
sample_text = ('The movie was cool. The animation and the graphics '
               'were out of this world. I would recommend this movie.')
predictions = model.predict(np.array([sample_text]))
predictions