In [None]:
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow import keras
from collections import Counter

In [None]:
VOCAB_SIZE = 10000
NUM_OOV_BUCKETS = 1000
MAX_LENGTH = 300
EMBEDDING_SIZE = 128
BATCH_SIZE = 32

In [None]:
dataset, info = tfds.load('imdb_reviews', as_supervised=True, with_info=True)
train_data, test_data = dataset['train'], dataset['test']
train_size = info.splits['train'].num_examples
test_size = info.splits['test'].num_examples
print(f"train_size = {train_size}, test_size = {test_size}")

train_size = 25000, test_size = 25000


In [None]:
for x_batch, y_batch in train_data.batch(2).take(1):
    print(f"x_batch = \n{x_batch}")
    print(f"y_batch = \n{y_batch}")
    break

x_batch = 
[b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it."
 b'I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on the sette and having just eaten a lot. However on this occasion I fell asleep because the film was rubbish. The plot de

In [None]:
def preprocess(x_batch, y_batch):
    x_batch = tf.strings.substr(x_batch, 0, MAX_LENGTH)
    x_batch = tf.strings.regex_replace(x_batch, rb"<br\s*/?>", b" ")
    x_batch = tf.strings.regex_replace(x_batch, b"[^a-zA-Z']", b" ")
    x_batch = tf.strings.split(x_batch)
    return x_batch.to_tensor(default_value=b"<pad>"), y_batch

In [None]:
x_batch, y_batch = preprocess(x_batch, y_batch)
x_batch

<tf.Tensor: shape=(2, 53), dtype=string, numpy=
array([[b'This', b'was', b'an', b'absolutely', b'terrible', b'movie',
        b"Don't", b'be', b'lured', b'in', b'by', b'Christopher',
        b'Walken', b'or', b'Michael', b'Ironside', b'Both', b'are',
        b'great', b'actors', b'but', b'this', b'must', b'simply', b'be',
        b'their', b'worst', b'role', b'in', b'history', b'Even',
        b'their', b'great', b'acting', b'could', b'not', b'redeem',
        b'this', b"movie's", b'ridiculous', b'storyline', b'This',
        b'movie', b'is', b'an', b'early', b'nineties', b'US',
        b'propaganda', b'pi', b'<pad>', b'<pad>', b'<pad>'],
       [b'I', b'have', b'been', b'known', b'to', b'fall', b'asleep',
        b'during', b'films', b'but', b'this', b'is', b'usually', b'due',
        b'to', b'a', b'combination', b'of', b'things', b'including',
        b'really', b'tired', b'being', b'warm', b'and', b'comfortable',
        b'on', b'the', b'sette', b'and', b'having', b'just', b'eaten',

In [None]:
vocab_counter = Counter()
for x_batch, y_batch in train_data.batch(32).map(preprocess):
    for review in x_batch:
        vocab_counter.update(list(review.numpy()))

print(f"vocab_counter.most_common()[:3] = {vocab_counter.most_common()[:3]}")
print(f"len(vocab_counter) = {len(vocab_counter)}")

vocab_counter.most_common()[:3] = [(b'<pad>', 214309), (b'the', 61137), (b'a', 38564)]
len(vocab_counter) = 53893


In [None]:
vocab = [word for word, count in vocab_counter.most_common()[:VOCAB_SIZE]]
word_to_idx = {word: idx for idx, word in enumerate(vocab)}

In [None]:
words = tf.constant(vocab)
word_idxes = tf.range(len(vocab), dtype=tf.int64)
vocab_init = tf.lookup.KeyValueTensorInitializer(words, word_idxes)
table = tf.lookup.StaticVocabularyTable(vocab_init, NUM_OOV_BUCKETS)

In [None]:
table.lookup(tf.constant([b"This was an absolutely terrible movie".split()]))

<tf.Tensor: shape=(1, 6), dtype=int64, numpy=array([[ 22,  11,  28, 337, 302,  12]])>

In [None]:
def encode_words(x_batch, y_batch):
    return table.lookup(x_batch), y_batch

In [None]:
train_data_preprocessed = train_data.repeat().batch(BATCH_SIZE).map(preprocess).map(encode_words).prefetch(1)

In [None]:
for x_batch, y_batch in train_data_preprocessed.take(1):
    print(x_batch)
    print(y_batch)

tf.Tensor(
[[  22   11   28 ...    0    0    0]
 [   6   21   70 ...    0    0    0]
 [4099 6881    1 ...    0    0    0]
 ...
 [  22   12  118 ...  331 1047    0]
 [1757 4101  451 ...    0    0    0]
 [3365 4392    6 ...    0    0    0]], shape=(32, 60), dtype=int64)
tf.Tensor([0 0 0 1 1 1 0 0 0 0 0 1 1 0 1 0 1 1 1 0 1 1 1 1 1 0 0 0 1 0 0 0], shape=(32,), dtype=int64)


In [None]:
model = keras.models.Sequential([
    keras.layers.Embedding(VOCAB_SIZE + NUM_OOV_BUCKETS, EMBEDDING_SIZE, mask_zero=True, input_shape=[None]),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.GRU(128),
    keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(train_data_preprocessed, steps_per_epoch=train_size // BATCH_SIZE, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
