In [1]:
import tensorflow_datasets as tfds

datasets, info = tfds.load("imdb_reviews", as_supervised=True, with_info=True)
train_size = info.splits["train"].num_examples

Downloading and preparing dataset 80.23 MiB (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Generating splits...:   0%|          | 0/3 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/25000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incomplete3ZXU20/imdb_reviews-train.tfrecord…

Generating test examples...:   0%|          | 0/25000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incomplete3ZXU20/imdb_reviews-test.tfrecord*…

Generating unsupervised examples...:   0%|          | 0/50000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incomplete3ZXU20/imdb_reviews-unsupervised.t…

Dataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.


In [2]:
def preprocess(text, label):
    text = tf.cast(text, tf.string)
    text = tf.strings.lower(text)
    text = tf.strings.regex_replace(text, "<br />", " ")
    text = tf.strings.regex_replace(text, "[^a-zA-Z']", " ")
    text = tf.strings.split(text)
    return text, label

In [3]:
from collections import Counter
import tensorflow as tf

vocabulary = Counter()
for X_batch, y_batch in datasets["train"].batch(32).map(preprocess):
    for review in X_batch:
        vocabulary.update(list(review.numpy()))

In [4]:
vocabulary.most_common()[:3]

[(b'the', 336195), (b'and', 164138), (b'a', 163108)]

In [5]:
vocab_size = 10000
truncated_vocabulary = [
    word for word, count in vocabulary.most_common()[:vocab_size]]

In [6]:
words = tf.constant(truncated_vocabulary)
word_ids = tf.range(len(truncated_vocabulary), dtype=tf.int64)
vocab_init = tf.lookup.KeyValueTensorInitializer(words, word_ids)
num_oov_buckets = 1000
table = tf.lookup.StaticVocabularyTable(vocab_init, num_oov_buckets)

In [7]:
table.lookup(tf.constant([b"This movie was faaaaaantastic".split()]))

<tf.Tensor: shape=(1, 4), dtype=int64, numpy=array([[10745,    15,    11, 10053]])>

In [8]:
def encode_words(X_batch, y_batch):
    return table.lookup(X_batch), y_batch

train_set = datasets["train"].batch(32).map(preprocess)
train_set = train_set.map(encode_words).prefetch(1)

In [9]:
from tensorflow import keras

embed_size = 128
model = keras.models.Sequential([
    keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size,
                            input_shape=[None]),
    keras.layers.GRU(128, return_sequences=True), 
    keras.layers.GRU(128),
    keras.layers.Dense(1, activation="sigmoid")
])

model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
history = model.fit(train_set, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [12]:
import numpy as np

predict_x = model.predict(table.lookup(tf.constant([b"This movie sucks".split()]))) 

if predict_x[0] >= 0.7:
  print('Good')
elif predict_x[0] >= 0.4:
  print('Neutral')
else:
  print('Bad')

Bad


In [14]:
import numpy as np

predict_x = model.predict(table.lookup(tf.constant([b"This movie is awesome".split()]))) 

if predict_x[0] >= 0.7:
  print('Good')
elif predict_x[0] >= 0.4:
  print('Neutral')
else:
  print('Bad')

Good


In [24]:
import numpy as np

predict_x = model.predict(table.lookup(tf.constant([b"I like the movie, but it was okay".split()]))) 

if predict_x[0] >= 0.7:
  print('Good')
elif predict_x[0] >= 0.4:
  print('Neutral')
else:
  print('Bad')

Neutral
