In [1]:
import numpy as np
import tensorflow_datasets as tfds
from tensorflow import keras
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

In [2]:
VOCAB_SIZE = 10000
OOV_TOKEN = '<OOV>'
MAX_LENGTH = 120
TRUNCATING = 'post'
PADDING = 'post'
EMBEDDING_DIM = 16

In [3]:
imdb, info = tfds.load('imdb_reviews', with_info=True, as_supervised=True)
train_data, test_data = imdb['train'], imdb['test']

In [4]:
train_sentences = []
train_labels = []

val_sentences = []
val_labels = []

for sentence, label in train_data:
    train_sentences.append(sentence.numpy().decode('utf8'))
    train_labels.append(label.numpy())

for sentence, label in test_data:
    val_sentences.append(sentence.numpy().decode('utf8'))
    val_labels.append(label.numpy())

train_labels = np.array(train_labels)
val_labels = np.array(val_labels)

In [5]:
tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token=OOV_TOKEN)
tokenizer.fit_on_texts(train_sentences)
word_index = tokenizer.word_index
reverse_word_index = dict([(idx, word) for (word, idx) in word_index.items()])

In [6]:
def decode_sequence(sequence):
    return ' '.join([reverse_word_index.get(i, '?') for i in sequence])

In [7]:
train_sequences = tokenizer.texts_to_sequences(train_sentences)
train_padded = pad_sequences(train_sequences, maxlen=MAX_LENGTH, padding=PADDING, truncating=TRUNCATING)
val_sequences = tokenizer.texts_to_sequences(val_sentences)
val_padded = pad_sequences(val_sequences, maxlen=MAX_LENGTH, padding=PADDING, truncating=TRUNCATING)
print(f"train_padded.shape = {train_padded.shape}, val_padded.shape = {val_padded.shape}")

train_padded.shape = (25000, 120), val_padded.shape = (25000, 120)


In [8]:
model = keras.Sequential([
    keras.layers.Embedding(VOCAB_SIZE, EMBEDDING_DIM, input_length=MAX_LENGTH),
    keras.layers.Flatten(),
    keras.layers.Dense(6, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 120, 16)           160000    
_________________________________________________________________
flatten (Flatten)            (None, 1920)              0         
_________________________________________________________________
dense (Dense)                (None, 6)                 11526     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 7         
Total params: 171,533
Trainable params: 171,533
Non-trainable params: 0
_________________________________________________________________


In [9]:
model.fit(
    train_padded, 
    train_labels,
    epochs=10, 
    validation_data=(val_padded, val_labels)
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x2d43b7537c8>

In [10]:
sentence = "I really think this is amazing. honest."
sequence = tokenizer.texts_to_sequences([sentence])
padded = pad_sequences(sequence, maxlen=MAX_LENGTH, truncating=TRUNCATING)
model.predict(padded)

array([[0.7519765]], dtype=float32)