In [1]:
import numpy as np
import tensorflow_datasets as tfds
from tensorflow import keras
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

In [2]:
VOCAB_SIZE = 10000
OOV_TOKEN = '<OOV>'
MAX_LENGTH = 120
TRUNCATING = 'post'
PADDING = 'post'

In [3]:
imdb, info = tfds.load('imdb_reviews', with_info=True, as_supervised=True)
train_ds, test_ds = imdb['train'], imdb['test']

In [4]:
train_sentences = []
train_labels = []

val_sentences = []
val_labels = []

for sentence, label in train_ds:
    train_sentences.append(sentence.numpy().decode('utf8'))
    train_labels.append(label.numpy())

for sentence, label in test_ds:
    val_sentences.append(sentence.numpy().decode('utf8'))
    val_labels.append(label.numpy())

train_labels = np.array(train_labels)
val_labels = np.array(val_labels)

In [5]:
tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token=OOV_TOKEN)
tokenizer.fit_on_texts(train_sentences)
word_index = tokenizer.word_index
reverse_word_index = dict([(idx, word) for (word, idx) in word_index.items()])

In [6]:
train_sequences = tokenizer.texts_to_sequences(train_sentences)
print(f"type(train_sequences) = {type(train_sequences)}")
print(f"train_sequences[0] = {train_sequences[0]}")

type(train_sequences) = <class 'list'>
train_sequences[0] = [12, 14, 33, 425, 392, 18, 90, 28, 1, 9, 32, 1366, 3585, 40, 486, 1, 197, 24, 85, 154, 19, 12, 213, 329, 28, 66, 247, 215, 9, 477, 58, 66, 85, 114, 98, 22, 5675, 12, 1322, 643, 767, 12, 18, 7, 33, 400, 8170, 176, 2455, 416, 2, 89, 1231, 137, 69, 146, 52, 2, 1, 7577, 69, 229, 66, 2933, 16, 1, 2904, 1, 1, 1479, 4940, 3, 39, 3900, 117, 1584, 17, 3585, 14, 162, 19, 4, 1231, 917, 7917, 9, 4, 18, 13, 14, 4139, 5, 99, 145, 1214, 11, 242, 683, 13, 48, 24, 100, 38, 12, 7181, 5515, 38, 1366, 1, 50, 401, 11, 98, 1197, 867, 141, 10]


In [7]:
train_padded = pad_sequences(train_sequences, maxlen=MAX_LENGTH, padding=PADDING, truncating=TRUNCATING)
print(f"type(train_padded) = {type(train_padded)}")
print(f"train_padded.shape = {train_padded.shape}")
print(f"train_padded[0] = \n{train_padded[0]}")

type(train_padded) = <class 'numpy.ndarray'>
train_padded.shape = (25000, 120)
train_padded[0] = 
[  12   14   33  425  392   18   90   28    1    9   32 1366 3585   40
  486    1  197   24   85  154   19   12  213  329   28   66  247  215
    9  477   58   66   85  114   98   22 5675   12 1322  643  767   12
   18    7   33  400 8170  176 2455  416    2   89 1231  137   69  146
   52    2    1 7577   69  229   66 2933   16    1 2904    1    1 1479
 4940    3   39 3900  117 1584   17 3585   14  162   19    4 1231  917
 7917    9    4   18   13   14 4139    5   99  145 1214   11  242  683
   13   48   24  100   38   12 7181 5515   38 1366    1   50  401   11
   98 1197  867  141   10    0    0    0]


In [8]:
def decode_sequence(sequence):
    return ' '.join([reverse_word_index.get(i, '?') for i in sequence])

In [9]:
print(f"{'-' * 16} decoded {'-' * 16} \n{decode_sequence(train_padded[0])}")
print(f"{'-' * 16} original {'-' * 16} \n{train_sentences[0]}")

---------------- decoded ---------------- 
this was an absolutely terrible movie don't be <OOV> in by christopher walken or michael <OOV> both are great actors but this must simply be their worst role in history even their great acting could not redeem this movie's ridiculous storyline this movie is an early nineties us propaganda piece the most pathetic scenes were those when the <OOV> rebels were making their cases for <OOV> maria <OOV> <OOV> appeared phony and her pseudo love affair with walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning i am disappointed that there are movies like this ruining actor's like christopher <OOV> good name i could barely sit through it ? ? ?
---------------- original ---------------- 
This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's