In [48]:
import tensorflow as tf
import tensorflow_datasets as tfds 
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [49]:
imdb, info = tfds.load("imdb_reviews",  as_supervised = True, with_info = True) 
# imdb is a dict containing three different <tf.data.Dataset>,
# each of which contain two tensors in the default format of (tensor containing the input, tensor containing the label)
imdb

{'test': <PrefetchDataset shapes: ((), ()), types: (tf.string, tf.int64)>,
 'train': <PrefetchDataset shapes: ((), ()), types: (tf.string, tf.int64)>,
 'unsupervised': <PrefetchDataset shapes: ((), ()), types: (tf.string, tf.int64)>}

In [50]:
print(info.description)
print(info.features)
print(info.splits)

Large Movie Review Dataset.
This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. There is additional unlabeled data for use as well.
FeaturesDict({
    'label': ClassLabel(shape=(), dtype=tf.int64, num_classes=2),
    'text': Text(shape=(), dtype=tf.string),
})
{'test': <tfds.core.SplitInfo num_examples=25000>, 'train': <tfds.core.SplitInfo num_examples=25000>, 'unsupervised': <tfds.core.SplitInfo num_examples=50000>}


* Each of the 25000 records in the <tf.data.Dataset> (for the train set and test set) is stored as a FeaturesDict. 
* The FeaturesDict consists of a **string tensor** called "text" (containing the review) and an **integer tensor** called "label"(containing the label). 
* We need to convert the string tensor and the integer tensor of each record to a np array.

In [51]:
train_data, test_data = imdb['train'], imdb['test']

reviews_train = []
labels_train = []
reviews_test = []
labels_test = []

for review, label in train_data:

    reviews_train.append(review.numpy().decode("utf8"))
    # Tensors are explicitly converted to np arrays using their .numpy() method.
    # review.numpy() is b"This was an absolutely terrible movie. Don't ..." , of <class 'bytes'>

    # A string is a sequence of characters, i.e. unicode symbols that can't be directly stored on disk. 
    # A byte string is a sequence of bytes - things that can be stored on disk. 
    # The mapping between bytes and unicode symbols is an encoding.
    # There are many different types of encodings. 
    # We need to convert the byte string to an actual string, using the decode() functoin of the byte string.


for review, label in test_data:
    reviews_test.append(review.numpy().decode("utf8"))
    labels_test.append(label.numpy())

In [52]:
print(reviews_train[0])

This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it.


In [53]:
vocab_size = 10000
embedding_dim = 16
max_length = 120
trunc_type='post'
oov_tok = "UNK" # "UNK" if the word is not part of the vocab_size words

# Tokenize words and vectorize sentences

In [54]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token= oov_tok)
tokenizer.fit_on_texts(reviews_train)
word_index = tokenizer.word_index
seqs_train = tokenizer.texts_to_sequences(reviews_train)
padded_train = pad_sequences(seqs_train, maxlen=max_length, truncating=trunc_type)

seqs_test = tokenizer.texts_to_sequences(reviews_test)
padded_test = pad_sequences(seqs_test, maxlen=max_length)

In [55]:
reverse_word_index = dict([(value,key) for key,value in word_index.items()])
def decode_review(paddedseq):
    return " ".join([reverse_word_index.get(token, "?") for token in paddedseq]) 
    # Rather than dict[key], dict.get(key) lets us return "?" if a certain key is missing.
    # THus, all the zeros in the paddedseq (i.e. the pad characters) will be "?"

In [56]:
print(decode_review(padded_train[0]))

? ? ? this was an absolutely terrible movie don't be UNK in by christopher walken or michael UNK both are great actors but this must simply be their worst role in history even their great acting could not redeem this movie's ridiculous storyline this movie is an early nineties us propaganda piece the most pathetic scenes were those when the UNK rebels were making their cases for UNK maria UNK UNK appeared phony and her pseudo love affair with walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning i am disappointed that there are movies like this ruining actor's like christopher UNK good name i could barely sit through it


In [57]:
padded_train[0]

array([   0,    0,    0,   12,   14,   33,  425,  392,   18,   90,   28,
          1,    9,   32, 1366, 3585,   40,  486,    1,  197,   24,   85,
        154,   19,   12,  213,  329,   28,   66,  247,  215,    9,  477,
         58,   66,   85,  114,   98,   22, 5675,   12, 1322,  643,  767,
         12,   18,    7,   33,  400, 8170,  176, 2455,  416,    2,   89,
       1231,  137,   69,  146,   52,    2,    1, 7577,   69,  229,   66,
       2933,   16,    1, 2904,    1,    1, 1479, 4940,    3,   39, 3900,
        117, 1584,   17, 3585,   14,  162,   19,    4, 1231,  917, 7917,
          9,    4,   18,   13,   14, 4139,    5,   99,  145, 1214,   11,
        242,  683,   13,   48,   24,  100,   38,   12, 7181, 5515,   38,
       1366,    1,   50,  401,   11,   98, 1197,  867,  141,   10],
      dtype=int32)

# Create model
## Create a 16-dimensional embedding, for each token (i.e. each tokenized word) in each padded sequence (i.e. each vectorized sentence)

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim)
])