In [1]:
import tensorflow as tf
import tensorflow.keras as keras

In [2]:
(X_train, y_train), (X_test, y_test) = keras.datasets.imdb.load_data()
X_train[0][:10]  # the 1st review, its first 10 words

  x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])
  x_test, y_test = np.array(xs[idx:]), np.array(labels[idx:])


[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65]

**(?)** How to get rid of the `VisibleDeprecationWarning`?

In [3]:
imdb_data = keras.datasets.imdb.load_data()
type(imdb_data)

tuple

In [4]:
len(imdb_data)

2

In [5]:
for i in range(2):
    print(i, type(imdb_data[i]), len(imdb_data[i]))

0 <class 'tuple'> 2
1 <class 'tuple'> 2


In [6]:
for i in range(2):
    print(i, type(imdb_data[0][i]), type(imdb_data[1][i]))

0 <class 'numpy.ndarray'> <class 'numpy.ndarray'>
1 <class 'numpy.ndarray'> <class 'numpy.ndarray'>


In [7]:
for i in range(2):
    print(i, imdb_data[1][i].shape, imdb_data[1][i].shape)

0 (25000,) (25000,)
1 (25000,) (25000,)


## Decode a Review

In [8]:
word_index = keras.datasets.imdb.get_word_index()
type(word_index)

dict

In [9]:
len(word_index)

88584

In [10]:
word_index["movie"], word_index["montage"] 

(17, 4223)

`word_index` is a dictionary whose keys are words (i.e. strings) and whose values are the encoded indices.

In [11]:
# id_to_word is a dictionary being nearly the opposite of word_index
id_to_word = {index + 3: word for word, index in word_index.items()}
for id_, token in enumerate(("<pad>", "<sos>", "<unk>")):
    id_to_word[id_] = token

**(?)** Why `index + 3`?<br>
**(R)** Note the diff btw two entities

01. `index`
  - `index` is what `keras.datasets.imdb.get_word_index()` gave us.
02. `id_`
  - `id_` is index shifted to the right by 3 integers to allow spaces for the 3 special tokens `"<pad>", "<sos>", "<unk>"`

In [12]:
example_review = " ".join([id_to_word[id_] for id_ in X_train[0][:10]])
example_review

'<sos> this film was just brilliant casting location scenery story'

```python
[word_index[word]+3 for word in example_review.split(" ")]
```
<br>

```
KeyError: '<sos>'
```

In [13]:
word_to_id = {word: id_ for id_, word in id_to_word.items()}
print([word_to_id[word] for word in example_review.split(" ")])
print(X_train[0][:10])
print([word_index[word]+3 for word in
"this film was just brilliant casting location scenery story".split(" ")])

[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65]
[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65]
[14, 22, 16, 43, 530, 973, 1622, 1385, 65]


Let's handle the preprocessing exclusively in tensorflow, so that the entiring processing is inside the model and thus can be shifted outside Python, to mobile devices and web browsers.

In [14]:
import tensorflow_datasets as tfds

In [15]:
datasets, info = tfds.load("imdb_reviews", as_supervised=True, with_info=True)
type(datasets), type(info)



(dict, tensorflow_datasets.core.dataset_info.DatasetInfo)

In [16]:
len(datasets)

3

In [17]:
datasets

{'test': <_OptionsDataset shapes: ((), ()), types: (tf.string, tf.int64)>,
 'train': <_OptionsDataset shapes: ((), ()), types: (tf.string, tf.int64)>,
 'unsupervised': <_OptionsDataset shapes: ((), ()), types: (tf.string, tf.int64)>}

In [18]:
train_size = info.splits["train"].num_examples
train_size

25000

In [19]:
def preprocess(X_batch, y_batch):
    X_batch = tf.strings.substr(X_batch, 0, 300)
    X_batch = tf.strings.regex_replace(X_batch, b"<br\\s*/?>", b" ")
    X_batch = tf.strings.regex_replace(X_batch, b"[^a-zA-Z']", b" ")
    X_batch = tf.strings.split(X_batch)
    return X_batch.to_tensor(default_value=b"<pad>"), y_batch

**(?)** Try to dig deeper into why the choices of these functions such as `tf.strings.substr`, etc. In particular, what kind of form does the `X_batch` take before entering this function of `preprocess`? In particular, does `to_tensor()` means that every instance in a batch will have exactly the same number of words (i.e. ragged tensor $\implies$ tensor)?

In [20]:
for X, y in datasets["train"].batch(32).map(preprocess).take(1):
    print(f"X.shape = {X.shape}")
    print(f"y.shape = {y.shape}")
    print(f"X = {X}")
    print(f"y = {y}")

X.shape = (32, 66)
y.shape = (32,)
X = [[b'I' b"haven't" b'seen' ... b'<pad>' b'<pad>' b'<pad>']
 [b'This' b'was' b'a' ... b'<pad>' b'<pad>' b'<pad>']
 [b'Oh' b'yeah' b'Jenna' ... b'<pad>' b'<pad>' b'<pad>']
 ...
 [b'Great' b'fun' b'I' ... b'<pad>' b'<pad>' b'<pad>']
 [b'What' b'can' b'I' ... b'<pad>' b'<pad>' b'<pad>']
 [b'This' b'film' b'really' ... b'<pad>' b'<pad>' b'<pad>']]
y = [1 1 1 0 1 1 1 1 1 1 1 0 0 1 0 1 1 0 0 0 0 0 0 0 1 1 1 1 0 1 1 0]


In [21]:
for X, y in datasets["train"].batch(32).map(preprocess).take(5):
    print(f"X.shape = {X.shape}")
    print(f"y.shape = {y.shape}")
    #print(f"X = {X}")
    #print(f"y = {y}")

X.shape = (32, 66)
y.shape = (32,)
X.shape = (32, 63)
y.shape = (32,)
X.shape = (32, 66)
y.shape = (32,)
X.shape = (32, 64)
y.shape = (32,)
X.shape = (32, 62)
y.shape = (32,)


In [22]:
type(datasets["train"].batch(32))

tensorflow.python.data.ops.dataset_ops.BatchDataset

**(?)** We are not modifying `y_batch`. Why not just use input arg `X_batch` and return `X_batch` alone?<br>
**(R)** Later on, there will be a line of code
```python
for X_batch, y_batch in datasets["train"].batch(32).map(preprocess):
```
This line shows how we would like to use our function `preprocess` and also the reason why it cannot be a function of `X_batch` alone (i.e. must include `y_batch` as input arg as well.)

### Construct the vocabulary
01. going thru the whole training set
02. applying our `preprocess()` function
03. using a `Counter` (in `collections` module) to count the number of occurrences of each word

In [23]:
from collections import Counter
vocabulary = Counter()
for X_batch, y_batch in datasets["train"].batch(32).map(preprocess):
    for review in X_batch:
        vocabulary.update(list(review.numpy()))

**(?)** Try to understand and explain the convoluted line `vocabulary.update(list(review.numpy()))`.

In [24]:
import numpy as np

In [25]:
for X_batch, y_batch in datasets["train"].batch(32).map(preprocess).take(1):
    k = np.random.randint(0, len(X_batch))
    review = X_batch[k]
    print(f"type(review) = {type(review)}")
    print(f"type(review.numpy()) = {type(review.numpy())}")
    print(f"review.numpy().dtype = {review.numpy().dtype}")
    print(f"review.numpy() = {review.numpy()}")

type(review) = <class 'tensorflow.python.framework.ops.EagerTensor'>
type(review.numpy()) = <class 'numpy.ndarray'>
review.numpy().dtype = object
review.numpy() = [b'I' b'think' b'that' b'the' b'idea' b'of' b'the' b'plot' b'is'
 b'perfect' b'for' b'exploring' b'first' b'of' b'all' b'the' b'emotional'
 b'experiences' b'of' b'the' b'people' b'involved' b'and' b'second' b'as'
 b'someone' b'else' b'wrote' b'in' b'a' b'comment' b'the' b'implications'
 b'of' b'this' b'kind' b'of' b'relationships' b'incest' b'and'
 b'lesbianism' b'in' b'the' b'romanian' b'society' b'so' b'to' b'begin'
 b'with' b'the' b'<pad>' b'<pad>' b'<pad>' b'<pad>' b'<pad>' b'<pad>'
 b'<pad>' b'<pad>' b'<pad>' b'<pad>' b'<pad>' b'<pad>' b'<pad>' b'<pad>'
 b'<pad>' b'<pad>']


**(?)** Must we convert the ndarray `review.numpy()` (of data type `object`) into a list before feeding it into the `Counter`'s `update()` method?<br>
**(R)** Not necessarily, it seems.

In [26]:
vocabulary2 = Counter()
for X_batch, y_batch in datasets["train"].batch(32).map(preprocess):
    for review in X_batch:
        vocabulary2.update(review.numpy())

In [27]:
vocabulary2

Counter({b'I': 27019,
         b'absolutely': 460,
         b'LOVED': 25,
         b'this': 18490,
         b'movie': 14945,
         b'when': 2812,
         b'was': 14950,
         b'a': 38564,
         b'kid': 283,
         b'cried': 33,
         b'every': 867,
         b'time': 3126,
         b'watched': 1124,
         b'it': 17690,
         b'It': 4038,
         b"wasn't": 638,
         b'weird': 160,
         b'to': 27707,
         b'me': 2923,
         b'totally': 320,
         b'identified': 6,
         b'with': 9073,
         b'the': 61137,
         b'characters': 1371,
         b'would': 2451,
         b'love': 1539,
         b'see': 2620,
         b'again': 772,
         b'and': 33431,
         b'hope': 199,
         b'wont': 21,
         b'be': 5888,
         b'disappointed': 325,
         b'Pufnstuf': 4,
         b'rocks': 32,
         b'really': 2924,
         b'drawn': 96,
         b'in': 18966,
         b'fantasy': 153,
         b'world': 649,
         b'And': 1063,
    

In [28]:
vocabulary

Counter({b'Just': 297,
         b'because': 2072,
         b'someone': 419,
         b'is': 25719,
         b'under': 248,
         b'the': 61137,
         b'age': 247,
         b'of': 33983,
         b'does': 962,
         b'not': 6325,
         b'mean': 405,
         b'they': 3008,
         b'are': 5665,
         b'stupid': 395,
         b'If': 1282,
         b'your': 991,
         b'child': 274,
         b'likes': 98,
         b'this': 18490,
         b'film': 10668,
         b"you'd": 71,
         b'better': 1169,
         b'have': 7031,
         b'him': 952,
         b'her': 2322,
         b'tested': 5,
         b'I': 27019,
         b'am': 1094,
         b'continually': 9,
         b'amazed': 78,
         b'at': 4814,
         b'how': 1791,
         b'so': 4521,
         b'many': 1630,
         b'people': 2086,
         b'can': 2337,
         b'be': 5888,
         b'involved': 240,
         b'in': 18966,
         b'something': 979,
         b'that': 14752,
         b'turns': 185,

In [29]:
vocabulary2 == vocabulary

False

In [30]:
vocabulary[b"this"], vocabulary2[b"this"]

(18490, 18490)

In [31]:
vocabulary[b"If"], vocabulary2[b"If"]

(1282, 1282)

In [32]:
vocabulary.most_common()[:3]

[(b'<pad>', 214837), (b'the', 61137), (b'a', 38564)]

As expected, there should be a lot of `"<pad>"`.

In [33]:
vocabulary.most_common(7)

[(b'<pad>', 214837),
 (b'the', 61137),
 (b'a', 38564),
 (b'of', 33983),
 (b'and', 33431),
 (b'to', 27707),
 (b'I', 27019)]

In [34]:
vocab_size = 10_000
truncated_vocabulary = [
    word for word, count in vocabulary.most_common(vocab_size)]

Map bytes strings to indices before further mapping them into embedding vectors.

In [35]:
tf.int64, tf.uint64

(tf.int64, tf.uint64)

In [36]:
words = tf.constant(truncated_vocabulary)
word_ids = tf.range(len(truncated_vocabulary), dtype=tf.int64)
# can dtype be tf.uint64?
vocab_init = tf.lookup.KeyValueTensorInitializer(words, word_ids)
n_oov_buckets = 1000
table = tf.lookup.StaticVocabularyTable(vocab_init, n_oov_buckets)

Try look up the IDs of a few words:

In [37]:
b"I am fine. And you?".split()

[b'I', b'am', b'fine.', b'And', b'you?']

In [38]:
table.lookup(tf.constant(b"This movie was faaaaaantastic".split()))

<tf.Tensor: shape=(4,), dtype=int64, numpy=array([   22,    12,    11, 10053])>

In [39]:
table.lookup(tf.constant([b"This movie was faaaaaantastic".split()]))

<tf.Tensor: shape=(1, 4), dtype=int64, numpy=array([[   22,    12,    11, 10053]])>

In [40]:
table.lookup(tf.constant([b"This movie was faaaaaantastic".split(),
                          b"I agree with you".split()]))

<tf.Tensor: shape=(2, 4), dtype=int64, numpy=
array([[   22,    12,    11, 10053],
       [    6,   754,    16,    27]])>

In [41]:
def encode_words(X_batch, y_batch):
    return table.lookup(X_batch), y_batch

train_set = datasets["train"].batch(32).map(preprocess)
train_set = train_set.map(encode_words).prefetch(1)

In [42]:
for X, y in train_set.take(1):
    print(f"X.shape = {X.shape}")
    print(f"y.shape = {y.shape}")
    print(f"X = {X}")
    print(f"y = {y}")

X.shape = (32, 66)
y.shape = (32,)
X = [[   22     7     1 ...   188   264   503]
 [  500    82   361 ...     0     0     0]
 [    6    21    73 ...     0     0     0]
 ...
 [  231  2089  1168 ...     0     0     0]
 [10597    22     7 ...     0     0     0]
 [   22     7   235 ...     0     0     0]]
y = [0 0 1 1 1 1 1 1 1 1 0 0 1 0 0 1 1 0 0 1 0 0 1 1 1 1 1 1 1 0 0 0]


In [43]:
for X, y in train_set.take(5):
    print(f"X.shape = {X.shape}")
    print(f"y.shape = {y.shape}")

X.shape = (32, 66)
y.shape = (32,)
X.shape = (32, 64)
y.shape = (32,)
X.shape = (32, 66)
y.shape = (32,)
X.shape = (32, 64)
y.shape = (32,)
X.shape = (32, 60)
y.shape = (32,)


Note that the sequences are still **of different lengths**. `to_tensor()` does not make the lengths uniform / homogeneous.

**(?)** Wait, I thought they should have become homogeneous after `to_tensor()`?

In [44]:
help(tf.RaggedTensor.to_tensor)

Help on function to_tensor in module tensorflow.python.ops.ragged.ragged_tensor:

to_tensor(self, default_value=None, name=None, shape=None)
    Converts this `RaggedTensor` into a `tf.Tensor`.
    
    If `shape` is specified, then the result is padded and/or truncated to
    the specified shape.
    
    Examples:
    
    >>> rt = tf.ragged.constant([[9, 8, 7], [], [6, 5], [4]])
    >>> print(rt.to_tensor())
    tf.Tensor(
        [[9 8 7] [0 0 0] [6 5 0] [4 0 0]], shape=(4, 3), dtype=int32)
    >>> print(rt.to_tensor(shape=[5, 2]))
    tf.Tensor(
        [[9 8] [0 0] [6 5] [4 0] [0 0]], shape=(5, 2), dtype=int32)
    
    Args:
      default_value: Value to set for indices not specified in `self`. Defaults
        to zero.  `default_value` must be broadcastable to
        `self.shape[self.ragged_rank + 1:]`.
      name: A name prefix for the returned tensors (optional).
      shape: The shape of the resulting dense tensor.  In particular,
        `result.shape[i]` is `shape[i]` (if

In [45]:
import tensorflow.keras as keras

More precisely, as the following stackoverflow post's answer indicated,
<https://stackoverflow.com/questions/58479556/notimplementederror-cannot-convert-a-symbolic-tensor-2nd-target0-to-a-numpy>

the problem was at Numpy. In my case, I use conda and it suffices to do as follows to make the code cell work:
```bash
conda install -n homl2e numpy=1.19
```

In [None]:
embed_size = 128
model = keras.models.Sequential([
    keras.layers.Embedding(vocab_size + n_oov_buckets, embed_size,
                           input_shape=[None]),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.GRU(128),
    keras.layers.Dense(1, activation="sigmoid"),
])
model.compile(loss="binary_crossentropy", optimizer="adam",
              metrics=["acc"])
history = model.fit(train_set, epochs=5)

Epoch 1/5
     15/Unknown - 44s 861ms/step - loss: 0.6936 - acc: 0.5769

In [46]:
embed_size = 128
model = keras.models.Sequential([
    keras.layers.Embedding(vocab_size + n_oov_buckets, embed_size,
                           mask_zero=True, # not shown in the book
                           input_shape=[None]),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.GRU(128),
    keras.layers.Dense(1, activation="sigmoid")
])
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
history = model.fit(train_set, steps_per_epoch=train_size // 32, epochs=5)

Epoch 1/5
Epoch 2/5




