In [1]:
import tensorflow as tf
import tensorflow.keras as keras

In [4]:
(X_train, y_train), (X_test, y_test) = keras.datasets.imdb.load_data()
X_train[0][:10]  # the 1st review, its first 10 words

[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65]

**(?)** How to get rid of the `VisibleDeprecationWarning`?

In [5]:
imdb_data = keras.datasets.imdb.load_data()
type(imdb_data)

tuple

In [6]:
len(imdb_data)

2

In [7]:
for i in range(2):
    print(i, type(imdb_data[i]), len(imdb_data[i]))

0 <class 'tuple'> 2
1 <class 'tuple'> 2


In [8]:
for i in range(2):
    print(i, type(imdb_data[0][i]), type(imdb_data[1][i]))

0 <class 'numpy.ndarray'> <class 'numpy.ndarray'>
1 <class 'numpy.ndarray'> <class 'numpy.ndarray'>


In [22]:
for i in range(2):
    print(i, imdb_data[1][i].shape, imdb_data[1][i].shape)

0 (25000,) (25000,)
1 (25000,) (25000,)


## Decode a Review

In [9]:
word_index = keras.datasets.imdb.get_word_index()
type(word_index)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json


dict

In [24]:
len(word_index)

88584

In [26]:
word_index["movie"], word_index["montage"] 

(17, 4223)

`word_index` is a dictionary whose keys are words (i.e. strings) and whose values are the encoded indices.

In [32]:
# id_to_word is a dictionary being nearly the opposite of word_index
id_to_word = {index + 3: word for word, index in word_index.items()}
for id_, token in enumerate(("<pad>", "<sos>", "<unk>")):
    id_to_word[id_] = token

**(?)** Why `index + 3`?<br>
**(R)** Note the diff btw two entities

01. `index`
  - `index` is what `keras.datasets.imdb.get_word_index()` gave us.
02. `id_`
  - `id_` is index shifted to the right by 3 integers to allow spaces for the 3 special tokens `"<pad>", "<sos>", "<unk>"`

In [33]:
example_review = " ".join([id_to_word[id_] for id_ in X_train[0][:10]])
example_review

'<sos> this film was just brilliant casting location scenery story'

```python
[word_index[word]+3 for word in example_review.split(" ")]
```
<br>

```
KeyError: '<sos>'
```

In [39]:
word_to_id = {word: id_ for id_, word in id_to_word.items()}
print([word_to_id[word] for word in example_review.split(" ")])
print(X_train[0][:10])
print([word_index[word]+3 for word in
"this film was just brilliant casting location scenery story".split(" ")])

[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65]
[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65]
[14, 22, 16, 43, 530, 973, 1622, 1385, 65]


Let's handle the preprocessing exclusively in tensorflow, so that the entiring processing is inside the model and thus can be shifted outside Python, to mobile devices and web browsers.

In [1]:
import tensorflow_datasets as tfds

In [2]:
datasets, info = tfds.load("imdb_reviews", as_supervised=True, with_info=True)
type(datasets), type(info)

[1mDownloading and preparing dataset imdb_reviews (80.23 MiB) to /home/phunc20/tensorflow_datasets/imdb_reviews/plain_text/0.1.0...[0m


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]





0 examples [00:00, ? examples/s]

Shuffling...:   0%|          | 0/10 [00:00<?, ? shard/s]

Instructions for updating:
Use eager execution and: 
`tf.data.TFRecordDataset(path)`


Instructions for updating:
Use eager execution and: 
`tf.data.TFRecordDataset(path)`


Reading...: 0 examples [00:00, ? examples/s]

Writing...:   0%|          | 0/2500 [00:00<?, ? examples/s]

Reading...: 0 examples [00:00, ? examples/s]

Writing...:   0%|          | 0/2500 [00:00<?, ? examples/s]

Reading...: 0 examples [00:00, ? examples/s]

Writing...:   0%|          | 0/2500 [00:00<?, ? examples/s]

Reading...: 0 examples [00:00, ? examples/s]

Writing...:   0%|          | 0/2500 [00:00<?, ? examples/s]

Reading...: 0 examples [00:00, ? examples/s]

Writing...:   0%|          | 0/2500 [00:00<?, ? examples/s]

Reading...: 0 examples [00:00, ? examples/s]

Writing...:   0%|          | 0/2500 [00:00<?, ? examples/s]

Reading...: 0 examples [00:00, ? examples/s]

Writing...:   0%|          | 0/2500 [00:00<?, ? examples/s]

Reading...: 0 examples [00:00, ? examples/s]

Writing...:   0%|          | 0/2500 [00:00<?, ? examples/s]

Reading...: 0 examples [00:00, ? examples/s]

Writing...:   0%|          | 0/2500 [00:00<?, ? examples/s]

Reading...: 0 examples [00:00, ? examples/s]

Writing...:   0%|          | 0/2500 [00:00<?, ? examples/s]

0 examples [00:00, ? examples/s]

Shuffling...:   0%|          | 0/10 [00:00<?, ? shard/s]

Reading...: 0 examples [00:00, ? examples/s]

Writing...:   0%|          | 0/2500 [00:00<?, ? examples/s]

Reading...: 0 examples [00:00, ? examples/s]

Writing...:   0%|          | 0/2500 [00:00<?, ? examples/s]

Reading...: 0 examples [00:00, ? examples/s]

Writing...:   0%|          | 0/2500 [00:00<?, ? examples/s]

Reading...: 0 examples [00:00, ? examples/s]

Writing...:   0%|          | 0/2500 [00:00<?, ? examples/s]

Reading...: 0 examples [00:00, ? examples/s]

Writing...:   0%|          | 0/2500 [00:00<?, ? examples/s]

Reading...: 0 examples [00:00, ? examples/s]

Writing...:   0%|          | 0/2500 [00:00<?, ? examples/s]

Reading...: 0 examples [00:00, ? examples/s]

Writing...:   0%|          | 0/2500 [00:00<?, ? examples/s]

Reading...: 0 examples [00:00, ? examples/s]

Writing...:   0%|          | 0/2500 [00:00<?, ? examples/s]

Reading...: 0 examples [00:00, ? examples/s]

Writing...:   0%|          | 0/2500 [00:00<?, ? examples/s]

Reading...: 0 examples [00:00, ? examples/s]

Writing...:   0%|          | 0/2500 [00:00<?, ? examples/s]

0 examples [00:00, ? examples/s]

Shuffling...:   0%|          | 0/20 [00:00<?, ? shard/s]

Reading...: 0 examples [00:00, ? examples/s]

Writing...:   0%|          | 0/2500 [00:00<?, ? examples/s]

Reading...: 0 examples [00:00, ? examples/s]

Writing...:   0%|          | 0/2500 [00:00<?, ? examples/s]

Reading...: 0 examples [00:00, ? examples/s]

Writing...:   0%|          | 0/2500 [00:00<?, ? examples/s]

Reading...: 0 examples [00:00, ? examples/s]

Writing...:   0%|          | 0/2500 [00:00<?, ? examples/s]

Reading...: 0 examples [00:00, ? examples/s]

Writing...:   0%|          | 0/2500 [00:00<?, ? examples/s]

Reading...: 0 examples [00:00, ? examples/s]

Writing...:   0%|          | 0/2500 [00:00<?, ? examples/s]

Reading...: 0 examples [00:00, ? examples/s]

Writing...:   0%|          | 0/2500 [00:00<?, ? examples/s]

Reading...: 0 examples [00:00, ? examples/s]

Writing...:   0%|          | 0/2500 [00:00<?, ? examples/s]

Reading...: 0 examples [00:00, ? examples/s]

Writing...:   0%|          | 0/2500 [00:00<?, ? examples/s]

Reading...: 0 examples [00:00, ? examples/s]

Writing...:   0%|          | 0/2500 [00:00<?, ? examples/s]

Reading...: 0 examples [00:00, ? examples/s]

Writing...:   0%|          | 0/2500 [00:00<?, ? examples/s]

Reading...: 0 examples [00:00, ? examples/s]

Writing...:   0%|          | 0/2500 [00:00<?, ? examples/s]

Reading...: 0 examples [00:00, ? examples/s]

Writing...:   0%|          | 0/2500 [00:00<?, ? examples/s]

Reading...: 0 examples [00:00, ? examples/s]

Writing...:   0%|          | 0/2500 [00:00<?, ? examples/s]

Reading...: 0 examples [00:00, ? examples/s]

Writing...:   0%|          | 0/2500 [00:00<?, ? examples/s]

Reading...: 0 examples [00:00, ? examples/s]

Writing...:   0%|          | 0/2500 [00:00<?, ? examples/s]

Reading...: 0 examples [00:00, ? examples/s]

Writing...:   0%|          | 0/2500 [00:00<?, ? examples/s]

Reading...: 0 examples [00:00, ? examples/s]

Writing...:   0%|          | 0/2500 [00:00<?, ? examples/s]

Reading...: 0 examples [00:00, ? examples/s]

Writing...:   0%|          | 0/2500 [00:00<?, ? examples/s]

Reading...: 0 examples [00:00, ? examples/s]

Writing...:   0%|          | 0/2500 [00:00<?, ? examples/s]



[1mDataset imdb_reviews downloaded and prepared to /home/phunc20/tensorflow_datasets/imdb_reviews/plain_text/0.1.0. Subsequent calls will reuse this data.[0m


(dict, tensorflow_datasets.core.dataset_info.DatasetInfo)

In [3]:
len(datasets)

3

In [4]:
datasets

{'test': <_OptionsDataset shapes: ((), ()), types: (tf.string, tf.int64)>,
 'train': <_OptionsDataset shapes: ((), ()), types: (tf.string, tf.int64)>,
 'unsupervised': <_OptionsDataset shapes: ((), ()), types: (tf.string, tf.int64)>}

In [5]:
train_size = info.splits["train"].num_examples
train_size

25000

In [6]:
def preprocess(X_batch, y_batch):
    X_batch = tf.strings.substr(X_batch, 0, 300)
    X_batch = tf.strings.regex_replace(X_batch, b"<br\\s*/?>", b" ")
    X_batch = tf.strings.regex_replace(X_batch, b"[^a-zA-Z']", b" ")
    X_batch = tf.strings.split(X_batch)
    return X_batch.to_tensor(default_value=b"<pad>"), y_batch

**(?)** Try to dig deeper into why the choices of these functions such as `tf.strings.substr`, etc. In particular, what kind of form does the `X_batch` take before entering this function of `preprocess`?

In [7]:
type(datasets["train"].batch(32))

tensorflow.python.data.ops.dataset_ops.BatchDataset

**(?)** We are not modifying `y_batch`. Why not just use input arg `X_batch` and return `X_batch` alone?<br>
**(R)** Later on, there will be a line of code
```python
for X_batch, y_batch in datasets["train"].batch(32).map(preprocess):
```
This line shows how we would like to use our function `preprocess` and also the reason why it cannot be a function of `X_batch` alone (i.e. must include `y_batch` as input arg as well.)

### Construct the vocabulary
01. going thru the whole training set
02. applying our `preprocess()` function
03. using a `Counter` (in `collections` module) to count the number of occurrences of each word

In [19]:
from collections import Counter
vocabulary = Counter()
for X_batch, y_batch in datasets["train"].batch(32).map(preprocess):
    for review in X_batch:
        vocabulary.update(list(review.numpy()))

**(?)** Try to understand and explain the convoluted line `vocabulary.update(list(review.numpy()))`.

In [17]:
for X_batch, y_batch in datasets["train"].batch(32).map(preprocess).take(1):
    k = np.random.randint(0, len(X_batch))
    review = X_batch[k]
    print(f"type(review) = {type(review)}")
    print(f"type(review.numpy()) = {type(review.numpy())}")
    print(f"review.numpy().dtype = {review.numpy().dtype}")
    print(f"review.numpy() = {review.numpy()}")

type(review) = <class 'tensorflow.python.framework.ops.EagerTensor'>
type(review.numpy()) = <class 'numpy.ndarray'>
review.numpy().dtype = object
review.numpy() = [b'This' b'is' b'the' b'most' b'depressing' b'film' b'I' b'have' b'ever'
 b'seen' b'I' b'first' b'saw' b'it' b'as' b'a' b'child' b'and' b'even'
 b'thinking' b'about' b'it' b'now' b'really' b'upsets' b'me' b'I' b'know'
 b'it' b'was' b'set' b'in' b'a' b'time' b'when' b'life' b'was' b'hard'
 b'and' b'I' b'know' b'these' b'people' b'were' b'poor' b'and' b'the'
 b'crops' b'were' b'vital' b'Yes' b'I' b'get' b'all' b'that' b'What' b'I'
 b'find' b'hard' b'to' b'take' b'is' b'I' b"can't" b'remember' b'o']


**(?)** Must we convert the ndarray `review.numpy()` (of data type `object`) into a list before feeding it into the `Counter`'s `update()` method?<br>
**(R)** Not necessarily, it seems.

In [20]:
vocabulary2 = Counter()
for X_batch, y_batch in datasets["train"].batch(32).map(preprocess):
    for review in X_batch:
        vocabulary2.update(review.numpy())

In [21]:
vocabulary2

Counter({b'This': 6672,
         b'was': 14950,
         b'a': 38564,
         b'wonderfully': 78,
         b'clever': 132,
         b'and': 33431,
         b'entertaining': 418,
         b'movie': 14945,
         b'that': 14752,
         b'I': 27019,
         b'shall': 27,
         b'never': 1416,
         b'tire': 13,
         b'of': 33983,
         b'watching': 1437,
         b'many': 1630,
         b'times': 792,
         b'The': 11879,
         b'casting': 148,
         b'magnificent': 41,
         b'in': 18966,
         b'matching': 5,
         b'up': 2508,
         b'the': 61137,
         b'young': 846,
         b'with': 9073,
         b'older': 134,
         b'characters': 1371,
         b'There': 1230,
         b'are': 5665,
         b'those': 1035,
         b'us': 575,
         b'out': 3376,
         b'here': 919,
         b'who': 4266,
         b'really': 2924,
         b'do': 1593,
         b'appreciate': 118,
         b'good': 3727,
         b'actors': 1087,
         b'an'

In [22]:
vocabulary

Counter({b'Just': 297,
         b'because': 2072,
         b'someone': 419,
         b'is': 25719,
         b'under': 248,
         b'the': 61137,
         b'age': 247,
         b'of': 33983,
         b'does': 962,
         b'not': 6325,
         b'mean': 405,
         b'they': 3008,
         b'are': 5665,
         b'stupid': 395,
         b'If': 1282,
         b'your': 991,
         b'child': 274,
         b'likes': 98,
         b'this': 18490,
         b'film': 10668,
         b"you'd": 71,
         b'better': 1169,
         b'have': 7031,
         b'him': 952,
         b'her': 2322,
         b'tested': 5,
         b'I': 27019,
         b'am': 1094,
         b'continually': 9,
         b'amazed': 78,
         b'at': 4814,
         b'how': 1791,
         b'so': 4521,
         b'many': 1630,
         b'people': 2086,
         b'can': 2337,
         b'be': 5888,
         b'involved': 240,
         b'in': 18966,
         b'something': 979,
         b'that': 14752,
         b'turns': 185,

In [23]:
vocabulary2 == vocabulary

False

In [24]:
vocabulary[b"this"], vocabulary2[b"this"]

(18490, 18490)

In [25]:
vocabulary[b"If"], vocabulary2[b"If"]

(1282, 1282)

In [50]:
vocabulary.most_common()[:3]

[(b'<pad>', 214309), (b'the', 61137), (b'a', 38564)]

As expected, there should be a lot of `"<pad>"`.