In [2]:
import tensorflow.keras as keras
import tensorflow as tf

In [3]:
shakespeare_url = "https://homl.info/shakespeare"  # shortcut URL
filepath = keras.utils.get_file("shakespeare.txt", shakespeare_url)
filepath

'/home/phunc20/.keras/datasets/shakespeare.txt'

In [4]:
with open(filepath) as f:
    shakespeare_text = f.read()

shakespeare_text[:150]

'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou are all resolved rather to die than to famish?\n\nA'

In [5]:
tokenizer = keras.preprocessing.text.Tokenizer(char_level=True)
tokenizer

<keras_preprocessing.text.Tokenizer at 0x7f1606ef0950>

In [6]:
help(tokenizer.fit_on_texts)

Help on method fit_on_texts in module keras_preprocessing.text:

fit_on_texts(texts) method of keras_preprocessing.text.Tokenizer instance
    Updates internal vocabulary based on a list of texts.
    
    In the case where texts contains lists,
    we assume each entry of the lists to be a token.
    
    Required before using `texts_to_sequences` or `texts_to_matrix`.
    
    # Arguments
        texts: can be a list of strings,
            a generator of strings (for memory-efficiency),
            or a list of list of strings.



In [7]:
# What would happen if we forgot its arg has to be a list?
tokenizer.fit_on_texts(shakespeare_text)

In [8]:
tokenizer.texts_to_sequences(["First"])

[[20, 6, 9, 8, 3]]

In [9]:
tokenizer.texts_to_sequences("First")

[[20], [6], [9], [8], [3]]

In [10]:
L = list("First")
L

['F', 'i', 'r', 's', 't']

In [11]:
tokenizer.texts_to_sequences(L)

[[20], [6], [9], [8], [3]]

In [12]:
[L]

[['F', 'i', 'r', 's', 't']]

In [13]:
tokenizer.texts_to_sequences([L])

[[20, 6, 9, 8, 3]]

In [14]:
tokenizer.sequences_to_texts([[20, 6, 9, 8, 3]])

['f i r s t']

In [15]:
tokenizer.sequences_to_texts([[20], [6], [9], [8], [3]])

['f', 'i', 'r', 's', 't']

In [16]:
max_id = len(tokenizer.word_index)
max_id  # num of distinct characters

39

In [17]:
tokenizer.sequences_to_texts([range(1, max_id+1)])

["  e t o a i h s r n \n l d u m y w , c f g b p : k v . ' ; ? ! - j q x z 3 & $"]

**Rmk**. To make it visually clearer, `sequences_to_texts()` returns the characters, each separated by an extra space character.

In [18]:
dataset_size = tokenizer.document_count
dataset_size

1115394

In [19]:
len(shakespeare_text)

1115394

In [20]:
tokenizer.sequences_to_texts([[0]])

['']

In [21]:
tokenizer.sequences_to_texts([[0]])[0] == ''

True

In [22]:
import numpy as np

In [23]:
# map each letter to its ID (0-38 instead of 1-39)
[encoded] = np.array(tokenizer.texts_to_sequences([shakespeare_text])) - 1
encoded

array([19,  5,  8, ..., 20, 26, 10])

In [24]:
len(encoded)

1115394

In [25]:
train_size = dataset_size * 90 // 100
train_size

1003854

In [26]:
dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])

## Chopping the Sequential Dataset into Multiple Windows

In [27]:
n_steps = 100
window_length = n_steps + 1  # target = input shifted 1 character ahead
dataset = dataset.window(window_length, shift=1, drop_remainder=True)

In [28]:
ds_just_flat = dataset.flat_map(lambda ds: ds)
for i, tensor in enumerate(ds_just_flat):
    if i < 10:
        print(tensor)
    else:
        break

tf.Tensor(19, shape=(), dtype=int64)
tf.Tensor(5, shape=(), dtype=int64)
tf.Tensor(8, shape=(), dtype=int64)
tf.Tensor(7, shape=(), dtype=int64)
tf.Tensor(2, shape=(), dtype=int64)
tf.Tensor(0, shape=(), dtype=int64)
tf.Tensor(18, shape=(), dtype=int64)
tf.Tensor(5, shape=(), dtype=int64)
tf.Tensor(2, shape=(), dtype=int64)
tf.Tensor(5, shape=(), dtype=int64)


In [29]:
dataset = dataset.flat_map(lambda ds: ds.batch(window_length))

In [30]:
for i, tensor in enumerate(dataset):
    if i < 10:
        print(tensor.shape)
    else:
        break

(101,)
(101,)
(101,)
(101,)
(101,)
(101,)
(101,)
(101,)
(101,)
(101,)


In [31]:
batch_size = 32
dataset = dataset.shuffle(10_000).batch(batch_size)
# Recall that the arg inside shuffle() is the buffer size

In [32]:
for i, batch in enumerate(dataset):
    if i < 3:
        print(batch)
    else:
        break

tf.Tensor(
[[10 21 13 ... 10  5  9]
 [ 1 12  0 ... 12  5  1]
 [ 6  1  0 ... 12  0 14]
 ...
 [ 7 22  1 ...  4  2  0]
 [ 0  8  1 ...  0  7  3]
 [ 1  0 18 ... 13  9  5]], shape=(32, 101), dtype=int64)
tf.Tensor(
[[ 4  2  0 ...  0 15  3]
 [ 0 16  6 ...  9  3  2]
 [11  0 12 ...  2  3  0]
 ...
 [ 3  0 22 ...  1 11 25]
 [ 7 10  2 ...  6  0  4]
 [ 9  5 13 ... 13  9  7]], shape=(32, 101), dtype=int64)
tf.Tensor(
[[ 9  5 14 ... 11 11  3]
 [22  5  9 ...  0 20  5]
 [ 2  6  1 ...  9  0  4]
 ...
 [ 0  3 19 ... 15  3 13]
 [31 18  8 ...  0  2  3]
 [ 6  3 13 ...  3 19  0]], shape=(32, 101), dtype=int64)


In [33]:
dataset = dataset.map(lambda batch: (batch[:, :-1], batch[:, 1:]))

In [34]:
for i, batch in enumerate(dataset):
    if i < 2:
        print(type(batch), len(batch), batch[0].shape, batch[1].shape)
    else:
        break

<class 'tuple'> 2 (32, 100) (32, 100)
<class 'tuple'> 2 (32, 100) (32, 100)


**(?)** Why the target is a tensor of length `100`, not a single character?

In [35]:
dataset = dataset.map(lambda X_batch, Y_batch:
                      (tf.one_hot(X_batch, depth=max_id), Y_batch))

Let's make sure the correctness of `tf.one_hot()`.

In [36]:
tf.random.uniform(maxval=max_id-1, dtype=tf.int32, shape=(32,100))

<tf.Tensor: shape=(32, 100), dtype=int32, numpy=
array([[23, 25, 22, ..., 31, 22, 27],
       [11,  0, 17, ..., 15, 16, 22],
       [32, 16, 10, ..., 37, 18, 16],
       ...,
       [19,  5, 19, ..., 27, 17,  5],
       [ 8, 30, 20, ...,  1, 21,  0],
       [33, 14, 30, ..., 12, 27, 24]], dtype=int32)>

In [37]:
X_batch = tf.random.uniform(maxval=max_id-1, dtype=tf.int32, shape=(32,100))
one_hotted = tf.one_hot(X_batch, depth=max_id)
one_hotted

<tf.Tensor: shape=(32, 100, 39), dtype=float32, numpy=
array([[[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 1., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 1., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       ...,

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 1.,

In [38]:
for f in dir(tf.math):
    if "sum" in f:
        print(f)

cumsum
cumulative_logsumexp
reduce_logsumexp
reduce_sum
segment_sum
unsorted_segment_sum


In [39]:
help(tf.math.reduce_sum)

Help on function reduce_sum in module tensorflow.python.ops.math_ops:

reduce_sum(input_tensor, axis=None, keepdims=False, name=None)
    Computes the sum of elements across dimensions of a tensor.
    
    Reduces `input_tensor` along the dimensions given in `axis`.
    Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
    of the entries in `axis`, which must be unique. If `keepdims` is true, the
    reduced dimensions are retained with length 1.
    
    If `axis` is None, all dimensions are reduced, and a
    tensor with a single element is returned.
    
    For example:
    
    >>> # x has a shape of (2, 3) (two rows and three columns):
    >>> x = tf.constant([[1, 1, 1], [1, 1, 1]])
    >>> x.numpy()
    array([[1, 1, 1],
           [1, 1, 1]], dtype=int32)
    >>> # sum all the elements
    >>> # 1 + 1 + 1 + 1 + 1+ 1 = 6
    >>> tf.reduce_sum(x).numpy()
    6
    >>> # reduce along the first dimension
    >>> # the result is [1, 1, 1] + [1, 1, 1] = [2, 

In [40]:
tf.math.reduce_sum(one_hotted, axis=-1)

<tf.Tensor: shape=(32, 100), dtype=float32, numpy=
array([[1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       ...,
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.]], dtype=float32)>

In [41]:
tf.math.reduce_sum(one_hotted)

<tf.Tensor: shape=(), dtype=float32, numpy=3200.0>

In [42]:
dataset = dataset.prefetch(1)

In [43]:
for i, batch in enumerate(dataset):
    if i < 2:
        print(type(batch), len(batch), batch[0].shape, batch[1].shape)
    else:
        break

<class 'tuple'> 2 (32, 100, 39) (32, 100)
<class 'tuple'> 2 (32, 100, 39) (32, 100)


## Building and Training the Char-Rnn Model

**(?)** Why use `sparse_categorical_crossentropy`?

**(?)** `GRU` with `128` units. What that really mean?

**(?)** The meaning of `dropout` and `recurrent_dropout` in `keras.layers.GRU()`

In [58]:
model = keras.models.Sequential([
    keras.layers.GRU(128, return_sequences=True, input_shape=[None, max_id],
                     dropout=0.2, recurrent_dropout=0.2),
    keras.layers.GRU(128, return_sequences=True,
                     dropout=0.2, recurrent_dropout=0.2),
    keras.layers.TimeDistributed(keras.layers.Dense(max_id,
                                                    activation="softmax")),
])
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam")

In [None]:
history = model.fit(dataset, epochs=20)

## Using the Char-RNN Model

In [44]:
def preprocess(texts):
    X = np.array(tokenizer.texts_to_sequences(texts)) - 1
    return tf.one_hot(X, max_id)  # equiv. to tf.one_hot(X, depth=max_id)

In [47]:
X_new = preprocess("How are yo")
X_new.shape

TensorShape([10, 1, 39])

In [49]:
X_new = preprocess(["How are yo"])
X_new.shape

TensorShape([1, 10, 39])

`[1, 10, 39]` is a shape similar to a batch's shape `(32, 100, 39)` above. Let's try to input a list of two strings.

```python
X_new = preprocess(["How are yo", "I'm fin"])
X_new.shape
----------------------------------------------
```
```
TypeError                                 Traceback (most recent call last)
<ipython-input-48-1dc23507da9a> in <module>
----> 1 X_new = preprocess(["How are yo", "I'm fin"])
      2 X_new.shape

<ipython-input-44-113c814f9c80> in preprocess(texts)
      1 def preprocess(texts):
----> 2     X = np.array(tokenizer.texts_to_sequences(texts)) - 1
      3     return tf.one_hot(X, max_id)  # equiv. to tf.one_hot(X, depth=max_id)

TypeError: unsupported operand type(s) for -: 'list' and 'int'
```

In [50]:
tokenizer.texts_to_sequences(["How are yo", "I'm fin"])

[[7, 4, 17, 1, 5, 9, 2, 1, 16, 4], [6, 28, 15, 1, 20, 6, 10]]

In [53]:
np.array(_)

  """Entry point for launching an IPython kernel.


array([list([7, 4, 17, 1, 5, 9, 2, 1, 16, 4]),
       list([6, 28, 15, 1, 20, 6, 10])], dtype=object)

This results in a numpy array of `dtype=object` (more preciesely, lists) because the two lists are of different length. That's why it cannot be subtracted `1` from.

In [59]:
Y_pred = model.predict_classes(X_new)
Y_pred



array([[34, 34, 34, 13, 13, 13, 22,  9, 13, 13]])

In [64]:
Y_pred.shape

(1, 10)

In [62]:
tokenizer.sequences_to_texts(Y_pred + 1)

['x x x u u u p n u u']

Note that our input was **a single batch** of a sequence of `10` characters, thus so should the output `Y_pred` be of shape `(1, 10)`. Mapped to the corresponding characters, it should give `10` characters.

In [60]:
tokenizer.sequences_to_texts(Y_pred + 1)[0]

'x x x u u u p n u u'

In [61]:
tokenizer.sequences_to_texts(Y_pred + 1)[0][-1]

'u'

**(?)** Why `Y_pred + 1` before sending into `tokenizer.sequences_to_texts()`?

In [63]:
tokenizer.sequences_to_texts(Y_pred)

['q q q d d d b r d d']

**(R)** We have to `Y_pred + 1` because the RNN model will only return `[0..38]` as neuron output, but our `tokenizer` has encoded the characters to `[1..39]`.

**(?)** `predict()` vs `predict_class()`?