In [2]:
import tensorflow.keras as keras
import tensorflow as tf

In [4]:
shakespeare_url = "https://homl.info/shakespeare"  # shortcut URL
filepath = keras.utils.get_file("shakespeare.txt", shakespeare_url)
filepath

Downloading data from https://homl.info/shakespeare


'/home/phunc20/.keras/datasets/shakespeare.txt'

In [7]:
with open(filepath) as f:
    shakespeare_text = f.read()

shakespeare_text[:150]

'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou are all resolved rather to die than to famish?\n\nA'

In [8]:
tokenizer = keras.preprocessing.text.Tokenizer(char_level=True)
tokenizer

<keras_preprocessing.text.Tokenizer at 0x7f077b183dd0>

In [9]:
help(tokenizer.fit_on_texts)

Help on method fit_on_texts in module keras_preprocessing.text:

fit_on_texts(texts) method of keras_preprocessing.text.Tokenizer instance
    Updates internal vocabulary based on a list of texts.
    
    In the case where texts contains lists,
    we assume each entry of the lists to be a token.
    
    Required before using `texts_to_sequences` or `texts_to_matrix`.
    
    # Arguments
        texts: can be a list of strings,
            a generator of strings (for memory-efficiency),
            or a list of list of strings.



In [10]:
# What would happen if we forgot its arg has to be a list?
tokenizer.fit_on_texts(shakespeare_text)

In [12]:
tokenizer.texts_to_sequences(["First"])

[[20, 6, 9, 8, 3]]

In [11]:
tokenizer.texts_to_sequences("First")

[[20], [6], [9], [8], [3]]

In [15]:
L = list("First")
L

['F', 'i', 'r', 's', 't']

In [16]:
tokenizer.texts_to_sequences(L)

[[20], [6], [9], [8], [3]]

In [17]:
[L]

[['F', 'i', 'r', 's', 't']]

In [18]:
tokenizer.texts_to_sequences([L])

[[20, 6, 9, 8, 3]]

In [13]:
tokenizer.sequences_to_texts([[20, 6, 9, 8, 3]])

['f i r s t']

In [14]:
tokenizer.sequences_to_texts([[20], [6], [9], [8], [3]])

['f', 'i', 'r', 's', 't']

In [20]:
max_id = len(tokenizer.word_index)
max_id  # num of distinct characters

39

In [26]:
tokenizer.sequences_to_texts([range(1, max_id+1)])

["  e t o a i h s r n \n l d u m y w , c f g b p : k v . ' ; ? ! - j q x z 3 & $"]

In [21]:
dataset_size = tokenizer.document_count
dataset_size

1115394

In [27]:
len(shakespeare_text)

1115394

In [30]:
tokenizer.sequences_to_texts([[0]])

['']

In [31]:
tokenizer.sequences_to_texts([[0]])[0] == ''

True

In [33]:
import numpy as np

In [35]:
# map each letter to its ID (0-38 instead of 1-39)
[encoded] = np.array(tokenizer.texts_to_sequences([shakespeare_text])) - 1
encoded

array([19,  5,  8, ..., 20, 26, 10])

In [36]:
len(encoded)

1115394

In [37]:
train_size = dataset_size * 90 // 100
train_size

1003854

In [38]:
dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])

## Chopping the Sequential Dataset into Multiple Windows

In [39]:
n_steps = 100
window_length = n_steps + 1  # target = input shifted 1 character ahead
dataset = dataset.window(window_length, shift=1, drop_remainder=True)

In [43]:
ds_just_flat = dataset.flat_map(lambda ds: ds)
for i, tensor in enumerate(ds_just_flat):
    if i < 10:
        print(tensor)
    else:
        break

tf.Tensor(19, shape=(), dtype=int64)
tf.Tensor(5, shape=(), dtype=int64)
tf.Tensor(8, shape=(), dtype=int64)
tf.Tensor(7, shape=(), dtype=int64)
tf.Tensor(2, shape=(), dtype=int64)
tf.Tensor(0, shape=(), dtype=int64)
tf.Tensor(18, shape=(), dtype=int64)
tf.Tensor(5, shape=(), dtype=int64)
tf.Tensor(2, shape=(), dtype=int64)
tf.Tensor(5, shape=(), dtype=int64)


In [44]:
dataset = dataset.flat_map(lambda ds: ds.batch(window_length))

In [48]:
for i, tensor in enumerate(dataset):
    if i < 10:
        print(tensor.shape)
    else:
        break

(101,)
(101,)
(101,)
(101,)
(101,)
(101,)
(101,)
(101,)
(101,)
(101,)


In [49]:
batch_size = 32
dataset = dataset.shuffle(10_000).batch(batch_size)
# Recall that the arg inside shuffle() is the buffer size

In [51]:
for i, batch in enumerate(dataset):
    if i < 3:
        print(batch)
    else:
        break

tf.Tensor(
[[26 10 10 ...  1 26 10]
 [ 2  6  3 ...  1 26 10]
 [ 0  3  9 ... 26 10 10]
 ...
 [13  8  0 ...  0  6  5]
 [ 3 11  1 ...  2  6  1]
 [35  1  9 ... 10 16  6]], shape=(32, 101), dtype=int64)
tf.Tensor(
[[17  0 16 ...  1  0 16]
 [ 8  9  0 ...  9 12  0]
 [20  1  9 ...  8  9  7]
 ...
 [ 3 14 22 ...  0 15  3]
 [ 7  1  0 ...  5  9  0]
 [ 5  9  7 ...  5  9 12]], shape=(32, 101), dtype=int64)
tf.Tensor(
[[ 0  6  1 ...  0 22  1]
 [ 2  3  9 ...  1  0 12]
 [ 8  9  7 ...  7  0  2]
 ...
 [ 3  9  7 ...  5 18  1]
 [13  7  0 ... 11 11  0]
 [ 5  2  6 ... 16  6  4]], shape=(32, 101), dtype=int64)


In [52]:
dataset = dataset.map(lambda batch: (batch[:, :-1], batch[:, 1:]))

In [58]:
for i, batch in enumerate(dataset):
    if i < 2:
        print(type(batch), len(batch), batch[0].shape, batch[1].shape)
    else:
        break

<class 'tuple'> 2 (32, 100) (32, 100)
<class 'tuple'> 2 (32, 100) (32, 100)


**(?)** Why the target is a tensor of length `100`, not a single character?

In [91]:
dataset = dataset.map(lambda X_batch, Y_batch:
                      (tf.one_hot(X_batch, depth=max_id), Y_batch))

Let's make sure the correctness of `tf.one_hot()`.

In [73]:
tf.random.uniform(maxval=max_id-1, dtype=tf.int32, shape=(32,100))

<tf.Tensor: shape=(32, 100), dtype=int32, numpy=
array([[18,  2,  0, ..., 34, 28, 10],
       [11, 34,  6, ..., 25, 11,  3],
       [28, 23, 31, ..., 28, 35,  2],
       ...,
       [30, 34,  0, ..., 14, 32, 35],
       [18, 27, 26, ...,  9,  8, 19],
       [ 6, 31, 32, ...,  0, 16,  8]], dtype=int32)>

In [75]:
X_batch = tf.random.uniform(maxval=max_id-1, dtype=tf.int32, shape=(32,100))
one_hotted = tf.one_hot(X_batch, depth=max_id)
one_hotted

<tf.Tensor: shape=(32, 100, 39), dtype=float32, numpy=
array([[[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 1., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 1., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       ...,

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 1., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0.,

In [85]:
for f in dir(tf.math):
    if "sum" in f:
        print(f)

cumsum
cumulative_logsumexp
reduce_logsumexp
reduce_sum
segment_sum
unsorted_segment_sum


In [88]:
help(tf.math.reduce_sum)

Help on function reduce_sum in module tensorflow.python.ops.math_ops:

reduce_sum(input_tensor, axis=None, keepdims=False, name=None)
    Computes the sum of elements across dimensions of a tensor.
    
    Reduces `input_tensor` along the dimensions given in `axis`.
    Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
    of the entries in `axis`, which must be unique. If `keepdims` is true, the
    reduced dimensions are retained with length 1.
    
    If `axis` is None, all dimensions are reduced, and a
    tensor with a single element is returned.
    
    For example:
    
    >>> # x has a shape of (2, 3) (two rows and three columns):
    >>> x = tf.constant([[1, 1, 1], [1, 1, 1]])
    >>> x.numpy()
    array([[1, 1, 1],
           [1, 1, 1]], dtype=int32)
    >>> # sum all the elements
    >>> # 1 + 1 + 1 + 1 + 1+ 1 = 6
    >>> tf.reduce_sum(x).numpy()
    6
    >>> # reduce along the first dimension
    >>> # the result is [1, 1, 1] + [1, 1, 1] = [2, 

In [87]:
tf.math.reduce_sum(one_hotted, axis=-1)

<tf.Tensor: shape=(32, 100), dtype=float32, numpy=
array([[1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       ...,
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.]], dtype=float32)>

In [90]:
tf.math.reduce_sum(one_hotted)

<tf.Tensor: shape=(), dtype=float32, numpy=3200.0>

In [92]:
dataset = dataset.prefetch(1)

## Building and Training the Char-Rnn Model

**(?)** Why use `sparse_categorical_crossentropy`?

**(?)** `GRU` with `128` units. What that really mean?

**(?)** The meaning of `dropout` and `recurrent_dropout` in `keras.layers.GRU()`

In [93]:
model = keras.models.Sequential([
    keras.layers.GRU(128, return_sequences=True, input_shape=[None, max_id],
                     dropout=0.2, recurrent_dropout=0.2),
    keras.layers.GRU(128, return_sequences=True,
                     dropout=0.2, recurrent_dropout=0.2),
    keras.layers.TimeDistributed(keras.layers.Dense(max_id,
                                                    activation="softmax")),
])
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam")
history = model.fit(dataset, epochs=20)

Epoch 1/20
     75/Unknown - 30s 310ms/step - loss: 3.2848

KeyboardInterrupt: 

## Using the Char-RNN Model

In [None]:
def preprocess(texts):
    X = np.array(keras)