# Example Problem

In [1]:
import tensorflow as tf
import numpy as np

In [2]:
np.random.seed(42)
tf.random.set_seed(42)
n_steps=5
dataset = tf.data.Dataset.from_tensor_slices(tf.range(15))
dataset = dataset.window(size=n_steps, shift=2, drop_remainder=True)
dataset = dataset.flat_map(lambda window: window.batch(n_steps))

In [3]:
dataset = dataset.shuffle(buffer_size=10).map(lambda window: (window[:-1], window[1:]))
dataset = dataset.batch(3).prefetch(1)
for index, (X_batch, Y_batch) in enumerate(dataset):
    print('_' * 20, 'Batch', index, '\nX_batch')
    print(X_batch.numpy())
    print('=' * 5, '\nY)batch')
    print(Y_batch.numpy())

____________________ Batch 0 
X_batch
[[6 7 8 9]
 [2 3 4 5]
 [4 5 6 7]]
===== 
Y)batch
[[ 7  8  9 10]
 [ 3  4  5  6]
 [ 5  6  7  8]]
____________________ Batch 1 
X_batch
[[ 0  1  2  3]
 [ 8  9 10 11]
 [10 11 12 13]]
===== 
Y)batch
[[ 1  2  3  4]
 [ 9 10 11 12]
 [11 12 13 14]]


In [5]:
shakespeare_url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
filepath = tf.keras.utils.get_file('shakespeare.txt', shakespeare_url)
with open(filepath) as f:
    shakespeare_text = f.read()

Downloading data from https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt


In [8]:
print(shakespeare_text[:250])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.



In [9]:
"".join(sorted(set(shakespeare_text.lower())))

"\n !$&',-.3:;?abcdefghijklmnopqrstuvwxyz"

In [12]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(char_level=True)

In [13]:
tokenizer.fit_on_texts(shakespeare_text)

In [14]:
tokenizer.texts_to_sequences(['sample'])

[[8, 5, 15, 23, 12, 2]]

In [15]:
tokenizer.sequences_to_texts([[8, 5, 15, 23, 12, 2]])

['s a m p l e']

In [16]:
max_id = len(tokenizer.word_index)
dataset_size = tokenizer.document_count
print(max_id)
print(dataset_size)

39
1115394


In [17]:
[encoded] = np.array(tokenizer.texts_to_sequences([shakespeare_text])) - 1
train_size = dataset_size * 90 // 100
dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])

In [18]:
n_steps = 100
window_length = n_steps + 1 # target = input shifted 1 character ahead
dataset = dataset.repeat().window(window_length, shift=1, drop_remainder=True)

In [19]:
dataset = dataset.flat_map(lambda window: window.batch(window_length))

In [20]:
batch_size = 32
dataset = dataset.shuffle(10000).batch(batch_size)
dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))

In [21]:
dataset = dataset.map(lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch))

In [22]:
dataset = dataset.prefetch(1)

In [23]:
for X_batch, Y_batch in dataset.take(1):
    print(X_batch.shape, Y_batch.shape)

(32, 100, 39) (32, 100)


In [29]:
model = tf.keras.models.Sequential([
    tf.keras.layers.GRU(units=128, return_sequences=True, input_shape=[None, max_id], 
                        dropout=.2, recurrent_dropout=.2),
    tf.keras.layers.GRU(128, return_sequences=True, dropout=.2, recurrent_dropout=.2),
    tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(units=max_id, activation=tf.nn.softmax))
])

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam')
history = model.fit(dataset, steps_per_epoch=train_size//batch_size, epochs=10)

Train for 31370 steps
Epoch 1/10
 1208/31370 [>.............................] - ETA: 2:59:21 - loss: 2.0936

KeyboardInterrupt: 