In [1]:
import os
import numpy as np
import tensorflow as tf
from tensorflow import keras
import utils

In [16]:
text = "It was the best of times, it was the worst of times, it was the age of wisdom, it was the age of foolishness, it was the epoch of belief, it was the epoch of incredulity, it was the season of Light, it was the season of Darkness, it was the spring of hope, it was the winter of despair, we had everything before us, we had nothing before us, we were all going direct to Heaven, we were all going direct the other way— in short, the period was so far like the present period, that some of its noisiest authorities insisted on its being received, for good or for evil, in the superlative degree of comparison only."

In [18]:
charset = "".join(sorted(set(text.lower())))
print(f"len(charset) = {len(charset)}")
charset

len(charset) = 26


' ,.abcdefghiklmnoprstuvwy—'

In [19]:
tokenizer = keras.preprocessing.text.Tokenizer(char_level=True)
tokenizer.fit_on_texts(text)

In [20]:
tokenizer.document_count

612

In [21]:
print(tokenizer.word_index)

{' ': 1, 'e': 2, 't': 3, 'i': 4, 'o': 5, 's': 6, 'a': 7, 'h': 8, 'r': 9, 'n': 10, 'w': 11, 'f': 12, ',': 13, 'd': 14, 'g': 15, 'l': 16, 'p': 17, 'c': 18, 'b': 19, 'm': 20, 'u': 21, 'v': 22, 'y': 23, 'k': 24, '—': 25, '.': 26}


In [22]:
seq = tokenizer.texts_to_sequences(["times"])
seq

[[3, 4, 20, 2, 6]]

In [25]:
tokenizer.sequences_to_texts(seq)

['t i m e s']

In [26]:
vocab_size = len(tokenizer.word_index)
dataset_size = tokenizer.document_count
train_size = int(dataset_size * 0.9)
print(f"vocab_size = {vocab_size}, dataset_size = {dataset_size}, train_size = {train_size}")

vocab_size = 26, dataset_size = 612, train_size = 550


In [27]:
sequences = (np.array(tokenizer.texts_to_sequences([text])) - 1)[0]

In [28]:
utils.reset_session()

num_steps = 100
window_size = num_steps + 1
batch_size = 32

dataset = tf.data.Dataset.from_tensor_slices(sequences[:train_size])
dataset = dataset.repeat().window(window_size, shift=1, drop_remainder=True)
dataset = dataset.flat_map(lambda window: window.batch(window_size))
dataset = dataset.map(lambda windows: (windows[:-1], windows[1:]))
dataset = dataset.shuffle(10000).batch(batch_size)
dataset = dataset.map(lambda x_batch, y_batch: (tf.one_hot(x_batch, depth=vocab_size), y_batch))
dataset = dataset.prefetch(1)

In [29]:
for x_batch, y_batch in dataset.take(1):
    print(x_batch.shape, y_batch.shape)
    print(x_batch, y_batch)

(32, 100, 26) (32, 100)
tf.Tensor(
[[[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]
  ...
  [1. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]
  [0. 1. 0. ... 0. 0. 0.]
  ...
  [1. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 ...

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 1. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [1. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 1. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [