In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras

In [2]:
DATA_FILE = './shakespeare.txt'
MAX_SEQUENCE_LENGTH = 100

In [3]:
BATCH_SIZE = 64
SHUFFLE_BUFFER_SIZE = 10000

In [4]:
text = open(DATA_FILE, 'rb').read().decode(encoding='utf-8')
len(text)

1115394

In [5]:
text[:250]

'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you know Caius Marcius is chief enemy to the people.\n'

In [6]:
vocab = sorted(set(text))
print('{} unique characters'.format(len(vocab)))

65 unique characters


In [7]:
char2idx = {ch: idx for idx, ch in enumerate(vocab)}
print(char2idx)

{'\n': 0, ' ': 1, '!': 2, '$': 3, '&': 4, "'": 5, ',': 6, '-': 7, '.': 8, '3': 9, ':': 10, ';': 11, '?': 12, 'A': 13, 'B': 14, 'C': 15, 'D': 16, 'E': 17, 'F': 18, 'G': 19, 'H': 20, 'I': 21, 'J': 22, 'K': 23, 'L': 24, 'M': 25, 'N': 26, 'O': 27, 'P': 28, 'Q': 29, 'R': 30, 'S': 31, 'T': 32, 'U': 33, 'V': 34, 'W': 35, 'X': 36, 'Y': 37, 'Z': 38, 'a': 39, 'b': 40, 'c': 41, 'd': 42, 'e': 43, 'f': 44, 'g': 45, 'h': 46, 'i': 47, 'j': 48, 'k': 49, 'l': 50, 'm': 51, 'n': 52, 'o': 53, 'p': 54, 'q': 55, 'r': 56, 's': 57, 't': 58, 'u': 59, 'v': 60, 'w': 61, 'x': 62, 'y': 63, 'z': 64}


In [8]:
idx2char = np.array(vocab)
print(idx2char)

['\n' ' ' '!' '$' '&' "'" ',' '-' '.' '3' ':' ';' '?' 'A' 'B' 'C' 'D' 'E'
 'F' 'G' 'H' 'I' 'J' 'K' 'L' 'M' 'N' 'O' 'P' 'Q' 'R' 'S' 'T' 'U' 'V' 'W'
 'X' 'Y' 'Z' 'a' 'b' 'c' 'd' 'e' 'f' 'g' 'h' 'i' 'j' 'k' 'l' 'm' 'n' 'o'
 'p' 'q' 'r' 's' 't' 'u' 'v' 'w' 'x' 'y' 'z']


In [9]:
text_as_int = np.array([char2idx[c] for c in text])
len(text_as_int)

1115394

In [10]:
print(f"text[:13] = {text[:13]}")
print(f"text_as_int[:13] = {text_as_int[:13]}")

text[:13] = First Citizen
text_as_int[:13] = [18 47 56 57 58  1 15 47 58 47 64 43 52]


In [11]:
char_ds = tf.data.Dataset.from_tensor_slices(text_as_int)

for i in char_ds.take(5):
    print(idx2char[i.numpy()])

F
i
r
s
t


In [12]:
batched_ds = char_ds.batch(MAX_SEQUENCE_LENGTH + 1, drop_remainder=True)

for batch in batched_ds.take(1):
    print(f"batch.shape = {batch.shape}")
    print(idx2char[batch.numpy()])

batch.shape = (101,)
['F' 'i' 'r' 's' 't' ' ' 'C' 'i' 't' 'i' 'z' 'e' 'n' ':' '\n' 'B' 'e' 'f'
 'o' 'r' 'e' ' ' 'w' 'e' ' ' 'p' 'r' 'o' 'c' 'e' 'e' 'd' ' ' 'a' 'n' 'y'
 ' ' 'f' 'u' 'r' 't' 'h' 'e' 'r' ',' ' ' 'h' 'e' 'a' 'r' ' ' 'm' 'e' ' '
 's' 'p' 'e' 'a' 'k' '.' '\n' '\n' 'A' 'l' 'l' ':' '\n' 'S' 'p' 'e' 'a'
 'k' ',' ' ' 's' 'p' 'e' 'a' 'k' '.' '\n' '\n' 'F' 'i' 'r' 's' 't' ' ' 'C'
 'i' 't' 'i' 'z' 'e' 'n' ':' '\n' 'Y' 'o' 'u' ' ']


In [13]:
def x_y_split(chunk):
    return chunk[:-1], chunk[1:]

x_y_ds = batched_ds.map(x_y_split)

for batch_x, batch_y in x_y_ds.take(2):
    print("=" * 32)
    print(f"batch_x.shape = {batch_x.shape}")
    print(f"batch_y.shape = {batch_y.shape}")
    print(f"batch_x = \n{idx2char[batch_x.numpy()]}")
    print(f"batch_y = \n{idx2char[batch_y.numpy()]}")

batch_x.shape = (100,)
batch_y.shape = (100,)
batch_x = 
['F' 'i' 'r' 's' 't' ' ' 'C' 'i' 't' 'i' 'z' 'e' 'n' ':' '\n' 'B' 'e' 'f'
 'o' 'r' 'e' ' ' 'w' 'e' ' ' 'p' 'r' 'o' 'c' 'e' 'e' 'd' ' ' 'a' 'n' 'y'
 ' ' 'f' 'u' 'r' 't' 'h' 'e' 'r' ',' ' ' 'h' 'e' 'a' 'r' ' ' 'm' 'e' ' '
 's' 'p' 'e' 'a' 'k' '.' '\n' '\n' 'A' 'l' 'l' ':' '\n' 'S' 'p' 'e' 'a'
 'k' ',' ' ' 's' 'p' 'e' 'a' 'k' '.' '\n' '\n' 'F' 'i' 'r' 's' 't' ' ' 'C'
 'i' 't' 'i' 'z' 'e' 'n' ':' '\n' 'Y' 'o' 'u']
batch_y = 
['i' 'r' 's' 't' ' ' 'C' 'i' 't' 'i' 'z' 'e' 'n' ':' '\n' 'B' 'e' 'f' 'o'
 'r' 'e' ' ' 'w' 'e' ' ' 'p' 'r' 'o' 'c' 'e' 'e' 'd' ' ' 'a' 'n' 'y' ' '
 'f' 'u' 'r' 't' 'h' 'e' 'r' ',' ' ' 'h' 'e' 'a' 'r' ' ' 'm' 'e' ' ' 's'
 'p' 'e' 'a' 'k' '.' '\n' '\n' 'A' 'l' 'l' ':' '\n' 'S' 'p' 'e' 'a' 'k'
 ',' ' ' 's' 'p' 'e' 'a' 'k' '.' '\n' '\n' 'F' 'i' 'r' 's' 't' ' ' 'C' 'i'
 't' 'i' 'z' 'e' 'n' ':' '\n' 'Y' 'o' 'u' ' ']
batch_x.shape = (100,)
batch_y.shape = (100,)
batch_x = 
['a' 'r' 'e' ' ' 'a' 'l' 'l' ' ' 'r' 'e' 's' 'o

In [14]:
train_ds = x_y_ds.shuffle(SHUFFLE_BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

for batch_x, batch_y in train_ds.take(1):
    print(f"batch_x.shape = {batch_x.shape}")
    print(f"batch_y.shape = {batch_y.shape}")
    print(f"batch_x = \n{idx2char[batch_x.numpy()]}")
    print(f"batch_y = \n{idx2char[batch_y.numpy()]}")

batch_x.shape = (64, 100)
batch_y.shape = (64, 100)
batch_x = 
[['n' 't' 'e' ... 'A' 's' ' ']
 ['o' 'u' 's' ... 't' '.' '\n']
 ['y' ' ' 'l' ... 'i' 's' 'p']
 ...
 ['o' 'm' 'e' ... 'o' 'r' ',']
 ['e' "'" 's' ... 'y' ' ' 't']
 ['e' ',' ' ' ... 'f' 'o' 'r']]
batch_y = 
[['t' 'e' 'r' ... 's' ' ' 'n']
 ['u' 's' ' ' ... '.' '\n' '\n']
 [' ' 'l' 'i' ... 's' 'p' 'a']
 ...
 ['m' 'e' '?' ... 'r' ',' ' ']
 ["'" 's' ' ' ... ' ' 't' 'h']
 [',' ' ' 'I' ... 'o' 'r' ' ']]
