### Setup

In [1]:
# NOTE: import TF and other libraries

import os
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
import time
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import tensorflow as tf


In [2]:
# NOTE: download Shakespeare dataset

path_to_file = tf.keras.utils.get_file(
    "shakespeare.txt",
    "https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt",
)

In [3]:
# NOTE: read the data

text = open(path_to_file, "rb").read().decode(encoding="utf-8")
print(f"[DEBUG] length of text: {len(text)} characters")
print(f"{text[:250]=}")

vocab = sorted(set(text))
print(f"{len(vocab)} unique characters")

[DEBUG] length of text: 1115394 characters
text[:250]='First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you know Caius Marcius is chief enemy to the people.\n'
65 unique characters


### Process the text

In [4]:
# NOTE: vectorize the text

example_texts = ["abcdefg", "xyz"]

# string to set of chars
chars = tf.strings.unicode_split(example_texts, input_encoding="UTF-8")
print(chars)

# set of chars to numerical representation
ids_from_chars = tf.keras.layers.StringLookup(
    vocabulary=list(vocab), mask_token=None
)
ids = ids_from_chars(chars)
print(ids)

# numerical representation to set of chars
chars_from_ids = tf.keras.layers.StringLookup(
    vocabulary=ids_from_chars.get_vocabulary(), 
    invert=True, 
    mask_token=None
)
chars = chars_from_ids(ids)
print(chars)

# set of chars to string
def text_from_ids(ids):
    return tf.strings.reduce_join(chars_from_ids(ids), axis=-1)

<tf.RaggedTensor [[b'a', b'b', b'c', b'd', b'e', b'f', b'g'], [b'x', b'y', b'z']]>
<tf.RaggedTensor [[40, 41, 42, 43, 44, 45, 46], [63, 64, 65]]>
<tf.RaggedTensor [[b'a', b'b', b'c', b'd', b'e', b'f', b'g'], [b'x', b'y', b'z']]>


In [6]:
# NOTE: the prediction task: 
# given sequence of character, 
# what is the most probable next character?

# NOTE: create training examples and targets
all_ids = ids_from_chars(tf.strings.unicode_split(text, "UTF-8"))
print(f"{all_ids=}")

ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)
for ids in ids_dataset.take(10):
    print(chars_from_ids(ids).numpy().decode("utf-8"))

# batching
seq_length = 100
examples_per_epoch = len(text) // (seq_length+1) # NOTE: training set; given sequence(seq_length), predict next character(+1)
sequences = ids_dataset.batch(seq_length+1, drop_remainder=True)
for seq in sequences.take(5):
    print(text_from_ids(seq).numpy())

all_ids=<tf.Tensor: shape=(1115394,), dtype=int64, numpy=array([19, 48, 57, ..., 46,  9,  1])>
F
i
r
s
t
 
C
i
t
i
b'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '
b'are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you k'
b"now Caius Marcius is chief enemy to the people.\n\nAll:\nWe know't, we know't.\n\nFirst Citizen:\nLet us ki"
b"ll him, and we'll have corn at our own price.\nIs't a verdict?\n\nAll:\nNo more talking on't; let it be d"
b'one: away, away!\n\nSecond Citizen:\nOne word, good citizens.\n\nFirst Citizen:\nWe are accounted poor citi'


In [7]:
# NOTE: preparing dataset: (input, label)
# `input`: current character
# `label`: next character

def split_input_target(sequence):
    input_text = sequence[:-1]
    target_text = sequence[1:]

    return input_text, target_text

split_input_target(list("Tensorflow"))
dataset = sequences.map(split_input_target)

(['T', 'e', 'n', 's', 'o', 'r', 'f', 'l', 'o'],
 ['e', 'n', 's', 'o', 'r', 'f', 'l', 'o', 'w'])