In [2]:
import tensorflow as tf

In [3]:
url = "https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt"
shakespear_raw_text = tf.keras.utils.get_file(fname="shakespeare.txt", origin="https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt")

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt


In [4]:
shakespear_raw_text

'/root/.keras/datasets/shakespeare.txt'

In [5]:
text = ""
with open(shakespear_raw_text,"rb") as read_obj:
    # print(read_obj.read().decode(encoding='utf-8')[:1000])
    text = read_obj.read().decode(encoding='utf-8')
print(text[:10])

text = text[:1000]

First Citi


In [None]:
vocabs = sorted(set(text))
print(f"Vocabs = {vocabs}")
print(f"Len of Vocab = {len(vocabs)}")

Vocabs = ['\n', ' ', '!', "'", ',', '.', ':', ';', '?', 'A', 'B', 'C', 'F', 'I', 'L', 'M', 'N', 'O', 'R', 'S', 'W', 'Y', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'y', 'z']
Len of Vocab = 46


Preprocess Text

In [None]:
ids_from_chars = tf.keras.layers.StringLookup(vocabulary=list(vocabs), mask_token=None)
chars_from_ids = tf.keras.layers.StringLookup(vocabulary=ids_from_chars.get_vocabulary(), invert=True, mask_token=None)
def text_preprocessing(data, reverse=False, axis=True):

    if reverse:
        # numbers to chars
        chars = chars_from_ids(data)
        # chars join to make sentence
        if not axis:
            return tf.strings.reduce_join(chars).numpy()
        else:
            return tf.strings.reduce_join(chars, axis=-1).numpy()
    else:
        # splitting
        data = tf.strings.unicode_split(data, "UTF-8")
        # converting char to numbers
        return ids_from_chars(data)

In [None]:
all_ids = text_preprocessing(text)

In [None]:
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)

In [None]:
for id in ids_dataset.take(10):
    char = text_preprocessing(id,reverse=True, axis=False)
    print(char.decode("UTF-8"))

F
i
r
s
t
 
C
i
t
i


In [None]:
batch=100
from pprint import pprint
batches = ids_dataset.batch(batch+1, drop_remainder=True)
print(pprint(batches.__dict__))
for ids in batches.take(1):
    chars = text_preprocessing(ids, reverse=True)
    # print(chars)

{'_batch_size': <tf.Tensor: shape=(), dtype=int64, numpy=101>,
 '_drop_remainder': <tf.Tensor: shape=(), dtype=bool, numpy=True>,
 '_graph_attr': <tensorflow.python.framework.ops.Graph object at 0x7f1516cf1960>,
 '_input_dataset': <_TensorSliceDataset element_spec=TensorSpec(shape=(), dtype=tf.int64, name=None)>,
 '_name': None,
 '_options_attr': <tensorflow.python.data.ops.options.Options object at 0x7f1505951240>,
 '_structure': TensorSpec(shape=(101,), dtype=tf.int64, name=None),
 '_variant_tensor_attr': <tf.Tensor: shape=(), dtype=variant, value=<BatchDatasetV2Op(101)::Dataset>>}
None


In [None]:
def get_target_label(data):
    input = data[:-1]
    label = data[1:]
    print(input, label)
    return input, label

print(get_target_label(b"Madhusudhan reddy"))

b'Madhusudhan redd' b'adhusudhan reddy'
(b'Madhusudhan redd', b'adhusudhan reddy')


In [None]:
dataset = batches.map(get_target_label)

Tensor("strided_slice:0", shape=(100,), dtype=int64) Tensor("strided_slice_1:0", shape=(100,), dtype=int64)


In [None]:
for d in dataset.take(1):
    print("Input = ",text_preprocessing(d[0], reverse=True))
    print("Label = ",text_preprocessing(d[1], reverse=True))

Input =  b'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'
Label =  b'irst Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '


In [None]:
# Batch size
BATCH_SIZE = 64

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

dataset = (
    dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))

dataset

<_PrefetchDataset element_spec=(TensorSpec(shape=(64, 100), dtype=tf.int64, name=None), TensorSpec(shape=(64, 100), dtype=tf.int64, name=None))>

In [None]:
# Models

vocab_size = len(ids_from_chars.get_vocabulary())
embedding_dim =64
rnn_units = 1024


class MyModel(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, rnn_units):
        super().__init__(self)
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(
            rnn_units,
            return_sequences=True,
            return_state = True,
        )
        self.dense = tf.keras.layers.Dense(vocab_size)

    def call(self, inputs, states=None, return_state=False, training=False):
        x = inputs
        # print("Before Prediction = ",text_preprocessing(x,reverse=True))
        x = self.embedding(x, training=training)
        # print("embeddings ",x)
        if states is None:
            states = self.gru.get_initial_state(x)
            # print("states ",states)
        x, states = self.gru(x, initial_state=states, training=training)
        x = self.dense(x, training=training)
        # print("After prediction = ",text_preprocessing(x,reverse=True))
        # print("final ",x)
        if return_state:
            return x, states
        else:
            return x

In [None]:
model = MyModel(vocab_size=vocab_size,embedding_dim=embedding_dim,rnn_units=rnn_units)

In [None]:
for input_example_batch, target_example_batch in dataset.take(1):
    print(f"input_example_batch= {input_example_batch}")
    example_batch_predictions = model(input_example_batch, )
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")
    print(f"example_batch_predictions= {example_batch_predictions}")

In [None]:
# model.summary()

In [None]:
sample_indeces = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sample_indeces = tf.squeeze(sample_indeces, axis=-1).numpy()
print(text_preprocessing(input_example_batch[0], reverse=True))
print(text_preprocessing(sample_indeces, reverse=True))

b's[UNK]v [UNK][UNK]w [UNK]w[UNK][UNK]C\nf[UNK] [UNK]sv [UNK]w .[UNK]us[UNK]wv: [UNK]w[UNK][UNK][UNK][UNK][UNK][UNK] [UNK]w [UNK][UNK][UNK][UNK][UNK]v [UNK]s[UNK]w [UNK]ws[UNK]v\nk[UNK]w [UNK]s[UNK][UNK][UNK] [UNK][UNK]v[UNK][UNK]z[UNK] [UNK]y [UNK][UNK][UNK] z[UNK][UNK]v w[UNK]us[UNK]w?'
b"[UNK];jb[UNK]njd[UNK]N[UNK]'![UNK]zOmdjRe;Iop[UNK][UNK]om[UNK]pjnk[UNK]M:[UNK][UNK].p[UNK][UNK][UNK]mcck[UNK]aflz.[UNK][UNK]Bnbw[UNK][UNK]oMRd[UNK]j[UNK],Or[UNK]f[UNK]BWr?zFbd?[UNK][UNK][UNK]d\n[UNK]d[UNK]I[UNK]S![UNK]fBO"


In [None]:
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)
example_batch_mean_loss = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("Mean loss:        ", example_batch_mean_loss)


Prediction shape:  (64, 100, 66)  # (batch_size, sequence_length, vocab_size)
Mean loss:         tf.Tensor(4.189749, shape=(), dtype=float32)


In [None]:
tf.exp(example_batch_mean_loss).numpy()

66.0062

In [None]:
# model.compile(optimizer='adam', loss=loss)

In [None]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
import os
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [None]:
EPOCHS = 20

In [None]:
class CustomTraining(MyModel):
  @tf.function
  def train_step(self, inputs):
      inputs, labels = inputs
      with tf.GradientTape() as tape:
          predictions = self(inputs, training=True)
          loss = self.loss(labels, predictions)
      grads = tape.gradient(loss, model.trainable_variables)
      self.optimizer.apply_gradients(zip(grads, model.trainable_variables))

      return {'loss': loss}

model = CustomTraining(
    vocab_size=len(ids_from_chars.get_vocabulary()),
    embedding_dim=embedding_dim,
    rnn_units=rnn_units)

In [None]:
model.compile(optimizer = tf.keras.optimizers.Adam(),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True))

In [None]:
# model.fit(dataset, epochs=1)

In [None]:
# EPOCHS = 5

# mean = tf.metrics.Mean()
# import time
# for epoch in range(EPOCHS):
#     start = time.time()

#     mean.reset_states()
#     for (batch_n, (inp, target)) in enumerate(dataset):
#         logs = model.train_step([inp, target])
#         mean.update_state(logs['loss'])

#         if batch_n % 50 == 0:
#             template = f"Epoch {epoch+1} Batch {batch_n} Loss {logs['loss']:.4f}"
#             print(template)

#     # saving (checkpoint) the model every 5 epochs
#     if (epoch + 1) % 5 == 0:
#         model.save_weights(checkpoint_prefix.format(epoch=epoch))

#     print()
#     print(f'Epoch {epoch+1} Loss: {mean.result().numpy():.4f}')
#     print(f'Time taken for 1 epoch {time.time() - start:.2f} sec')
#     print("_"*80)

# model.save_weights(checkpoint_prefix.format(epoch=epoch))

In [None]:
test_text = ["Hello"]
test_ids = text_preprocessing(test_text)
print(test_ids)
pred, state = model(inputs=test_ids, return_state=True)
test_op = text_preprocessing(pred, reverse=True)
print(test_op)

<tf.RaggedTensor [[0, 27, 34, 34, 37]]>
[[b'[UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK]'
  b'[UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK]'
  b'[UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK]'
  b'[UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UN

In [None]:
# print(tf.random.categorical(pred,  num_samples=1))
# print(pred[0])
sample_indeces = tf.random.categorical(pred[0], num_samples=1)
print(sample_indeces)
sample_indeces = tf.squeeze(sample_indeces, axis=-1).numpy()
print(sample_indeces)
# print(text_preprocessing(input_example_batch[0], reverse=True))
print(text_preprocessing(sample_indeces, reverse=True))

tf.Tensor(
[[30]
 [21]
 [15]
 [ 0]
 [10]], shape=(5, 1), dtype=int64)
[30 21 15  0 10]
b'hWL[UNK]A'


In [None]:
for input_example_batch, target_example_batch in dataset.take(1):
    print(f"input_example_batch= {input_example_batch}")
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")
    print(f"example_batch_predictions= {example_batch_predictions}")