In [0]:
from __future__ import absolute_import, division, print_function, unicode_literals

try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass
import tensorflow as tf

import numpy as np
import os
import time

In [67]:
path_to_file = tf.keras.utils.get_file('bible.txt', 'http://www.gutenberg.org/cache/epub/10/pg10.txt')

Downloading data from http://www.gutenberg.org/cache/epub/10/pg10.txt


In [68]:
# Read, then decode for py2 compat.
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')
# length of text is the number of characters in it
print ('Length of text: {} characters'.format(len(text)))

Length of text: 4452517 characters


In [69]:
# Take a look at the first 250 characters in text
print(text[:250])

﻿The Project Gutenberg EBook of The King James Bible


**********************************************************************
EBOOK (#10) WAS ONE OF PROJECT GUTENBERG'S EARLY FILES PRODUCED AT A
TIME WHEN PROOFING METHODS AND TOOLS WERE NOT WELL


In [70]:
# The unique characters in the file
vocab = sorted(set(text))
print ('{} unique characters'.format(len(vocab)))

85 unique characters


In [0]:
# Creating a mapping from unique characters to indices
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

text_as_int = np.array([char2idx[c] for c in text])

In [72]:
print('{')
for char,_ in zip(char2idx, range(20)):
    print('  {:4s}: {:3d},'.format(repr(char), char2idx[char]))
print('  ...\n}')

{
  '\n':   0,
  '\r':   1,
  ' ' :   2,
  '!' :   3,
  '"' :   4,
  '#' :   5,
  '$' :   6,
  '%' :   7,
  "'" :   8,
  '(' :   9,
  ')' :  10,
  '*' :  11,
  ',' :  12,
  '-' :  13,
  '.' :  14,
  '/' :  15,
  '0' :  16,
  '1' :  17,
  '2' :  18,
  '3' :  19,
  ...
}


In [73]:
# Show how the first 13 characters from the text are mapped to integers
print ('{} ---- characters mapped to int ---- > {}'.format(repr(text[:13]), text_as_int[:13]))

'\ufeffThe Project ' ---- characters mapped to int ---- > [84 49 65 62  2 45 75 72 67 62 60 77  2]


In [74]:
# The maximum length sentence we want for a single input in characters
seq_length = 100
examples_per_epoch = len(text)//(seq_length+1)

# Create training examples / targets
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

for i in char_dataset.take(5):
  print(idx2char[i.numpy()])

﻿
T
h
e
 


In [75]:
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

for item in sequences.take(5):
  print(repr(''.join(idx2char[item.numpy()])))

'\ufeffThe Project Gutenberg EBook of The King James Bible\r\n\r\n\r\n*******************************************'
"***************************\r\nEBOOK (#10) WAS ONE OF PROJECT GUTENBERG'S EARLY FILES PRODUCED AT A\r\nTI"
'ME WHEN PROOFING METHODS AND TOOLS WERE NOT WELL DEVELOPED. THERE IS\r\nAN IMPROVED EDITION OF THIS TIT'
'LE WHICH MAY VIEWED AT EBOOK http://www.gutenberg.org/files/10900/10900-h/10900-h.htm\r\n(There is no t'
'ext file for this ebook)\r\n**********************************************************************\r\n\r\n\r'


In [0]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

In [77]:
for input_example, target_example in  dataset.take(1):
  print ('Input data: ', repr(''.join(idx2char[input_example.numpy()])))
  print ('Target data:', repr(''.join(idx2char[target_example.numpy()])))

Input data:  '\ufeffThe Project Gutenberg EBook of The King James Bible\r\n\r\n\r\n******************************************'
Target data: 'The Project Gutenberg EBook of The King James Bible\r\n\r\n\r\n*******************************************'


In [78]:
for i, (input_idx, target_idx) in enumerate(zip(input_example[:5], target_example[:5])):
    print("Step {:4d}".format(i))
    print("  input: {} ({:s})".format(input_idx, repr(idx2char[input_idx])))
    print("  expected output: {} ({:s})".format(target_idx, repr(idx2char[target_idx])))

Step    0
  input: 84 ('\ufeff')
  expected output: 49 ('T')
Step    1
  input: 49 ('T')
  expected output: 65 ('h')
Step    2
  input: 65 ('h')
  expected output: 62 ('e')
Step    3
  input: 62 ('e')
  expected output: 2 (' ')
Step    4
  input: 2 (' ')
  expected output: 45 ('P')


In [79]:
# Batch size
BATCH_SIZE = 64

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

dataset

<BatchDataset shapes: ((64, 100), (64, 100)), types: (tf.int64, tf.int64)>

In [0]:
# Length of the vocabulary in chars
vocab_size = len(vocab)

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024

In [0]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
  model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim,
                              batch_input_shape=[batch_size, None]),
    tf.keras.layers.GRU(rnn_units,
                        return_sequences=True,
                        stateful=True,
                        recurrent_initializer='glorot_uniform'),
    tf.keras.layers.Dense(vocab_size)
  ])
  return model

In [0]:
model = build_model(
  vocab_size = len(vocab),
  embedding_dim=embedding_dim,
  rnn_units=rnn_units,
  batch_size=BATCH_SIZE)

In [83]:
for input_example_batch, target_example_batch in dataset.take(1):
  example_batch_predictions = model(input_example_batch)
  print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(64, 100, 85) # (batch_size, sequence_length, vocab_size)


In [84]:
model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (64, None, 256)           21760     
_________________________________________________________________
gru_4 (GRU)                  (64, None, 1024)          3938304   
_________________________________________________________________
dense_4 (Dense)              (64, None, 85)            87125     
Total params: 4,047,189
Trainable params: 4,047,189
Non-trainable params: 0
_________________________________________________________________


In [0]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices,axis=-1).numpy()

In [86]:
sampled_indices

array([10, 79, 41, 64, 11, 70,  6, 47,  7,  4, 36, 16, 76, 41, 28, 44, 19,
       11, 73, 28, 52,  4, 27, 23, 19,  5, 34, 32, 12, 54, 46, 61, 25, 21,
       35, 84, 72, 59, 17, 72, 66, 26, 64, 41, 78, 31, 56, 24, 68, 42, 81,
       84, 58, 55, 59, 48, 65, 82, 15, 77,  1, 64, 54, 12, 17, 57, 29, 72,
       46, 74, 63, 25,  2, 42, 16, 24, 30,  3, 66, 46, 69, 72, 62, 70, 53,
       21, 49, 38, 47,  1, 35, 60, 13, 17, 57, 47, 80, 76, 54, 38])

In [87]:
print("Input: \n", repr("".join(idx2char[input_example_batch[0]])))
print()
print("Next Char Predictions: \n", repr("".join(idx2char[sampled_indices ])))

Input: 
 'Pharaoh, Thus\r\nsaith the LORD God of Israel, Let my people go, that they may hold a\r\nfeast unto me i'

Next Char Predictions: 
 ')vLg*m$R%"G0sL?O3*p?W";73#EC,YQd95F\ufeffob1oi:gLuB[8kMx\ufeffaZbShy/t\rgY,1]@oQqf9 M08A!iQloemX5TIR\rFc-1]RwsYI'


In [88]:
def loss(labels, logits):
  return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

example_batch_loss  = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("scalar_loss:      ", example_batch_loss.numpy().mean())

Prediction shape:  (64, 100, 85)  # (batch_size, sequence_length, vocab_size)
scalar_loss:       4.442077


In [0]:
model.compile(optimizer='adam', loss=loss)

In [0]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [0]:
EPOCHS=10

In [93]:
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Train for 688 steps
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [95]:
tf.train.latest_checkpoint(checkpoint_dir)

'./training_checkpoints/ckpt_10'

In [0]:
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)

model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

model.build(tf.TensorShape([1, None]))

In [97]:
model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (1, None, 256)            21760     
_________________________________________________________________
gru_5 (GRU)                  (1, None, 1024)           3938304   
_________________________________________________________________
dense_5 (Dense)              (1, None, 85)             87125     
Total params: 4,047,189
Trainable params: 4,047,189
Non-trainable params: 0
_________________________________________________________________


In [0]:
def generate_text(model, start_string):
  # Evaluation step (generating text using the learned model)

  # Number of characters to generate
  num_generate = 1000

  # Converting our start string to numbers (vectorizing)
  input_eval = [char2idx[s] for s in start_string]
  input_eval = tf.expand_dims(input_eval, 0)

  # Empty string to store our results
  text_generated = []

  # Low temperatures results in more predictable text.
  # Higher temperatures results in more surprising text.
  # Experiment to find the best setting.
  temperature = 1.0

  # Here batch size == 1
  model.reset_states()
  for i in range(num_generate):
      predictions = model(input_eval)
      # remove the batch dimension
      predictions = tf.squeeze(predictions, 0)

      # using a categorical distribution to predict the word returned by the model
      predictions = predictions / temperature
      predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

      # We pass the predicted word as the next input to the model
      # along with the previous hidden state
      input_eval = tf.expand_dims([predicted_id], 0)

      text_generated.append(idx2char[predicted_id])

  return (start_string + ''.join(text_generated))

In [100]:
print(generate_text(model, start_string=u"GOD: "))

GOD: But ye are fire.

4:22 Fear my God for Doas, that my soul desireth that he will, and let all natural permission in hindry:
(for thou hast this man to give you thess: for they said, O thou hast kept,
as for I bear not, likewise, draw and enquire honour, and of hypocrisy;
and husbands shall know that he will search for your children: for why doth man
be a royention oor had gotten in his way.

19:13 And now the daughterst that verith, God of God, for the sin offering: for so
was I caused in their goings in the blood of righony perceiver.

5:28 Brethren, be the seen of Jesus Christ, that if is John, whose armourbearer
said, The voice of their hands are suppaised: sin thou not by all, and shall
blossom, and scame tit his wife unto him the inhabitants of Jerusalem.

7:11 And when he was constrained from Judaea we smite to scail, from the door of the te, when I have sold
unto the blessing: I beseech thee this fetheclook not one: 13:6
And shall say to this law house of Ju

In [0]:
model = build_model(
  vocab_size = len(vocab),
  embedding_dim=embedding_dim,
  rnn_units=rnn_units,
  batch_size=BATCH_SIZE)

In [0]:
optimizer = tf.keras.optimizers.Adam()

In [0]:
@tf.function
def train_step(inp, target):
  with tf.GradientTape() as tape:
    predictions = model(inp)
    loss = tf.reduce_mean(
        tf.keras.losses.sparse_categorical_crossentropy(
            target, predictions, from_logits=True))
  grads = tape.gradient(loss, model.trainable_variables)
  optimizer.apply_gradients(zip(grads, model.trainable_variables))

  return loss

In [104]:
# Training step
EPOCHS = 10

for epoch in range(EPOCHS):
  start = time.time()

  # initializing the hidden state at the start of every epoch
  # initally hidden is None
  hidden = model.reset_states()

  for (batch_n, (inp, target)) in enumerate(dataset):
    loss = train_step(inp, target)

    if batch_n % 100 == 0:
      template = 'Epoch {} Batch {} Loss {}'
      print(template.format(epoch+1, batch_n, loss))

  # saving (checkpoint) the model every 5 epochs
  if (epoch + 1) % 5 == 0:
    model.save_weights(checkpoint_prefix.format(epoch=epoch))

  print ('Epoch {} Loss {:.4f}'.format(epoch+1, loss))
  print ('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

model.save_weights(checkpoint_prefix.format(epoch=epoch))

Epoch 1 Batch 0 Loss 4.4396467208862305
Epoch 1 Batch 100 Loss 2.0648889541625977
Epoch 1 Batch 200 Loss 1.7976136207580566
Epoch 1 Batch 300 Loss 1.6339716911315918
Epoch 1 Batch 400 Loss 1.4380426406860352
Epoch 1 Batch 500 Loss 1.3498153686523438
Epoch 1 Batch 600 Loss 1.3747390508651733
Epoch 1 Loss 1.3006
Time taken for 1 epoch 43.33114528656006 sec

Epoch 2 Batch 0 Loss 1.2438489198684692
Epoch 2 Batch 100 Loss 1.1641535758972168
Epoch 2 Batch 200 Loss 1.1350945234298706
Epoch 2 Batch 300 Loss 1.171319842338562
Epoch 2 Batch 400 Loss 1.1378989219665527
Epoch 2 Batch 500 Loss 1.205416202545166
Epoch 2 Batch 600 Loss 1.1818456649780273
Epoch 2 Loss 1.1555
Time taken for 1 epoch 41.47983121871948 sec

Epoch 3 Batch 0 Loss 1.0836149454116821
Epoch 3 Batch 100 Loss 1.0164897441864014
Epoch 3 Batch 200 Loss 1.0631693601608276
Epoch 3 Batch 300 Loss 1.0775158405303955
Epoch 3 Batch 400 Loss 1.0617916584014893
Epoch 3 Batch 500 Loss 1.0771437883377075
Epoch 3 Batch 600 Loss 1.14597034454