<a href="https://colab.research.google.com/github/nixphix/ChibiOS/blob/master/intro_to_tf_nlp_w4_shakespeare_text_gen.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np

import tensorflow as tf
import tensorflow.keras as k

In [2]:
path_to_file = k.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt


In [3]:
path_to_file

'/root/.keras/datasets/shakespeare.txt'

In [4]:
corpus = open(path_to_file).read()
print(corpus[:100])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [5]:
# get unique chars list
all_chars = np.array(sorted(list(set(corpus))))
print(f"Length of corpus: {len(corpus)}")
print(f"Char size: {len(all_chars)}")
print(f"Some chars are: {all_chars[:20]}")

Length of corpus: 1115394
Char size: 65
Some chars are: ['\n' ' ' '!' '$' '&' "'" ',' '-' '.' '3' ':' ';' '?' 'A' 'B' 'C' 'D' 'E'
 'F' 'G']


In [6]:
# create indexes
char2idx = {c:idx for idx, c in enumerate(all_chars)}
idx2char = np.array([c for c, idx in char2idx.items()])
print(f"Char2Idx: {char2idx}")
print(f"Idx2Char: {idx2char}")

Char2Idx: {'\n': 0, ' ': 1, '!': 2, '$': 3, '&': 4, "'": 5, ',': 6, '-': 7, '.': 8, '3': 9, ':': 10, ';': 11, '?': 12, 'A': 13, 'B': 14, 'C': 15, 'D': 16, 'E': 17, 'F': 18, 'G': 19, 'H': 20, 'I': 21, 'J': 22, 'K': 23, 'L': 24, 'M': 25, 'N': 26, 'O': 27, 'P': 28, 'Q': 29, 'R': 30, 'S': 31, 'T': 32, 'U': 33, 'V': 34, 'W': 35, 'X': 36, 'Y': 37, 'Z': 38, 'a': 39, 'b': 40, 'c': 41, 'd': 42, 'e': 43, 'f': 44, 'g': 45, 'h': 46, 'i': 47, 'j': 48, 'k': 49, 'l': 50, 'm': 51, 'n': 52, 'o': 53, 'p': 54, 'q': 55, 'r': 56, 's': 57, 't': 58, 'u': 59, 'v': 60, 'w': 61, 'x': 62, 'y': 63, 'z': 64}
Idx2Char: ['\n' ' ' '!' '$' '&' "'" ',' '-' '.' '3' ':' ';' '?' 'A' 'B' 'C' 'D' 'E'
 'F' 'G' 'H' 'I' 'J' 'K' 'L' 'M' 'N' 'O' 'P' 'Q' 'R' 'S' 'T' 'U' 'V' 'W'
 'X' 'Y' 'Z' 'a' 'b' 'c' 'd' 'e' 'f' 'g' 'h' 'i' 'j' 'k' 'l' 'm' 'n' 'o'
 'p' 'q' 'r' 's' 't' 'u' 'v' 'w' 'x' 'y' 'z']


In [7]:
# encode corpus into int, show index for first few words
encoded_corpus = np.array([char2idx[c] for c in corpus], dtype=np.uint8)
print(f"First 30 char\n\nCorpus:\n{corpus[:30]}\n\nEncoded Corpus:\n{encoded_corpus[:30]}")

First 30 char

Corpus:
First Citizen:
Before we proce

Encoded Corpus:
[18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 14 43 44 53 56 43  1 61 43
  1 54 56 53 41 43]


In [8]:
# create a tf data set and take 5 and print
dataset = tf.data.Dataset.from_tensor_slices(encoded_corpus)
for data in dataset.take(5):
    print(idx2char[data.numpy()])

F
i
r
s
t


In [9]:
# batch the dataset into sequence
TRAIN_SEQ_LEN = 100
sequence = dataset.batch(TRAIN_SEQ_LEN + 1, drop_remainder=True)
print(f"Number of sequences: {len(sequence)}")
for item in sequence.take(1):
    print(f"Number Sequence: {item.numpy()}")
    print(f"Char Sequence:\n", repr("".join(list(idx2char[item.numpy()]))))

Number of sequences: 11043
Number Sequence: [18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 14 43 44 53 56 43  1 61 43
  1 54 56 53 41 43 43 42  1 39 52 63  1 44 59 56 58 46 43 56  6  1 46 43
 39 56  1 51 43  1 57 54 43 39 49  8  0  0 13 50 50 10  0 31 54 43 39 49
  6  1 57 54 43 39 49  8  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10
  0 37 53 59  1]
Char Sequence:
 'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '


In [10]:
# a function to split the sequence into data and label by mapping
def split_data_label(seq):
    data = seq[:-1]
    label = seq[1:]
    return data, label

split_sequence = sequence.map(split_data_label)

In [11]:
for input_text, target_text in split_sequence.take(1):
    print("Input: ", repr("".join(all_chars[input_text])))
    print("Output:", repr("".join(all_chars[target_text])))

Input:  'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'
Output: 'irst Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '


In [12]:
for input_idx, target_idx in zip(input_text[:5], target_text[:5]):
    print(f"Input text:     {repr(idx2char[input_idx.numpy()])}({input_idx.numpy()})")
    print(f"Expected text:  {repr(idx2char[target_idx.numpy()])}({target_idx.numpy()})")
    print()

Input text:     'F'(18)
Expected text:  'i'(47)

Input text:     'i'(47)
Expected text:  'r'(56)

Input text:     'r'(56)
Expected text:  's'(57)

Input text:     's'(57)
Expected text:  't'(58)

Input text:     't'(58)
Expected text:  ' '(1)



In [13]:
BUFFER = 10000
BATCH_SIZE = 64

VOCAB_SIZE = len(all_chars)

EMBEDDING_DIM = 100
RNN_UNITS = 1024


In [14]:
# shuffle and batch the data for training
train_data = split_sequence.shuffle(BUFFER).batch(BATCH_SIZE, drop_remainder=True)

In [15]:
# build model with embedding, gru and dense layer
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = k.Sequential([
            k.layers.Embedding(vocab_size, embedding_dim, batch_input_shape=[batch_size, None]),
            k.layers.GRU(rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
            k.layers.Dense(vocab_size),
    ])
    model.summary()
    return model

In [16]:
model = build_model(VOCAB_SIZE, EMBEDDING_DIM, RNN_UNITS, BATCH_SIZE)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (64, None, 100)           6500      
_________________________________________________________________
gru (GRU)                    (64, None, 1024)          3459072   
_________________________________________________________________
dense (Dense)                (64, None, 65)            66625     
Total params: 3,532,197
Trainable params: 3,532,197
Non-trainable params: 0
_________________________________________________________________


In [17]:
for iptext, optext in train_data.take(1):
    dry_prediction = model(iptext)
    print(f"Shape of prediction {dry_prediction.shape}")

Shape of prediction (64, 100, 65)


In [18]:
print(f"Input text:\n{''.join([idx2char[idx] for idx in iptext[0].numpy()])}\n")
print(f"Dry prediction:\n{''.join([idx2char[idx] for idx in dry_prediction[0].numpy().argmax(axis=1)])}")

Input text:
Your princely father and my loving lord!

EDWARD:
O, speak no more, for I have heard too much.

RICH

Dry prediction:
,Dymm$FmT3dwSW33Veded3333gFVwNaZTF3wNFnnn'vu3?suM'WudddddVV3'VNNF,,d3zFFP3e33dddddFv3ezzNNe3dsgg''ll


In [19]:
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

print(f"Dry prediction loss: {loss(optext, dry_prediction).numpy().mean()}")

Dry prediction loss: 4.173456192016602


In [20]:
model.compile(optimizer='adam', loss=loss)

In [21]:
import os

check_pt_dir = './check_pt'
check_pt_file_fmt = os.path.join(check_pt_dir, 'check_{epoch}')

check_pt = k.callbacks.ModelCheckpoint(check_pt_file_fmt, save_weights_only=True)

In [22]:
model.fit(train_data, epochs=30, callbacks=[check_pt])

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x7fb0006f8470>

In [23]:
# loading checkpoint and reseting input from batch to single input
model_new = build_model(VOCAB_SIZE, EMBEDDING_DIM, RNN_UNITS, batch_size=1)
model_new.load_weights(tf.train.latest_checkpoint(check_pt_dir))
model_new.build(tf.TensorShape([1, None]))

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (1, None, 100)            6500      
_________________________________________________________________
gru_1 (GRU)                  (1, None, 1024)           3459072   
_________________________________________________________________
dense_1 (Dense)              (1, None, 65)             66625     
Total params: 3,532,197
Trainable params: 3,532,197
Non-trainable params: 0
_________________________________________________________________


In [24]:
def predict_text(model, start_text):
    sequence = [char2idx[s] for s in start_text]
    sequence = tf.expand_dims(sequence, 0)

    generated_text = []
    num_text = 200
    temp = 1.0
    model.reset_states()
    for _ in range(num_text):
        pred = model(sequence)
        pred = tf.squeeze(pred, 0)
        pred = pred / temp
        pred = tf.random.categorical(pred, num_samples=1)[-1,0].numpy()
        sequence = tf.expand_dims([pred], 0)
        generated_text.append(idx2char[pred])
        # print(idx2char[pred])
    print(start_text + "".join(generated_text))

In [26]:
predict_text(model_new, "Romeo:") # 10 epoch

Romeo: as highness is at his.

KING RICHARD III:
But to your lordship will than they were. But least a Roman through
A mother place and deach, and what means he is, erentley say it is.

NORTHUMBERLAND:
He s


In [25]:
predict_text(model_new, "Romeo:") # 30 epoch

Romeo:
Of that sheard spread suspicion, as
no loding wrong:
Some she privilege divide, as it
was so fitting their absolver.

ELBOW:
Commend me to the occasion how having those whose that I provided
As is th


In [51]:
# custom training loop
model_other = build_model(VOCAB_SIZE, EMBEDDING_DIM, RNN_UNITS, BATCH_SIZE)

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (64, None, 100)           6500      
_________________________________________________________________
gru_3 (GRU)                  (64, None, 1024)          3459072   
_________________________________________________________________
dense_3 (Dense)              (64, None, 65)            66625     
Total params: 3,532,197
Trainable params: 3,532,197
Non-trainable params: 0
_________________________________________________________________


In [52]:
check_pt_dir_other = './check_pt_other'
check_pt_file_other_fmt = os.path.join(check_pt_dir_other, 'check_{epoch}')

In [53]:
optimizer = tf.keras.optimizers.Adam()
loss_fn = lambda true, pred: tf.reduce_mean(tf.keras.losses.sparse_categorical_crossentropy(true, pred, from_logits=True))

In [54]:
@tf.function
def train(ipt, opt):
    with tf.GradientTape() as tape:
        predict = model_other(ipt)
        loss = loss_fn(opt, predict)
    grads = tape.gradient(loss, model_other.trainable_variables)
    optimizer.apply_gradients(zip(grads, model_other.trainable_variables))
    return loss

In [50]:
import time
# for evey epoch
EPOCH = 10
for ep in range(1, EPOCH+1):
    # start timer and reset model state
    start = time.time()
    model_other.reset_states()
    # enumerate datase and get ip and tgt
    for idx, (ipt, tgt) in enumerate(train_data):
        # call train mth and get loss
        loss = train(ipt, tgt)
        # for every 100th batch print epoch, batch and losss
        if not idx % 100:
            print(f"Epoch {ep} Batch {idx} loss: {loss}")
    # save model wts every 5 epoch
    if not ep % 5:
        print("Saving model wts...")
        model_other.save_weights(check_pt_file_other_fmt.format(epoch=ep))
    # print epoch loss and time taken
    print(f"Epoch {ep} time taken {int(time.time() - start)}s loss: {loss}\n")
# save wts
model_other.save_weights(check_pt_file_other_fmt.format(epoch=ep))

Epoch 1 Batch 0 loss: 0.987172544002533
Epoch 1 Batch 100 loss: 1.082219123840332
Epoch 1 time taken 9s loss: 1.0874640941619873

Epoch 2 Batch 0 loss: 0.942598283290863
Epoch 2 Batch 100 loss: 1.0414557456970215
Epoch 2 time taken 9s loss: 1.0571722984313965

Epoch 3 Batch 0 loss: 0.9457034468650818
Epoch 3 Batch 100 loss: 0.9817665219306946
Epoch 3 time taken 9s loss: 1.014931321144104

Epoch 4 Batch 0 loss: 0.8751521110534668
Epoch 4 Batch 100 loss: 0.9925789833068848
Epoch 4 time taken 9s loss: 1.0095241069793701

Epoch 5 Batch 0 loss: 0.855405867099762
Epoch 5 Batch 100 loss: 0.9215734601020813
Saving model wts...

Epoch 5 time taken 9s loss: 0.9686150550842285

Epoch 6 Batch 0 loss: 0.8056967854499817
Epoch 6 Batch 100 loss: 0.903012216091156
Epoch 6 time taken 9s loss: 0.9309359788894653

Epoch 7 Batch 0 loss: 0.7813970446586609
Epoch 7 Batch 100 loss: 0.8896231055259705
Epoch 7 time taken 9s loss: 0.8960713744163513

Epoch 8 Batch 0 loss: 0.7353395223617554
Epoch 8 Batch 100 lo

In [57]:
# loading checkpoint and reseting input from batch to single input
model_other_new = build_model(VOCAB_SIZE, EMBEDDING_DIM, RNN_UNITS, batch_size=1)
model_other_new.load_weights(tf.train.latest_checkpoint(check_pt_dir_other))
model_other_new.build(tf.TensorShape([1, None]))

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (1, None, 100)            6500      
_________________________________________________________________
gru_5 (GRU)                  (1, None, 1024)           3459072   
_________________________________________________________________
dense_5 (Dense)              (1, None, 65)             66625     
Total params: 3,532,197
Trainable params: 3,532,197
Non-trainable params: 0
_________________________________________________________________


In [58]:
predict_text(model_other_new, "Romeo:")

Romeo:
By the stooping but against the outorn him,
His discovered join, with cripportary
My man turns to quench my country, and our hearts
Should ne'er be cured by the charge.

First Gentleman:
Where lius?

