In [1]:
%load_ext tensorboard

In [2]:
import os
import datetime
import numpy as np
import pandas as pd
import tensorflow as tf

In [3]:
print("Tensorflow version = {}".format(tf.__version__))

Tensorflow version = 2.9.1


In [4]:
datafile_path = "../Dataset/Harry Potter and the Half-Blood Prince.txt"

In [5]:
text = open(datafile_path, 'rb').read().decode(encoding='utf-8')
print("The book contains a total of {} characters".format(len(text)))

The book contains a total of 1010864 characters


In [6]:
print(text[204:1000])

CHAPTER ONE


                          THE OTHER MINISTER

   It was nearing midnight and the Prime Minister was sitting alone in his office, reading a long
memo that was slipping through his brain without leaving the slightest trace of meaning behind. He
was waiting for a call from the President of a far distant country, and between wondering when the
wretched man would telephone, and trying to suppress unpleasant memories of what had been a
very long, tiring, and difficult week, there was not much space in his head for anything else. The
more he attempted to focus on the print on the page before him, the more clearly the Prime
Minister could see the gloating face of one of his political opponents. This particular opponent had
appeared on the news that very day, not only 


# Prepare the Text
<ul>
    <li>Removing the staring characters stating the author and the name of the book</li>
</ul>

In [7]:
text = text[204:]

In [8]:
print(text[:500])

CHAPTER ONE


                          THE OTHER MINISTER

   It was nearing midnight and the Prime Minister was sitting alone in his office, reading a long
memo that was slipping through his brain without leaving the slightest trace of meaning behind. He
was waiting for a call from the President of a far distant country, and between wondering when the
wretched man would telephone, and trying to suppress unpleasant memories of what had been a
very long, tiring, and difficult week, ther


In [9]:
# Counting the unique vocabulary in the text
vocab = sorted(set(text))
print("{} number of unique charcters.".format(len(vocab)))

82 number of unique charcters.


# Character to Integer Mapping

In [10]:
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

text_as_int = np.array([char2idx[c] for c in text])

In [11]:
print('{')
for char,_ in zip(char2idx, range(20)):
    print('  {:4s}: {:3d},'.format(repr(char), char2idx[char]))
print(' ...\n}')

{
  '\n':   0,
  '\x0c':   1,
  '\r':   2,
  ' ' :   3,
  '!' :   4,
  '(' :   5,
  ')' :   6,
  ',' :   7,
  '-' :   8,
  '.' :   9,
  '0' :  10,
  '1' :  11,
  '2' :  12,
  '3' :  13,
  '4' :  14,
  '5' :  15,
  '6' :  16,
  '7' :  17,
  '8' :  18,
  ':' :  19,
 ...
}


In [12]:
print('{} \n ---- char-2-int ---- \n{}'.format(repr(text[50:100]), text_as_int[50:100]))

'ER MINISTER\r\n\r\n   It was nearing midnight and the ' 
 ---- char-2-int ---- 
[26 39  3 34 30 35 30 40 41 26 39  2  0  2  0  3  3  3 30 67  3 70 48 66
  3 61 52 48 65 56 61 54  3 60 56 51 61 56 54 55 67  3 48 61 51  3 67 55
 52  3]


# Prepare the Dataset
<p>Using the sliding window approach to select the training batch.</p>

In [13]:
#maximum length sentence we want per character input
seq_length = 100
examples_per_epoch = len(text)//(seq_length+1)

In [14]:
#Creating training examples and training batches
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

for i in char_dataset.take(20):
    print(idx2char[i.numpy()])

C
H
A
P
T
E
R
 
O
N
E









 
 
 


In [15]:
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

for item in sequences.take(10):
    print(repr(''.join(idx2char[item.numpy()])))
    print('-'*115)

'CHAPTER ONE\r\n\r\n\r\n                          THE OTHER MINISTER\r\n\r\n   It was nearing midnight and the P'
-------------------------------------------------------------------------------------------------------------------
'rime Minister was sitting alone in his office, reading a long\r\nmemo that was slipping through his bra'
-------------------------------------------------------------------------------------------------------------------
'in without leaving the slightest trace of meaning behind. He\r\nwas waiting for a call from the Preside'
-------------------------------------------------------------------------------------------------------------------
'nt of a far distant country, and between wondering when the\r\nwretched man would telephone, and trying'
-------------------------------------------------------------------------------------------------------------------
' to suppress unpleasant memories of what had been a\r\nvery long, tiring, and difficult week, there was

In [16]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

In [17]:
for input_example, target_example in dataset.take(1):
    print('Input Data: ', repr(''.join(idx2char[input_example.numpy()])))
    print('Target Data: ', repr(''.join(idx2char[target_example.numpy()])))

Input Data:  'CHAPTER ONE\r\n\r\n\r\n                          THE OTHER MINISTER\r\n\r\n   It was nearing midnight and the '
Target Data:  'HAPTER ONE\r\n\r\n\r\n                          THE OTHER MINISTER\r\n\r\n   It was nearing midnight and the P'


In [18]:
for i, (input_idx, target_idx) in enumerate(zip(input_example[:5], target_example[:5])):
    print("Step {:4d}".format(i))
    print("  input : {}  ({:s})".format(input_idx, repr(idx2char[input_idx])))
    print("  expected output: {}  ({:s})".format(target_idx, repr(idx2char[target_idx])))

Step    0
  input : 24  ('C')
  expected output: 29  ('H')
Step    1
  input : 29  ('H')
  expected output: 22  ('A')
Step    2
  input : 22  ('A')
  expected output: 37  ('P')
Step    3
  input : 37  ('P')
  expected output: 41  ('T')
Step    4
  input : 41  ('T')
  expected output: 26  ('E')


# Prepare Training Batch

In [19]:
# Batch size
BATCH_SIZE = 64
# Buffer size to shuffle the dataset
BUFFER_SIZE = 10000

In [20]:
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
print("Dataset Shape={}".format(dataset))

Dataset Shape=<BatchDataset element_spec=(TensorSpec(shape=(64, 100), dtype=tf.int32, name=None), TensorSpec(shape=(64, 100), dtype=tf.int32, name=None))>


# Prepare Model

We prepare the utility function to generate the architecture of our deep learning based language model. We leverage the high level tf.keras API for creating this model.

In [21]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    """
    This function creates a model object.
    Parameters:
        vocab_size: number of unique characters
        embedding_dim: size of embedding vector. Typically in powers of 2
        rnn_units: number of GRU units to be used
        batch_size: batch size for training the model
    Returns:
        tf.keras model object
    """
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim, batch_input_shape=[batch_size, None]),
        tf.keras.layers.GRU(rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
        tf.keras.layers.Dense(vocab_size)
    ])
    return model

In [22]:
#length of the vocabulary in chars
vocab_size = len(vocab)

#embedding dimension
embedding_dim = 256

#number of RNN units
rnn_units = 1024

In [23]:
model = build_model(vocab_size=len(vocab),
                    embedding_dim=embedding_dim,
                    rnn_units=rnn_units,
                    batch_size=BATCH_SIZE)

In [24]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (64, None, 256)           20992     
                                                                 
 gru (GRU)                   (64, None, 1024)          3938304   
                                                                 
 dense (Dense)               (64, None, 82)            84050     
                                                                 
Total params: 4,043,346
Trainable params: 4,043,346
Non-trainable params: 0
_________________________________________________________________


In [25]:
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

In [26]:
model.compile(optimizer='adam', loss=loss)

# Setup Callbacks
     We setup a single callback to store training checkpoints.

In [27]:
checkpoint_dir = r'data/training_checkpoints'
#Name the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [28]:
logdir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard_callback = tf.keras.callbacks.TensorBoard(logdir, histogram_freq=1)

# Train the Language Model

    Now we shall train the model based on the training dataset perpared.
    We train for a few epoch first to check if the model is learning or not.

In [29]:
EPOCHS = 40

history=model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback, tensorboard_callback])

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


In [30]:
%tensorboard --logdir logs

In [31]:
from tensorboard import notebook
notebook.list() #to view TensorBoard instances

Known TensorBoard instances:
  - port 6006: logdir logs (started 0:00:00 ago; pid 22692)


In [32]:
notebook.display(port=6006, height=1000)

Selecting TensorBoard with logdir logs (started 0:00:00 ago; port 6006, pid 22692).


# Generate Text
    based on the training done, we need to generate some text and the see what the model has learned.

In [33]:
tf.train.latest_checkpoint(checkpoint_dir)

'data/training_checkpoints\\ckpt_40'

# Model Load 

In [34]:
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)

model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

model.build(tf.TensorShape([1, None]))

In [35]:
def generate_text(model, context_string, num_generate=1000, temperature=1.0):
    """
    Parameters:
        model: tf.keras object trained on a sufficient sized corpus
        num_generate: number of characters to be generated
        temperature: parameter to control randomness of outputs
    Returns:
        string: context_string + text_generated
    """
    #vectorizing: convert context string into string indices
    input_eval = [char2idx[s] for s in context_string]
    input_eval = tf.expand_dims(input_eval, 0)
    
    #String for generated characters
    text_generated = []
    
    model.reset_states()
    
    #Loop till required number of characters are generated
    for i in range(num_generate):
        predictions = model(input_eval)
        predictions = tf.squeeze(predictions, 0)
        
        #temperature helps in controlling the character returned by the model.
        predictions = predictions/temperature
        #Sampling over a categorical distribution
        prediction_id = tf.random.categorical(predictions, num_samples=1)[-1, 0].numpy()
        
        #predicted character acts as input for next step
        input_eval = tf.expand_dims([prediction_id], 0)
        
        text_generated.append(idx2char[prediction_id])
    return (context_string + ''.join(text_generated))

In [36]:
print(generate_text(model, context_string=u"This particular opponent", num_generate=100))

This particular opponent that she could show any time down asl,” said Harry, “the restrid tell him, I return from whired awa
