# Word prediction and sequence generation for shakespeare play with LSTM and GRU. Mehran Piran

In [1]:
import os
import traceback

import shutil
import numpy as np
import random as  rnd

import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.layers import Input

from termcolor import colored


2024-07-14 18:03:46.729764: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [12]:
os.chdir('/mnt/market/anclab-rstudio-server/home/mpir0002/NLP_labs')

<a name="1"></a>
## 1) Load data

In [85]:

dirname = '/mnt/market/anclab-rstudio-server/home/mpir0002/NLP_labs'
filename = 'shakespeare_data.txt'
file_path = os.path.join(dirname, filename)
with open(file_path, 'r') as file:
    # Read all lines into a list
    lines = file.readlines()


In [86]:
lines[1:10]

['\n',
 '\n',
 '\n',
 'FROM off a hill whose concave womb reworded\n',
 'A plaintful story from a sistering vale,\n',
 'My spirits to attend this double voice accorded,\n',
 'And down I laid to list the sad-tuned tale;\n',
 'Ere long espied a fickle maid full pale,\n',
 'Tearing of papers, breaking rings a-twain,\n']

In [87]:
print("\n".join(lines[1:10]))







FROM off a hill whose concave womb reworded

A plaintful story from a sistering vale,

My spirits to attend this double voice accorded,

And down I laid to list the sad-tuned tale;

Ere long espied a fickle maid full pale,

Tearing of papers, breaking rings a-twain,



In [88]:

lines = [] # storing all the lines in a variable. 

counter = 0

with open(file_path, 'r') as file:
    for line in file:        
        # remove leading and trailing whitespace
        pure_line = line.strip().lower()

        # if pure_line is not the empty string,
        if pure_line:
            # append it to the list
            lines.append(pure_line)
            
n_lines = len(lines)
print(f"Number of lines: {n_lines}")

Number of lines: 125097


In [89]:
l1 = lines[1:10]
print(len(l1))
print(type(l1))
l1

9
<class 'list'>


['from off a hill whose concave womb reworded',
 'a plaintful story from a sistering vale,',
 'my spirits to attend this double voice accorded,',
 'and down i laid to list the sad-tuned tale;',
 'ere long espied a fickle maid full pale,',
 'tearing of papers, breaking rings a-twain,',
 "storming her world with sorrow's wind and rain.",
 'upon her head a platted hive of straw,',
 'which fortified her visage from the sun,']

In [90]:
l2 = "\n".join(lines[1:10])
print(len(l2))
print(type(l2))
l2

389
<class 'str'>


"from off a hill whose concave womb reworded\na plaintful story from a sistering vale,\nmy spirits to attend this double voice accorded,\nand down i laid to list the sad-tuned tale;\nere long espied a fickle maid full pale,\ntearing of papers, breaking rings a-twain,\nstorming her world with sorrow's wind and rain.\nupon her head a platted hive of straw,\nwhich fortified her visage from the sun,"

<a name="1"></a>
## 2) Create vocabulary

Because we are performing sequence generation, A vocabulary of letters are created, not words 

In [91]:
# 2) Create vocabulary  
text = "\n".join(lines)
# The unique characters in the file
vocab = sorted(set(text))
vocab.insert(0,"[UNK]") # Add a special character for any unknown
vocab.insert(1,"") # Add the empty character for padding.
print(vocab)

['[UNK]', '', '\t', '\n', ' ', '!', '$', '&', "'", '(', ')', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', '[', ']', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '|']


In [92]:
print(vocab.index('a'))
print(vocab.index('e'))
print(vocab.index('i'))

29
33
37


<a name="1"></a>
## 3) Convert each line to a tensor and tokenization

In [93]:
line = "Hello world!"
chars = tf.strings.unicode_split(line, input_encoding='UTF-8')
print(chars)

tf.Tensor([b'H' b'e' b'l' b'l' b'o' b' ' b'w' b'o' b'r' b'l' b'd' b'!'], shape=(12,), dtype=string)


In [94]:
ids = tf.keras.layers.StringLookup(vocabulary=list(vocab), mask_token=None)(chars)
print(ids)
#  By setting mask_token=None, you are indicating that no token in the vocabulary should be treated as a mask token

tf.Tensor([ 0 33 40 40 43  4 51 43 46 40 32  5], shape=(12,), dtype=int64)


In [95]:
def line_to_tensor(line, vocab):
    """
    Converts a line of text into a tensor of integer values representing characters.

    Args:
        line (str): A single line of text.
        vocab (list): A list containing the vocabulary of unique characters.

    Returns:
        tf.Tensor(dtype=int64): A tensor containing integers (unicode values) corresponding to the characters in the `line`.
    """

    # Split the input line into individual characters
    chars = list(line)
    # Map characters to their respective integer values using StringLookup
    ids = tf.keras.layers.StringLookup(vocabulary=list(vocab), mask_token=None)(chars)
    

    return ids


You will also need a function that produces text given a numeric tensor

In [96]:

def text_from_ids(ids, vocab):
    
    """
    Converts a tensor of integer values into human-readable text.

    Args:
        ids (tf.Tensor): A tensor containing integer values (unicode IDs).
        vocab (list): A list containing the vocabulary of unique characters.

    Returns:
        str: A string containing the characters in human-readable format.
    """
    
    # Initialize the StringLookup layer to map integer IDs back to characters
    chars_from_ids = tf.keras.layers.StringLookup(vocabulary=vocab, invert=True, mask_token=None)
    
    # Use the layer to decode the tensor of IDs into human-readable text
    return tf.strings.reduce_join(chars_from_ids(ids), axis=-1)


<a name="1"></a>
## 3.1) Prepare your data for training and testing

In [97]:
train_lines = lines[:-1000] # Leave the rest for training
eval_lines = lines[-1000:] # Create a holdout validation set

print(f"Number of training lines: {len(train_lines)}")
print(f"Number of validation lines: {len(eval_lines)}")

Number of training lines: 124097
Number of validation lines: 1000


### Create input and output for your model

In many sequence modeling tasks (such as language modeling or sequence prediction), you often need to predict the next element in the sequence given the previous elements. This is why you might use seq_length + 1 instead of just seq_length.

You have to predict the next character in a sequence. The following function creates 2 tensors, each with a length of seq_length out of the input sequence of lenght seq_length + 1. The first one contains the first seq_length elements and the second one contains the last seq_length elements. For example, if you split the sequence ['H', 'e', 'l', 'l', 'o'], you will obtain the sequences ['H', 'e', 'l', 'l'] and ['e', 'l', 'l', 'o'].


In [132]:
def split_input_target(sequence):
    """
    Splits the input sequence into two sequences, where one is shifted by one position.

    Args:
        sequence (tf.Tensor or list): A list of characters or a tensor.

    Returns:
        tf.Tensor, tf.Tensor: Two tensors representing the input and output sequences for the model.
    """
    # Create the input sequence by excluding the last character
    input_text = list(sequence[:-1])
    # Create the target sequence by excluding the first character
    target_text = list(sequence[1:])

    return input_text, target_text

In [133]:
split_input_target("Tensorflow")

(['T', 'e', 'n', 's', 'o', 'r', 'f', 'l', 'o'],
 ['e', 'n', 's', 'o', 'r', 'f', 'l', 'o', 'w'])

<a name="1"></a>
## 3.2) Data generator

Create a TensorFlow DataSet from your numeric tensors using tf.data.Dataset.from_tensor_slices() function

In [134]:
def split_input_target(sequence):
    """
    Splits the input sequence into two sequences, where one is shifted by one position.

    Args:
        sequence (tf.Tensor or list): A list of characters or a tensor.

    Returns:
        tf.Tensor, tf.Tensor: Two tensors representing the input and output sequences for the model.
    """
    # Create the input sequence by excluding the last character
    input_text = sequence[:-1]
    # Create the target sequence by excluding the first character
    target_text = sequence[1:]

    return input_text, target_text

In [135]:
split_input_target("Tensorflow")

('Tensorflo', 'ensorflow')

In [136]:
split_input_target(list("Tensorflow"))

(['T', 'e', 'n', 's', 'o', 'r', 'f', 'l', 'o'],
 ['e', 'n', 's', 'o', 'r', 'f', 'l', 'o', 'w'])

In [140]:
lines = train_lines[0:5]
print(lines)
single_line_data = "\n".join(lines)
single_line_data

["a lover's complaint", 'from off a hill whose concave womb reworded', 'a plaintful story from a sistering vale,', 'my spirits to attend this double voice accorded,', 'and down i laid to list the sad-tuned tale;']


"a lover's complaint\nfrom off a hill whose concave womb reworded\na plaintful story from a sistering vale,\nmy spirits to attend this double voice accorded,\nand down i laid to list the sad-tuned tale;"

In [128]:
all_ids = line_to_tensor(single_line_data, vocab)
all_ids

<tf.Tensor: shape=(197,), dtype=int64, numpy=
array([29,  4, 40, 43, 50, 33, 46,  8, 47,  4, 31, 43, 41, 44, 40, 29, 37,
       42, 48,  3, 34, 46, 43, 41,  4, 43, 34, 34,  4, 29,  4, 36, 37, 40,
       40,  4, 51, 36, 43, 47, 33,  4, 31, 43, 42, 31, 29, 50, 33,  4, 51,
       43, 41, 30,  4, 46, 33, 51, 43, 46, 32, 33, 32,  3, 29,  4, 44, 40,
       29, 37, 42, 48, 34, 49, 40,  4, 47, 48, 43, 46, 53,  4, 34, 46, 43,
       41,  4, 29,  4, 47, 37, 47, 48, 33, 46, 37, 42, 35,  4, 50, 29, 40,
       33, 11,  3, 41, 53,  4, 47, 44, 37, 46, 37, 48, 47,  4, 48, 43,  4,
       29, 48, 48, 33, 42, 32,  4, 48, 36, 37, 47,  4, 32, 43, 49, 30, 40,
       33,  4, 50, 43, 37, 31, 33,  4, 29, 31, 31, 43, 46, 32, 33, 32, 11,
        3, 29, 42, 32,  4, 32, 43, 51, 42,  4, 37,  4, 40, 29, 37, 32,  4,
       48, 43,  4, 40, 37, 47, 48,  4, 48, 36, 33,  4, 47, 29, 32, 12, 48,
       49, 42, 33, 32,  4, 48, 29, 40, 33, 25])>

In [147]:
seq_length = 20
BUFFER_SIZE = 1000
batch_size = 64

lines = train_lines[0:1000]
single_line_data = "\n".join(lines)


In [148]:
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)
ids_dataset

<_TensorSliceDataset element_spec=TensorSpec(shape=(), dtype=tf.int64, name=None)>

In [149]:
data_generator = ids_dataset.batch(seq_length + 1, drop_remainder=True)
data_generator

<_BatchDataset element_spec=TensorSpec(shape=(21,), dtype=tf.int64, name=None)>

In [150]:
dataset_xy = data_generator.map(split_input_target)
dataset_xy

<_MapDataset element_spec=(TensorSpec(shape=(20,), dtype=tf.int64, name=None), TensorSpec(shape=(20,), dtype=tf.int64, name=None))>

In [151]:
dataset = (dataset_xy.shuffle(BUFFER_SIZE).batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE))  
dataset

<_PrefetchDataset element_spec=(TensorSpec(shape=(None, 20), dtype=tf.int64, name=None), TensorSpec(shape=(None, 20), dtype=tf.int64, name=None))>

In [152]:

def create_batch_dataset(lines, vocab, seq_length=100, batch_size=64):
    
    """
    Creates a batch dataset from a list of text lines.

    Args:
        lines (list): A list of strings with the input data, one line per row.
        vocab (list): A list containing the vocabulary.
        seq_length (int): This parameter specifies the length of each sequence sample that will be used for training. 
        batch_size (int): The batch size. It determines how many sequence samples are processed together in parallel during each training iteration

    Returns:
        tf.data.Dataset: A batch dataset generator.
    """
    
    # Buffer size to shuffle the dataset
    # (TF data is designed to work with possibly infinite sequences,
    # so it doesn't attempt to shuffle the entire sequence in memory. Instead,
    # it maintains a buffer in which it shuffles elements).
    BUFFER_SIZE = 10000
    
    # For simplicity, just join all lines into a single line
    single_line_data  = "\n".join(lines)

    # Convert your data into a tensor using the given vocab
    all_ids = line_to_tensor(single_line_data, vocab)
    # Create a TensorFlow dataset from the data tensor
    ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)
    # Create a batch dataset
    data_generator = ids_dataset.batch(seq_length + 1, drop_remainder=True) # drop_remainder=True means that if the total number of elements in the dataset is not perfectly divisible by the batch size, the remaining elements that do not form a full batch will be discarded.
    
    dataset_xy = data_generator.map(split_input_target) # Applies the split_input_target function to each batch. This function typically splits each batch into input and target sequences.
    
    # Assemble the final dataset with shuffling, batching, and prefetching
    dataset = (                                   
        dataset_xy                                
        .shuffle(BUFFER_SIZE)
        .batch(batch_size)
        .prefetch(tf.data.experimental.AUTOTUNE)  
        )            
    
    return dataset



In [None]:

# ids_dataset.batch(seq_length + 1, drop_remainder=True):
# This operation batches the dataset into sequences of length seq_length + 1. Each batch produced by this method consists of seq_length + 1 elements. This approach is typically used when preparing data for sequence prediction tasks, where each batch includes both input sequences (first seq_length elements) and target sequences (next element).

# dataset.batch(batch_size):
# After mapping split_input_target to create (inputs, targets) pairs from each batch, dataset.batch(batch_size) then batches these pairs into batches of size batch_size. This batching step ensures that during training, batch_size number of (inputs, targets) pairs are processed simultaneously.


### Test your data generator

In [215]:
dataset = create_batch_dataset(train_lines[0:10], vocab, seq_length=16, batch_size=2)
dataset

<_PrefetchDataset element_spec=(TensorSpec(shape=(None, 16), dtype=tf.int64, name=None), TensorSpec(shape=(None, 16), dtype=tf.int64, name=None))>

In [168]:
text_from_ids([22,30,31,20] , vocab).numpy()

b'8bc6'

In [162]:

for input_example, target_example in dataset.take(1):
    print(text_from_ids(input_example[0], vocab).numpy())
    print(text_from_ids(target_example[0], vocab).numpy())
    
    print("\n", text_from_ids(input_example[1], vocab).numpy())
    print(text_from_ids(target_example[1], vocab).numpy())
    

b"sorrow's wind an"
b"orrow's wind and"

 b'attend this doub'
b'ttend this doubl'


2024-07-15 18:40:44.397278: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int64 and shape [409]
	 [[{{node Placeholder/_0}}]]
2024-07-15 18:40:44.397611: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int64 and shape [409]
	 [[{{node Placeholder/_0}}]]


In [172]:
line = "Hello world!"
line = line.lower()

char_to_id = {c: i for i, c in enumerate(list(set(line)))}
id_to_char = {i: c for i, c in enumerate(list(set(line)))}
print(char_to_id)
print(id_to_char)
char_to_id = {c: i for i, c in enumerate(vocab)}
id_to_char = {i: c for i, c in enumerate(vocab)}
print("\n",char_to_id)
print("\n",id_to_char)
all_ids = [char_to_id[c] for c in line]
print("\n",all_ids)

ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)
print("\n",ids_dataset)

seq_length = 5
data_generator = ids_dataset.batch(seq_length + 1, drop_remainder=True)
print("\n",data_generator)

{'e': 0, '!': 1, 'l': 2, ' ': 3, 'd': 4, 'r': 5, 'w': 6, 'h': 7, 'o': 8}
{0: 'e', 1: '!', 2: 'l', 3: ' ', 4: 'd', 5: 'r', 6: 'w', 7: 'h', 8: 'o'}

 {'[UNK]': 0, '': 1, '\t': 2, '\n': 3, ' ': 4, '!': 5, '$': 6, '&': 7, "'": 8, '(': 9, ')': 10, ',': 11, '-': 12, '.': 13, '0': 14, '1': 15, '2': 16, '3': 17, '4': 18, '5': 19, '6': 20, '7': 21, '8': 22, '9': 23, ':': 24, ';': 25, '?': 26, '[': 27, ']': 28, 'a': 29, 'b': 30, 'c': 31, 'd': 32, 'e': 33, 'f': 34, 'g': 35, 'h': 36, 'i': 37, 'j': 38, 'k': 39, 'l': 40, 'm': 41, 'n': 42, 'o': 43, 'p': 44, 'q': 45, 'r': 46, 's': 47, 't': 48, 'u': 49, 'v': 50, 'w': 51, 'x': 52, 'y': 53, 'z': 54, '|': 55}

 {0: '[UNK]', 1: '', 2: '\t', 3: '\n', 4: ' ', 5: '!', 6: '$', 7: '&', 8: "'", 9: '(', 10: ')', 11: ',', 12: '-', 13: '.', 14: '0', 15: '1', 16: '2', 17: '3', 18: '4', 19: '5', 20: '6', 21: '7', 22: '8', 23: '9', 24: ':', 25: ';', 26: '?', 27: '[', 28: ']', 29: 'a', 30: 'b', 31: 'c', 32: 'd', 33: 'e', 34: 'f', 35: 'g', 36: 'h', 37: 'i', 38: 'j', 39:

<a name="1"></a>
## 3.3) Create training dataset

In [217]:
BATCH_SIZE = 64
dataset = create_batch_dataset(train_lines, vocab, seq_length=100, batch_size=BATCH_SIZE)
dataset

<_PrefetchDataset element_spec=(TensorSpec(shape=(None, 100), dtype=tf.int64, name=None), TensorSpec(shape=(None, 100), dtype=tf.int64, name=None))>

In [218]:
len(dataset.take(1000)) # Dataset contains 791 batches

791

<a name="1"></a>
## 4) Constructing the LSTM and GRU language models

In [175]:
len(vocab)

56

In [275]:

class LSTMLM(tf.keras.Model):
    
    """
    A LSTM-based language model that maps from a tensor of tokens to activations over a vocabulary.

    Args:
        vocab_size (int, optional): Size of the vocabulary. Defaults to 256.
        embedding_dim (int, optional): Depth of embedding. Defaults to 256.
        rnn_units (int, optional): Number of units in the LSTM cell. Defaults to 128.

    Returns:
        tf.keras.Model: A GRULM language model.
    """
    
    def __init__(self, vocab_size=256, embedding_dim=256, rnn_units=128):
        super().__init__(self)
        
        
        # Create an embedding layer to map token indices to embedding vectors
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        # Define a GRU (Gated Recurrent Unit) layer for sequence modeling
        self.lstm = tf.keras.layers.LSTM(units=rnn_units, return_sequences=True, return_state=True) # return_state=True, the GRU layer will return the last hidden state in addition to the sequence of outputs
        # Apply a dense layer with log-softmax activation to predict next tokens
        self.dense = tf.keras.layers.Dense(units=vocab_size, activation=tf.nn.log_softmax)
        
        
    def call(self, inputs, states=None, return_state=False, training=False):
        x = inputs
        # Map input tokens to embedding vectors
        x = self.embedding(x, training=training)
        if states is None:
            # Get initial state from the LSTM layer
           states = self.lstm.get_initial_state(x)
        x, h_states, c_states = self.lstm(x, initial_state=states, training=training)
        # Predict the next tokens and apply log-softmax activation
        x = self.dense(x, training=training)
        if return_state:
            return x, h_states
        else:
            return x



In [276]:

# Length of the vocabulary in StringLookup Layer
vocab_size = 56

# The embedding dimension
embedding_dim = 256

# RNN layers
rnn_units = 512

model = LSTMLM(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    rnn_units = rnn_units)


In [None]:
# Optional: Build the model for inspection
model.build(input_shape=(batch_size, sequence_length))
model.summary()

# Optional: Call the model to ensure it works with the given input shape
dummy_input = tf.keras.Input(shape=(sequence_length,))
model(dummy_input)

In [277]:
# testing your model
try:
    # Simulate inputs of length 100. This allows to compute the shape of all inputs and outputs of our network
    model.build(input_shape=(BATCH_SIZE, 100))
    model.call(Input(shape=(100)))
    model.summary() 
except:
    print("\033[91mError! \033[0mA problem occurred while building your model. This error can occur due to wrong initialization of the return_sequences parameter\n\n")
    traceback.print_exc()



2024-07-16 01:32:56.510125: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-07-16 01:32:56.511550: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-07-16 01:32:56.512794: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Model: "lstmlm_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_11 (Embedding)    (None, 100, 256)          14336     
                                                                 
 lstm_8 (LSTM)               [(None, 100, 512),        1574912   
                              (None, 512),                       
                              (None, 512)]                       
                                                                 
 dense_11 (Dense)            (None, 100, 56)           28728     
                                                                 
Total params: 1,617,976
Trainable params: 1,617,976
Non-trainable params: 0
_________________________________________________________________


2024-07-16 01:32:56.720045: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-07-16 01:32:56.721408: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-07-16 01:32:56.722447: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

In [272]:

class GRULM(tf.keras.Model):
    """
    A GRU-based language model that maps from a tensor of tokens to activations over a vocabulary.

    Args:
        vocab_size (int, optional): Size of the vocabulary. Defaults to 256.
        embedding_dim (int, optional): Depth of embedding. Defaults to 256.
        rnn_units (int, optional): Number of units in the GRU cell. Defaults to 128.

    Returns:
        tf.keras.Model: A GRULM language model.
    """
    def __init__(self, vocab_size=256, embedding_dim=256, rnn_units=128):
        super().__init__(self)

        ### START CODE HERE ###

        # Create an embedding layer to map token indices to embedding vectors
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        # Define a GRU (Gated Recurrent Unit) layer for sequence modeling
        self.gru = tf.keras.layers.GRU(units=rnn_units, return_sequences=True, return_state=True)
        # Apply a dense layer with log-softmax activation to predict next tokens
        self.dense = tf.keras.layers.Dense(units=vocab_size, activation=tf.nn.log_softmax)
        
        ### END CODE HERE ###
    
    def call(self, inputs, states=None, return_state=False, training=False):
        x = inputs
        # Map input tokens to embedding vectors
        x = self.embedding(x, training=training)
        if states is None:
            # Get initial state from the GRU layer
            states = self.gru.get_initial_state(x)
        x, states = self.gru(x, initial_state=states, training=training)
        # Predict the next tokens and apply log-softmax activation
        x = self.dense(x, training=training)
        if return_state:
            return x, states
        else:
            return x


# Length of the vocabulary in StringLookup Layer
vocab_size = 56

# The embedding dimension
embedding_dim = 256

# RNN layers
rnn_units = 512

model = GRULM(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    rnn_units = rnn_units)


# testing your model

try:
    # Simulate inputs of length 100. This allows to compute the shape of all inputs and outputs of our network
    model.build(input_shape=(BATCH_SIZE, 100))
    model.call(Input(shape=(100)))
    model.summary() 
except:
    print("\033[91mError! \033[0mA problem occurred while building your model. This error can occur due to wrong initialization of the return_sequences parameter\n\n")
    traceback.print_exc()



2024-07-16 01:30:29.098165: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-07-16 01:30:29.099853: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-07-16 01:30:29.101117: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Model: "grulm_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_10 (Embedding)    (None, 100, 256)          14336     
                                                                 
 gru_2 (GRU)                 [(None, 100, 512),        1182720   
                              (None, 512)]                       
                                                                 
 dense_10 (Dense)            (None, 100, 56)           28728     
                                                                 
Total params: 1,225,784
Trainable params: 1,225,784
Non-trainable params: 0
_________________________________________________________________


why
x, h_states, c_states = self.lstm(x, initial_state=states, training=training)
while
x, states = self.gru(x, initial_state=states, training=training)
?

when using the GRU (self.gru) in your GRULM model, you unpack the outputs into x and states because the GRU only returns the hidden state. On the other hand, with the LSTM (self.lstm) in your LSTMLM model, you unpack the outputs into x, h_states, and c_states to capture both the hidden state and the cell state.


### Prefdiction before training

In [243]:
for input_example_batch, target_example_batch in dataset.take(2):
    print("Input: ", input_example_batch[0].numpy()) # Lets use only the first sequence on the batch
    print("Target: ", target_example_batch[0].numpy())
    example_batch_predictions = model(tf.constant([input_example_batch[0].numpy()]))
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")
    print("\n Predictions: " , tf.argmax(example_batch_predictions, axis=-1))
    
    print("\n" , example_batch_predictions[0][99].numpy())
    print(example_batch_predictions[0][99].numpy().shape)
    print("\n" , example_batch_predictions)
    
    print("\n\n\n")

2024-07-15 23:49:31.316950: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int64 and shape [5107435]
	 [[{{node Placeholder/_0}}]]
2024-07-15 23:49:31.317297: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int64 and shape [5107435]
	 [[{{node Placeholder/_0}}]]


Input:  [32 37 47 48 29 34 34 25  4 29 42 32  4 37  3 36 43 44 33  4 48 43  4 47
 33 33  4 29  4 36 43 49 47 33 51 37 34 33  4 48 29 39 33  4 48 36 33 33
  4 30 33 48 51 33 33 42  4 36 33 46  4 40 33 35 47  3 29 42 32  4 47 44
 37 42  4 37 48  4 43 34 34 13  3 47 37 46  4 29 42 32 46 33 51  2 34 29
 37 48 36 11]
Target:  [37 47 48 29 34 34 25  4 29 42 32  4 37  3 36 43 44 33  4 48 43  4 47 33
 33  4 29  4 36 43 49 47 33 51 37 34 33  4 48 29 39 33  4 48 36 33 33  4
 30 33 48 51 33 33 42  4 36 33 46  4 40 33 35 47  3 29 42 32  4 47 44 37
 42  4 37 48  4 43 34 34 13  3 47 37 46  4 29 42 32 46 33 51  2 34 29 37
 48 36 11  4]
(1, 100, 56) # (batch_size, sequence_length, vocab_size)

 Predictions:  tf.Tensor(
[[25 25 43 37 51 22 32 10 10 10 10 12 16 25 16 20 22 18 32 20 20 30 20 16
  32 32 52 53 16 20 18 32 32 32 24 52 32 32 52 20 53 51  1 45 51 20 32 32
  52 54 54 20 24 32 32 54 20 20 32 54 16 10 54 33 21 32 29 20 54 16 51 13
  43  6 10 16 37 37 37 22 32 24 32 32 43 55 39 51 54 54 54 54 54 

**Predictions is a tensor of log probabilities**              
The tf.nn.log_softmax function computes the logarithm of the softmax activation along the last dimension of the tensor. Softmax itself ensures that the outputs are probabilities (values between 0 and 1), but applying log_softmax afterwards converts these probabilities into log-probabilities, which can be negative.

Here's why log-probabilities (negative values) are commonly used:
Numerical Stability: When dealing with probabilities in machine learning models, it's often more numerically stable to work with log-probabilities, especially during computation of gradients and when dealing with small probabilities.
Loss Calculation: Many loss functions, like cross-entropy, expect log-probabilities as inputs because they penalize the distance between predicted and actual probabilities.

In [298]:
# Prediction for the last character in a sequence
last_character = tf.math.argmax(example_batch_predictions[0][99]).numpy()
print(last_character)
print(text_from_ids([last_character] , vocab).numpy())

36
b'h'


In [254]:
predictions = tf.argmax(example_batch_predictions, axis=-1)
print(predictions)
print(predictions.shape)

tf.Tensor(
[[36 16 16 54 11 54 22 49 43 22 20 10 10 54 16 16 36 16 10 30 24 43 45 20
  18 24  0 10 30 24 10 53 51 10 52 16 16 37 20 22 30 20 35 29 34  6 37 32
  32 16 51 51 50 35 18 13 32 32 45 20 20 18 29  1  1  8 20 30 24  0 10 30
  24 10 53 51 10 52 43 45 43 16 16 51 51 10 36 37 20 50 32 33 20 51 36 16
  10 30 24 23]], shape=(1, 100), dtype=int64)
(1, 100)


In [264]:
list(text_from_ids(predictions , vocab).numpy())

[b"h22z,z8uo86))z22h2)b:oq64:[UNK])b:)yw)x22i68b6gaf$idd2wwvg4.ddq664a'6b:[UNK])b:)yw)xoqo22ww)hi6vde6wh2)b:9"]

In [257]:
print(vocab)

['[UNK]', '', '\t', '\n', ' ', '!', '$', '&', "'", '(', ')', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', '[', ']', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '|']


<a name="1"></a>
## 5) Training

In [278]:
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
optimizer = tf.keras.optimizers.Adam(learning_rate=0.00125)
model.compile(optimizer=optimizer , loss=loss)

In [279]:
EPOCHS = 10
history = model.fit(dataset, epochs=EPOCHS)

Epoch 1/10


2024-07-16 01:33:24.988258: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-07-16 01:33:24.989877: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-07-16 01:33:24.991627: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [284]:
model.summary()

Model: "lstmlm_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_11 (Embedding)    (None, 100, 256)          14336     
                                                                 
 lstm_8 (LSTM)               [(None, 100, 512),        1574912   
                              (None, 512),                       
                              (None, 512)]                       
                                                                 
 dense_11 (Dense)            (None, 100, 56)           28728     
                                                                 
Total params: 1,617,976
Trainable params: 1,617,976
Non-trainable params: 0
_________________________________________________________________


In [290]:
#model.save('/mnt/market/anclab-rstudio-server/home/mpir0002/NLP_labs/lstm_model_Shekespeare.h5')
#os.chdir('/mnt/market/anclab-rstudio-server/home/mpir0002/NLP_labs/')
#model.save('lstm_model_Shekespeare', save_format='tf')


In [289]:
# Save model architecture to JSON
model_json = model.to_json()
with open("/mnt/market/anclab-rstudio-server/home/mpir0002/NLP_labs/lstm_model_Shekespeare.json", "w") as json_file:
    json_file.write(model_json)

# Save weights to HDF5
model.save_weights("/mnt/market/anclab-rstudio-server/home/mpir0002/NLP_labs/lstm_model_Shekespeare_weights.h5")


### Prefdictions after training

In [294]:
for input_example_batch, target_example_batch in dataset.take(2):
    print("Input: ", input_example_batch[0].numpy()) # Lets use only the first sequence on the batch
    print("Target: ", target_example_batch[0].numpy())
    example_batch_predictions = model(tf.constant([input_example_batch[0].numpy()]))
    print("\n Predictions shape", example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")
    print("Predictions: " , tf.argmax(example_batch_predictions, axis=-1))
    
    print("\n" , example_batch_predictions[0][99].numpy())
    print(example_batch_predictions[0][99].numpy().shape)
    print("\n" , example_batch_predictions)
    
    print("\n\n\n")

2024-07-16 05:53:29.512833: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int64 and shape [5107435]
	 [[{{node Placeholder/_0}}]]
2024-07-16 05:53:29.513275: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int64 and shape [5107435]
	 [[{{node Placeholder/_0}}]]


Input:  [44 44 33 29 46 47  4 29  4 34 29 31 33  3 48 36 29 48  4 43 50 33 46 12
 35 43 33 47  4 41 53  4 30 40 49 42 48  4 37 42 50 33 42 48 37 43 42  4
 45 49 37 48 33 11  3 32 49 40 40 37 42 35  4 41 53  4 40 37 42 33 47  4
 29 42 32  4 32 43 37 42 35  4 41 33  4 32 37 47 35 46 29 31 33 13  3 51
 33 46 33  4]
Target:  [44 33 29 46 47  4 29  4 34 29 31 33  3 48 36 29 48  4 43 50 33 46 12 35
 43 33 47  4 41 53  4 30 40 49 42 48  4 37 42 50 33 42 48 37 43 42  4 45
 49 37 48 33 11  3 32 49 40 40 37 42 35  4 41 53  4 40 37 42 33 47  4 29
 42 32  4 32 43 37 42 35  4 41 33  4 32 37 47 35 46 29 31 33 13  3 51 33
 46 33  4 37]

 Predictions shape (1, 100, 56) # (batch_size, sequence_length, vocab_size)
Predictions:  tf.Tensor(
[[46 33 29 46 33  4 48 42 47 29 37 33  4 43 36 29 48  4 51 42 33 46 48 46
  33 43 47  4 48 53  4 40 46 43 47 48  4 48 42  4 37 42 48 37 43 42  4 48
  49 37 48 33  4  3 29 33 46 40  4 42 35  4 48 33  4 34 43 34 33  4  4 29
  42 32  4 47 37 49 42 35  4 48 43 42 48 33 47 

<a name="1"></a>
## 6) Evaluating model using log perplexity

In the following, we will write a program takes in preds and target. preds is a tensor of log probabilities. You can use tf.one_hot to transform the target into the same dimension. You then multiply them and sum them.

In [299]:

def log_perplexity(preds, target):
    """
    Function to calculate the log perplexity of a model.

    Args:
        preds (tf.Tensor): Predictions of a list of batches of tensors corresponding to lines of text.
        target (tf.Tensor): Actual list of batches of tensors corresponding to lines of text.

    Returns:
        float: The log perplexity of the model.
    """
    
    PADDING_ID = 1
    
    target_one_hot = tf.one_hot(target, depth=preds.shape[-1])
    
    log_p = np.sum(target_one_hot * preds.numpy(), axis= -1)   # Identify non-padding elements in the target
    
    non_pad = 1.0 - np.equal(target, PADDING_ID)   # The line is used to create a mask that identifies non-padding elements in the target tensor.
    
    log_p = log_p * non_pad       # Apply non-padding mask to log probabilities to exclude padding
    
    log_ppx = np.sum(log_p, axis=-1) / np.sum(non_pad, axis=-1) # Calculate the log perplexity by taking the sum of log probabilities and dividing by the sum of non-padding elements
    
    log_ppx = np.mean(log_ppx) # Compute the mean of the previous expression
        
    return -log_ppx

In [300]:
len(eval_lines)

1000

In [303]:

eval_text = "\n".join(eval_lines)
eval_ids = line_to_tensor(eval_text, vocab)
input_ids, target_ids = split_input_target(eval_ids)

preds, status = model(tf.expand_dims(input_ids, 0), training=False, states=None, return_state=True)

#Get the log perplexity
log_ppx = log_perplexity(preds, tf.expand_dims(target_ids, 0))
print(f'The log perplexity and perplexity of your model are {log_ppx} and {np.exp(log_ppx)} respectively')

The log perplexity and perplexity of your model are 1.2003743380375573 and 3.321360001441171 respectively


In [306]:
tf.expand_dims(input_ids, 0)

<tf.Tensor: shape=(1, 41573), dtype=int64, numpy=array([[27, 33, 52, ..., 49, 42, 48]])>

In [309]:
print(status.shape) # Last hidden state dimension
print(preds.shape)

(1, 512)
(1, 41573, 56)


<a name="1"></a>
## 7) Generative language model