# Latural Language Processing with RNN
By: Matthew Fernandez
<br>01/12/2022

In [1]:
%tensorflow_version 2.x
from keras.preprocessing import sequence
import keras
import tensorflow as tf
import os
import numpy as np

Colab only includes TensorFlow 2.x; %tensorflow_version has no effect.


### Dataset
We will look at the shakespeare dataset found in tf.keras.utils package

In [2]:
path_to_file = tf.keras.utils.get_file("shakespeare.txt", "https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt")

Loading your own data

In [3]:
# from google.colab import files
# path_to_file = list(files.upload().keys())[0]

In [4]:
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')
print('Length of text: {} characters'.format(len(text)))

Length of text: 1115394 characters


In [5]:
print(text[:250])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.



### Encoding

In [6]:
vocab = sorted(set(text))
char2idx = {u:i for i,u in enumerate(vocab)}
idx2char = np.array(vocab)

def text_to_int(text):
  return np.array([char2idx[c] for c in text])
text_as_int = text_to_int(text)

In [7]:
print("Text: ", text[:13])
print("Encoded: ",text_to_int(text[:13]))

Text:  First Citizen
Encoded:  [18 47 56 57 58  1 15 47 58 47 64 43 52]


Function to convert our numeric values to text.

In [8]:
def int_to_text(ints):
  try:
    ints = ints.numpy()
  except:
    pass
  return "".join(idx2char[ints])

### Training Examples
The training examples we will prepare will use *sequence_length* as the input and *sequence_length* sequence as the output where that sequence is shifted one to the right.<br>Example:
- Input: Hell | output: ello

In [9]:
sequence_length = 100
examples_per_epoch =len(text)//(sequence_length+1)

#create tr examples from dataset
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

Next let us use the batch method to turn this stream of characters into batches of desired length.

In [10]:
sequences = char_dataset.batch(sequence_length+1, drop_remainder=True)

Now we need to use these sequwnce of length 101 and split into input and output.

In [11]:
def split_input_target(chunk):
  input_text = chunk[:-1]
  target_text = chunk[1:]
  return input_text, target_text
dataset = sequences.map(split_input_target)

In [12]:
for x,y in dataset.take(2):
  print('\n\nExample\nInput')
  print(int_to_text(x))
  print('\nOutput')
  print(int_to_text(y))



Example
Input
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You

Output
irst Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You 


Example
Input
are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you 

Output
re all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you k


Finally, create the training batches.

In [13]:
BATCH_SIZE = 128
VOCAB_SIZE = len(vocab) # num of unique characters
EMBEDDING_DIM = 256
RNN_UNITS = 1024

BUFFER_SIZE = 10000

data = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

### Building the Model
Now we build the model. Let use use an embedding layer LSTM and one dense layer that contains a node for each unique character in our training data. The dense layer will give use a probability distribution over all nodes.

In [14]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim, batch_input_shape=[batch_size,None]),
        tf.keras.layers.LSTM(rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
        tf.keras.layers.Dense(vocab_size)
    ])
    return model

model = build_model(VOCAB_SIZE, EMBEDDING_DIM, RNN_UNITS, BATCH_SIZE)
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (128, None, 256)          16640     
                                                                 
 lstm (LSTM)                 (128, None, 1024)         5246976   
                                                                 
 dense (Dense)               (128, None, 65)           66625     
                                                                 
Total params: 5,330,241
Trainable params: 5,330,241
Non-trainable params: 0
_________________________________________________________________


### Create Loss Function
We need a loss function, so we'll build our own. This is because we want our output to be (64, sequence_length, 65) shaped tensor that represents the probability distribution of each character at each timestamp for every sequence in the batch.

In [15]:
for input_example_batch, target_example_batch in data.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "  (batch_size, sequence_length, vocab_size)")

(128, 100, 65)   (batch_size, sequence_length, vocab_size)


In [16]:
print(len(example_batch_predictions))
print(example_batch_predictions)

128
tf.Tensor(
[[[-1.0092584e-03 -3.7219739e-03  3.6222248e-03 ...  3.1431572e-04
    9.9778106e-04 -5.7588527e-03]
  [-7.0723980e-03 -7.9825129e-03 -8.8412990e-04 ...  2.0120195e-03
    3.1886706e-05 -5.4056882e-03]
  [-7.5174035e-03 -4.9794763e-03 -8.7329478e-04 ... -1.5632802e-03
    1.6230833e-03 -7.3378365e-03]
  ...
  [ 5.3603156e-03  2.5960547e-03  9.9959010e-03 ... -5.9562614e-03
    2.6964322e-03  1.8822802e-02]
  [ 4.8382534e-03  1.2634785e-03  1.1980202e-02 ... -4.9365051e-03
    4.5961039e-03  1.6664194e-02]
  [ 2.8009624e-03 -3.3238721e-03  7.5042182e-05 ... -8.6177196e-03
    7.4084140e-03  1.5664224e-02]]

 [[-1.1890118e-03 -2.4077769e-04  4.4395742e-03 ... -3.9260904e-03
   -5.2606664e-03 -5.0649201e-03]
  [-3.7055933e-03 -6.4772303e-04  7.0612034e-04 ... -2.7532554e-03
   -1.4399553e-03 -9.5414501e-03]
  [-4.6190517e-03 -3.6590232e-03  4.4367774e-03 ... -9.8301144e-04
    3.1589647e-05 -1.3350913e-02]
  ...
  [-7.0186877e-03 -4.5324662e-03 -8.7617716e-04 ...  2.5659630

Let us examine one prediction

In [17]:
# examine one prediction
pred = example_batch_predictions[0]
print(len(pred))
print(pred)

100
tf.Tensor(
[[-1.0092584e-03 -3.7219739e-03  3.6222248e-03 ...  3.1431572e-04
   9.9778106e-04 -5.7588527e-03]
 [-7.0723980e-03 -7.9825129e-03 -8.8412990e-04 ...  2.0120195e-03
   3.1886706e-05 -5.4056882e-03]
 [-7.5174035e-03 -4.9794763e-03 -8.7329478e-04 ... -1.5632802e-03
   1.6230833e-03 -7.3378365e-03]
 ...
 [ 5.3603156e-03  2.5960547e-03  9.9959010e-03 ... -5.9562614e-03
   2.6964322e-03  1.8822802e-02]
 [ 4.8382534e-03  1.2634785e-03  1.1980202e-02 ... -4.9365051e-03
   4.5961039e-03  1.6664194e-02]
 [ 2.8009624e-03 -3.3238721e-03  7.5042182e-05 ... -8.6177196e-03
   7.4084140e-03  1.5664224e-02]], shape=(100, 65), dtype=float32)


Let us look at prediction at first time stamp

In [18]:
time_pred = pred[0]
print(len(time_pred))
print(time_pred)

65
tf.Tensor(
[-1.0092584e-03 -3.7219739e-03  3.6222248e-03 -1.6570165e-03
 -2.4570862e-03  4.7434564e-03  1.0907461e-04  1.6431680e-03
 -3.5936923e-03 -1.7743605e-03 -5.4320302e-03 -3.7677060e-03
  6.2220235e-04 -9.3049987e-04 -8.8832271e-04  6.1103408e-03
  3.1768254e-04 -1.6376799e-03 -1.5795175e-03 -4.7996547e-03
  1.6170249e-03  1.2038435e-03  2.9213848e-03  1.7214839e-03
 -2.6685232e-03 -2.1070004e-03  1.3498655e-03  2.4168047e-03
  4.1967644e-03  4.9583609e-03  7.6959790e-03 -2.6878684e-03
  3.4799748e-03  3.5690356e-04 -1.7997827e-03  2.3036327e-03
  5.7875604e-04  1.9788041e-03 -1.6396760e-03 -6.4158160e-04
 -5.6815532e-04 -4.7555743e-03 -2.1985534e-03  1.5205023e-03
 -3.1362816e-03 -6.1548073e-03  4.0063350e-03  6.5273414e-03
 -3.5824648e-03 -1.6087187e-03 -1.2576975e-03  2.6948140e-03
  5.8152410e-04  6.4627267e-05 -1.1353671e-03  3.5656723e-03
 -1.8107500e-03  2.6086663e-04  1.4049106e-03  3.8930003e-03
  4.8779380e-03  3.8774363e-03  3.1431572e-04  9.9778106e-04
 -5.758852

In [19]:
# If we want to determine the predicted character we need to sample the output distribution
sampled_indices = tf.random.categorical(pred, num_samples=1)

# we now reshape that array and convert all the ints to numbers to see the actual characters
sampled_indices = np.reshape(sampled_indices, (1,-1))[0]
predicted_chars = int_to_text(sampled_indices)

# this is what the model predicted for training sequence 1
predicted_chars

"qJtb.C\n:JHRhv$bnfILnBO,ZoL-xqz!tBZcWezdVh?e\n&m-yVeFuB:xK.TXRYWAMJ,qvzggra.YglrMkH,-q'XY$joa$MJaOUEb'"

So now we need to create a loss Fn that can compare that output to the expected output and give use some numeric value representing how close the two were.

In [20]:
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

### Compiling the Model

In [21]:
model.compile(optimizer='adam',loss=loss)

Now we'll set up checkpoints as it trains that will allow us to load our model to train for later purposes if need be.

In [22]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
#name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir,'ckpt_(epoch)')

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

### Train our Model

In [23]:
history = model.fit(data, epochs=40, callbacks=[checkpoint_callback])

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


### Loading the Model
We'll rebuild the model from a checkpoint using a batch_size of 1 so that we fan feed one piece of text to the model and have it make a prediction.

In [24]:
model = build_model(VOCAB_SIZE, EMBEDDING_DIM, RNN_UNITS, batch_size=1)

In [25]:
# find the latest loaded checkpoint
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.build(tf.TensorShape([1,None]))

In [26]:
# checkpoint_num = 10
# model.load_weights(tf.train.load_checkpoint("./training_checkpoints/ckpt_"+str(checkpoint_num)))
# model.build(tf.TensorShape([1,None]))

### Generating the Data

In [27]:
"""
tensorflow FN to generate some text using any starting string
"""
def generate_text(model, start_string):
    num_gen = 800

    input_eval = [char2idx[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval,0)

    text_generated = []

    temp = 1.0
    model.reset_states()
    for i in range(num_gen):
        pred = model(input_eval)
        pred = tf.squeeze(pred, 0)

        pred = pred / temp
        pred_id = tf.random.categorical(pred, num_samples=1)[-1,0].numpy()

        input_eval = tf.expand_dims([pred_id], 0)

        text_generated.append(idx2char[pred_id])

    return (start_string + "".join(text_generated))


In [28]:
inp = input("Type a starting string: ")
print(generate_text(model, inp))

Type a starting string: His arms were as cold as 
His arms were as cold as chequest;
And thou, and death will have you quake,
Which she hath praised and fighters from the king,
Who hath cambifian'd with a spirit to die.
Where is your crat-moner? when did I beg them not
In the hile own window, like power incourse.'
Come, Warwick, through the country of your eye,
But he is own repair, and dread none othor
To teach her kinsman and well committed to them, if he wear fellows from right,
But, as I can, it do change, sir, and the little tricks of
conscience says we stand and need to Romeo's hand dustices, that they are sharl
Can when thou art d, well known to the pun my heart. I beg the world s after to.
Hark: the sleep deeds did return to do
And mine own lineal with honest peace
And not on his sword, Lord Anbelo, to you:
And were the reis of them and clear love?

JOHN 
