In [1]:
from keras.preprocessing import sequence
import keras 
import tensorflow as tf 
import os 
import numpy as np 

In [10]:
text = open('/Users/nuthankumar/Downloads/shakespeare.txt', 'rb').read().decode(encoding='utf-8')
print(f'length of text : {len(text)} characters')

length of text : 1115394 characters


In [11]:
print(text[:250])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.



Encoding -convert each character as unique character as a different integer

In [12]:
vocab = sorted(set(text))
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

def text_to_int(text):
    return np.array([char2idx[c] for c in text])

text_as_int = text_to_int(text)

In [13]:
print("Text:", text[:13])
print("Encoded:", text_to_int(text[:13]))

Text: First Citizen
Encoded: [18 47 56 57 58  1 15 47 58 47 64 43 52]


In [21]:
def int_to_text(ints):
    try:
        ints = ints.numpy()
    except:
        pass
    return ''.join(idx2char[ints])

print(int_to_text(text_as_int[:13]))

First Citizen


In [22]:
seq_length = 100 #lenght of sequence for a training example
examples_per_epoch = len(text)//(seq_length+1)

#create trining examples / targets 
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

2022-04-17 18:05:42.107443: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [23]:
sequence = char_dataset.batch(seq_length+1, drop_remainder=True)

In [26]:
def split_input_target(chunk):  # for the example: hello
    input_text = chunk[:-1]  # hell
    target_text = chunk[1:]  # ello
    return input_text, target_text  # hell, ello

dataset = sequence.map(split_input_target)  # we use map to apply the above function to every entry

In [29]:
for x, y in dataset.take(2):
    print("\n\nEXAMPLE\n")
    print("INPUT")
    print(int_to_text(x))
    print("\nOUTPUT")
    print(int_to_text(y))



EXAMPLE

INPUT
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You

OUTPUT
irst Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You 


EXAMPLE

INPUT
are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you 

OUTPUT
re all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you k


In [33]:
BATCH_SIZE = 64
VOCAB_SIZE = len(vocab)  # vocab is number of unique characters
EMBEDDING_DIM = 256
RNN_UNITS = 1024

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

data = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

In [34]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim,
                              batch_input_shape=[batch_size, None]),
    tf.keras.layers.LSTM(rnn_units,
                        return_sequences=True,
                        stateful=True,
                        recurrent_initializer='glorot_uniform'),
    tf.keras.layers.Dense(vocab_size)
  ])
    return model

model = build_model(VOCAB_SIZE,EMBEDDING_DIM, RNN_UNITS, BATCH_SIZE)
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (64, None, 256)           16640     
                                                                 
 lstm (LSTM)                 (64, None, 1024)          5246976   
                                                                 
 dense (Dense)               (64, None, 65)            66625     
                                                                 
Total params: 5,330,241
Trainable params: 5,330,241
Non-trainable params: 0
_________________________________________________________________


In [39]:
for input_example_batch, target_example_batch in data.take(1):
    example_batch_predictions = model(input_example_batch)  # ask our model for a prediction on our first batch of training data (64 entries)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")  # print out the output shape

(64, 100, 65) # (batch_size, sequence_length, vocab_size)


In [40]:
# we can see that the predicition is an array of 64 arrays, one for each entry in the batch
print(len(example_batch_predictions))
print(example_batch_predictions)

64
tf.Tensor(
[[[-0.00397122 -0.00216983 -0.00493209 ...  0.00280518 -0.00367669
    0.000792  ]
  [-0.00989217  0.00246343 -0.01006163 ... -0.00427083 -0.00791966
   -0.0040151 ]
  [-0.00552519  0.0015785  -0.00503073 ... -0.00324098 -0.00217815
   -0.00085359]
  ...
  [-0.01104917  0.00126988 -0.0010531  ...  0.0030817  -0.00741801
   -0.00073603]
  [-0.01010646  0.00177636 -0.00120853 ... -0.00050446 -0.00389778
   -0.00403074]
  [-0.01227846 -0.00303128 -0.00068513 ... -0.00051176  0.00140545
   -0.0011688 ]]

 [[ 0.0068974  -0.00958298  0.0019778  ...  0.0019543  -0.00201078
   -0.00101586]
  [ 0.00283226 -0.01336535  0.00306271 ... -0.00222185  0.00580967
   -0.00173371]
  [ 0.00198469 -0.01240167  0.00709158 ... -0.01033729  0.00670723
   -0.00393187]
  ...
  [ 0.00816919 -0.00983437  0.00949153 ... -0.00544435  0.003636
   -0.00026337]
  [ 0.00998421 -0.01071401  0.0132242  ... -0.00130441  0.00121285
    0.00575912]
  [ 0.00212639 -0.00972493  0.00629773 ...  0.00196777 -0.001

In [43]:
pred = example_batch_predictions[0]
print(len(pred))
print(pred)

100
tf.Tensor(
[[-0.00397122 -0.00216983 -0.00493209 ...  0.00280518 -0.00367669
   0.000792  ]
 [-0.00989217  0.00246343 -0.01006163 ... -0.00427083 -0.00791966
  -0.0040151 ]
 [-0.00552519  0.0015785  -0.00503073 ... -0.00324098 -0.00217815
  -0.00085359]
 ...
 [-0.01104917  0.00126988 -0.0010531  ...  0.0030817  -0.00741801
  -0.00073603]
 [-0.01010646  0.00177636 -0.00120853 ... -0.00050446 -0.00389778
  -0.00403074]
 [-0.01227846 -0.00303128 -0.00068513 ... -0.00051176  0.00140545
  -0.0011688 ]], shape=(100, 65), dtype=float32)


In [44]:
time_pred = pred[0]
print(len(time_pred))
print(time_pred)

65
tf.Tensor(
[-3.9712153e-03 -2.1698270e-03 -4.9320916e-03 -1.9819795e-03
  2.2533813e-03 -3.3941749e-04 -4.5591267e-04 -3.2212120e-05
  7.7486812e-04 -4.2894552e-03 -2.0506112e-03  1.5532154e-03
  4.1250940e-03 -1.6846259e-03  3.0075274e-03  5.5464031e-03
  1.4910118e-03 -5.9946077e-03  4.5219613e-03  2.5072307e-03
  2.0315126e-03  7.4398848e-03  2.5311545e-03  3.5749669e-03
 -9.5072773e-04  9.3878561e-04 -2.1321627e-03  3.4006652e-03
  1.2647902e-03  3.8029568e-04 -5.1423330e-03  1.2024879e-03
 -4.9098767e-03  7.4330843e-03 -6.1387010e-03  3.1025421e-03
  7.6438943e-03 -4.1202735e-03 -6.3133305e-03  3.4522247e-03
 -6.9967128e-04 -3.9437497e-03  2.3948601e-03 -3.1940367e-03
 -2.5195237e-03  5.2880519e-04 -2.7365698e-03 -4.2357300e-03
  3.0184372e-03 -1.1633228e-03 -4.8523988e-03  3.3918421e-03
 -2.1313690e-04  2.5480660e-03  2.1730457e-03 -2.3472332e-03
 -2.3203266e-03 -2.4848862e-04  3.7957991e-03 -4.9253274e-03
 -1.5062341e-03 -5.8049466e-03  2.8051815e-03 -3.6766857e-03
  7.920002

In [45]:
# If we want to determine the predicted character we need to sample the output distribution (pick a value based on probabillity)
sampled_indices = tf.random.categorical(pred, num_samples=1)

# now we can reshape that array and convert all the integers to numbers to see the actual characters
sampled_indices = np.reshape(sampled_indices, (1, -1))[0]
predicted_chars = int_to_text(sampled_indices)

predicted_chars  # and this is what the model predicted for training sequence 1

"-MZFiPExVfoTawZIp'kh!blYA'AhJ\nZssUqbDgH3PFuBA-:zU?yl\nXnZE3IGXE?:RkyEv.-&:h3!GXnzzKB-K3KeIMaYWRYQCPoR"

In [51]:
def loss(labels, logits):
  return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

In [53]:
model.compile(optimizer='adam', loss=loss)

In [54]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [None]:
history = model.fit(data, epochs=50, callbacks=[checkpoint_callback])

In [None]:
model = build_model(VOCAB_SIZE, EMBEDDING_DIM, RNN_UNITS, batch_size=1)

In [None]:
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.build(tf.TensorShape([1, None]))

In [None]:
def generate_text(model, start_string):
      # Evaluation step (generating text using the learned model)

  # Number of characters to generate
  num_generate = 800

  # Converting our start string to numbers (vectorizing)
  input_eval = [char2idx[s] for s in start_string]
  input_eval = tf.expand_dims(input_eval, 0)

  # Empty string to store our results
  text_generated = []

  # Low temperatures results in more predictable text.
  # Higher temperatures results in more surprising text.
  # Experiment to find the best setting.
  temperature = 1.0

  # Here batch size == 1
  model.reset_states()
  for i in range(num_generate):
      predictions = model(input_eval)
      # remove the batch dimension
    
      predictions = tf.squeeze(predictions, 0)

      # using a categorical distribution to predict the character returned by the model
      predictions = predictions / temperature
      predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

      # We pass the predicted character as the next input to the model
      # along with the previous hidden state
      input_eval = tf.expand_dims([predicted_id], 0)

      text_generated.append(idx2char[predicted_id])

  return (start_string + ''.join(text_generated))

In [None]:
inp = input("Type a starting string: ")
print(generate_text(model, inp))