<a href="https://colab.research.google.com/github/phanigundubogula/dl_projects/blob/master/pg_dinosaur_name_generation_with_RNN_01_24_2021.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import os
import io
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
print(tf.__version__)

2.4.0


In [2]:
import pathlib
text_dir = pathlib.Path('drive/MyDrive/Text')
for dir in text_dir.iterdir():
  print(str(dir))

drive/MyDrive/Text/dinos.txt
drive/MyDrive/Text/text_generation_gutenberg.txt


In [3]:
text_path = os.path.join(text_dir, "dinos.txt")
print(text_path)

drive/MyDrive/Text/dinos.txt


In [4]:
text = open(text_path, encoding='UTF-8').read().strip()
print(text[:20])
text_list = text.split("\n")

Aachenosaurus
Aardon


In [5]:
num_examples = len(text_list)
print("Number of examples : %d"%num_examples)

Number of examples : 1536


In [6]:
vocab = sorted(set(text))
print(vocab)

['\n', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [7]:
print("{} characters in vocabulary".format(len(vocab)))

53 characters in vocabulary


In [8]:
char2idx = {u:i for i, u in enumerate(vocab)}

for char, _ in zip(char2idx, range(20)):
  print("{:4s} : {:3d}".format(repr(char), char2idx[char]))

'\n' :   0
'A'  :   1
'B'  :   2
'C'  :   3
'D'  :   4
'E'  :   5
'F'  :   6
'G'  :   7
'H'  :   8
'I'  :   9
'J'  :  10
'K'  :  11
'L'  :  12
'M'  :  13
'N'  :  14
'O'  :  15
'P'  :  16
'Q'  :  17
'R'  :  18
'S'  :  19


In [9]:
print(char2idx)

{'\n': 0, 'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7, 'H': 8, 'I': 9, 'J': 10, 'K': 11, 'L': 12, 'M': 13, 'N': 14, 'O': 15, 'P': 16, 'Q': 17, 'R': 18, 'S': 19, 'T': 20, 'U': 21, 'V': 22, 'W': 23, 'X': 24, 'Y': 25, 'Z': 26, 'a': 27, 'b': 28, 'c': 29, 'd': 30, 'e': 31, 'f': 32, 'g': 33, 'h': 34, 'i': 35, 'j': 36, 'k': 37, 'l': 38, 'm': 39, 'n': 40, 'o': 41, 'p': 42, 'q': 43, 'r': 44, 's': 45, 't': 46, 'u': 47, 'v': 48, 'w': 49, 'x': 50, 'y': 51, 'z': 52}


In [10]:
idx2char = np.array(vocab)
print("char at 10th index : %s"%idx2char[10])

char at 10th index : J


In [11]:
print(idx2char)

['\n' 'A' 'B' 'C' 'D' 'E' 'F' 'G' 'H' 'I' 'J' 'K' 'L' 'M' 'N' 'O' 'P' 'Q'
 'R' 'S' 'T' 'U' 'V' 'W' 'X' 'Y' 'Z' 'a' 'b' 'c' 'd' 'e' 'f' 'g' 'h' 'i'
 'j' 'k' 'l' 'm' 'n' 'o' 'p' 'q' 'r' 's' 't' 'u' 'v' 'w' 'x' 'y' 'z']


In [12]:
def create_dataset(text_list):
  text_list = [[c for c in name] for name in text_list]
  return text_list


In [14]:
def tokenize(lang):
  lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='', char_level=True)
  lang_tokenizer.fit_on_texts(lang)
  tensor = lang_tokenizer.texts_to_sequences(lang)
  tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding="post")
  return tensor, lang_tokenizer

In [15]:
def load_dataset(text_list):
  text_list = create_dataset(text_list)
  tensor, lang = tokenize(text_list)
  return tensor, lang

In [16]:
input_tensor, inp_lang = load_dataset(text_list)
max_length_input = input_tensor.shape[1]

In [17]:
print("Max input sequence length : {}".format(max_length_input))
print("Total number of examples : %d"%input_tensor.shape[0])

Max input sequence length : 26
Total number of examples : 1536


In [19]:
def convert(lang, tensor):
  for t in tensor:
    if t!=0:
      print("{} -- > {}".format(t, lang.index_word[t]))

In [20]:
print("Input language; index to char mapping")
convert(inp_lang, input_tensor[0])

Input language; index to char mapping
1 -- > a
1 -- > a
13 -- > c
12 -- > h
8 -- > e
6 -- > n
4 -- > o
2 -- > s
1 -- > a
3 -- > u
5 -- > r
3 -- > u
2 -- > s


In [21]:
def split_input_target(seq):
  input = seq[:-1]
  target = seq[1:]
  return input, target

In [24]:
dataset = tf.data.Dataset.from_tensor_slices(input_tensor)

for input in dataset.take(5):
  name=[]
  for i in input.numpy():
    if i!=0:
      name.append(inp_lang.index_word[i])
  print(''.join(name))


aachenosaurus
aardonyx
abdallahsaurus
abelisaurus
abrictosaurus


In [25]:
dataset = dataset.map(split_input_target)

In [26]:
def display(input_batch):
   inp=[]
   for i in input_batch.numpy():
      if i!=0:
        inp.append(inp_lang.index_word[i])
   return(''.join(inp))

In [27]:
BATCH_SIZE=64
BUFFER_SIZE = input_tensor.shape[0]
units = 1024
embedding_dim=256
vocab_size = len(inp_lang.word_index) + 1

dataset = dataset.cache().shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

dataset

<BatchDataset shapes: ((64, 25), (64, 25)), types: (tf.int32, tf.int32)>

In [28]:
example_input_batch, example_target_batch = next(iter(dataset))
example_input_batch.shape, example_target_batch.shape

(TensorShape([64, 25]), TensorShape([64, 25]))

In [29]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
  model= Sequential([
                     layers.Embedding(vocab_size, embedding_dim,
                                      batch_input_shape=[batch_size,None]),
                     layers.GRU(rnn_units,
                                return_sequences=True,
                                stateful=True,
                                recurrent_initializer='glorot_uniform'),
                     layers.Dense(vocab_size)
  ])
  return model

In [30]:
model = build_model(vocab_size,
                    embedding_dim,
                    units,
                    BATCH_SIZE)

In [31]:
example_batch_predictions = model(example_input_batch)
print(example_batch_predictions.shape)

(64, 25, 27)


In [32]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (64, None, 256)           6912      
_________________________________________________________________
gru (GRU)                    (64, None, 1024)          3938304   
_________________________________________________________________
dense (Dense)                (64, None, 27)            27675     
Total params: 3,972,891
Trainable params: 3,972,891
Non-trainable params: 0
_________________________________________________________________


In [66]:
sample_indices=tf.random.categorical(example_batch_predictions[0], num_samples=1)
print(sample_indices.shape)
sample_indices = tf.squeeze(sample_indices, axis=-1)
print(sample_indices.shape)

(25, 1)
(25,)


In [34]:
print("Sample Input : ",display(example_input_batch[0]))
print("Predictions : ",display(sample_indices))

Sample Input :  kuszholia
Predictions :  ghrfhfujnxsebznubaefdnfk


In [35]:
example_input_batch[0].shape, sample_indices.shape

(TensorShape([25]), TensorShape([25]))

In [36]:
def loss(label, logits):
  return tf.keras.losses.sparse_categorical_crossentropy(label, logits, from_logits=True)

example_batch_loss = loss(example_target_batch, example_batch_predictions)
print("Predictions Shape : ", example_batch_predictions.shape)
print("Scalar losss : ", example_batch_loss.numpy().mean())

Predictions Shape :  (64, 25, 27)
Scalar losss :  3.302452


In [37]:
checkpoint_dir = "./checkpoints"
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True
)

In [38]:
model.compile(optimizer='adam', loss=loss)

In [39]:
EPOCHS=10
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [40]:
tf.train.latest_checkpoint(checkpoint_dir)

'./checkpoints/ckpt_10'

In [42]:
model = build_model(vocab_size, embedding_dim, units, batch_size=1)
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.build(tf.TensorShape([1,None]))



In [43]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (1, None, 256)            6912      
_________________________________________________________________
gru_2 (GRU)                  (1, None, 1024)           3938304   
_________________________________________________________________
dense_2 (Dense)              (1, None, 27)             27675     
Total params: 3,972,891
Trainable params: 3,972,891
Non-trainable params: 0
_________________________________________________________________


In [44]:
def generate(model, start_char):
  num_generate = max_length_input -1

  input_eval= [inp_lang.word_index[start_char]]
  input_eval = tf.expand_dims(input_eval, 0)

  text_generated=[]

  temperature = 1.0

  model.reset_states()

  for i in range(num_generate):
    predictions = model(input_eval)
    predictions = tf.squeeze(predictions, 0)
    predictions = predictions/temperature
    predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()
    input_eval = tf.expand_dims([predicted_id],0)
    if(predicted_id!=0):
      text_generated.append(inp_lang.index_word[predicted_id])
  return start_char+''.join(text_generated)


In [62]:
print(generate(model, start_char=u'd'))

dadfysaurus
