<a href="https://colab.research.google.com/github/musleho/NLPwithRNN/blob/main/nlp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf

In [None]:
tf.__version__

In [None]:
path_to_file = 'shakespeare.txt'

In [None]:
text = open(path_to_file, 'r').read()

In [None]:
print(text[:500])

In [None]:
# The unique characters in the file
vocab = sorted(set(text))
print(vocab)
len(vocab)

In [None]:
char_to_ind = {u:i for i, u in enumerate(vocab)}

In [None]:
char_to_ind

In [None]:
ind_to_char = np.array(vocab)

In [None]:
ind_to_char

In [None]:
encoded_text = np.array([char_to_ind[c] for c in text])

In [None]:
encoded_text

In [None]:
sample = text[:20]
sample

In [None]:
encoded_text[:20]

In [None]:
print(text[:500])

In [None]:
line = "From fairest creatures we desire increase"

In [None]:
len(line)

In [None]:
part_stanza = """From fairest creatures we desire increase,
  That thereby beauty's rose might never die,
  But as the riper should by time decease,"""

In [None]:
len(part_stanza)

In [None]:
seq_len = 120

In [None]:
total_num_seq = len(text)//(seq_len+1)

In [None]:
total_num_seq

In [None]:
# Create Training Sequences
char_dataset = tf.data.Dataset.from_tensor_slices(encoded_text)

for i in char_dataset.take(500):
     print(ind_to_char[i.numpy()])

In [None]:
sequences = char_dataset.batch(seq_len+1, drop_remainder=True)

In [None]:
def create_seq_targets(seq):
    input_txt = seq[:-1]
    target_txt = seq[1:]
    return input_txt, target_txt

In [None]:
dataset = sequences.map(create_seq_targets)

In [None]:
for input_txt, target_txt in  dataset.take(1):
    print(input_txt.numpy())
    print(''.join(ind_to_char[input_txt.numpy()]))
    print('\n')
    print(target_txt.numpy())
    # There is an extra whitespace!
    print(''.join(ind_to_char[target_txt.numpy()]))

In [None]:
# Batch size
batch_size = 128

# Buffer size to shuffle the dataset so it doesn't attempt to shuffle
# the entire sequence in memory. Instead, it maintains a buffer in which it shuffles elements
buffer_size = 10000

dataset = dataset.shuffle(buffer_size).batch(batch_size, drop_remainder=True)

In [None]:
dataset

In [None]:
# Length of the vocabulary in chars
vocab_size = len(vocab)

# The embedding dimension
embed_dim = 64

# Number of RNN units
rnn_neurons = 1026

In [None]:
from keras.models import Sequential
from keras.layers import LSTM,Dense,Embedding,Dropout,GRU

In [None]:
from keras.losses import sparse_categorical_crossentropy

In [None]:
help(sparse_categorical_crossentropy)

In [None]:
def sparse_cat_loss(y_true,y_pred):
  return sparse_categorical_crossentropy(y_true, y_pred, from_logits=True)

In [None]:
def create_model(vocab_size, embed_dim, rnn_neurons, batch_size):
    model = Sequential()
    model.add(Embedding(vocab_size, embed_dim,batch_input_shape=[batch_size, None]))
    model.add(GRU(rnn_neurons,return_sequences=True,stateful=True,recurrent_initializer='glorot_uniform'))
    # Final Dense Layer to Predict
    model.add(Dense(vocab_size))
    model.compile(optimizer='adam', loss=sparse_cat_loss) 
    return model

In [None]:
model = create_model(
  vocab_size = vocab_size,
  embed_dim=embed_dim,
  rnn_neurons=rnn_neurons,
  batch_size=batch_size)

In [None]:
model.summary()

In [None]:
for input_example_batch, target_example_batch in dataset.take(1):

  example_batch_predictions = model(input_example_batch)
  print(example_batch_predictions.shape, " <=== (batch_size, sequence_length, vocab_size)")


In [None]:
example_batch_predictions

In [None]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)

In [None]:
sampled_indices

In [None]:
# Reformat to not be a lists of lists
sampled_indices = tf.squeeze(sampled_indices,axis=-1).numpy()

In [None]:
sampled_indices

In [None]:
epochs = 30

In [None]:
# model.fit(dataset,epochs=epochs)

In [None]:
from keras.models import load_model

In [None]:
model = create_model(vocab_size, embed_dim, rnn_neurons, batch_size=1)

model.load_weights('shakespeare_gen.h5')

model.build(tf.TensorShape([1, None]))


In [None]:
model.summary()

In [None]:
def generate_text(model, start_seed,gen_size=100,temp=1.0):

  num_generate = gen_size
  input_eval = [char_to_ind[s] for s in start_seed]
  input_eval = tf.expand_dims(input_eval, 0)
  text_generated = []
 
  temperature = temp

  # Here batch size == 1
  model.reset_states()

  for i in range(num_generate):

      # Generate Predictions
      predictions = model(input_eval)
      predictions = tf.squeeze(predictions, 0)
      predictions = predictions / temperature
      
      predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()
      input_eval = tf.expand_dims([predicted_id], 0)
      text_generated.append(ind_to_char[predicted_id])

  return (start_seed + ''.join(text_generated))

In [None]:
print(generate_text(model,"flower",gen_size=1000))