<a href="https://colab.research.google.com/github/pdevall/TextGeneration/blob/master/CharacterLevelLanguageModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
import tensorflow as tf
from tensorflow import keras

Load Data using numpy. Get the Max length of the word in the file. The Max length will be used to pad the sequences after tokenization



In [0]:
datasetNPARRAY = np.loadtxt("/content/drive/My Drive/Colab Notebooks/CharacterLevelLanguageModel/dinos.txt", dtype="str")
print(datasetNPARRAY)
max_length = max(len(x) for x in datasetNPARRAY )
print(max_length)

Using Keras Tokenizer to tokenize the words loaded from file at the chracter level.

In [0]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(
    num_words=len(datasetNPARRAY),
    filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
    lower=True,
    char_level=True,
    document_count=0
)    

Tokenizer fit_on_texts will create a vocabulary. texts_to_sequences will generate the char to sequence in the vocabulary.

In [0]:
tokenizer.fit_on_texts(datasetNPARRAY)
sequences = tokenizer.texts_to_sequences(datasetNPARRAY)
vocab_size = len(tokenizer.word_index) + 1
print(tokenizer.word_index)
print(vocab_size)


Shuffle input, shift 1 place right for Y_sequences.

In [0]:
sequences = keras.preprocessing.sequence.pad_sequences(sequences, maxlen=max_length,  padding='post', value=0)
x_sequences = np.reshape(sequences, (sequences.shape[0], sequences.shape[1]))
np.random.shuffle(x_sequences)
y_sequences = np.reshape(sequences, (x_sequences.shape[0], x_sequences.shape[1]))
y_sequences = np.roll(y_sequences, 1, axis=1)
y_sequences[:,0] = 0
print(x_sequences.shape)
print(y_sequences)

In [0]:
def build_model(vocab_size, embedding_dim, rnn_units, input_length):
  model = keras.models.Sequential()
  model.add(keras.layers.Embedding(vocab_size, embedding_dim, input_length=input_length))
  #model.add(keras.layers.LSTM(rnn_units, return_sequences=True, recurrent_initializer='glorot_uniform', input_shape=(x_sequences.shape[1], x_sequences.shape[2])))
  model.add(keras.layers.LSTM(rnn_units, return_sequences=True, recurrent_initializer='glorot_uniform'))
  model.add(keras.layers.Dense(vocab_size,  activation='softmax'))
  return model

In [0]:
model = build_model(vocab_size, embedding_dim=vocab_size, rnn_units=128, input_length=max_length)
model.summary()

In [0]:
def loss(labels, logits):
  return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

In [0]:
model.compile(optimizer='adam', loss=loss)

In [0]:
def generate_name(model):
  name = []
  sequence = np.random.randint(1, 26)
  x = np.zeros((1, max_length))
  x[0][0] = sequence
  temperature  = 1.0
  for i in range(13):
    predictions = model.predict(x)
    predictions = tf.squeeze(predictions, 0)
    predictions = predictions / temperature
    predictions = tf.random.categorical(predictions, num_samples=1)
    index = predictions[-1,0].numpy()
    for char, char_index in tokenizer.word_index.items(): 
      if index == char_index:
        x[0][i+1]=index
        name.append(char)
  print(''.join(name))

In [0]:
def generate_name_loop(epoch, _):
  if epoch % 25 == 0:        
    print('Names generated after epoch %d:' % epoch)
    for i in range(3):
      generate_name(model)
        
    print()

name_generator = keras.callbacks.LambdaCallback(on_epoch_end = generate_name_loop)

model.fit(x_sequences, y_sequences, epochs=100, callbacks=[name_generator], verbose=1)
