<a href="https://colab.research.google.com/github/pdevall/TextGeneration/blob/master/CharacterLevelLanguageModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
import tensorflow as tf
from tensorflow import keras

Load Data using numpy. Get the Max length of the word in the file. The Max length will be used to pad the sequences after tokenization



In [3]:
datasetNPARRAY = np.loadtxt("/content/drive/My Drive/Colab Notebooks/CharacterLevelLanguageModel/dinos.txt", dtype="str")
print(datasetNPARRAY)
max_length = max(len(x) for x in datasetNPARRAY )
print(max_length)

['Aachenosaurus' 'Aardonyx' 'Abdallahsaurus' ... 'Zuoyunlong'
 'Zupaysaurus' 'Zuul']
26


Using Keras Tokenizer to tokenize the words loaded from file at the chracter level.

In [0]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(
    num_words=len(datasetNPARRAY),
    filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
    lower=True,
    char_level=True,
    document_count=0
)    

Tokenizer fit_on_texts will create a vocabulary. texts_to_sequences will generate the char to sequence in the vocabulary.

In [5]:
tokenizer.fit_on_texts(datasetNPARRAY)
sequences = tokenizer.texts_to_sequences(datasetNPARRAY)
vocab_size = len(tokenizer.word_index) + 1
print(tokenizer.word_index)
print(vocab_size)


{'a': 1, 's': 2, 'u': 3, 'o': 4, 'r': 5, 'n': 6, 'i': 7, 'e': 8, 't': 9, 'l': 10, 'p': 11, 'h': 12, 'c': 13, 'g': 14, 'd': 15, 'm': 16, 'y': 17, 'b': 18, 'k': 19, 'v': 20, 'x': 21, 'z': 22, 'j': 23, 'w': 24, 'f': 25, 'q': 26}
27


Shuffle input, shift 1 place right for Y_sequences.

In [6]:
sequences = keras.preprocessing.sequence.pad_sequences(sequences, maxlen=max_length,  padding='post', value=0)
x_sequences = np.reshape(sequences, (sequences.shape[0], sequences.shape[1], 1))
np.random.shuffle(x_sequences)
y_sequences = np.reshape(sequences, (x_sequences.shape[0], x_sequences.shape[1]))
y_sequences = np.roll(y_sequences, 1, axis=1)
y_sequences[:,0] = 0
print(x_sequences)
print(y_sequences)

[[[ 2]
  [ 7]
  [ 1]
  ...
  [ 0]
  [ 0]
  [ 0]]

 [[ 2]
  [13]
  [ 1]
  ...
  [ 0]
  [ 0]
  [ 0]]

 [[19]
  [ 3]
  [10]
  ...
  [ 0]
  [ 0]
  [ 0]]

 ...

 [[11]
  [ 1]
  [10]
  ...
  [ 0]
  [ 0]
  [ 0]]

 [[18]
  [ 1]
  [ 5]
  ...
  [ 0]
  [ 0]
  [ 0]]

 [[ 9]
  [ 7]
  [ 9]
  ...
  [ 0]
  [ 0]
  [ 0]]]
[[ 0  2  7 ...  0  0  0]
 [ 0  2 13 ...  0  0  0]
 [ 0 19  3 ...  0  0  0]
 ...
 [ 0 11  1 ...  0  0  0]
 [ 0 18  1 ...  0  0  0]
 [ 0  9  7 ...  0  0  0]]


In [0]:
def build_model(vocab_size, embedding_dim, rnn_units):
  model = keras.models.Sequential()
  #model.add(keras.layers.Embedding(vocab_size, embedding_dim, input_length=batch_size))
  model.add(keras.layers.LSTM(rnn_units, return_sequences=True, recurrent_initializer='glorot_uniform', input_shape=(x_sequences.shape[1], x_sequences.shape[2])))
  model.add(keras.layers.Dense(vocab_size,  activation='softmax'))
  return model

In [8]:
model = build_model(vocab_size, embedding_dim=4, rnn_units=128)
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 26, 128)           66560     
_________________________________________________________________
dense (Dense)                (None, 26, 27)            3483      
Total params: 70,043
Trainable params: 70,043
Non-trainable params: 0
_________________________________________________________________


In [0]:
def loss(labels, logits):
  return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

In [0]:
model.compile(optimizer='adam', loss=loss)

In [11]:
# Directory where the checkpoints will be saved
checkpoint_dir = '/content/drive/My Drive/Colab Notebooks/CharacterLevelLanguageModel/'
# Name of the checkpoint files
checkpoint_prefix = checkpoint_dir + "weights-improvement.hdf5"

checkpoint_callback=keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

history = model.fit(x_sequences, y_sequences, epochs=200, callbacks=[checkpoint_callback])

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200

KeyboardInterrupt: ignored

In [0]:
model = build_model(vocab_size, 4, 128)

model.load_weights(checkpoint_prefix)

model.build(tf.TensorShape([1, ]))

def generate_text(model, start_char):
  # Evaluation step (generating text using the learned model)

  # Number of characters to generate
  num_generate = 15

  # Converting our start string to numbers (vectorizing)
  input_eval = tokenizer.texts_to_sequences(start_char)
  input_eval = tf.expand_dims(input_eval, 0)
  print(input_eval)

  # Empty string to store our results
  text_generated = []

  # Low temperatures results in more predictable text.
  # Higher temperatures results in more surprising text.
  # Experiment to find the best setting.
  temperature = 1.0

  # Here batch size == 1
  model.reset_states()
  for i in range(num_generate):
      predictions = model(input_eval)
      # remove the batch dimension
      predictions = tf.squeeze(predictions, 0)

      # using a categorical distribution to predict the character returned by the model
      predictions = predictions / temperature
      predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()
      # We pass the predicted character as the next input to the model
      # along with the previous hidden state
      input_eval = tf.expand_dims([predicted_id], 0)
      predictedChar = ''.join([str(elem) for elem in tokenizer.sequences_to_texts([[predicted_id]])])
      text_generated.append(predictedChar)
     # text_generated.append(str(tokenizer.sequences_to_texts([[predicted_id]])))
  return (start_char + ''.join(text_generated))

In [20]:
for i in range(12):
  randomNum = np.random.randint(1, 27)
  character=''
  for char, sequence in tokenizer.word_index.items(): 
    if sequence == randomNum:
      character = char
  print(generate_text(model, start_char=character))


5
tf.Tensor([[[5]]], shape=(1, 1, 1), dtype=int32)
rkrhebnlmrgvl
1
tf.Tensor([[[1]]], shape=(1, 1, 1), dtype=int32)
afzqoncmrdmbbtqt
4
tf.Tensor([[[4]]], shape=(1, 1, 1), dtype=int32)
oqoabvjynzgrfvw
7
tf.Tensor([[[7]]], shape=(1, 1, 1), dtype=int32)
itdeyclyivmvxek
23
tf.Tensor([[[23]]], shape=(1, 1, 1), dtype=int32)
jjzlhhbhvfvenh
18
tf.Tensor([[[18]]], shape=(1, 1, 1), dtype=int32)
bojcnlnlwctptg
6
tf.Tensor([[[6]]], shape=(1, 1, 1), dtype=int32)
nytdkogndkyzm
3
tf.Tensor([[[3]]], shape=(1, 1, 1), dtype=int32)
uijohhzfedpjdyyk
21
tf.Tensor([[[21]]], shape=(1, 1, 1), dtype=int32)
xuffuxjzrzeyl
6
tf.Tensor([[[6]]], shape=(1, 1, 1), dtype=int32)
nlsnohjmxfvlneh
10
tf.Tensor([[[10]]], shape=(1, 1, 1), dtype=int32)
lbtyhuovjmivqo
25
tf.Tensor([[[25]]], shape=(1, 1, 1), dtype=int32)
fjmtguomzhdujbu
