We'll take a LSTM layer, feed it strings of N characters extracted from a text corpus, and train it to predict character N + 1. The output of the model will be a softmax over all possible characters: a proability distribution for the next characters. This LSTM is called a ***character-level neural language model***.

Building a language model of Bible (KJV) writing style (Dataset from https://www.kaggle.com/phyred23/bibleverses

In [None]:
from google.colab import files
uploaded = files.upload()

Saving bible_data_set.csv to bible_data_set.csv


In [None]:
import pandas as pd
bible = pd.read_csv('/content/bible_data_set.csv')

In [None]:
#parsing csv to extract all the verses as continous text
text = ""
for verse in bible['text']:
  text += verse[:-2]

In [None]:
#Vectorizing sequences of characters through one-hot encoding
maxlen = 60 #We'll extract sequences of 60 characters
step = 3 #We'll sample a new sequence ever three characters

sentences = []  #Holds the extracted sequences
next_chars = []  #Holds the targets (the follow-up characters)

for i in range(0, len(text) - maxlen, step):
  sentences.append(text[i: i+maxlen])
  next_chars.append(text[i+maxlen])

print('Number of sequences:', len(sentences))

chars = sorted(list(set(text)))  #List of unique characters in the corpus
print('Unique characters:', len(chars))
char_indices = dict((char, chars.index(char)) for char in chars)  #Dictionary that maps unique characters to their index in the list "chars"

print('Vectorization...')
x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)))
for i, sentence in enumerate(sentences):
  for t, char in enumerate(sentence):
    x[i, t, char_indices[char]] = 1    #One-hot encoding
  y[i, char_indices[next_chars[i]]] = 1


Number of sequences: 1368744
Unique characters: 63
Vectorization...


In [None]:
#Single-layer LSTM model for next-character prediction
from tensorflow.keras import layers
from tensorflow.keras import models

model = models.Sequential()
model.add(layers.LSTM(128, input_shape=(maxlen, len(chars))))
model.add(layers.Dense(len(chars), activation='softmax'))

#Model compilation configuration
optimizer = keras.optimizers.RMSprop(learning_rate=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

In [None]:
#Function to sample the next character given the model's predictiion
np.seterr(divide = 'ignore') 
def sample(preds, temperature=0.1):
  preds = np.asarray(preds).astype('float64')

  #Reweighting probability distribution to a different temperature
  log_preds = np.log(preds) / temperature
  exp_preds = np.exp(log_preds)
  preds = exp_preds/ np.sum(exp_preds) 

  probas = np.random.multinomial(1, preds, 1) #returns probabiltiy of each predicted character
  return np.argmax(probas) #returning the index of the character with greatest probability

In [None]:
#Training the model
model.fit(x, y, batch_size=2048, epochs=60, verbose=0)

<tensorflow.python.keras.callbacks.History at 0x7f6bfb00a990>

In [None]:
pulp_quote = "The path of the righteous man is beset on all sides by the inequities of the selfish and the tyranny of evil men. Blessed is he who, in the name of charity and good will, shepherds the weak through the valley of the darkness, for he is truly his brother's keeper and the finder of lost children. And I will strike down upon thee with great vengeance and furious anger those who attempt to poison and destroy my brothers. And you will know I am the Lord when I lay My vengeance upon you."
#pulp_quote = "The path of the righteous man is beset on all sides by the inequities of the selfish and the tyranny of evil men."

In [None]:
# Text-generation loop
import random
import sys

generated_text = pulp_quote[-maxlen:]

temperature = 0.3

#sys.stdout.write(pulp_quote.lower())
print()
for i in range(1000): #Generates 1000 characters, starting from the seed text
  #One hot encoding the characters generated so far
  sampled = np.zeros((1, maxlen, len(chars)))
  for t, char in enumerate(generated_text):
    sampled[0, t, char_indices[char]] = 1
      
  #Sampling the next character
  preds = model.predict(sampled, verbose=0)[0]
  next_index = sample(preds, temperature) #Calling the sampling function
  next_char = chars[next_index]

  generated_text += next_char
  generated_text = generated_text[1:]

  sys.stdout.write(next_char)


 

0.0001