#Import libraries

In [None]:
import tensorflow as tf
import numpy as np
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

#Building the Word Vocabulary

In [None]:
# Define the lyrics of the song
data="In the town of Athy one Jeremy Lanigan \n Battered away til he hadnt a pound. \nHis father died and made him a man again \n Left him a farm and ten acres of ground. \nHe gave a grand party for friends and relations \nWho didnt forget him when come to the wall, \nAnd if youll but listen Ill make your eyes glisten \nOf the rows and the ructions of Lanigans Ball. \nMyself to be sure got free invitation, \nFor all the nice girls and boys I might ask, \nAnd just in a minute both friends and relations \nWere dancing round merry as bees round a cask. \nJudy ODaly, that nice little milliner, \nShe tipped me a wink for to give her a call, \nAnd I soon arrived with Peggy McGilligan \nJust in time for Lanigans Ball. \nThere were lashings of punch and wine for the ladies, \nPotatoes and cakes; there was bacon and tea, \nThere were the Nolans, Dolans, OGradys \nCourting the girls and dancing away. \nSongs they went round as plenty as water, \nThe harp that once sounded in Taras old hall,\nSweet Nelly Gray and The Rat Catchers Daughter,\nAll singing together at Lanigans Ball. \nThey were doing all kinds of nonsensical polkas \nAll round the room in a whirligig. \nJulia and I, we banished their nonsense \nAnd tipped them the twist of a reel and a jig. \nAch mavrone, how the girls got all mad at me \nDanced til youd think the ceiling would fall. \nFor I spent three weeks at Brooks Academy \nLearning new steps for Lanigans Ball. \nThree long weeks I spent up in Dublin, \nThree long weeks to learn nothing at all,\n Three long weeks I spent up in Dublin, \nLearning new steps for Lanigans Ball. \nShe stepped out and I stepped in again, \nI stepped out and she stepped in again, \nShe stepped out and I stepped in again, \nLearning new steps for Lanigans Ball. \nBoys were all merry and the girls they were hearty \nAnd danced all around in couples and groups, \nTil an accident happened, young Terrance McCarthy \nPut his right leg through miss Finnertys hoops. \nPoor creature fainted and cried Meelia murther, \nCalled for her brothers and gathered them all. \nCarmody swore that hed go no further \nTil he had satisfaction at Lanigans Ball. \nIn the midst of the row miss Kerrigan fainted, \nHer cheeks at the same time as red as a rose. \nSome of the lads declared she was painted, \nShe took a small drop too much, I suppose. \nHer sweetheart, Ned Morgan, so powerful and able, \nWhen he saw his fair colleen stretched out by the wall, \nTore the left leg from under the table \nAnd smashed all the Chaneys at Lanigans Ball. \nBoys, oh boys, twas then there were runctions. \nMyself got a lick from big Phelim McHugh. \nI soon replied to his introduction \nAnd kicked up a terrible hullabaloo. \nOld Casey, the piper, was near being strangled. \nThey squeezed up his pipes, bellows, chanters and all. \nThe girls, in their ribbons, they got all entangled \nAnd that put an end to Lanigans Ball."

#Split the long strings per line and put it in list
corpus = data.lower().split("\n")

In [None]:
#Initialize tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)

#Define the max length. Add 1 for the index 0
total_words = len(tokenizer.word_index) + 1

#Preprocessing the Dataset

We then look at the code to take this corpus and turn it into training data: We **tokenize** it.

In [None]:
input_sequences = []

for line in corpus:
  #converting line of text to tokens
  token_list = tokenizer.texts_to_sequences([line])[0]

  #creates the ngram sequences
  for i in range(1, len(token_list)):
    n_gram_sequence = token_list[:i+1]
    input_sequences.append(n_gram_sequence)

  #get the length of the longest line
  max_sequence_len = max([len(x) for x in input_sequences])

The next step: Padding

In [None]:
intput_sequences = np.array(pad_sequences, maxlen=max_sequence_len, padding='pre')

Now that we have our sequences, we turn them into x’s and y’s, our input values and their labels. All we have to do is take all but the last character as x, and use the last character as y on our label.

We do this by **splitting our sequences into our x’s and y’s.**

In [None]:
xs = input_sequences[:,:-1]
labels = input_sequences[:,-1]

We one-hot encode the labels as this is a classification problem.

In [None]:
ys = tf.keras.utils.to_categorical(labels, num_classes = total_words)

#Building the Model

In [None]:
model = Sequential([
    Embedding(total_words, 64, input_length=max_sequence_len -1),
    Bidirectional(LSTM(20)),
    Dense(total_words, activation ='softmax')
])

model.compile(
    loss = 'categorical_crossentropy',
    optimizer = 'adam',
    metrics = ['accuracy']
)

#Training the Model

In [None]:
history = model.fit(xs, ys, epochs=500)

#Generating Text

Process:
1. Feed a seed text to initiate the process.
2. Model predicts the index of the most probable next word.
3. Look up the index in the reverse word index dictionary
4. Append the next word to the seed text.
5. Feed the result to the model again.

In [None]:
#Define seed text
seed_text = "Laurence went to Dublin"

In [None]:
#Define total words to predict
next_words = 100

for _ in range(next_words):
  token_list = tokenizer.texts_to_sequence([seed_text])[0]
  token_list = pad_sequences([token_list], maxlen = max_sequence_len-1, padding='pre')

  #Feed to the model and get probabilities for each index
  probabilities = model.predict(token_list)

  #Get the index w/ highest probability
  predicted = np.argmax(probabilities, axis = -1)[0]

  if predicted != 0:
    output_word = tokenizer.index_word[predicted] #look up word associated w/ index
    seed_sext += " " +output_word #combine with seed text

  print(seed_text)

This produces lots of repeated words, so instead of getting the index with max probability, you can just get the top three indices and choose one at random.

In [None]:
seed_text = "Laurence went to Dublin"

next_words = 100

for _ in range(next_words):

  token_list = tokenizer.texts_to_sequences([seed_text])[0]

	# Pad the sequence
  token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')

	# Feed to the model and get the probabilities for each index
  probabilities = model.predict(token_list)

  # Pick a random number from [1,2,3]
  choice = np.random.choice([1,2,3])

  # Sort the probabilities in ascending order
  # and get the random choice from the end of the array
  predicted = np.argsort(probabilities)[0][-choice]

	# Ignore if index is 0 because that is just the padding.
  if predicted != 0:

		# Look up the word associated with the index.
	  output_word = tokenizer.index_word[predicted]

		# Combine with the seed text
	  seed_text += " " + output_word

# Print the result
print(seed_text)