#Importing the Library

In [None]:
import tensorflow as tf
import numpy as np

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam

#Building the Word Vocabulary

In [None]:
# Download the dataset
!gdown --id 15UqmiIm0xwh9mt0IYq2z3jHaauxQSTQT

We then load the dataset.

In [None]:
#Load the dataset
data = open('./irish-lyrics-eof.txt').read()

#Lowercase and split the text
corpus = data.lower().split("\n")

We initialize the `Tokenizer class` and generate the word index dictionary.

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1

#Preprocessing the Dataset

This is same with the previous lab.

In [None]:
input_sequences = []

for line in corpus:
    #tokenize the current line
    token_list = tokenizer.texts_to_sequences([line])[0]

    for i in range(1, len(token_list)):
      #generate subphrases
      n_gram_sequence = token_list[:i+1]
      input_sequences.append(n_gram_sequence)

    #get the length of the longest line
    max_sequence_len = max([len(x) for x in input_sequences])

    #pad all sequences
    input_sequences = pad_sequences(input_sequences, maxlen = max_sequence_len, padding = 'pre')

    #Create  inputs
    xs = input_sequences[:,:-1]
    labels = input_sequences[:,-1]

    #convert label into one hot arrays
    ys = tf.keras.utils.to_categorical(labels, num_classes = total_words)

#Build and compile the model

In [None]:
#Setting hyperparameters
embedding_dim = 100
lstm_units = 150
learning_rate = 0.01

#Build the model
model = Sequential ([
    Embedding(total_words, embedding_dim, input_length = max_seqneuce_len-1),
    Bidirectional(LSTM(lstm_units)),
    Dense(total_words, activation = 'softmax')
])

#compile the model
model.compile(
    loss = 'categorical_crossentropy',
    optimizer = tf.keras.optimizers.Adam(learning_rate = learning_rate),
    metrics = ['accuracy']
)

#Train the model

In [None]:
epochs = 100

# Train the model
history = model.fit(xs, ys, epochs=epochs)

#Visualize the Model

In [None]:
import matplotlib.pyplot as plt

# Plot utility
def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.show()

# Visualize the accuracy
plot_graphs(history, 'accuracy')

#Generating text

Now you can let the model make its own songs or poetry! Because it is trained on a much larger corpus, the results below should contain less repetitions as before. The code below picks the next word based on the highest probability output.

In [None]:
# Define seed text
seed_text = "help me obi-wan kinobi youre my only hope"

#define total words to predict
next_words = 100

#loop until desired length is reached
for _ in range(next_words):
  #convert text to token list
  token_list = tokenizer.texts_to_sequences([seed_text])[0]

  #pad the sequence
  token_list = pad_sequences([token_list], maxlen = max_sequence_len-1, padding='pre')

  #feed to the model and get the probabilities for each index
  probabilities = model.predict(token_list, werbose = 0)

  #get the index with the highest probabilitiy
  predicted = np.argmax(probabilities, axis = -1)[0]

  if predicted != 0:
    output_word = tokenizer.index_word[predicted]
    seed_text += " " + output_word

# Print the result
print(seed_text)

This code gets the top 3 predictions and picks one at random.

In [None]:
# Define seed text
seed_text = "help me obi-wan kinobi youre my only hope"

# Define total words to predict
next_words = 100

# Loop until desired length is reached
for _ in range(next_words):

	# Convert the seed text to a token sequence
  token_list = tokenizer.texts_to_sequences([seed_text])[0]

	# Pad the sequence
  token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')

	# Feed to the model and get the probabilities for each index
  probabilities = model.predict(token_list, verbose=0)

  # Pick a random number from [1,2,3]
  choice = np.random.choice([1,2,3])

  # Sort the probabilities in ascending order
  # and get the random choice from the end of the array
  predicted = np.argsort(probabilities)[0][-choice]

	# Ignore if index is 0 because that is just the padding.
  if predicted != 0:

		# Look up the word associated with the index.
	  output_word = tokenizer.index_word[predicted]

		# Combine with the seed text
	  seed_text += " " + output_word

# Print the result
print(seed_text)