In [1]:
import os
import numpy as np
from tensorflow import keras
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

In [2]:
DATA_URL = 'https://storage.googleapis.com/laurencemoroney-blog.appspot.com/irish-lyrics-eof.txt'

In [3]:
data_path = keras.utils.get_file(
    os.path.basename(DATA_URL),
    DATA_URL,
    cache_dir='./',
    cache_subdir=''
)

In [4]:
data = open(data_path).read()
corpus = data.lower().split("\n")
print(f"len(corpus) = {len(corpus)}")

len(corpus) = 1693


In [5]:
print(corpus[:5])

['come all ye maidens young and fair', 'and you that are blooming in your prime', 'always beware and keep your garden fair', 'let no man steal away your thyme', 'for thyme it is a precious thing']


In [6]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
word_index = tokenizer.word_index
reverse_word_index = dict([(idx, word) for (word, idx) in word_index.items()])
print(f"first 5 items in word_index = {list(word_index.items())[:5]}")

first 5 items in word_index = [('the', 1), ('and', 2), ('i', 3), ('to', 4), ('a', 5)]


In [7]:
total_words = len(tokenizer.word_index) + 1
total_words

2690

In [8]:
def decode_sequence(sequence):
    return ' '.join([reverse_word_index.get(i, '?') for i in sequence])

In [9]:
input_sequences = []
for line in corpus:
    sequence = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(sequence)):
        n_gram_sequence = sequence[:i+1]
        input_sequences.append(n_gram_sequence)
        
max_sequence_len = max(map(len, input_sequences))
print(f"max_sequence_len = {max_sequence_len}")

input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')
input_sequences = np.array(input_sequences)
print(f"input_sequences.shape = {input_sequences.shape}")

max_sequence_len = 16
input_sequences.shape = (12038, 16)


In [10]:
xs, labels = input_sequences[:, :-1], input_sequences[:, -1]
ys = keras.utils.to_categorical(labels, num_classes=total_words)
print(f"xs.shape = {xs.shape}, labels.shape = {labels.shape}, ys.shape = {ys.shape}")

xs.shape = (12038, 15), labels.shape = (12038,), ys.shape = (12038, 2690)


In [11]:
print(input_sequences[2])
print(decode_sequence(input_sequences[2]))

[   0    0    0    0    0    0    0    0    0    0    0    0   51   12
   96 1217]
? ? ? ? ? ? ? ? ? ? ? ? come all ye maidens


In [12]:
print(xs[2])
print(decode_sequence(xs[2]))

[ 0  0  0  0  0  0  0  0  0  0  0  0 51 12 96]
? ? ? ? ? ? ? ? ? ? ? ? come all ye


In [13]:
print(ys[2])

[0. 0. 0. ... 0. 0. 0.]


In [14]:
print(np.argmax(ys[2]))
print(decode_sequence([np.argmax(ys[2])]))

1217
maidens
