In [None]:
import os
import re

import numpy as np
from matplotlib import pyplot as plt
from keras.utils import pad_sequences, to_categorical
from keras.preprocessing.text import Tokenizer

from models import LSTMModel

In [None]:
to_exclude = '!"#$%&()*+-/:;<=>@[\\]^_`{|}~\t'
to_tokenize = '.,:;!?\n'
tokenizer = Tokenizer(filters=to_exclude)


In [None]:
poem_dir = "data/poems/forms/abc"
text = []
for file in sorted(os.listdir(poem_dir)):
    full_path = os.path.join(poem_dir, file)
    with open(full_path, encoding='utf-8-sig') as f:
        data = f.read()
    data = re.sub(r'(['+to_tokenize+'])', r' \1 ', data)
    text.append(data.lower())

print(text[0])

In [None]:
tokenizer.fit_on_texts(text)
total_words = len(tokenizer.word_index) + 1
print(total_words)

In [None]:
input_sequences = []
max_sequence_len = 30

for line in text:
    token_list = tokenizer.texts_to_sequences([line])[0]
    seq = []
    for i in range(1, len(token_list)):
        start_index = max(0, i - max_sequence_len + 1)
        end_index = i + 1
        n_gram_sequence = token_list[start_index:end_index]
        seq.append(n_gram_sequence)
    input_sequences.extend(seq)
pad_input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

print(input_sequences[10])
print(text[0].split(' ')[:10])

In [None]:
X, labels = pad_input_sequences[:, :-1], pad_input_sequences[:, -1]
y = to_categorical(labels, num_classes=total_words)

In [None]:
model = LSTMModel().create(total_words=total_words, output_dim=1000, max_sequence_len=max_sequence_len)

In [None]:
history = model.train(X, y, epochs=10, batch_size=64)

In [None]:
fig = plt.figure(figsize=(6, 6))
plt.plot(history.history['accuracy'], label="accuracy")
plt.plot(history.history['loss'], label="loss")
plt.legend()
plt.title('model accuracy')
plt.xlabel('epoch')
plt.ylabel('accuracy')
plt.show()


In [None]:
model.save()

In [None]:
seed = "Oh dear"
next_words = 100

for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted = model.predict(token_list)[0]
    output_word = ""
    for word, index in tokenizer.word_index.items():
        if index == predicted:
            output_word = word
            break
    # output_word = tokenizer.word_index[predicted]
    seed = " ".join([seed, output_word])
print(seed)