In [28]:
import os
import re

import numpy as np
from matplotlib import pyplot as plt
from keras.utils import pad_sequences, to_categorical
from keras.preprocessing.text import Tokenizer

from models import LSTMModel

In [33]:
to_exclude = '!"#$%&()*+-/:;<=>@[\\]^_`{|}~\t'
to_tokenize = '.,:;!?'
tokenizer = Tokenizer(filters=to_exclude)


In [43]:
poem_dir = "data/poems/forms/abc"
text = []
for file in os.listdir(poem_dir):
    full_path = os.path.join(poem_dir, file)
    with open(full_path, encoding='utf-8-sig') as f:
        data = f.read()
    data = re.sub(r'(['+to_tokenize+'])', r' \1 ', data)
    data = data.replace('\n', '\n!stop!')
    text.extend(data.lower().split('!stop!'))

print(text[:10])

['2 abc of h . k .  and china revised vision . \n', 'barrels tears are wines and salts . \n', 'with a whisk on goody tails ! \n', 'wiggle maces to fix the heads . \n', 'heads in jack on boxes are ceased . \n', 'cry to paranoid truly bosses . \n', 'bosses are jokers take your boys . \n', 'studs are bogs with fire apples . \n', 'true predicates worth cases . ’\n', 'descents wash in badly bands . \n']


In [44]:
tokenizer.fit_on_texts(text)
total_words = len(tokenizer.word_index) + 1
print(total_words)

6828


In [53]:
input_sequences = []
for line in text:
    token_list = tokenizer.texts_to_sequences([line])[0]
    # print(token_list)
    seq = []
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        seq.append(n_gram_sequence)
    input_sequences.extend(seq)


print(input_sequences[0])
print(text[0])

[898, 67]
2 abc of h . k .  and china revised vision . 



In [54]:
max_sequence_len = max([len(x) for x in input_sequences])
pad_input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

In [55]:
X, labels = pad_input_sequences[:, :-1], pad_input_sequences[:, -1]
y = to_categorical(labels, num_classes=total_words)

In [57]:
model = LSTMModel(total_words=total_words, max_sequence_len=max_sequence_len, load=False)

In [None]:
history = model.train(X, y, epochs=1000, batch_size=64)

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
  2/277 [..............................] - ETA: 15s - loss: 5.6183 - accuracy: 0.1016

In [None]:
fig = plt.figure(figsize=(6, 6))
plt.plot((history.history['accuracy']))
plt.title('model accuracy')
plt.xlabel('epoch')
plt.ylabel('accuracy')
plt.show()


In [None]:
model.save()


In [None]:
seed = "Oh dear"
next_words = 40

for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted = model.predict(token_list)[0]
    output_word = ""
    for word, index in tokenizer.word_index.items():
        if index == predicted:
            output_word = word
            break
    # output_word = tokenizer.word_index[predicted]
    seed = " ".join([seed, output_word])
print(seed)