In [1]:
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional

In [None]:
!gdown --id 108jAePKK4R3BVYBbYJZ32JWUwxeMg20K

In [None]:
SONNETS_FILE = './sonnets.txt'

with open('./sonnets.txt') as f:
    data =f.read()

corpus = data.lower().split("\n")

print(f"There are {len(corpus)} lines of sonnets\n")
print(f"The first 5 lines look like this:\n")
for i in range(5):
  print(corpus[i])

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index)+1

In [None]:
def n_gram_seqs(corpus, tokenizer):
    input_sequences = []

    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)

    return input_sequences

In [None]:
input_sequences = n_gram_seqs(corpus, tokenizer)
max_sequence_len = max([len(x) for x in input_sequences])

In [None]:
def pad_seqs(input_sequences, maxlen):
    padded_sequences = np.array(pad_sequences(input_sequences, maxlen=maxlen))
    return padded_sequences

In [None]:
input_sequences = pad_seqs(input_sequences, max_sequence_len)

In [1]:
def features_and_labels(input_sequences, total_words):
    features = input_sequences[:,:-1]
    labels = input_sequences[:, -1]
    one_hot_labels = to_categorical(labels, num_classes = total_words)
    return features, one_hot_labels

In [None]:
features, labels = features_and_labels(input_sequences, total_words)

In [None]:
def create_model(total_words, max_sequence_len):
    model = Sequential()
    model.add(Embedding(total_words, 100, input_length = max_sequence_len))
    model.add(Bidirectional(LSTM(20)))
    model.add(Dense(total_words, activation="softmax"))

    model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=['accuracy'])