In [1]:
import numpy as np
from nltk.tokenize import RegexpTokenizer
from keras.models import Sequential, load_model
from keras.layers import LSTM
from keras.layers.core import Dense, Activation
from keras.optimizers import RMSprop
import matplotlib.pyplot as plt
import pickle
import heapq

ModuleNotFoundError: No module named 'tensorflow'

#### Load and preprocess the text

In [None]:
path = '1661-0.txt'
text = open(path).read().lower()
print('corpus length:', len(text))

#### Tokenize the text

In [None]:
tokenizer = RegexpTokenizer(r'\w+')
words = tokenizer.tokenize(text)

#### Get unique words and create word index

In [None]:
unique_words = np.unique(words)
unique_word_index = dict((c, i) for i, c in enumerate(unique_words))

#### Define sequence length

In [None]:
WORD_LENGTH = 5
prev_words = []
next_words = []

#### Prepare input and output sequences

In [None]:
for i in range(len(words) - WORD_LENGTH):
    prev_words.append(words[i:i + WORD_LENGTH])
    next_words.append(words[i + WORD_LENGTH])

print(prev_words[0])
print(next_words[0])

#### One-hot encoding the sequences

In [None]:
X = np.zeros((len(prev_words), WORD_LENGTH, len(unique_words)), dtype=bool)
Y = np.zeros((len(next_words), len(unique_words)), dtype=bool)
for i, each_words in enumerate(prev_words):
    for j, each_word in enumerate(each_words):
        X[i, j, unique_word_index[each_word]] = 1
    Y[i, unique_word_index[next_words[i]]] = 1

print(X[0][0])

#### Define the LSTM model

In [None]:
model = Sequential()
model.add(LSTM(128, input_shape=(WORD_LENGTH, len(unique_words))))
model.add(Dense(len(unique_words)))
model.add(Activation('softmax'))

#### Compile the model

In [None]:
optimizer = RMSprop(learning_rate=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

#### Train the model

In [None]:
history = model.fit(X, Y, validation_split=0.05, batch_size=128, epochs=2, shuffle=True).history

#### Save the model and history

In [None]:
model.save('keras_next_word_model.h5')
pickle.dump(history, open("history.p", "wb"))

#### Load the model and history for future use

In [None]:
model = load_model('keras_next_word_model.h5')
history = pickle.load(open("history.p", "rb"))

#### Plot model accuracy

In [None]:

plt.plot(history['accuracy'])
plt.plot(history['val_accuracy'])
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

#### Plot model loss

In [None]:
plt.plot(history['loss'])
plt.plot(history['val_loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

#### Helper function to prepare the input for the model

In [None]:
def prepare_input(text):
    x = np.zeros((1, WORD_LENGTH, len(unique_words)))
    for t, word in enumerate(text.split()):
        if word in unique_word_index:
            x[0, t, unique_word_index[word]] = 1
    return x

#### Sampling function to choose top predictions

In [None]:
def sample(preds, top_n=3):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds + 1e-10)  # Prevent log(0)
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    return heapq.nlargest(top_n, range(len(preds)), preds.take)

#### Dictionary to map indices back to words

In [None]:
indices_char = dict((i, c) for c, i in unique_word_index.items())

#### Function to predict the completion of a text sequence

In [None]:
def predict_completion(text):
    original_text = text
    generated = text
    completion = ''
    
    while True:
        x = prepare_input(text)
        preds = model.predict(x, verbose=0)[0]
        next_index = sample(preds, top_n=1)[0]
        next_word = indices_char[next_index]
        text = text.split(' ', 1)[-1] + ' ' + next_word
        completion += next_word + ' '
        
        if next_word == ' ':
            return completion.strip()

#### Function to predict multiple completions

In [None]:
def predict_completions(text, n=3):
    x = prepare_input(text)
    preds = model.predict(x, verbose=0)[0]
    next_indices = sample(preds, n)
    return [indices_char[idx] + predict_completion(text[1:] + indices_char[idx]) for idx in next_indices]

#### Example quotes to test the model

In [None]:
quotes = [
    "It is not a lack of love, but a lack of friendship that makes unhappy marriages.",
    "That which does not kill us makes us stronger.",
    "I'm not upset that you lied to me, I'm upset that from now on I can't believe you.",
    "And those who were seen dancing were thought to be insane by those who could not hear the music.",
    "It is hard enough to remember my opinions, without also remembering my reasons for them!"
]

#### Test the model predictions

In [None]:
for q in quotes:
    seq = q[:40].lower()
    print(seq)
    print(predict_completions(seq, 5))
    print()