In [1]:
## Import Module Requirements:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Sequential ## !! The Model being used
from keras.layers import Embedding
from keras.layers import Dense
from keras.layers import LSTM
import numpy as np
import itertools
import random

In [2]:
## Step 1: Define Problem
#########################
## Problem: Build a next word predictor
## Goal: Predict the next word in a sequence
#####################################################
## Outputs for Measuring Quality of Model Validation:
### TRUE Positive: Validation Data Label = TRUE;  Machine Learning Label Output = TRUE 
## FALSE Positive: Validation Data Label = FALSE; Machine Learning Label Output = TRUE 
### TRUE Negative: Validation Data Label = FALSE; Machine Learning Label Output = FALSE 
## FALSE Negative: Validation Data Label = TRUE;  Machine Learning Label Output = FALSE 

In [3]:
## Step 2: Collect & Split Dataset
##################################
import text_preprocessor # Import Variables via Files from Directory
from text_preprocessor import training_text # Import Variables via Files from Directory
##################################
## Example text from Training Data
print(training_text[0:100])

['theory', 'and', 'practice', 'of', 'piano', 'construction', 'by', 'william', 'b', 'white', '--', 'a', 'project', 'gutenberg', 'ebook', 'the', 'project', 'gutenberg', 'ebook', 'of', 'theory', 'and', 'practice', 'of', 'piano', 'construction', 'by', 'william', 'b', 'white', 'this', 'ebook', 'is', 'for', 'the', 'use', 'of', 'anyone', 'anywhere', 'at', 'no', 'cost', 'and', 'with', 'almost', 'no', 'restriction', 'whatsoever', 'you', 'may', 'copy', 'it', 'give', 'it', 'away', 'or', 'reuse', 'it', 'under', 'the', 'term', 'of', 'the', 'project', 'gutenberg', 'license', 'included', 'with', 'this', 'ebook', 'or', 'online', 'at', 'wwwgutenbergorglicense', 'title', 'theory', 'and', 'practice', 'of', 'piano', 'construction', 'with', 'a', 'detailed', 'practical', 'method', 'for', 'tuning', 'author', 'william', 'b', 'white', 'release', 'date', 'june', 'ebook', 'language', 'english', 'character', 'set']


In [4]:
## Get {keyword:id} Pairs
keyword_id_dict = {}
keyword_id = 1
for keyword in range(len(training_text)):
    if training_text[keyword] not in keyword_id_dict:
        keyword_id_dict[training_text[keyword]] = keyword_id
        keyword_id += 1

print(dict(itertools.islice(keyword_id_dict.items(), 30)))

{'theory': 1, 'and': 2, 'practice': 3, 'of': 4, 'piano': 5, 'construction': 6, 'by': 7, 'william': 8, 'b': 9, 'white': 10, '--': 11, 'a': 12, 'project': 13, 'gutenberg': 14, 'ebook': 15, 'the': 16, 'this': 17, 'is': 18, 'for': 19, 'use': 20, 'anyone': 21, 'anywhere': 22, 'at': 23, 'no': 24, 'cost': 25, 'with': 26, 'almost': 27, 'restriction': 28, 'whatsoever': 29, 'you': 30}


In [5]:
## Get a list of Text Sequences
text_sequences = []
sequence_training_len = 4
for keyword in range(sequence_training_len, len(training_text)):
    sequence = training_text[keyword - sequence_training_len:keyword]
    text_sequences.append(sequence)

print(text_sequences[0:30])

[['theory', 'and', 'practice', 'of'], ['and', 'practice', 'of', 'piano'], ['practice', 'of', 'piano', 'construction'], ['of', 'piano', 'construction', 'by'], ['piano', 'construction', 'by', 'william'], ['construction', 'by', 'william', 'b'], ['by', 'william', 'b', 'white'], ['william', 'b', 'white', '--'], ['b', 'white', '--', 'a'], ['white', '--', 'a', 'project'], ['--', 'a', 'project', 'gutenberg'], ['a', 'project', 'gutenberg', 'ebook'], ['project', 'gutenberg', 'ebook', 'the'], ['gutenberg', 'ebook', 'the', 'project'], ['ebook', 'the', 'project', 'gutenberg'], ['the', 'project', 'gutenberg', 'ebook'], ['project', 'gutenberg', 'ebook', 'of'], ['gutenberg', 'ebook', 'of', 'theory'], ['ebook', 'of', 'theory', 'and'], ['of', 'theory', 'and', 'practice'], ['theory', 'and', 'practice', 'of'], ['and', 'practice', 'of', 'piano'], ['practice', 'of', 'piano', 'construction'], ['of', 'piano', 'construction', 'by'], ['piano', 'construction', 'by', 'william'], ['construction', 'by', 'william', 

In [6]:
## Convert list of Text Sequences into its numerical keyword,id Pair
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_sequences) # Updates internal vocabulary based on a list of texts.
sequences = tokenizer.texts_to_sequences(text_sequences) # Transforms each text in texts to a sequence of integers.
print(sequences[0:50])

[[453, 4, 152, 2], [4, 152, 2, 168], [152, 2, 168, 77], [2, 168, 77, 14], [168, 77, 14, 992], [77, 14, 992, 121], [14, 992, 121, 746], [992, 121, 746, 162], [121, 746, 162, 5], [746, 162, 5, 80], [162, 5, 80, 273], [5, 80, 273, 644], [80, 273, 644, 1], [273, 644, 1, 80], [644, 1, 80, 273], [1, 80, 273, 644], [80, 273, 644, 2], [273, 644, 2, 453], [644, 2, 453, 4], [2, 453, 4, 152], [453, 4, 152, 2], [4, 152, 2, 168], [152, 2, 168, 77], [2, 168, 77, 14], [168, 77, 14, 992], [77, 14, 992, 121], [14, 992, 121, 746], [992, 121, 746, 12], [121, 746, 12, 644], [746, 12, 644, 6], [12, 644, 6, 16], [644, 6, 16, 1], [6, 16, 1, 163], [16, 1, 163, 2], [1, 163, 2, 747], [163, 2, 747, 1452], [2, 747, 1452, 24], [747, 1452, 24, 65], [1452, 24, 65, 748], [24, 65, 748, 4], [65, 748, 4, 15], [748, 4, 15, 645], [4, 15, 645, 65], [15, 645, 65, 1733], [645, 65, 1733, 2098], [65, 1733, 2098, 111], [1733, 2098, 111, 29], [2098, 111, 29, 432], [111, 29, 432, 9], [29, 432, 9, 122]]


In [7]:
## Get a list of empty sequences
n_sequences = np.empty([len(sequences), sequence_training_len], dtype='int32')
print(n_sequences)

[[          0           0    23689536           1]
 [     999081           0          -1          -1]
 [ 2002475924  2123511579  -394174497  1218405217]
 ...
 [-1711472021 -1339695325  -309812460 -1236975977]
 [ 1449989897  1643624775 -1309731435 -1063851751]
 [ 1099052232  -884821824  1580753708   858819792]]


In [8]:
## Prep the training data
for sequence in range(len(sequences)):
    n_sequences[sequence] = sequences[sequence]
train_inputs = n_sequences[:,:-1] # Gets every keyword in a sequence except the last one (the keywords leading up to the TARGET)
train_targets = n_sequences[:,-1] # Gets the last keyword in a sequence (the TARGET)
vocabulary_size = len(tokenizer.word_counts) + 1 # vocabulary_size increased by 1 because of Padding
train_targets = to_categorical(train_targets, num_classes = vocabulary_size) # Converts a class vector (integers) to binary class matrix
sequence_len = train_inputs.shape[1] # The number of keywords in a sequence from the training data

In [9]:
## Step 3: Train Model
#################
## Prep the Model
model = Sequential()
model.add(Embedding(vocabulary_size, sequence_len, input_length=sequence_len))
model.add(LSTM(50, return_sequences=True))
model.add(LSTM(50))
model.add(Dense(50, activation='relu'))
model.add(Dense(vocabulary_size, activation='softmax'))

In [10]:
## Train the Model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy','mse']) # Compile defines the loss function, the optimizer and the metrics
model.fit(train_inputs, train_targets, epochs=50, verbose=1) ## Fit is used for training the model with the provided inputs and targets

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7ff081acd730>

In [11]:
## Step 4: Debug & Tune Model
## Debug & Tune Model: Validate Model using the Debugging Dataset
#### Review Machine Learning Label Output vs Debugging Dataset Label
#### IF inspired THEN fix issues (dataset, hyperparameters, etc.)
validation_text = 'if the tuned'
validation_text = validation_text.strip().lower()
print('First 3 words in a sequence:')
print(validation_text)
print('\n')

encoded_text = tokenizer.texts_to_sequences([validation_text])[0] ## Converts validation text to keyword_id
pad_encoded = pad_sequences([encoded_text], maxlen=sequence_len, truncating='pre') ## Converts keyword_ids to array

for keyword in (model.predict(pad_encoded)[0]).argsort()[-3:][::-1]:
    predicted_next_word = tokenizer.index_word[keyword]
    print("Suggested next word: '{0}'".format(predicted_next_word))

First 3 words in a sequence:
if the tuned


Suggested next word: 'prop'
Suggested next word: 'describes'
Suggested next word: 'boring'
