In [None]:
# Part 1
# - Process Test
# - Clean Text
# - Tokenize the text and create sequences with Keras
def read_files(filepath):
    with open(filepath) as f:
        str_text = f.read()
    
    return str_text

In [None]:
import spacy

In [None]:
nlp = spacy.load('en', disable=['parser', 'tagger', 'ner'])

In [None]:
# Expand limit to file words count
nlp.max_length = 1198623

In [None]:
def separate_punc(doc_text):
    return [
        token.text.lower()
        for token in nlp(doc_text) 
        if token.text not in '\n\n \n\n\n!"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n '
    ]

In [None]:
d = read_files('moby_dick_four_chapters.txt')

In [None]:
tokens = separate_punc(d)

In [None]:
len(tokens)

In [None]:
# 25 words --> network predict #26

In [None]:
train_len = 25 + 1

text_sequences = []

for i in range(train_len, len(tokens)):
    seq = tokens[i - train_len : i]
    
    text_sequences.append(seq)

In [None]:
' '.join(text_sequences[0])

In [None]:
# Notice it is just one token over
' '.join(text_sequences[1])

In [None]:
# Notice it is just one token over
' '.join(text_sequences[2])

In [None]:
from keras.preprocessing.text import Tokenizer

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_sequences)

In [None]:
# Tokenization of the texts
sequences = tokenizer.texts_to_sequences(text_sequences)

In [None]:
for i in sequences[0]:
    print(f'{i}: {tokenizer.index_word[i]}')

In [None]:
vocabulary_size = len(tokenizer.word_counts)

In [None]:
vocabulary_size

In [None]:
import numpy as np

In [None]:
sequences = np.array(sequences)

In [None]:
sequences

In [None]:
# Part 2
# - Create the LSTM Based Model
# - Split the Data into Features and Labels
# - X Features (First n words of sequence)
# - Y Label (Next word after the sequence)
# Fit the Model

In [None]:
from keras.utils import to_categorical

In [None]:
# Grab all columns, except the last column
X = sequences[:,:-1]

In [None]:
# Grab only the last column
y = sequences[:, -1]

In [None]:
y = to_categorical(y, num_classes=vocabulary_size + 1)

In [None]:
seq_len = X.shape[1]

In [None]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding

In [None]:
def create_model(vocabulary_size, seq_len):

    model = Sequential()

    model.add(Embedding(vocabulary_size, seq_len, input_length=seq_len))
    model.add(LSTM(50, return_sequences=True))
    model.add(LSTM(50))
    model.add(Dense(50, activation='relu'))
    model.add(Dense(vocabulary_size, activation='softmax'))
    
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    model.summary()
    
    return model

In [None]:
model = create_model(vocabulary_size + 1, seq_len)

In [None]:
from pickle import dump, load

In [None]:
model.fit(X, y, batch_size=128, epochs=2, verbose=1)

In [None]:
model.save('my_mobydick_model.h5')

In [None]:
dump(tokenizer, open('my_simpletokenizer', 'wb'))