In [2]:
import numpy
import sys
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM
from tensorflow.keras import utils
from tensorflow.keras.callbacks import ModelCheckpoint

In [3]:
file = open("file.txt").read()

In [5]:
def tokenize_words(s):
    # lowercase everything to standardize it
    s = s.lower()

    # instantiate the tokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(s)

    # if the created token isn't in the stop words, make it part of "filtered"
    filtered = filter(lambda token: token not in stopwords.words('english'), tokens)
    return " ".join(filtered)

In [8]:
processed_inputs = tokenize_words(file)

In [14]:
chars = sorted(list(set(processed_inputs)))
char_to_num = dict((c, i) for i, c in enumerate(chars))

In [22]:
input_len = len(processed_inputs)
vocab_len = len(chars)
print('Total number of words:',len(processed_inputs.split()))
print ('Total number of characters:', len(processed_inputs))
print ('Total vocab:', len(chars))

Total number of words: 829
Total number of characters: 6257
Total vocab: 28


In [24]:
seq_length = 100
x_data = []
y_data = []
for i in range(0, input_len - seq_length, 1):
    # Define input and output sequences
    # Input is the current character plus desired sequence length
    in_seq = processed_inputs[i:i + seq_length]

    # Out sequence is the initial character plus total sequence length
    out_seq = processed_inputs[i + seq_length]

    # We now convert list of characters to integers based on
    # previously and add the values to our lists
    x_data.append([char_to_num[char] for char in in_seq])
    y_data.append(char_to_num[out_seq])

In [27]:
n_patterns = len(x_data)
print ("Total Patterns:", n_patterns)

Total Patterns: 6157


In [30]:
X = numpy.reshape(x_data, (n_patterns, seq_length, 1))
X = X/float(vocab_len)
y = utils.to_categorical(y_data)

In [31]:
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))

In [32]:
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [33]:
filepath = "model_weights_saved.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
desired_callbacks = [checkpoint]

In [35]:
model.fit(X, y, epochs=4, batch_size=256, callbacks=desired_callbacks,verbose=-1)

Train on 6157 samples
Epoch 1/4

Epoch 00001: loss improved from inf to 2.98857, saving model to model_weights_saved.hdf5
Epoch 2/4

Epoch 00002: loss improved from 2.98857 to 2.93243, saving model to model_weights_saved.hdf5
Epoch 3/4

Epoch 00003: loss improved from 2.93243 to 2.92104, saving model to model_weights_saved.hdf5
Epoch 4/4

Epoch 00004: loss improved from 2.92104 to 2.92103, saving model to model_weights_saved.hdf5


<tensorflow.python.keras.callbacks.History at 0x7f56ad283e10>

In [36]:
filename = "model_weights_saved.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [37]:
num_to_char = dict((i, c) for i, c in enumerate(chars))

In [104]:
start = numpy.random.randint(0, len(x_data) - 1)
pattern = x_data[start]
print("Random Seed:")
print("\"", ''.join([num_to_char[value] for value in pattern]), "\"")
# pattern

Random Seed:
" shared pride delight evening previous brought home mother said playfully pretty present victor tomor "


In [106]:
s=''
for i in range(20):
    x = numpy.reshape(pattern, (1, len(pattern), 1))
    x = x / float(vocab_len)
    prediction = model.predict(x, verbose=0)
    index = numpy.argmax(prediction)
    print(index)
    result = num_to_char[index]
    seq_in = [num_to_char[value] for value in pattern]
#     print(result)
    sys.stdout.write(result)
    pattern.append(index)
    pattern = pattern[1:len(pattern)]

0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0


In [107]:
pattern

[0,
 6,
 23,
 6,
 15,
 10,
 15,
 8,
 0,
 17,
 19,
 6,
 23,
 10,
 16,
 22,
 20,
 0,
 3,
 19,
 16,
 22,
 8,
 9,
 21,
 0,
 9,
 16,
 14,
 6,
 0,
 14,
 16,
 21,
 9,
 6,
 19,
 0,
 20,
 2,
 10,
 5,
 0,
 17,
 13,
 2,
 26,
 7,
 22,
 13,
 13,
 26,
 0,
 17,
 19,
 6,
 21,
 21,
 26,
 0,
 17,
 19,
 6,
 20,
 6,
 15,
 21,
 0,
 23,
 10,
 4,
 21,
 16,
 19,
 0,
 21,
 16,
 14,
 16,
 19,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]