## RNN Word Prediction using LSTM model in Keras

Understanding LSTMs : https://www.youtube.com/watch?v=xPotjBiIFFA

In [0]:
import numpy as np

# source text
data = """ We are learning RNNs in Data Mining and Analytics class. This class is informative but learning is an art on its own. \n """

In [2]:
# encode the text as integers
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])
encoded = tokenizer.texts_to_sequences([data])[0]

Using TensorFlow backend.


In [0]:
encoded

[4, 5, 1, 6, 7, 8, 9, 10, 11, 2, 12, 2, 3, 13, 14, 1, 3, 15, 16, 17, 18, 19]

In [0]:
# determine the vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)

Vocabulary Size: 20


In [0]:
# create word -> word sequences
sequences = list()
for i in range(1, len(encoded)):
	sequence = encoded[i-1:i+1]
	sequences.append(sequence)
print('Total Sequences: %d' % len(sequences))

Total Sequences: 21


In [0]:
sequences

[[4, 5],
 [5, 1],
 [1, 6],
 [6, 7],
 [7, 8],
 [8, 9],
 [9, 10],
 [10, 11],
 [11, 2],
 [2, 12],
 [12, 2],
 [2, 3],
 [3, 13],
 [13, 14],
 [14, 1],
 [1, 3],
 [3, 15],
 [15, 16],
 [16, 17],
 [17, 18],
 [18, 19]]

In [0]:
# split into X and y elements
sequences = np.array(sequences)
X, y = sequences[:,0],sequences[:,1]

In [0]:
# one hot encode outputs
from keras.utils import to_categorical

y = to_categorical(y, num_classes=vocab_size)

In [0]:
y

array([[0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
        0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
        0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.

In [0]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense

model = Sequential() #This is were LSTM starts
model.add(Embedding(vocab_size, 10, input_length=1)) #How many output dimensions you want at the output
model.add(LSTM(50)) #number of hidden nodes 
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1, 10)             200       
_________________________________________________________________
lstm_1 (LSTM)                (None, 50)                12200     
_________________________________________________________________
dense_1 (Dense)              (None, 20)                1020      
Total params: 13,420
Trainable params: 13,420
Non-trainable params: 0
_________________________________________________________________
None


In [0]:
# compile network
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit network
model.fit(X, y, epochs=500, verbose=2)

Instructions for updating:
Use tf.cast instead.
Epoch 1/500
 - 1s - loss: 2.9954 - acc: 0.0000e+00
Epoch 2/500
 - 0s - loss: 2.9948 - acc: 0.0952
Epoch 3/500
 - 0s - loss: 2.9941 - acc: 0.1429
Epoch 4/500
 - 0s - loss: 2.9933 - acc: 0.1429
Epoch 5/500
 - 0s - loss: 2.9925 - acc: 0.1429
Epoch 6/500
 - 0s - loss: 2.9917 - acc: 0.1905
Epoch 7/500
 - 0s - loss: 2.9908 - acc: 0.1905
Epoch 8/500
 - 0s - loss: 2.9900 - acc: 0.1905
Epoch 9/500
 - 0s - loss: 2.9891 - acc: 0.1905
Epoch 10/500
 - 0s - loss: 2.9882 - acc: 0.1905
Epoch 11/500
 - 0s - loss: 2.9873 - acc: 0.1905
Epoch 12/500
 - 0s - loss: 2.9864 - acc: 0.2381
Epoch 13/500
 - 0s - loss: 2.9855 - acc: 0.2381
Epoch 14/500
 - 0s - loss: 2.9846 - acc: 0.2381
Epoch 15/500
 - 0s - loss: 2.9836 - acc: 0.2381
Epoch 16/500
 - 0s - loss: 2.9826 - acc: 0.2381
Epoch 17/500
 - 0s - loss: 2.9816 - acc: 0.2381
Epoch 18/500
 - 0s - loss: 2.9806 - acc: 0.2381
Epoch 19/500
 - 0s - loss: 2.9796 - acc: 0.2381
Epoch 20/500
 - 0s - loss: 2.9785 - acc: 0.23

<keras.callbacks.History at 0x7fce7f6f0c88>

In [0]:
# evaluate
in_text = 'We'
print('Current word: ', in_text)
encoded = tokenizer.texts_to_sequences([in_text])[0]
encoded = np.array(encoded)
yhat = model.predict_classes(encoded, verbose=0)
for word, index in tokenizer.word_index.items():
	if index == yhat:
		print('Next word: ', word)

Current word:  We
Next word:  are


In [0]:
# generate a sequence from the model
def generate_seq(model, tokenizer, seed_text, n_words):
	in_text, result = seed_text, seed_text
	# generate a fixed number of words
	for _ in range(n_words):
		# encode the text as integer
		encoded = tokenizer.texts_to_sequences([in_text])[0]
		encoded = np.array(encoded)
		# predict a word in the vocabulary
		yhat = model.predict_classes(encoded, verbose=0)
		# map predicted word index to word
		out_word = ''
		for word, index in tokenizer.word_index.items():
			if index == yhat:
				out_word = word
				break
		# append to input
		in_text, result = out_word, result + ' ' + out_word
	return result

In [0]:
print(generate_seq(model, tokenizer, 'We', 12))

We are learning is informative but learning is informative but learning is informative
