# Implementing LSTM Based Next Word Prediction Using Keras

In [93]:
import csv
import itertools
import operator
import numpy as np
import nltk
import sys
from datetime import datetime
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.layers import LSTM
import random

import matplotlib.pyplot as plt
%matplotlib inline

In [94]:
maxlen = 10
no_word = 'NO_WORD'
vocabulary_size = 8000
unknown_token = "UNKNOWN_TOKEN"
sentence_start_token = "SENTENCE_START"
sentence_end_token = "SENTENCE_END"
with open('reddit-comments-2015-08.csv') as f:
    reader = csv.reader(f,skipinitialspace=True)
    ## Split comments into sentences
    sentences  = itertools.chain(*[nltk.sent_tokenize(x[0].decode('utf-8').lower()) for x in reader])
    sentences = ["%s %s %s"%(sentence_start_token,x,sentence_end_token) for x in sentences]
print "Parsed %d sentences"%(len(sentences))

Parsed 79171 sentences


In [95]:
## Tokenize the sentences into words
count = 0
tokenized_words = [nltk.word_tokenize(word) for word in sentences]
for l in tokenized_words:
    count += len(l)
print "Found %d number of words"%(count)

Found 1716192 number of words


In [96]:
## Counting the word frequencies in the word_tokenize
word_freq = nltk.FreqDist(itertools.chain(*tokenized_words))
print "Found %d unique word tokens"%(len(word_freq))

Found 65751 unique word tokens


In [97]:
## Hashing the most frequent words into the vocabulary
vocab = word_freq.most_common(vocabulary_size - 1)
print len(vocab)
vocab.insert(0,(u'NO_WORD',100000))
print len(vocab)
index_to_word = [x[0] for x in vocab]
index_to_word.append(unknown_token)
word_to_index = dict([w,i] for i,w in enumerate(index_to_word))
print "Using vocabulary size %d." % vocabulary_size
print "The least frequent word in our vocabulary is '%s' and appeared %d times"%(vocab[-1][0],vocab[-1][1])

7999
8000
Using vocabulary size 8000.
The least frequent word in our vocabulary is 'devoted' and appeared 10 times


In [98]:
len(vocab)
print vocab[0],vocab[7999]

(u'NO_WORD', 100000) (u'devoted', 10)


In [99]:
print max(word_to_index.values())

8000


In [100]:
# Replace all words not in our vocabulary with the unknown token
for i, sent in enumerate(tokenized_words):
    tokenized_words[i] = [w if w in word_to_index else unknown_token for w in sent]

In [None]:
## Creating each sentence in the collection to be 10 words long
## I have split the sentences unknowingly thus making sentence incomplete
## This methos needs to be checked
text,next_word,maxlen = [],[],
for sent in tokenized_words:
    if len(sent) < maxlen:
        continue
    if len(sent) >= maxlen:
        val = sent[0:maxlen-1]
        val.append(sentence_end_token)
        text.append(val)
        next_word.append(sent[maxlen-1])
print text[0:10]
print "--------------------"
print next_word[0:10]

In [None]:
## Vectorizing each of the sentence into the matrix X
## Matrix y contains the next word prediction for the whole sentence (LSTM)
X = np.zeros((len(text),maxlen,vocabulary_size),dtype = np.bool)
y = np.zeros((len(text),vocabulary_size),dtype = np.bool)
for i,sent in enumerate(text):
    for t,word in enumerate(sent):
        X[i,t,word_to_index[word]] = 1
    y[i,word_to_index[next_word[i]]] = 1

In [None]:
print X[0:1]
print "---------------------------------------------------"
print y[0:1]

In [None]:
print('Build model...')
model = Sequential()
model.add(LSTM(512, return_sequences=True, input_shape=(maxlen,vocabulary_size)))
model.add(Dropout(0.2))
model.add(LSTM(512, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(vocabulary_size))
model.add(Activation('softmax'))

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='rmsprop')

In [None]:
model.fit(X, y, batch_size=128, nb_epoch=10)

In [None]:
## Model needs to be trained on a GPU
## The output needs to be predicted by using a mixture of sentences

## Using Masking Rather Than To Slice The Input

In [101]:
## Run above 4 Scripts to run the script below
print tokenized_words[0:10]

[[u'SENTENCE_START', u'body', u'SENTENCE_END'], [u'SENTENCE_START', u'i', u'joined', u'a', u'new', u'league', u'this', u'year', u'and', u'they', u'have', u'different', u'scoring', u'rules', u'than', u'i', u"'m", u'used', u'to', u'.', u'SENTENCE_END'], [u'SENTENCE_START', u'it', u"'s", u'a', u'slight', u'ppr', 'UNKNOWN_TOKEN', 'UNKNOWN_TOKEN', u'ppr', u'.', u'SENTENCE_END'], [u'SENTENCE_START', u'standard', u'besides', u'1', u'points', u'for', u'15', u'yards', u'receiving', u',', 'UNKNOWN_TOKEN', u'points', u'per', u'completion', u',', u'6', u'points', u'per', u'td', u'thrown', u',', u'and', u'some', u'bonuses', u'for', 'UNKNOWN_TOKEN', 'UNKNOWN_TOKEN', u'.', u'SENTENCE_END'], [u'SENTENCE_START', u'my', u'question', u'is', u',', u'is', u'it', u'wildly', u'clear', u'that', u'qb', u'has', u'the', u'highest', u'potential', u'for', u'points', u'?', u'SENTENCE_END'], [u'SENTENCE_START', u'i', u'put', u'in', u'the', u'rules', u'at', u'a', u'ranking', u'site', u'and', u'noticed', u'that', u'to

In [102]:
next_word = []
def mask(string):
    if len(string) < maxlen:
        ## Pad the string with no_word type of string
        next_word.append(string[-2])
        l = [no_word for i in range(maxlen - len(string) + 1)]
        string.remove(sentence_end_token)
        string[-1] = sentence_end_token
        string = l + string
    else:
        string = string[0:maxlen]
        next_word.append(string.pop(maxlen-1))
        string.append(sentence_end_token)
    return string

In [103]:
for i,sent in enumerate(tokenized_words):
    if len(sent) <= 3:
        tokenized_words.pop(i)
for i,sent in enumerate(tokenized_words):
    tokenized_words[i] = mask(tokenized_words[i])

In [104]:
print tokenized_words[0:10]
print next_word[0:10]

[[u'SENTENCE_START', u'i', u'joined', u'a', u'new', u'league', u'this', u'year', u'and', 'SENTENCE_END'], [u'SENTENCE_START', u'it', u"'s", u'a', u'slight', u'ppr', 'UNKNOWN_TOKEN', 'UNKNOWN_TOKEN', u'ppr', 'SENTENCE_END'], [u'SENTENCE_START', u'standard', u'besides', u'1', u'points', u'for', u'15', u'yards', u'receiving', 'SENTENCE_END'], [u'SENTENCE_START', u'my', u'question', u'is', u',', u'is', u'it', u'wildly', u'clear', 'SENTENCE_END'], [u'SENTENCE_START', u'i', u'put', u'in', u'the', u'rules', u'at', u'a', u'ranking', 'SENTENCE_END'], [u'SENTENCE_START', u'would', u'it', u'be', u'dumb', u'not', u'to', u'grab', u'a', 'SENTENCE_END'], [u'SENTENCE_START', u'in', u'your', u'scenario', u',', u'a', u'person', u'could', u'just', 'SENTENCE_END'], [u'SENTENCE_START', u'there', u"'s", u'no', u'way', u'to', u'enforce', u'it', u'.', 'SENTENCE_END'], [u'SENTENCE_START', u'an', u'honest', u'seller', u'is', u'going', u'to', u'not', u'sell', 'SENTENCE_END'], [u'SENTENCE_START', u'a', u'dishones

In [105]:
## Checking for the length of each word
print 'Maximum length of the word: '+str(maxlen)
for word in tokenized_words:
    if len(word) != 10:
        print len(word)

Maximum length of the word: 10


In [107]:
## Vectorizing each of the sentence into the matrix X
## Matrix y contains the next word prediction for the whole sentence (LSTM)
X = np.zeros((len(tokenized_words),maxlen,vocabulary_size+1),dtype = np.bool)
y = np.zeros((len(tokenized_words),vocabulary_size+1),dtype = np.bool)
for i,sent in enumerate(tokenized_words):
    for t,word in enumerate(sent):
        X[i,t,word_to_index[word]] = 1
    y[i,word_to_index[next_word[i]]] = 1

In [108]:
print X[0:1]
print "---------------------------------------------------"
print y[0:1]

[[[False  True False ..., False False False]
  [False False False ..., False False False]
  [False False False ..., False False False]
  ..., 
  [False False False ..., False False False]
  [False False False ..., False False False]
  [False False  True ..., False False False]]]
---------------------------------------------------
[[False False False ..., False False False]]


In [113]:
print np.shape(X)
print np.shape(y)

(78526, 10, 8001)
(78526, 8001)


In [117]:
print('Build model...')
model = Sequential()
model.add(LSTM(512, return_sequences=True, input_shape=(maxlen,vocabulary_size+1)))
model.add(Dropout(0.2))
model.add(LSTM(512, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(vocabulary_size+1))
model.add(Activation('softmax'))

Build model...


In [118]:
model.compile(loss='categorical_crossentropy', optimizer='rmsprop')

In [119]:
model.fit(X, y, batch_size=128, nb_epoch=10)

Epoch 1/10
 1152/78526 [..............................] - ETA: 1351s - loss: 7.0833

KeyboardInterrupt: 