In [1]:
!pip install nltk --upgrade

Collecting nltk
[?25l  Downloading https://files.pythonhosted.org/packages/f6/1d/d925cfb4f324ede997f6d47bea4d9babba51b49e87a767c170b77005889d/nltk-3.4.5.zip (1.5MB)
[K     |████████████████████████████████| 1.5MB 2.8MB/s 
Building wheels for collected packages: nltk
  Building wheel for nltk (setup.py) ... [?25l[?25hdone
  Created wheel for nltk: filename=nltk-3.4.5-cp36-none-any.whl size=1449908 sha256=e104bdaf44c704464bf19f63a1daf24fe93b7d180bd7a8041705090c16535b02
  Stored in directory: /root/.cache/pip/wheels/96/86/f6/68ab24c23f207c0077381a5e3904b2815136b879538a24b483
Successfully built nltk
Installing collected packages: nltk
  Found existing installation: nltk 3.2.5
    Uninstalling nltk-3.2.5:
      Successfully uninstalled nltk-3.2.5
Successfully installed nltk-3.4.5


In [0]:
#loading dataset
import pickle 
trainers=open('/content/traindump.txt','rb')
trainlist=pickle.load(trainers)
testers=open('/content/testdump.txt','rb')
testlist=pickle.load(testers)

In [0]:
#creating training tokens
tokens=[]
for i in trainlist:
  tokens.extend(i.split())
for i in testlist:
  tokens.extend(i.split())

In [0]:
#the training data will have a sequence of words followed by target word. We create sequences here of length 60 and 1 target word.
length = 51
sentences = []
for i in range(length, len(tokens)):
  sequences = tokens[i-length:i]
  line = ' '.join(sequences)
  sentences.append(line)
  #print(line)


In [0]:
import keras

In [0]:
#Class for vectorizing texts, or/and turning texts into sequences (=list of word indexes, where the word of rank i in the dataset (starting at 1) has index i).
tokenizer=keras.preprocessing.text.Tokenizer()

In [22]:
sentences

["<s> speech thank you so much </s> <s> that's so nice </s> <s> isn't he a great guy </s> <s> he doesn't get a fair press he doesn't get it </s> <s> it's just not fair </s> <s> and i have to tell you i'm here and very strongly here because",
 "speech thank you so much </s> <s> that's so nice </s> <s> isn't he a great guy </s> <s> he doesn't get a fair press he doesn't get it </s> <s> it's just not fair </s> <s> and i have to tell you i'm here and very strongly here because i",
 "thank you so much </s> <s> that's so nice </s> <s> isn't he a great guy </s> <s> he doesn't get a fair press he doesn't get it </s> <s> it's just not fair </s> <s> and i have to tell you i'm here and very strongly here because i have",
 "you so much </s> <s> that's so nice </s> <s> isn't he a great guy </s> <s> he doesn't get a fair press he doesn't get it </s> <s> it's just not fair </s> <s> and i have to tell you i'm here and very strongly here because i have great",
 "so much </s> <s> that's so nice </s> <s>

In [0]:
#creates a sort of word to index mapping. Then maps back the words to sequence indexes
tokenizer.fit_on_texts(sentences)
sentences = tokenizer.texts_to_sequences(sentences)

In [0]:
#0 is reserved for padding. So we need to add 1 for the embedding to map from 1 to vocab_size.
vocab_size = len(tokenizer.word_index) + 1

In [0]:
#creating the input sequence and the target word.
from numpy import array
from keras.utils import to_categorical
sentences = array(sentences)
X, y = sentences[:,:-1], sentences[:,-1]  # X is input sequences of words, y is target word
y = to_categorical(y, num_classes=vocab_size) #allocate index to word 
sent_length = X.shape[1]       

In [26]:
#creating the neural network
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import SimpleRNN
from keras.layers import Embedding
# define model
model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=sent_length))  
model.add(SimpleRNN(100, return_sequences=True))
model.add(SimpleRNN(100))
model.add(Dense(100, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 50, 50)            303500    
_________________________________________________________________
simple_rnn_3 (SimpleRNN)     (None, 50, 100)           15100     
_________________________________________________________________
simple_rnn_4 (SimpleRNN)     (None, 100)               20100     
_________________________________________________________________
dense_3 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_4 (Dense)              (None, 6070)              613070    
Total params: 961,870
Trainable params: 961,870
Non-trainable params: 0
_________________________________________________________________
None


In [27]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) #categorical crossentropy as there are multiple classes of words to predict
# fit model
model.fit(X, y, batch_size=128, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f84fad085f8>

In [0]:
model.save('/content/model.h5')
# save the tokenizer
pickle.dump(tokenizer, open('tokenizer.pkl', 'wb'))

In [0]:
#function for generating output
def create_output(model, tokenizer, seqlength, seed, number_words):
	gen = []
	text_input = seed
	for i in range(number_words):
		encoded = tokenizer.texts_to_sequences([text_input])[0]
		encoded = pad_sequences([encoded], maxlen=seqlength, truncating='pre')
		yhat = model.predict_classes(encoded, verbose=0)
		output = ''
		for word, index in tokenizer.word_index.items():
			if index == yhat:
				output = word
				break
		text_input += ' ' + output
		gen.append(output)
	return ' '.join(gen)

In [0]:
from keras.preprocessing.sequence import pad_sequences

In [0]:
length = 51
sent = []
for i in range(length, len(tokens)):
  sequences = tokens[i-length:i]
  line = ' '.join(sequences)
  sent.append(line)
  #print(line)

In [0]:
#omitting last word for prediction task
seqlength = len(sent[0].split()) - 1

In [0]:
#selecting random sentence for prediction
from random import randint
seed = sent[randint(0,len(sent))]

In [45]:
output_sentences = create_output(model, tokenizer,seqlength, seed, 12)
print(output_sentences)

s s i mean i dont know s s i


In [0]:
#The output is not very coherent, but better than ngrams. This is because the number of epochs is less, and the dataset size also is less. Increasing the number of epochs can solve the issue of coherency.  Also, RNNs are not very efficient in handling long term contexts due to vanishing gradients. 