In [46]:
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils
from keras.layers import Embedding
from nltk.tokenize import word_tokenize
import re

In [None]:
"""txt = re.sub('\,','',txt)
txt = re.sub('\?','',txt)
txt = re.sub('\(','',txt)
txt = re.sub('\)','',txt)
txt = re.sub('\:','',txt)
txt = re.sub('\;','',txt)
txt = re.sub('\'','',txt)
txt = re.sub('\!','',txt)
txt = re.sub('-','',txt)
txt = re.sub('\n','',txt)
txt = re.sub(' +',' ',txt)
"""


In [111]:
filename = "/home/user/Downloads/rnn-assignments/alice.txt"
txt = open(filename).read()
txt = txt.lower()
#txt = txt.replace(',.?!:;*',"")
txt = re.sub(r'[^\w\.]',' ',txt)

In [112]:
#tokenizing the input text
word_token = word_tokenize(txt)

In [113]:
# converting words to int
words = sorted(list(set(word_token)))
word_to_int = dict((w,i) for i,w in enumerate(words))

# reversed dict
reverse_dict = dict((word_to_int[i],i) for i in word_to_int)


In [114]:
# word and vocabulary count
n_words = len(word_token)
n_vocab = len(word_to_int)
print("Number of words : ",n_words)
print("Size of Vocabulary : ",n_vocab)

Number of words :  28326
Size of Vocabulary :  2575


In [115]:
# parameters
window = 100
epoch = 1
batch_size = 256

In [116]:
# intialize arrays
x=[]
y=[]

In [117]:
# generating dataset
for i in range(0, n_words - window, 1):
    seq_in = word_token[i:i + window]
    seq_out = word_token[i + window]
    x.append([word_to_int[word] for word in seq_in])
    y.append(word_to_int[seq_out])
n_patterns = len(x)
print ("Dataset size : ", n_patterns)

Dataset size :  28226


In [118]:
# Reshaping input
x = np.reshape(x,(n_patterns, window, 1))
y = np_utils.to_categorical(y)

In [119]:
# Define the LSTM model
model = Sequential()
model.add(Embedding(n_vocab,100,input_length=window ))
model.add(LSTM(256, input_shape=(x.shape[1], x.shape[2]),return_sequences=True,implementation=2))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 100, 100)          257500    
_________________________________________________________________
lstm_10 (LSTM)               (None, 100, 256)          365568    
_________________________________________________________________
dropout_9 (Dropout)          (None, 100, 256)          0         
_________________________________________________________________
lstm_11 (LSTM)               (None, 128)               197120    
_________________________________________________________________
dropout_10 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 2575)              332175    
Total params: 1,152,363
Trainable params: 1,152,363
Non-trainable params: 0
_________________________________________________________________


In [120]:
x = np.reshape(x,(n_patterns, window))
x.shape

(28226, 100)

In [121]:
model.fit(x[:15000], y[:15000], epochs=epoch, batch_size=batch_size, validation_split=0.2)

Train on 12000 samples, validate on 3000 samples
Epoch 1/1


<keras.callbacks.History at 0x7fcd853259e8>

In [122]:
ans = model.predict_classes(x[0:300])



In [123]:
# printing the original and predicted words
for i,j in zip(ans,range(0,200,1)):
    print("original sentence : ",word_token[j+95:j+100])
    print("predicted word : ",word_token[j+95:j+99]) 
    print(reverse_dict[i])
    print()

original sentence :  ['very', 'sleepy', 'and', 'stupid', 'whether']
predicted word :  ['very', 'sleepy', 'and', 'stupid']
the

original sentence :  ['sleepy', 'and', 'stupid', 'whether', 'the']
predicted word :  ['sleepy', 'and', 'stupid', 'whether']
the

original sentence :  ['and', 'stupid', 'whether', 'the', 'pleasure']
predicted word :  ['and', 'stupid', 'whether', 'the']
the

original sentence :  ['stupid', 'whether', 'the', 'pleasure', 'of']
predicted word :  ['stupid', 'whether', 'the', 'pleasure']
the

original sentence :  ['whether', 'the', 'pleasure', 'of', 'making']
predicted word :  ['whether', 'the', 'pleasure', 'of']
the

original sentence :  ['the', 'pleasure', 'of', 'making', 'a']
predicted word :  ['the', 'pleasure', 'of', 'making']
the

original sentence :  ['pleasure', 'of', 'making', 'a', 'daisy']
predicted word :  ['pleasure', 'of', 'making', 'a']
the

original sentence :  ['of', 'making', 'a', 'daisy', 'chain']
predicted word :  ['of', 'making', 'a', 'daisy']
the
