In [1]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, Convolution1D, Flatten, MaxPooling1D, Embedding
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
from nltk.tokenize import sent_tokenize
import numpy as np
import random
import sys

Using TensorFlow backend.


In [2]:
# load the text file into memory
file = open('../../Data/Books/sherlock.txt')
MAX_NB_WORDS = 8000
textSource = ['BOL ' + line + ' EOL' for line in sent_tokenize(file.read().lower().replace('\n', ' '))]
 

file.close()
print('corpus length:', len(textSource))

corpus length: 6925


In [3]:
textSource[0]

"BOL \ufeffproject gutenberg's the adventures of sherlock holmes, by arthur conan doyle  this ebook is for the use of anyone anywhere at no cost and with almost no restrictions whatsoever. EOL"

In [4]:
# tokenize words and convert word sequence to digit sequence
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(textSource)

sequences = tokenizer.texts_to_sequences(textSource)
word_index = tokenizer.word_index
vocab_size = len(word_index) +1

# build a reverse look up: index -> word
index_word = dict()
for k,v in word_index.items():
    if not v in index_word:
        index_word[v] = k
        
print('sentences count:', len(sequences))
print('vocab size:', vocab_size)

sentences count: 6925
vocab size: 8467


In [5]:
# build the training data by creating context-> next word paris. by shiftting x number of word per pair

min_length = 1
sentences = []
next_words = []
for i in range(0, len(sequences)):
    for j in range(min_length, len(sequences[i])-1): 
        sentences.append(sequences[i][0: j])
        next_words.append(sequences[i][j+1])

print('training sequences count:', len(sentences))
print('longest training sentence', len( max(sentences, key=len)))

training sequences count: 108593
longest training sentence 105


In [6]:
# build training data
X = pad_sequences(sentences)
y = np.zeros((len(sentences), vocab_size), dtype=np.bool)
for i, sentence in enumerate(sentences):
    y[i, next_words[i]] = 1

sent_length = len(X[0])

In [12]:
model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=sent_length))
model.add(Convolution1D(filters=20, kernel_size=5, activation='relu', padding='valid'))
model.add(MaxPooling1D(3))
model.add(Convolution1D(filters=20, kernel_size=5, activation='relu', padding='valid'))
model.add(MaxPooling1D(3))
model.add(Convolution1D(filters=20, kernel_size=5, activation='relu', padding='valid'))
model.add(MaxPooling1D(3))
model.add(Flatten())
model.add(Dense(vocab_size, activation='softmax'))
optimizer = RMSprop(lr=0.01)

model.compile(loss='categorical_crossentropy', optimizer=optimizer)

In [8]:
def sample(preds):
    preds = np.asarray(preds).astype('float64')
    exp_preds = np.exp(np.log(preds))
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [9]:
def append_word(input, count, word_num):
    input[0][count] = word_num
    count += 1

In [10]:
def generate_sent(model):
    input = np.zeros((1, sent_length), dtype=np.float32)
    message = 'sherlock holmes'
    count =0
    append_word(input, count, word_index['bol'])
    append_word(input, count, word_index['sherlock'])
    append_word(input, count, word_index['holmes'])
    
    for i in range(0, 10):
        predict = model.predict(input)[0]
        next = sample(predict)

        input[0][count] = next
        count += 1
        
        message += ' '
        message += index_word[next]
        
        # start a new sentence when the current one ends
        if index_word[next] == 'eol':
            append_word(input, count, word_index['bol'])
            message += ' bol'

        print(message)

In [13]:
for iteration in range(1, 50):
    print()
    print('-' * 50)
    print('Iteration', iteration)
    model.fit(X, y, batch_size=256, epochs=10)
    
    generate_sent(model)
model.save('StoryModelCNN.model')    


--------------------------------------------------
Iteration 1
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
sherlock holmes it
sherlock holmes it will
sherlock holmes it will for
sherlock holmes it will for a
sherlock holmes it will for a faced
sherlock holmes it will for a faced he
sherlock holmes it will for a faced he follow
sherlock holmes it will for a faced he follow i
sherlock holmes it will for a faced he follow i in
sherlock holmes it will for a faced he follow i in looked

--------------------------------------------------
Iteration 2
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
sherlock holmes card
sherlock holmes card eol bol
sherlock holmes card eol bol way
sherlock holmes card eol bol way the
sherlock holmes card eol bol way the face
sherlock holmes card eol bol way the face into
sherlock holmes card eol bol way the face into really
sherlock