In [1]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Embedding, LSTM
from keras.utils import np_utils
from keras.utils.data_utils import get_file
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
import numpy as np

np.random.seed(13)

Using Theano backend.


In [2]:
path = get_file('alice.txt', origin="http://www.gutenberg.org/cache/epub/11/pg11.txt")
doc = open(path).readlines()[0:50]
tokenizer = Tokenizer()
tokenizer.fit_on_texts(doc)
doc = tokenizer.texts_to_sequences(doc)
doc = [l for l in doc if len(l) > 1]
words_size = sum([len(words) - 1 for words in doc])

In [3]:
maxlen = max([len(x)-1 for x in doc])
vocab_size = len(tokenizer.word_index)+1

In [4]:
def generate_data(X, maxlen, V):
    for sentence in X: 
        inputs = []
        targets = []
        for i in range(1, len(sentence)):
            inputs.append(sentence[0:i])
            targets.append(sentence[i])
        y = np_utils.to_categorical(targets, V)
        inputs_sequence = sequence.pad_sequences(inputs, maxlen=maxlen)
        yield (inputs_sequence, y)


In [5]:
def sample(p):
    p /= sum(p)
    return np.where(np.random.multinomial(1,p,1)==1)[1][0]

In [6]:
nb_units = 128
model = Sequential()
model.add(Embedding(vocab_size, nb_units, input_length=maxlen))
model.add(LSTM(nb_units, return_sequences=False))
model.add(Dense(vocab_size))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adadelta')

In [7]:
for i in range(30):
    for x, y in generate_data(doc, maxlen, vocab_size):
        model.train_on_batch(x, y)

    in_words = "alice's"
    for _ in range(maxlen):
        in_sequence = sequence.pad_sequences(tokenizer.texts_to_sequences([in_words]), maxlen=maxlen)
        wordid = sample(model.predict(in_sequence)[0])
        for k, v in tokenizer.word_index.items():
            if v == wordid:
                in_words += " " + k
                break

    print(i, in_words)

0 alice's on thought ' gutenberg's gutenberg's date release so reading well made up millennium was
1 alice's peeped what lewis 0 cost get by very 11 online this in cost by
2 alice's whatsoever 'without no included online www feel as of into but i license is
3 alice's making last worth fulcrum what whether whether whatsoever what ' use for may millennium
4 alice's book beginning well 3 considering thought and gutenberg's had her stupid 0 is 25
5 alice's march ' org a for the pictures 'without for i of english beginning down
6 alice's 2008 in lewis mind you no gutenberg's carroll she sleepy the mind would may
7 alice's in fulcrum rabbit the it updated chain alice's 2011 ' 25 considering carroll december
8 alice's restrictions lewis the author sleepy lewis had for as into english release had '
9 alice's the adventures december her i to get carroll 2008 of was sister peeped chain
10 alice's her terms the well by chain ebook she get be it updated it had
11 alice's the beginning use ' ' by w

In [8]:
in_words = "alice's"
for _ in range(maxlen):
    in_sequence = sequence.pad_sequences(tokenizer.texts_to_sequences([in_words]), maxlen=maxlen)
    wordid = model.predict_classes(in_sequence, verbose=0)[0] # 最尤推定
    for k, v in tokenizer.word_index.items():
        if v == wordid:
            in_words += " " + k
            break

print(in_words)

alice's adventures in wonderland by lewis carroll ' as ' the trouble of the the
