In [1]:
import tensorflow as tf
import numpy as np

In [2]:
def load_text_to_id(filename):
    with open(filename) as f:
        words = f.read().decode('utf8').replace("\n", " <eos> ").split()
    vocab = sorted(list(set(words)))
    vocab = dict(zip(vocab, range(len(vocab))))
    unk = len(vocab)
    return [vocab[w] if w in vocab else unk for w in words], vocab

In [3]:
word_ids, vocab = load_text_to_id('raw_sentences.txt')

In [4]:
def get_model(n_steps, dim_input, dim_hidden, batch_size, vocab_size):
    input_data = tf.placeholder('int32', [batch_size, n_steps])
    targets = tf.placeholder('int32', [batch_size, n_steps])

    with tf.device('/gpu:0'):
        lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(dim_hidden)
        # lstm_cell = tf.nn.rnn_cell.DropoutWrapper(lstm_cell, output_keep_prob=0.5)
        # lstm_cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell] * 3)
        initial_state = lstm_cell.zero_state(batch_size, 'float32')

    with tf.device("/cpu:0"):
        embedding = tf.Variable(tf.random_normal([vocab_size, dim_hidden]))
    inputs = tf.nn.embedding_lookup(embedding, input_data)
    # inputs = tf.nn.dropout(inputs, 0.5)
    inputs = [tf.squeeze(input_, [1]) for input_ in tf.split(1, n_steps, inputs)]

    outputs, state = tf.nn.rnn(lstm_cell, inputs, initial_state=initial_state)
    output = tf.reshape(tf.concat(1, outputs), [-1, dim_hidden])
    with tf.device('/gpu:0'):
        Wy = tf.Variable(tf.random_normal([dim_hidden, vocab_size]))
        by = tf.Variable(tf.random_normal([vocab_size]))
        logits = tf.matmul(output, Wy) + by
        loss = tf.nn.seq2seq.sequence_loss_by_example(
            [logits], [tf.reshape(targets, [-1])],
            [tf.ones([batch_size * n_steps], dtype='float32')])
        cost = tf.reduce_sum(loss) / batch_size

    final_state = state
    train_op = tf.train.RMSPropOptimizer(0.001, 0.9).minimize(cost)
    return {'train': train_op, 'final_state': final_state, 'cost': cost,
            'logits': logits, 'input': input_data, 'target': targets}

In [5]:
def batch(word_ids, batch_size, n_steps):
    word_ids = np.array(word_ids)
    batch_count = len(word_ids) // batch_size
    data = word_ids[:batch_count*batch_size].reshape([batch_size, batch_count])
    for end in range(n_steps, batch_count, n_steps):
        start = end - n_steps
        x = data[:, start:end]
        y = data[:, (start+1):(end+1)]
        yield x, y

In [6]:
model = get_model(n_steps=30, dim_input=20, dim_hidden=30, batch_size=128, vocab_size=len(vocab)+1)

In [7]:
with tf.Session() as sess:
    tf.global_variables_initializer().run()
    X = model['input']
    Y = model['target']
    for epoch in range(20):
        for x, y in batch(word_ids, batch_size=128, n_steps=30):
            sess.run(model['train'], feed_dict={X: x, Y: y})
        c = sess.run(model['cost'], feed_dict={X: x, Y: y})
        print epoch, c
    saver = tf.train.Saver(tf.global_variables())
    saver.save(sess, './')

0 120.467
1 98.9053
2 91.9758
3 88.4087
4 86.2257
5 84.6999
6 83.5605
7 82.6666
8 81.9324
9 81.3187
10 80.796
11 80.3458
12 79.9511
13 79.6024
14 79.2947
15 79.0207
16 78.7766
17 78.5595
18 78.3646
19 78.1871
