In [1]:
import tensorflow as tf
import numpy as np

In [2]:
def load_text_to_id(filename):
    with open(filename) as f:
        text = f.read().decode('utf8').replace(u'　', '').replace(u'\n', '')
        words = list(text)
    vocab = ['<unk>'] + sorted(list(set(words)))
    vocab = dict(zip(vocab, range(len(vocab))))
    word_ids = [vocab[w] if w in vocab else 0 for w in words]
    inv_vocab = np.array([x[1] for x in sorted(zip(vocab.values(), vocab.keys()))])
    return word_ids, vocab, inv_vocab

word_ids, vocab, inv_vocab = load_text_to_id('raw_novel.txt')

In [3]:
def batch(word_ids, batch_size, n_steps):
    word_ids = np.array(word_ids)
    batch_count = len(word_ids) // batch_size
    data = word_ids[:batch_count*batch_size].reshape([batch_size, batch_count])
    for end in range(n_steps, batch_count, 1):
        start = end - n_steps
        x = data[:, start:end]
        y = data[:, (start+1):(end+1)]
        yield x, y

In [4]:
def get_model(scope_name, n_steps, dim_input, dim_hidden, batch_size, vocab_size, n_layer=1):
    g = tf.Graph()
    with g.as_default():
        input_data = tf.placeholder('int32', [batch_size, n_steps])
        targets = tf.placeholder('int32', [batch_size, n_steps])
        p_keep = tf.placeholder_with_default(tf.constant(1.0), [])
        xavier = tf.contrib.layers.xavier_initializer()

        with tf.variable_scope(scope_name) as scope:
            with tf.device("/cpu:0"):
                try:
                    embedding = tf.get_variable('embedding', [vocab_size, dim_input], initializer=xavier)
                except ValueError:
                    scope.reuse_variables()
                    embedding = tf.get_variable('embedding', [vocab_size, dim_input], initializer=xavier)
                inputs = tf.nn.embedding_lookup(embedding, input_data)
                inputs = [tf.squeeze(input_, [1]) for input_ in tf.split(1, n_steps, inputs)]

            with tf.device('/gpu:0'):
                cell = tf.nn.rnn_cell.GRUCell(dim_hidden)
                cell = tf.nn.rnn_cell.DropoutWrapper(cell, output_keep_prob=p_keep)
                cell = tf.nn.rnn_cell.MultiRNNCell([cell] * n_layer)
                initial_state = cell.zero_state(batch_size, 'float32')

            outputs, state = tf.nn.rnn(cell, inputs, initial_state=initial_state)
            output = tf.reshape(tf.concat(1, outputs), [-1, dim_hidden])
            with tf.device('/gpu:0'):
                Wy = tf.get_variable('Wy', [dim_hidden, vocab_size], initializer=xavier)
                by = tf.get_variable('by', [vocab_size], initializer=xavier)
                logits = tf.matmul(output, Wy) + by
                probs = tf.nn.softmax(logits)
                loss = tf.nn.seq2seq.sequence_loss_by_example(
                    [logits], [tf.reshape(targets, [-1])],
                    [tf.ones([batch_size * n_steps], dtype='float32')], vocab_size)
                cost = tf.reduce_sum(loss) / batch_size / n_steps
                final_state = state
                train_op = tf.train.RMSPropOptimizer(0.001, 0.9).minimize(cost)

    return {'train': train_op, 'final_state': final_state, 'cost': cost,
            'logits': logits, 'input': input_data, 'target': targets,
            'init_state': initial_state, 'cell': cell, 'p_keep': p_keep,
            'embedding': embedding, 'probs': probs, 'graph': g}

In [5]:
def train(n_steps, dim_input, dim_hidden, batch_size=128, n_layer=1, p_keep=1.0, final_train=False):
    batch_size = 128
    vocab_size = len(vocab)
    if final_train:
        scope_name = 'default'
    else:
        scope_name = 'rnn_{}_{}_{}_{}_{}'.format(n_layer, n_steps, dim_input, dim_hidden, batch_size)
    model = get_model(scope_name, n_steps, dim_input, dim_hidden, batch_size, vocab_size, n_layer)
    with model['graph'].as_default():
        with tf.Session() as sess:
            tf.global_variables_initializer().run()
            X = model['input']
            Y = model['target']
            dp = model['p_keep']
            last_cost = 100.0
            for epoch in range(100):
                for x, y in batch(word_ids[:100000], batch_size, n_steps):
                    sess.run(model['train'], feed_dict={X: x, Y: y, dp: p_keep})
                cost = []
                for x, y in batch(word_ids[100000:], batch_size, n_steps):
                    cost.append(sess.run(model['cost'], feed_dict={X: x, Y: y, dp: 1.0}))
                curr_cost = np.mean(cost)
                # print epoch, curr_cost
                if curr_cost > last_cost or abs(curr_cost - last_cost) < 0.01:
                    break
                last_cost = curr_cost
            saver = tf.train.Saver(tf.global_variables())
            saver.save(sess, './lstm_zh.checkpoint')
            return curr_cost

## Grid Search (dim_input, dim_hidden, n_steps)

1. number of dimensions of input
2. number of hidden units
3. number of RNN cells

```python
for dim_input in (5, 10, 15, 30, 50, 100):
    for dim_hidden in (5, 10, 15, 30, 50, 100):
        for n_steps in (20, 60, 100):
            c = train(n_steps, dim_input, dim_hidden)
            print 'dim_input={}, dim_hidden={}, n_steps={}, cost={}'.format(dim_input, dim_hidden, n_steps, c)
```
```
dim_input=5, dim_hidden=5, n_steps=20, cost=5.04910373688
dim_input=5, dim_hidden=5, n_steps=60, cost=5.00340223312
dim_input=5, dim_hidden=5, n_steps=100, cost=4.9361448288
dim_input=5, dim_hidden=10, n_steps=20, cost=4.67049598694
dim_input=5, dim_hidden=10, n_steps=60, cost=4.60688018799
dim_input=5, dim_hidden=10, n_steps=100, cost=4.68523263931
dim_input=5, dim_hidden=15, n_steps=20, cost=4.48330354691
dim_input=5, dim_hidden=15, n_steps=60, cost=4.536277771
dim_input=5, dim_hidden=15, n_steps=100, cost=4.51421165466
dim_input=5, dim_hidden=30, n_steps=20, cost=4.40719175339
dim_input=5, dim_hidden=30, n_steps=60, cost=4.37579870224
dim_input=5, dim_hidden=30, n_steps=100, cost=4.38251447678
dim_input=5, dim_hidden=50, n_steps=20, cost=4.39374637604
dim_input=5, dim_hidden=50, n_steps=60, cost=4.39796352386
dim_input=5, dim_hidden=50, n_steps=100, cost=4.43117761612
dim_input=5, dim_hidden=100, n_steps=20, cost=4.41315174103
dim_input=5, dim_hidden=100, n_steps=60, cost=4.48978328705
dim_input=5, dim_hidden=100, n_steps=100, cost=4.48827600479
dim_input=10, dim_hidden=5, n_steps=20, cost=4.9605050087
dim_input=10, dim_hidden=5, n_steps=60, cost=5.15566396713
dim_input=10, dim_hidden=5, n_steps=100, cost=5.3531627655
dim_input=10, dim_hidden=10, n_steps=20, cost=4.64183187485
dim_input=10, dim_hidden=10, n_steps=60, cost=4.64913988113
dim_input=10, dim_hidden=10, n_steps=100, cost=4.61222314835
dim_input=10, dim_hidden=15, n_steps=20, cost=4.41376447678
dim_input=10, dim_hidden=15, n_steps=60, cost=4.25506
dim_input=10, dim_hidden=15, n_steps=100, cost=4.28174
dim_input=10, dim_hidden=30, n_steps=20, cost=4.30813550949
dim_input=10, dim_hidden=30, n_steps=60, cost=4.34905338287
dim_input=10, dim_hidden=30, n_steps=100, cost=4.34371805191
dim_input=10, dim_hidden=50, n_steps=20, cost=4.47641849518
dim_input=10, dim_hidden=50, n_steps=60, cost=4.32932662964
dim_input=10, dim_hidden=50, n_steps=100, cost=4.32593250275
dim_input=10, dim_hidden=100, n_steps=20, cost=4.34707021713
dim_input=10, dim_hidden=100, n_steps=60, cost=4.38002204895
dim_input=10, dim_hidden=100, n_steps=100, cost=4.39789485931
dim_input=15, dim_hidden=5, n_steps=20, cost=4.97037410736
dim_input=15, dim_hidden=5, n_steps=60, cost=5.17431783676
dim_input=15, dim_hidden=5, n_steps=100, cost=5.06800937653
dim_input=15, dim_hidden=10, n_steps=20, cost=4.65017032623
dim_input=15, dim_hidden=10, n_steps=60, cost=4.64081430435
dim_input=15, dim_hidden=10, n_steps=100, cost=4.66324615479
dim_input=15, dim_hidden=15, n_steps=20, cost=4.46678543091
dim_input=15, dim_hidden=15, n_steps=60, cost=4.46504926682
dim_input=15, dim_hidden=15, n_steps=100, cost=4.48022937775
dim_input=15, dim_hidden=30, n_steps=20, cost=4.27884626389
dim_input=15, dim_hidden=30, n_steps=60, cost=4.30791282654
dim_input=15, dim_hidden=30, n_steps=100, cost=4.33599472046
dim_input=15, dim_hidden=50, n_steps=20, cost=4.28652429581
dim_input=15, dim_hidden=50, n_steps=60, cost=4.24734210968 <------- 1st best
dim_input=15, dim_hidden=50, n_steps=100, cost=4.29611968994
dim_input=15, dim_hidden=100, n_steps=20, cost=4.31619215012
dim_input=15, dim_hidden=100, n_steps=60, cost=4.37390041351
dim_input=15, dim_hidden=100, n_steps=100, cost=4.3488779068
dim_input=30, dim_hidden=15, n_steps=20, cost=4.48592424393
dim_input=30, dim_hidden=15, n_steps=60, cost=4.49895906448
dim_input=30, dim_hidden=15, n_steps=100, cost=4.50793647766
dim_input=30, dim_hidden=30, n_steps=20, cost=4.27560329437
dim_input=30, dim_hidden=30, n_steps=60, cost=4.33786582947
dim_input=30, dim_hidden=30, n_steps=100, cost=4.35293722153
dim_input=30, dim_hidden=50, n_steps=20, cost=4.32201719284
dim_input=30, dim_hidden=50, n_steps=60, cost=4.29839229584
dim_input=30, dim_hidden=50, n_steps=100, cost=4.31720638275
dim_input=30, dim_hidden=100, n_steps=20, cost=4.28649139404
dim_input=30, dim_hidden=100, n_steps=60, cost=4.31956863403
dim_input=30, dim_hidden=100, n_steps=100, cost=4.37884473801
dim_input=50, dim_hidden=5, n_steps=20, cost=5.22038412094
dim_input=50, dim_hidden=5, n_steps=60, cost=5.20120239258
dim_input=50, dim_hidden=5, n_steps=100, cost=5.26643514633
dim_input=50, dim_hidden=10, n_steps=20, cost=4.66747570038
dim_input=50, dim_hidden=10, n_steps=60, cost=4.66478967667
dim_input=50, dim_hidden=10, n_steps=100, cost=4.89870882034
dim_input=50, dim_hidden=15, n_steps=20, cost=4.53129148483
dim_input=50, dim_hidden=15, n_steps=60, cost=4.53673696518
dim_input=50, dim_hidden=15, n_steps=100, cost=4.58493995667
dim_input=50, dim_hidden=30, n_steps=20, cost=4.31741189957
dim_input=50, dim_hidden=30, n_steps=60, cost=4.31924819946
dim_input=50, dim_hidden=30, n_steps=100, cost=4.37507772446
dim_input=50, dim_hidden=50, n_steps=20, cost=4.26547574997
dim_input=50, dim_hidden=50, n_steps=60, cost=4.31258153915
dim_input=50, dim_hidden=50, n_steps=100, cost=4.33610963821
dim_input=50, dim_hidden=100, n_steps=20, cost=4.25190830231 <------- 2nd best
dim_input=50, dim_hidden=100, n_steps=60, cost=4.30959272385
dim_input=50, dim_hidden=100, n_steps=100, cost=4.45529270172
```

## Grid Search (n_layers, p_keep, n_batch)

1. number of RNN layers
2. probabilities of Dropout
3. number of batches

```python
for n_layer in (1, 2, 3):
    for p_keep in (0.5, 0.75, 1.0):
        c = train(20, 50, 100, batch_size=64, n_layer=n_layer, p_keep=p_keep)
        print '[20,50,100] n_layer={}, p_keep={}, cost={}'.format(n_layer, p_keep, c)
        c = train(60, 15, 50, batch_size=128, n_layer=n_layer, p_keep=p_keep)
        print '[60,15,50]  n_layer={}, p_keep={}, cost={}'.format(n_layer, p_keep, c)
```
```
[20,50,100] n_layer=1, p_keep=0.5, cost=4.1572804451
[60,15,50]  n_layer=1, p_keep=0.5, cost=4.17022037506
[20,50,100] n_layer=1, p_keep=0.75, cost=4.12266349792 <-------- best
[60,15,50]  n_layer=1, p_keep=0.75, cost=4.14526844025
[20,50,100] n_layer=1, p_keep=1.0, cost=4.26864719391
[60,15,50]  n_layer=1, p_keep=1.0, cost=4.24512195587
[20,50,100] n_layer=2, p_keep=0.5, cost=5.92346096039
[60,15,50]  n_layer=2, p_keep=0.5, cost=5.85321617126
[20,50,100] n_layer=2, p_keep=0.75, cost=5.9269361496
[60,15,50]  n_layer=2, p_keep=0.75, cost=5.8477640152
[20,50,100] n_layer=2, p_keep=1.0, cost=4.47007656097
[60,15,50]  n_layer=2, p_keep=1.0, cost=5.83855199814
[20,50,100] n_layer=3, p_keep=0.5, cost=5.91980266571
[60,15,50]  n_layer=3, p_keep=0.5, cost=5.85325241089
[20,50,100] n_layer=3, p_keep=0.75, cost=5.90755319595
[60,15,50]  n_layer=3, p_keep=0.75, cost=5.84755277634
[20,50,100] n_layer=3, p_keep=1.0, cost=4.5511302948
[60,15,50]  n_layer=3, p_keep=1.0, cost=5.83842325211
```

## Final Training by Best Hyper-parameters

In [6]:
# 當設 final_train=True 時，會將 variable scope 設為 'default' 以便後續使用
train(20, 50, 100, 64, 1, 0.75, final_train=True)

4.1330943