In [1]:
import tensorflow as tf
import numpy as np

In [2]:
def load_text_to_id(filename):
    with open(filename) as f:
        text = f.read().decode('utf8').replace(u'　', '').replace(u'\n', '')
        words = list(text)
    vocab = ['<unk>'] + sorted(list(set(words)))
    vocab = dict(zip(vocab, range(len(vocab))))
    word_ids = [vocab[w] if w in vocab else 0 for w in words]
    inv_vocab = np.array([x[1] for x in sorted(zip(vocab.values(), vocab.keys()))])
    return word_ids, vocab, inv_vocab

word_ids, vocab, inv_vocab = load_text_to_id('raw_novel.txt')

In [3]:
def batch(word_ids, batch_size, n_steps):
    word_ids = np.array(word_ids)
    batch_count = len(word_ids) // batch_size
    data = word_ids[:batch_count*batch_size].reshape([batch_size, batch_count])
    for end in range(n_steps, batch_count, 1):
        start = end - n_steps
        x = data[:, start:end]
        y = data[:, (start+1):(end+1)]
        yield x, y

In [4]:
def get_model(scope_name, n_steps, dim_input, dim_hidden, batch_size, vocab_size, n_layer=1):
    input_data = tf.placeholder('int32', [batch_size, n_steps])
    targets = tf.placeholder('int32', [batch_size, n_steps])
    p_keep = tf.placeholder_with_default(tf.constant(1.0), [])

    with tf.variable_scope(scope_name) as scope:
        with tf.device("/cpu:0"):
            try:
                embedding = tf.get_variable('embedding', [vocab_size, dim_input],
                    initializer=tf.contrib.layers.xavier_initializer())
            except ValueError:
                scope.reuse_variables()
                embedding = tf.get_variable('embedding', [vocab_size, dim_input],
                                            initializer=tf.contrib.layers.xavier_initializer())
            inputs = tf.nn.embedding_lookup(embedding, input_data)
            inputs = [tf.squeeze(input_, [1]) for input_ in tf.split(1, n_steps, inputs)]

        with tf.device('/gpu:0'):
            cell = tf.nn.rnn_cell.GRUCell(dim_hidden)
            cell = tf.nn.rnn_cell.DropoutWrapper(cell, p_keep, p_keep)
            cell = tf.nn.rnn_cell.MultiRNNCell([cell] * n_layer)
            initial_state = cell.zero_state(batch_size, 'float32')

        outputs, state = tf.nn.rnn(cell, inputs, initial_state=initial_state)
        output = tf.reshape(tf.concat(1, outputs), [-1, dim_hidden])
        with tf.device('/gpu:0'):
            Wy = tf.get_variable('Wy', [dim_hidden, vocab_size],
                                 initializer=tf.contrib.layers.xavier_initializer())
            by = tf.get_variable('by', [vocab_size],
                                 initializer=tf.contrib.layers.xavier_initializer())
            logits = tf.matmul(output, Wy) + by
            probs = tf.nn.softmax(logits)
            loss = tf.nn.seq2seq.sequence_loss_by_example(
                [logits], [tf.reshape(targets, [-1])],
                [tf.ones([batch_size * n_steps], dtype='float32')], vocab_size)
            cost = tf.reduce_sum(loss) / batch_size / n_steps
            final_state = state
            train_op = tf.train.RMSPropOptimizer(0.001, 0.9).minimize(cost)

    return {'train': train_op, 'final_state': final_state, 'cost': cost,
            'logits': logits, 'input': input_data, 'target': targets,
            'init_state': initial_state, 'cell': cell, 'p_keep': p_keep,
            'embedding': embedding, 'probs': probs}

In [7]:
def train(n_steps, dim_input, dim_hidden, batch_size=128, n_layer=1, p_keep=1.0, final_train=False):
    batch_size = 128
    vocab_size = len(vocab)
    if final_train:
        scope_name = 'default'
    else:
        scope_name = 'rnn_{}_{}_{}_{}_{}'.format(n_layer, n_steps, dim_input, dim_hidden, batch_size)
    model = get_model(scope_name, n_steps, dim_input, dim_hidden, batch_size, vocab_size, n_layer)
    with tf.Session() as sess:
        tf.global_variables_initializer().run()
        X = model['input']
        Y = model['target']
        dp = model['p_keep']
        last_cost = 100.0
        for epoch in range(100):
            for x, y in batch(word_ids[:100000], batch_size, n_steps):
                sess.run(model['train'], feed_dict={X: x, Y: y, dp: p_keep})
            cost = []
            for x, y in batch(word_ids[100000:], batch_size, n_steps):
                cost.append(sess.run(model['cost'], feed_dict={X: x, Y: y, dp: 1.0}))
            curr_cost = np.mean(cost)
            print epoch, curr_cost
            if curr_cost > last_cost or abs(curr_cost - last_cost) < 0.01:
                break
            last_cost = curr_cost
        saver = tf.train.Saver(tf.global_variables())
        saver.save(sess, './lstm_zh.checkpoint')
        return curr_cost

## Grid Search

1. number of dimensions of input
2. number of hidden units
3. number of RNN cells

```python
cost = {}
for dim_input in (5, 10, 15, 30, 50, 100):
    for dim_hidden in (5, 10, 15, 30, 50, 100):
        for n_steps in (20, 60, 100):
            print 'dim_input={}, dim_hidden={}, n_steps={}'.format(dim_input, dim_hidden, n_steps)
            cost[(n_steps, dim_input, dim_hidden)] = train(n_steps, dim_input, dim_hidden)
            print
print cost
```

In [None]:
cost = {}
for dim_input in (5, 10, 15, 30, 50, 100):
    for dim_hidden in (5, 10, 15, 30, 50, 100):
        for n_steps in (20, 60, 100):
            print 'dim_input={}, dim_hidden={}, n_steps={}'.format(dim_input, dim_hidden, n_steps)
            cost[(n_steps, dim_input, dim_hidden)] = train(n_steps, dim_input, dim_hidden)
            print
print cost

dim_input=5, dim_hidden=5, n_steps=20
0 5.74456
1 5.55805
2 5.43175


## Result of Grid Search

```
dim_input=5, dim_hidden=5, n_steps=20, 12 4.81909  
dim_input=5, dim_hidden=5, n_steps=60, 10 4.82728  
dim_input=5, dim_hidden=5, n_steps=100, 8 4.82829  
dim_input=5, dim_hidden=10, n_steps=20, 11 4.54927  
dim_input=5, dim_hidden=10, n_steps=60, 8 4.50915  
dim_input=5, dim_hidden=10, n_steps=100, 8 4.54042  
dim_input=5, dim_hidden=15, n_steps=20, 10 4.41731  
dim_input=5, dim_hidden=15, n_steps=60, 8 4.40338  
dim_input=5, dim_hidden=15, n_steps=100, 6 4.39306  
dim_input=5, dim_hidden=30, n_steps=20, 8 4.3128
dim_input=5, dim_hidden=30, n_steps=60, 6 4.34912  
dim_input=5, dim_hidden=30, n_steps=100, 4 4.34219  
dim_input=5, dim_hidden=50, n_steps=20, 4 4.46443  
dim_input=5, dim_hidden=50, n_steps=60, 14 5.31131  
dim_input=5, dim_hidden=50, n_steps=100, 17 5.7858  
dim_input=5, dim_hidden=100, n_steps=20, 10 5.79669  
dim_input=5, dim_hidden=100, n_steps=60, 42 8.45632  
dim_input=5, dim_hidden=100, n_steps=100, 41 10.0958  
dim_input=10, dim_hidden=5, n_steps=20, 11 4.79327  
dim_input=10, dim_hidden=5, n_steps=60, 10 4.78631  
dim_input=10, dim_hidden=5, n_steps=100, 10 4.81798  
dim_input=10, dim_hidden=10, n_steps=20, 11 4.53468  
dim_input=10, dim_hidden=10, n_steps=60, 8 4.49229  
dim_input=10, dim_hidden=10, n_steps=100, 7 4.48285  
dim_input=10, dim_hidden=15, n_steps=20, 10 4.42211  
dim_input=10, dim_hidden=15, n_steps=60, 8 4.35828  
dim_input=10, dim_hidden=15, n_steps=100, 7 4.36294  
dim_input=10, dim_hidden=30, n_steps=20, 6 4.36829  
dim_input=10, dim_hidden=30, n_steps=60, 5 4.31422
dim_input=10, dim_hidden=30, n_steps=100, 4 4.31992  
dim_input=10, dim_hidden=50, n_steps=20, 5 4.41931  
dim_input=10, dim_hidden=50, n_steps=60, 15 5.30443  
dim_input=10, dim_hidden=50, n_steps=100, 2 4.49032  
dim_input=10, dim_hidden=100, n_steps=20, 25 5.4159  
dim_input=10, dim_hidden=100, n_steps=60, 47 8.59541  
dim_input=10, dim_hidden=100, n_steps=100, 42 10.0263  
dim_input=15, dim_hidden=5, n_steps=20, 11 4.82542  
dim_input=15, dim_hidden=5, n_steps=60, 10 4.70664  
dim_input=15, dim_hidden=5, n_steps=100, 9 4.79271  
dim_input=15, dim_hidden=10, n_steps=20, 10 4.57568  
dim_input=15, dim_hidden=10, n_steps=60, 10 4.5129  
dim_input=15, dim_hidden=10, n_steps=100, 8 4.51734  
dim_input=15, dim_hidden=15, n_steps=20, 9 4.43064  
dim_input=15, dim_hidden=15, n_steps=60, 9 4.343  
dim_input=15, dim_hidden=15, n_steps=100, 7 4.3824  
dim_input=15, dim_hidden=30, n_steps=20, 8 4.32061  
dim_input=15, dim_hidden=30, n_steps=60, 5 4.30017  <======== BEST
dim_input=15, dim_hidden=30, n_steps=100, 4 4.31141  
dim_input=15, dim_hidden=50, n_steps=20, 4 4.44926  
dim_input=15, dim_hidden=50, n_steps=60, 13 5.28622  
dim_input=15, dim_hidden=50, n_steps=100, 2 4.46069  
dim_input=15, dim_hidden=100, n_steps=20, 26 5.39629  
dim_input=15, dim_hidden=100, n_steps=60, 63 8.76479  
dim_input=15, dim_hidden=100, n_steps=100, 47 10.1521  
dim_input=30, dim_hidden=5, n_steps=20, 12 4.834  
dim_input=30, dim_hidden=5, n_steps=60, 10 4.80542  
dim_input=30, dim_hidden=5, n_steps=100, 9 4.82141  
dim_input=30, dim_hidden=10, n_steps=20, 11 4.5351  
dim_input=30, dim_hidden=10, n_steps=60, 9 4.45272  
dim_input=30, dim_hidden=10, n_steps=100, 7 4.51826  
dim_input=30, dim_hidden=15, n_steps=20, 11 4.40718  
dim_input=30, dim_hidden=15, n_steps=60, 8 4.36056  
dim_input=30, dim_hidden=15, n_steps=100, 8 4.41101  
dim_input=30, dim_hidden=30, n_steps=20, 8 4.32665  
dim_input=30, dim_hidden=30, n_steps=60, 5 4.34097  
dim_input=30, dim_hidden=30, n_steps=100, 4 4.34122  
dim_input=30, dim_hidden=50, n_steps=20, 5 4.44636  
dim_input=30, dim_hidden=50, n_steps=60, 3 4.4633  
dim_input=30, dim_hidden=50, n_steps=100, 2 4.46157  
dim_input=30, dim_hidden=100, n_steps=20, 10 5.78953  
dim_input=30, dim_hidden=100, n_steps=60, 55 8.77992  
dim_input=30, dim_hidden=100, n_steps=100, 41 10.0491
dim_input=50, dim_hidden=5, n_steps=20, 12 4.80319
dim_input=50, dim_hidden=5, n_steps=60, 8 4.77495
dim_input=50, dim_hidden=5, n_steps=100, 9 4.79949
dim_input=50, dim_hidden=10, n_steps=20, 12 4.55258
dim_input=50, dim_hidden=10, n_steps=60, 10 4.48609
dim_input=50, dim_hidden=10, n_steps=100, 8 4.4701
dim_input=50, dim_hidden=15, n_steps=20, 11 4.41331
dim_input=50, dim_hidden=15, n_steps=60, 9 4.34871
dim_input=50, dim_hidden=15, n_steps=100, 6 4.41298
dim_input=50, dim_hidden=30, n_steps=20, 8 4.31822
dim_input=50, dim_hidden=30, n_steps=60, 5 4.33932
dim_input=50, dim_hidden=30, n_steps=100, 24 4.93073
dim_input=50, dim_hidden=50, n_steps=20, 5 4.42341
dim_input=50, dim_hidden=50, n_steps=60, 17 5.30831
dim_input=50, dim_hidden=50, n_steps=100, 15 5.76036
dim_input=50, dim_hidden=100, n_steps=20, 10 5.82358
dim_input=50, dim_hidden=100, n_steps=60, 50 8.65317
dim_input=50, dim_hidden=100, n_steps=100, 34 9.84145
dim_input=100, dim_hidden=5, n_steps=20, 12 4.80052
dim_input=100, dim_hidden=5, n_steps=60, 10 4.7754
dim_input=100, dim_hidden=5, n_steps=100, 10 4.80939
dim_input=100, dim_hidden=10, n_steps=20, 11 4.55337
dim_input=100, dim_hidden=10, n_steps=60, 9 4.46794
dim_input=100, dim_hidden=10, n_steps=100, 8 4.50755
dim_input=100, dim_hidden=15, n_steps=20, 11 4.41782
dim_input=100, dim_hidden=15, n_steps=60, 8 4.36567
dim_input=100, dim_hidden=15, n_steps=100, 6 4.39921
dim_input=100, dim_hidden=30, n_steps=20, 7 4.34298
dim_input=100, dim_hidden=30, n_steps=60, 5 4.33793
dim_input=100, dim_hidden=30, n_steps=100, 4 4.35769
dim_input=100, dim_hidden=50, n_steps=20, 5 4.46569
dim_input=100, dim_hidden=50, n_steps=60, 13 5.27012
dim_input=100, dim_hidden=50, n_steps=100, 24 6.0024
dim_input=100, dim_hidden=100, n_steps=20, 11 5.76113
dim_input=100, dim_hidden=100, n_steps=60, 45 8.4888
dim_input=100, dim_hidden=100, n_steps=100, 36 9.97792
```

## Another Grid Search

1. number of RNN layers
2. probabilities of Dropout

```python
cost = {}
for n_layer in (1, 2, 3):
    for p_keep in (0.5, 0.75, 1.0):
        print 'n_layer={}, p_keep={}'.format(n_layer, p_keep)
        cost[(n_layer, p_keep)] = train(60, 15, 30, n_layer=n_layer, p_keep=p_keep)
        print
print cost
```

## Results of Grid Search

```
n_layer=1, p_keep=0.5, 4.2462716
n_layer=1, p_keep=0.75, 4.0205622 <====== BEST
n_layer=1, p_keep=1.0, 4.3006234
n_layer=2, p_keep=0.5, 5.8313818
n_layer=2, p_keep=0.75, 5.8209763
n_layer=2, p_keep=1.0, 5.812592
n_layer=3, p_keep=0.5, 4.8201237
n_layer=3, p_keep=0.75, 5.8212233
n_layer=3, p_keep=1.0, 5.8125467
```

## Another Grid Search

1. number of batches

```python
cost = {}
for b in (32, 64, 96, 128, 192, 256):
    print 'batch={}'.format(b)
    cost[(n_layer, p_keep)] = train(60, 15, 30, batch_size=b, p_keep=p_keep)
    print
print cost
```

In [10]:
cost = {}
for b in (32, 64, 96, 128, 192, 256):
    print 'batch={}'.format(b)
    cost[b] = train(60, 15, 30, batch_size=b, p_keep=0.75)
    print
print cost

batch=32
0 5.18427
1 4.63354
2 4.41146


KeyboardInterrupt: 

In [6]:
train(60, 15, 30, 1, 0.75, final_train=True)

0 5.18676
1 4.6646
2 4.41315
3 4.29019
4 4.21891
5 4.17458
6 4.14202
7 4.12147
8 4.10267
9 4.09177
10 4.08239
11 4.07144
12 4.06633
13 4.06056
14 4.05623
15 4.05456
16 4.04804
17 4.04518
18 4.04212
19 4.0411
20 4.04118


4.0411777