# 第二节课，实践1

## 1B. 优化上一节课的RNN模型


###  在第二个版本里面，我们将实现cross-validation，并添加总结功能

In [1]:
class BatchGenerator(object):
    
    def __init__(self, tensor_in, tensor_out, batch_size, seq_length):
        """初始化mini-batch产生器，BaseClass
        Input:
            batch_size: 每一个mini-batch里面有多少样本。
            seq_length: 每一个样本的长度，和batch_size一起决定了每个minibatch的数据量。
        """
        self.batch_size = batch_size
        self.seq_length = seq_length
        
        self.tensor_in = tensor_in
        self.tensor_out = tensor_out
        
        self.create_batches()
        self.reset_batch_pointer()

    def reset_batch_pointer(self):
        self.pointer = 0

    def create_batches(self):
        self.num_batches = int(self.tensor_in.size / (self.batch_size * self.seq_length))
        self.tensor_in = self.tensor_in[:self.num_batches * self.batch_size * self.seq_length]
        self.tensor_out = self.tensor_out[:self.num_batches * self.batch_size * self.seq_length]
        
        # When the data (tesor) is too small, let's give them a better error message
        if self.num_batches==0:
            assert False, "Not enough data. Make seq_length and batch_size small."
        
        self.x_batches = np.split(self.tensor_in.reshape(self.batch_size, -1), self.num_batches, 1)
        self.y_batches = np.split(self.tensor_out.reshape(self.batch_size, -1), self.num_batches, 1)

    def next_batch(self):
        x, y = self.x_batches[self.pointer], self.y_batches[self.pointer]
        self.pointer += 1
        return x, y

class CopyBatchGenerator(BatchGenerator):
    
    def __init__(self, data, batch_size, seq_length):
        """初始化mini-batch产生器
        
        输入一个长度为T的sequence，sequence的前T-1个元素为input，
          sequence的后面T-1个元素为output。用来训练RNNLM。
        
        Input:
            batch_size: 每一个mini-batch里面有多少样本。
            seq_length: 每一个样本的长度，和batch_size一起决定了每个minibatch的数据量。
        """
        self.batch_size = batch_size
        self.seq_length = seq_length
        
        tensor_in = np.array(data)
        tensor_out = np.copy(tensor_in)
        tensor_out[:-1] = tensor_in[1:]
        tensor_out[-1] = tensor_in[0]
        
        super(CopyBatchGenerator, self).__init__(
            tensor_in, tensor_out, batch_size, seq_length)

class PredBatchGenerator(BatchGenerator):
    
    def __init__(self, data_in, data_out, batch_size, seq_length):
        """初始化mini-batch产生器
        
        输入两个长度为T的sequence，其中一个是输入sequence，另一个是输出sequence。
        
        Input:
            batch_size: 每一个mini-batch里面有多少样本。
            seq_length: 每一个样本的长度，和batch_size一起决定了每个minibatch的数据量。
        """
        self.batch_size = batch_size
        self.seq_length = seq_length
        
        tensor_in = np.array(data_in)
        tensor_out = np.array(data_out)
        super(PredBatchGenerator, self).__init__(
            tensor_in, tensor_out, batch_size, seq_length)

### 定义CharRNN 模型

* 和上一节课一样，这一节课里，我们的RNN模型的输入和输出是同样长度的序列，我们叫做char-level-RNN模型
* 下周我们将研究以句子为单位输入输出

In [2]:
import time
import numpy as np
import tensorflow as tf


class CharRNNLM(object):
    def __init__(self, is_training, batch_size, num_unrollings, vocab_size,
                 hidden_size, embedding_size, learning_rate):
        """
        New arguments:
            is_training: 是否在训练阶段
        """
        self.batch_size = batch_size
        self.num_unrollings = num_unrollings
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size

        self.input_data = tf.placeholder(tf.int64, [self.batch_size, self.num_unrollings], name='inputs')
        self.targets =  tf.placeholder(tf.int64, [self.batch_size, self.num_unrollings], name='targets')

        cell_fn = tf.nn.rnn_cell.BasicRNNCell

        params = dict()
        cell = cell_fn(self.hidden_size, **params)

        with tf.name_scope('initial_state'):
            self.zero_state = cell.zero_state(self.batch_size, tf.float32)
            
            self.initial_state = tf.placeholder(tf.float32,
                                                [self.batch_size, cell.state_size],
                                                'initial_state')
        
        with tf.name_scope('embedding_layer'):
            ## 定义词向量参数，并通过查询将输入的整数序列每一个元素转换为embedding向量
            # 如果提供了embedding的维度，我们声明一个embedding参数，即词向量参数矩阵
            # 否则，我们使用Identity矩阵作为词向量参数矩阵
            if embedding_size > 0:
                self.embedding = tf.get_variable('embedding', [self.vocab_size, self.embedding_size])
            else:
                self.embedding = tf.constant(np.eye(self.vocab_size), dtype=tf.float32)
            inputs = tf.nn.embedding_lookup(self.embedding, self.input_data)

        with tf.name_scope('slice_inputs'):
            # 我们将要使用static_rnn方法，需要将长度为num_unrolling的序列切割成
            # num_unrolling个单位，存在一个list里面,
            # 即，输入格式为：
            # [ num_unrollings, (batch_size, embedding_size)]
            sliced_inputs = [tf.squeeze(input_, [1]) for input_ in tf.split(
                axis = 1, num_or_size_splits = self.num_unrollings, value = inputs)]

        # 调用static_rnn方法，作forward propagation
        # 为方便阅读，我们将static_rnn的注释贴到这里
        # 输入：
        #     inputs: A length T list of inputs, each a Tensor of shape [batch_size, input_size]
        #     initial_state: An initial state for the RNN.
        #                If cell.state_size is an integer, this must be a Tensor of appropriate
        #                type and shape [batch_size, cell.state_size]
        # 输出：
        #     outputs: a length T list of outputs (one for each input), or a nested tuple of such elements.
        #     state: the final state
        outputs, final_state = tf.nn.static_rnn(
                cell = cell,
                inputs = sliced_inputs,
                initial_state=self.initial_state)
        self.final_state = final_state

        with tf.name_scope('flatten_outputs'):
            flat_outputs = tf.reshape(tf.concat(axis = 1, values = outputs), [-1, hidden_size])

        with tf.name_scope('flatten_targets'):
            flat_targets = tf.reshape(tf.concat(axis = 1, values = self.targets), [-1])

        with tf.variable_scope('softmax') as sm_vs:
            softmax_w = tf.get_variable('softmax_w', [hidden_size, vocab_size])
            softmax_b = tf.get_variable('softmax_b', [vocab_size])
            self.logits = tf.matmul(flat_outputs, softmax_w) + softmax_b
            self.probs = tf.nn.softmax(self.logits)

        with tf.name_scope('loss'):
            loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
                    logits = self.logits, labels = flat_targets)
            self.mean_loss = tf.reduce_mean(loss)

        with tf.name_scope('loss_montor'):
            count = tf.Variable(1.0, name='count')
            sum_mean_loss = tf.Variable(1.0, name='sum_mean_loss')

            self.reset_loss_monitor = tf.group(sum_mean_loss.assign(0.0),
                                               count.assign(0.0), name='reset_loss_monitor')
            self.update_loss_monitor = tf.group(sum_mean_loss.assign(sum_mean_loss+self.mean_loss),
                                                count.assign(count+1), name='update_loss_monitor')

            with tf.control_dependencies([self.update_loss_monitor]):
                self.average_loss = sum_mean_loss / count
                self.ppl = tf.exp(self.average_loss)

            # mark: version 1 --> version 2
            # 增加总结summary，方便通过tensorboard观察训练过程
            average_loss_summary = tf.summary.scalar(name = 'average_loss', tensor = self.average_loss)
            ppl_summary = tf.summary.scalar(name = 'perplexity', tensor = self.ppl)
                                                 
        self.summaries = tf.summary.merge(
            inputs = [average_loss_summary, ppl_summary], name='loss_monitor')

        
        self.global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0.0))

        # self.learning_rate = tf.constant(learning_rate)
        self.learning_rate = tf.placeholder(tf.float32, [], name='learning_rate')

        # mark: 从version1到version2的更新：
        if is_training:
            tvars = tf.trainable_variables()
            grads = tf.gradients(self.mean_loss, tvars)
            optimizer = tf.train.AdamOptimizer(self.learning_rate)
            self.train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=self.global_step)


    # 运行一个epoch
    def run_epoch(self, session, batch_generator,
                  is_training, learning_rate, freq=10):
        epoch_size = batch_generator.num_batches
        # mark:
        if is_training:
            extra_op = self.train_op
        else:
            extra_op = tf.no_op()
        
        state = self.zero_state.eval()
        
        self.reset_loss_monitor.run()
        batch_generator.reset_batch_pointer()
        start_time = time.time()
        for step in range(epoch_size):
            x, y = batch_generator.next_batch()
            
            ops = [self.average_loss, self.ppl, self.final_state, extra_op,
                   self.summaries, self.global_step]
            
            feed_dict = {self.input_data: x, self.targets: y, self.initial_state: state,
                         self.learning_rate: learning_rate}
                        
            results = session.run(ops, feed_dict)
            average_loss, ppl, final_state, _, summary_str, global_step = results
            
        return ppl, global_step

### 调用产生合成数据的module

In [3]:
from data.synthetic.synthetic_binary import gen_data

### 演示variable scope的冲突
如果下面的code cell被连续调用两次，则会有下述错误（注意reuse)：
```bash
ValueError: Variable embedding already exists, disallowed. Did you mean to set reuse=True in VarScope? Originally defined at:

  File "<ipython-input-1-2c5b9a1002a7>", line 36, in __init__
    self.embedding = tf.get_variable('embedding', [self.vocab_size, self.embedding_size])
  File "<ipython-input-2-44e044623871>", line 9, in <module>
    vocab_size, hidden_size, embedding_size, learning_rate)
  File "/home/dong/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2881, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
```


In [4]:
batch_size = 16
num_unrollings = 20
vocab_size = 2
hidden_size = 16
embedding_size = 16
learning_rate = 0.01

model = CharRNNLM(True, batch_size, num_unrollings,
                  vocab_size, hidden_size, embedding_size, learning_rate)

tf.get_variable_scope().reuse_variables()
valid_model = CharRNNLM(False, batch_size, num_unrollings,
                  vocab_size, hidden_size, embedding_size, learning_rate)

In [5]:
total_size = 1000000
dataset = gen_data(size = total_size)
dataset_val = gen_data(size = total_size // 20)
batch_size = 16
seq_length = num_unrollings
batch_generator = PredBatchGenerator(data_in = dataset[0],
                                     data_out = dataset[1],
                                     batch_size = batch_size,
                                     seq_length = seq_length)

batch_generator_valid = PredBatchGenerator(data_in = dataset_val[0],
                                           data_out = dataset_val[1],
                                           batch_size = batch_size,
                                           seq_length = seq_length)

In [6]:
session = tf.Session()

In [9]:
with session.as_default():
    for epoch in range(1):
        session.run(tf.global_variables_initializer())
        ppl, global_step = model.run_epoch(
            session, batch_generator, True, learning_rate, freq=10)
        print("training perplexity after one epoch %f" % ppl)
        ppl_valid, global_step = model.run_epoch(
            session, batch_generator, False, learning_rate, freq=10)
        print("validation perplexity after one epoch %f" % ppl)

training perplexity after one epoch 1.671384
validation perplexity after one epoch 1.671384
