In [1]:
# Copyright (C) 2018 Zhixian Ma <zx@mazhixian.me>
# Recurrent neural network notebook
# Samples are from the Penn Tree Bank (PTB) database
# Download from http://fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz

In [2]:
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt

In [3]:
import time
import numpy as np
import tensorflow as tf
import reader

In [4]:
# Define PTBInput class
class PTBInput(object):
    def __init__(self, config, data, name=None):
        self.batch_size = batch_size = config.batch_size
        self.num_steps = num_steps = config.num_steps
        self.epoch_size = ((len(data) // batch_size) - 1) // num_steps
        self.input_data, self.targets = reader.ptb_producer(
            data, batch_size, num_steps, name=name)

In [5]:
# Define model
class PTBModel(object):
    def __init__(self, is_training, config, input_):
        self._input = input_
        batch_size = input_.batch_size
        num_steps = input_.num_steps
        size = config.hidden_size
        vocab_size = config.vocab_size
    
        # Use tf.contrib.rnn.BasicLSTMCell define the cell unit
        def rnn_cell():
            return tf.contrib.rnn.BasicLSTMCell(
                num_units=size,  
                activation=tf.nn.tanh,
                reuse=not is_training)
    
        attn_cell = rnn_cell
        if is_training and config.keep_prob < 1:
            def attn_cell():
                return tf.contrib.rnn.DropoutWrapper(
                    rnn_cell(), output_keep_prob=config.keep_prob)
    
        cell = tf.contrib.rnn.MultiRNNCell(
            [attn_cell() for _ in range(config.num_layers)],
            state_is_tuple=True)
    
        self._initial_state = cell.zero_state(batch_size, tf.float32)
    
        # word embedding
        with tf.device("/cpu:0"):
            embedding = tf.get_variable(
                "embedding", [vocab_size, size], dtype=tf.float32)
            inputs = tf.nn.embedding_lookup(embedding, input_.input_data)
    
        if is_training and config.keep_prob < 1:
            inputs = tf.nn.dropout(inputs, config.keep_prob)
    
        # reusable variables
        outputs = []
        state = self._initial_state 
        with tf.variable_scope("RNN"):
            for time_step in range(num_steps):
                if time_step > 0: tf.get_variable_scope().reuse_variables()
                (cell_output, state) = cell(inputs[:, time_step, :], state)
                outputs.append(cell_output)
    
        output = tf.reshape(tf.concat(outputs, 1), [-1, size])
        softmax_w = tf.get_variable(
            "softmax_w", [size, vocab_size], dtype=tf.float32)
        softmax_b = tf.get_variable("softmax_b", [vocab_size], dtype=tf.float32)
        logits = tf.matmul(output, softmax_w) + softmax_b
        loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example(
            [logits],
            [tf.reshape(input_.targets, [-1])],
            [tf.ones([batch_size * num_steps], dtype=tf.float32)])
        self._cost = cost = tf.reduce_sum(loss) / batch_size
        self._final_state = state
    
        if not is_training:
            return
    
        self._lr = tf.Variable(0.0, trainable=False)
        tvars = tf.trainable_variables()
        grads, _ = tf.clip_by_global_norm(
            tf.gradients(cost, tvars),
            config.max_grad_norm)
        optimizer = tf.train.GradientDescentOptimizer(self._lr)
        self._train_op = optimizer.apply_gradients(
            zip(grads, tvars),
            global_step = tf.contrib.framework.get_or_create_global_step())
    
        self._new_lr = tf.placeholder(
            tf.float32, shape=[], name="new_learning_rate")
        self._lr_update = tf.assign(self._lr, self._new_lr)
    
    def assign_lr(self, session, lr_value):
        session.run(self._lr_update, feed_dict={self._new_lr: lr_value})
    
    
    # Define readonly properties
    # Property returns variables as read-only.
    @property
    def input(self):
        return self._input
    
    @property
    def initial_state(self):
        return self._initial_state
    
    @property
    def cost(self):
        return self._cost
    
    @property
    def final_state(self):
        return self._final_state
    
    @property
    def lr(self):
        return self._lr
    
    @property
    def train_op(self):
        return self._train_op

In [6]:
class SmallConfig(object):
    init_scale = 0.1 # 初始变量的边界 [-0.1, 0.1]
    learning_rate = 1.0 # 初始学习率 
    max_grad_norm = 5 # 梯度的最大范数，gradient clippint to avoid gradient explosion
    num_layers = 2 # LSTM cell层数
    num_steps = 20 # 
    hidden_size = 200 # h state的长度
    max_epoch = 4
    max_max_epoch = 13
    keep_prob = 1.0 
    lr_decay = 0.5
    batch_size = 20
    vocab_size = 10000 # 词汇表大小

In [7]:
def run_epoch(session, model, eval_op=None, verbose=False):
    start_time = time.time()
    costs = 0.0
    iters = 0
    state = session.run(model.initial_state)
    
    fetches = {
        "cost": model.cost,
        "final_state": model.final_state,
    }
    
    if eval_op is not None:
        fetches["eval_op"] = eval_op
    
    for step in range(model.input.epoch_size):
        feed_dict = {}
        for i, (c, h) in enumerate(model.initial_state):
            feed_dict[h] = state[i].h
        
        vals = session.run(fetches, feed_dict)
        cost = vals["cost"]
        state = vals["final_state"]
        
        costs += cost
        iters += model.input.num_steps
      
        if verbose and step % (model.input.epoch_size // 10) == 10:
            print("%.3f perplexity: %.3f speed: %.0f wps" % 
                 (step * 1.0 / model.input.epoch_size, np.exp(costs/iters),
                 iters * model.input.batch_size / (time.time() - start_time)))
    
    return np.exp(costs/iters)

In [8]:
datapath = '../data/simple-examples/data'
raw_data = reader.ptb_raw_data(datapath)
train_data, valid_data, test_data, _ = raw_data

config = SmallConfig()
eval_config = SmallConfig()
eval_config.batch_size = 1
eval_config.num_steps = 1

In [9]:
# Use the default Graph
# init with tf.random_uniform_initializer
# http://blog.csdn.net/lujiandong1/article/details/53448012
with tf.Graph().as_default():
    initializer = tf.random_uniform_initializer(-config.init_scale,config.init_scale)
    
    with tf.name_scope("Train"):
        train_input = PTBInput(config=config, data=train_data, name="TrainInput")
        with tf.variable_scope("Model", reuse=None, initializer=initializer):
            m = PTBModel(is_training=True, config=config, input_=train_input)
    
    with tf.name_scope("Valid"):
        valid_input = PTBInput(config=config, data=valid_data, name="ValidInput")
        with tf.variable_scope("Model", reuse=True, initializer=initializer):
            mvalid = PTBModel(is_training=False, config=config, input_=valid_input)
    
    with tf.name_scope("Test"):
        test_input = PTBInput(config=eval_config, data=test_data, name="TestInput")
        with tf.variable_scope("Model", reuse=True, initializer=initializer):
            mtest = PTBModel(is_training=False, config=eval_config, input_=test_input)
    
    sv = tf.train.Supervisor()
    with sv.managed_session() as session:
    # with tf.InteractiveSession() as sess:
    #    sess.run(tf.initialize_variables)
        for i in range(config.max_max_epoch):
            lr_decay = config.lr_decay ** max(i+1-config.max_epoch, 0.0)
            m.assign_lr(session, config.learning_rate * lr_decay)
        
            print("Epoch: %d Learning rate: %.3f" % (i+1, session.run(m.lr)))
            train_perplexity = run_epoch(session, m, eval_op=m.train_op, verbose=True)
        
            print("Epoch: %d Training Perplexity: %.3f" % (i+1, train_perplexity))
        
            valid_perplexity = run_epoch(session, mvalid)
            print("Epoch: %d Validation Perplexity: %.3f" % (i+1, valid_perplexity))
        
        test_perplexity = run_epoch(session, mtest)
        print("Epoch: %d Test Perplexity: %.3f" % (i+1, test_perplexity))

Epoch: 1 Learning rate: 1.000
0.004 perplexity: 7422.857 speed: 7713 wps
0.104 perplexity: 1037.062 speed: 28141 wps
0.204 perplexity: 779.209 speed: 30229 wps
0.304 perplexity: 642.252 speed: 31052 wps
0.404 perplexity: 554.060 speed: 31343 wps
0.504 perplexity: 493.019 speed: 31445 wps
0.604 perplexity: 441.517 speed: 31541 wps
0.703 perplexity: 404.960 speed: 31615 wps
0.803 perplexity: 376.058 speed: 31672 wps
0.903 perplexity: 350.548 speed: 31734 wps
Epoch: 1 Training Perplexity: 331.169
Epoch: 1 Validation Perplexity: 206.332
Epoch: 2 Learning rate: 1.000
0.004 perplexity: 245.239 speed: 34075 wps
0.104 perplexity: 179.075 speed: 30239 wps
0.204 perplexity: 186.765 speed: 30522 wps
0.304 perplexity: 181.802 speed: 30650 wps
0.404 perplexity: 178.625 speed: 30867 wps
0.504 perplexity: 175.994 speed: 30980 wps
0.604 perplexity: 170.691 speed: 31200 wps
0.703 perplexity: 167.880 speed: 31300 wps
0.803 perplexity: 165.150 speed: 31304 wps
0.903 perplexity: 161.050 speed: 31422 wps
E