数据预处理

In [None]:
train1 = "TED_data/train.en"
train2 = "TED_data/train.zh"
eval1 = "TED_data/valid.en"
eval2 = "TED_data/valid.zh"
test1 = "TED_data/test.en"
test2 = "TED_data/test.zh"

_prepro = lambda x:  [line.strip() for line in open(x, 'r').read().split("\n")]
prepro_train1, prepro_train2 = _prepro(train1), _prepro(train2)
prepro_eval1, prepro_eval2 = _prepro(eval1), _prepro(eval2)
prepro_test1, prepro_test2 = _prepro(test1), _prepro(test2)

print (prepro_train1[0])
print (prepro_train2[0])
print (prepro_eval1[0])
print (prepro_eval2[0])
print (prepro_test1[0])
print (prepro_test2[0])

将预处理的数据写入文件中

In [None]:
import os
os.makedirs("TED_data/prepro", exist_ok=True)

def _write(sents, fname):
    with open(fname, 'w') as fout:
        fout.write("\n".join(sents))

_write(prepro_train1, "TED_data/prepro/train.en")
_write(prepro_train2, "TED_data/prepro/train.zh")
_write(prepro_train1+prepro_train2, "TED_data/prepro/train")
_write(prepro_eval1, "TED_data/prepro/eval.en")
_write(prepro_eval2, "TED_data/prepro/eval.zh")
_write(prepro_test1, "TED_data/prepro/test.en")
_write(prepro_test2, "TED_data/prepro/test.zh")

训练BPE

In [None]:
import sentencepiece as spm
vocab_size = 32000

os.makedirs("TED_data/segmented", exist_ok=True)
train = '--input=TED_data/prepro/train --pad_id=0 --unk_id=1 \
    --bos_id=2 --eos_id=3 --model_prefix=TED_data/segmented/bpe \
    --vocab_size={} --model_type=bpe'.format(vocab_size)
spm.SentencePieceTrainer.Train(train)

使用BPE进行分词

In [None]:
sp = spm.SentencePieceProcessor()
sp.Load("TED_data/segmented/bpe.model")

def _segment_and_write(sents, fname):
        with open(fname, "w") as fout:
            for sent in sents:
                pieces = sp.EncodeAsPieces(sent)
                fout.write(" ".join(pieces) + "\n")
                
_segment_and_write(prepro_train1, "TED_data/segmented/train.en.bpe")
_segment_and_write(prepro_train2, "TED_data/segmented/train.zh.bpe")
_segment_and_write(prepro_eval1, "TED_data/segmented/eval.en.bpe")
_segment_and_write(prepro_eval2, "TED_data/segmented/eval.zh.bpe")
_segment_and_write(prepro_test1, "TED_data/segmented/test.en.bpe")          

print("train1:", open("TED_data/segmented/train.en.bpe",'r').readline())
print("train2:", open("TED_data/segmented/train.zh.bpe", 'r').readline())
print("eval1:", open("TED_data/segmented/eval.en.bpe", 'r').readline())
print("eval2:", open("TED_data/segmented/eval.zh.bpe", 'r').readline())
print("test1:", open("TED_data/segmented/test.en.bpe", 'r').readline())

构建模型

导入词表

In [None]:
vocab_fpath = 'TED_data/segmented/bpe.vocab' #词表文件
vocab = [line.split()[0] for line in open(vocab_fpath, 'r').read().splitlines()]
token2idx = {token: idx for idx, token in enumerate(vocab)}
idx2token = {idx: token for idx, token in enumerate(vocab)}

print(list(token2idx.items())[:5])
print(list(idx2token.items())[:5])

构建训练和测试用的数据，以iteration形式保存

In [None]:
maxlen1 = 100 # 源语言句子最大长度
maxlen2 = 100 # 目标语言句子最大长度
batch_size = 64

# 导入训练数据，过滤掉长度不符合要求的
train_sents1, train_sents2 = [], []
with open("TED_data/prepro/train.en", 'r') as f1, open("TED_data/prepro/train.zh", 'r') as f2:
    for sent1, sent2 in zip(f1, f2):
        if len(sent1.split()) + 1 > maxlen1: continue # 1: </s>
        if len(sent2.split()) + 1 > maxlen2: continue  # 1: </s>
        train_sents1.append(sent1.strip())
        train_sents2.append(sent2.strip())
    
eval_sents1, eval_sents2 = [], []
with open("TED_data/prepro/eval.en", 'r') as f1, open("TED_data/prepro/eval.zh", 'r') as f2:
    for sent1, sent2 in zip(f1, f2):
        if len(sent1.split()) + 1 > maxlen1: continue # 1: </s>
        if len(sent2.split()) + 1 > maxlen2: continue  # 1: </s>
        eval_sents1.append(sent1.strip())
        eval_sents2.append(sent2.strip())
        
test_sents1, test_sents2 = [], []
with open("TED_data/prepro/test.en", 'r') as f1, open("TED_data/prepro/test.zh", 'r') as f2:
    for sent1, sent2 in zip(f1, f2):
        if len(sent1.split()) + 1 > maxlen1: continue # 1: </s>
        if len(sent2.split()) + 1 > maxlen2: continue  # 1: </s>
        test_sents1.append(sent1.strip())
        test_sents2.append(sent2.strip())

数据进行batch

In [None]:
import tensorflow as tf

def encode(inp, types):
    '''
    字符串转为数字
    如果type为x, 表示inp是源语言字符串，会在末尾加</s>
    如果type为y，表示inp是目标语言字符串，会在开始加<s>，结尾加</s>
    '''
    inp_str = inp.decode("utf-8")
    if types=="x": tokens = inp_str.split() + ["</s>"]
    else: tokens = ["<s>"] + inp_str.split() + ["</s>"]

    x = [token2idx.get(t, token2idx["<unk>"]) for t in tokens]
    return x

def generator_fn(sents1, sents2):
    '''
    为训练产生数据
    返回：源语言数字序列、源语言序列长度、源语言字符串本身、目标语言数字序列（不包括最后一个）、
        目标语言数字序列（不包括第一个）、目标语言序列长度、目标语言字符串本身
    '''
    for sent1, sent2 in zip(sents1, sents2):
        x = encode(sent1, "x")
        y = encode(sent2, "y")
        decoder_input, y = y[:-1], y[1:]

        x_seqlen, y_seqlen = len(x), len(y)
        yield (x, x_seqlen, sent1), (decoder_input, y, y_seqlen, sent2)

shapes = (([None], (), ()), ([None], [None], (), ()))
types = ((tf.int32, tf.int32, tf.string), (tf.int32, tf.int32, tf.int32, tf.string))
paddings = ((0, 0, ''),(0, 0, 0, ''))

train_batches = tf.data.Dataset.from_generator(
        generator_fn,
        output_shapes=shapes,
        output_types=types,
        args=(train_sents1, train_sents2))
train_batches = train_batches.shuffle(128 * batch_size)
train_batches = train_batches.repeat() #可以永久地训练
train_batches = train_batches.padded_batch(batch_size, shapes, paddings).prefetch(1)
num_train_batches = len(train_sents1) // batch_size + int(len(train_sents1) % batch_size != 0) #计算总的batch数目
num_train_samples = len(train_sents1)

eval_batches = tf.data.Dataset.from_generator(
        generator_fn,
        output_shapes=shapes,
        output_types=types,
        args=(eval_sents1, eval_sents2))
eval_batches = eval_batches.repeat()
eval_batches = eval_batches.padded_batch(batch_size, shapes, paddings).prefetch(1)
num_eval_batches = len(eval_sents1) // batch_size + int(len(eval_sents1) % batch_size != 0) #计算总的batch数目
num_eval_samples = len(eval_sents1)

创建iteration

In [None]:
iter = tf.data.Iterator.from_structure(train_batches.output_types, train_batches.output_shapes)
xs, ys = iter.get_next()
# 迭代器初始化
train_init_op = iter.make_initializer(train_batches)
eval_init_op = iter.make_initializer(eval_batches)

模型定义

词向量初始化

In [None]:
import tensorflow as tf

d_model = 512 # 词向量维度
with tf.variable_scope("shared_weight_matrix"):
    embeddings = tf.get_variable('weight_mat', 
                                 dtype=tf.float32, 
                                 shape=(vocab_size, d_model),
                                 initializer=tf.contrib.layers.xavier_initializer())
    # 令第一行向量为0
    embeddings = tf.concat((tf.zeros(shape=[1, d_model]), embeddings[1:, :]), 0) # embeddings->(vocab_size, d_model)
    
print (embeddings.shape)

前向计算过程

编码

In [None]:
import numpy as np

dropout_rate = 0.3
num_blocks = 6
num_heads = 8
padding_num = -2 ** 32 + 1
epsilon = 1e-8
d_ff = 2048 # 前向网络的隐藏层

def encoding(xs_input, is_training):
    with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE):
        x, seqlens, sents1 = xs_input

        # 词向量编码
        # x->(batch_size, maxlen1), embeddings->(vocab_size, d_model), enc->(batch_size, maxlen1, d_model)
        enc = tf.nn.embedding_lookup(embeddings, x)
        enc *= d_model ** 0.5 # 求开根号，将embedding进行scale

        # 位置向量编码
        E = enc.get_shape().as_list()[-1] # 常数，实际为d_model
        N, T = tf.shape(enc)[0], tf.shape(enc)[1] # 变量，对应batch_size, maxlen1
        with tf.variable_scope("positional_encoding", reuse=tf.AUTO_REUSE):
            # position indices
            # [[0,1,...,maxlen1-1],[0,1,...,maxlen1-1],...[0,1,...,maxlen1-1]]
            position_ind = tf.tile(tf.expand_dims(tf.range(T), 0), [N, 1]) # (N, T)即(batch_size, maxlen1)

            # 先计算pos/10000^2i/d_model
            position_enc = np.array([
                [pos / np.power(10000, (i-i%2)/E) for i in range(E)]
                for pos in range(maxlen1)]) # position_end->(maxlen1, d_model)

            # 再计算
            # PE(pos,2i) = sin(pos/10000^2i/d_model)
            # PE(pos,2i+1) = cos(pos/10000^2i/d_model)
            position_enc[:, 0::2] = np.sin(position_enc[:, 0::2])  # dim 2i, 即(maxlen1, d_model)的d_model里的偶数位置取sin
            position_enc[:, 1::2] = np.cos(position_enc[:, 1::2])  # dim 2i+1, 即(maxlen1, d_model)的d_model里的奇数位置取cos
            #因为tf.nn.embedding_lookup输入为张量，所以做一次转换
            position_enc = tf.convert_to_tensor(position_enc, tf.float32) # (maxlen1, d_model)

            # lookup
            position_embeddings = tf.nn.embedding_lookup(position_enc, position_ind) # (batch_size, maxlen1, d_model)

            # masking，将pad的位置置为0
            position_embeddings = tf.where(tf.equal(enc, 0), enc, position_embeddings)

            position_embeddings = tf.to_float(position_embeddings) # 张量再转换为float32

        enc += position_embeddings

        # embedding dropout
        enc = tf.layers.dropout(enc, dropout_rate, training=is_training) # (batch_size, maxlen1, d_model)

        # encoder进行多头attention的映射
        for i in range(num_blocks):
            with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE):
                with tf.variable_scope("multihead_attention", reuse=tf.AUTO_REUSE):
                    # Linear projections
                    Q = tf.layers.dense(enc, d_model, use_bias=False) # (batch_size, maxlen1, d_model)
                    K = tf.layers.dense(enc, d_model, use_bias=False)
                    V = tf.layers.dense(enc, d_model, use_bias=False)

                    # Split and concat
                    Q_ = tf.concat(tf.split(Q, num_heads, axis=2), axis=0) # (num_heads*batch_size, maxlen1, d_model/num_heads)
                    K_ = tf.concat(tf.split(K, num_heads, axis=2), axis=0) # (num_heads*batch_size, maxlen1, d_model/num_heads)
                    V_ = tf.concat(tf.split(V, num_heads, axis=2), axis=0) # (num_heads*batch_size, maxlen1, d_model/num_heads)

                    # Attention
                    with tf.variable_scope("scaled_dot_product_attention", reuse=tf.AUTO_REUSE):
                        d_k = Q_.get_shape().as_list()[-1] # d_model/num_heads

                        # dot product
                        outputs = tf.matmul(Q_, tf.transpose(K_, [0, 2, 1]))  # (num_heads*batch_size, maxlen1, maxlen1)

                        # scale
                        outputs /= d_k ** 0.5

                        # key masking, 把enc中padding的部分做mask
                        masks = tf.sign(tf.reduce_sum(tf.abs(K_), axis=-1)) # (num_heads*batch_size, maxlen1)
                        masks = tf.expand_dims(masks, 1) # (num_heads*batch_size, 1, maxlen1)
                        masks = tf.tile(masks, [1, tf.shape(Q_)[1], 1]) # (num_heads*batch_size, maxlen1, maxlen1)
                        paddings = tf.ones_like(outputs) * padding_num # (num_heads*batch_size, maxlen1, maxlen1)
                        outputs = tf.where(tf.equal(masks, 0), paddings, outputs)  # (num_heads*batch_size, maxlen1, maxlen1)

                        # softmax
                        outputs = tf.nn.softmax(outputs)
                        attention = tf.transpose(outputs, [0, 2, 1])
                        tf.summary.image("attention", tf.expand_dims(attention[:1], -1)) # 对key的softmax

                        # query masking
                        masks = tf.sign(tf.reduce_sum(tf.abs(Q_), axis=-1)) # (num_heads*batch_size, maxlen1)
                        masks = tf.expand_dims(masks, -1) # (num_heads*batch_size, 1, maxlen1)
                        masks = tf.tile(masks, [1, 1, tf.shape(K_)[1]]) # (num_heads*batch_size, maxlen1, maxlen1)
                        outputs = outputs * masks

                        # dropout
                        outputs = tf.layers.dropout(outputs, rate=dropout_rate, training=is_training)

                        # weighted sum (context vectors)
                        outputs = tf.matmul(outputs, V_) # (num_heads*batch_size, maxlen1, d_model/num_heads)

                    # Restore shape
                    # (batch_size, maxlen1, d_model)
                    outputs = tf.concat(tf.split(outputs, num_heads, axis=0), axis=2)

                    # Residual connection
                    outputs += enc

                    # Layer Normalize
                    with tf.variable_scope("ln", reuse=tf.AUTO_REUSE):
                        outputs_shape = outputs.get_shape()
                        params_shape = outputs_shape[-1:]

                        mean, variance = tf.nn.moments(outputs, [-1], keep_dims=True)
                        beta= tf.get_variable("beta", params_shape, initializer=tf.zeros_initializer())
                        gamma = tf.get_variable("gamma", params_shape, initializer=tf.ones_initializer())
                        normalized = (outputs - mean) / ( (variance + epsilon) ** (.5) )
                        outputs = gamma * normalized + beta


                # feed forwards
                with tf.variable_scope("positionwise_feedforward", reuse=tf.AUTO_REUSE):
                    # Inner layer
                    outputs_ff = tf.layers.dense(outputs, d_ff, activation=tf.nn.relu)

                    # Outer layer
                    outputs_ff = tf.layers.dense(outputs, d_model)

                    # Residual connection
                    outputs_ff += outputs

                    # Normalize
                    with tf.variable_scope("ln", reuse=tf.AUTO_REUSE):
                        outputs_shape = outputs_ff.get_shape()
                        params_shape = outputs_shape[-1:]

                        mean, variance = tf.nn.moments(outputs_ff, [-1], keep_dims=True)
                        beta= tf.get_variable("beta", params_shape, initializer=tf.zeros_initializer())
                        gamma = tf.get_variable("gamma", params_shape, initializer=tf.ones_initializer())
                        normalized = (outputs_ff - mean) / ( (variance + epsilon) ** (.5) )
                        outputs_ff = gamma * normalized + beta

                enc = outputs_ff # (batch_size, maxlen1, d_model)
            
    memory = enc
    return memory, sents1

encoding_memory, sents1 = encoding(xs, True)

解码

In [None]:
def decoding(ys_input, memory, is_training):
    with tf.variable_scope("decoder", reuse=tf.AUTO_REUSE):
        decoder_inputs, y, seqlens, sents2 = ys_input

        # embedding
        dec = tf.nn.embedding_lookup(embeddings, decoder_inputs) # (batch_size, maxlen2, d_model)
        dec *= d_model ** 0.5  # scale

        # 位置向量编码
        E = dec.get_shape().as_list()[-1]
        N, T = tf.shape(dec)[0], tf.shape(dec)[1]
        with tf.variable_scope("positional_encoding", reuse=tf.AUTO_REUSE):
            position_ind = tf.tile(tf.expand_dims(tf.range(T), 0), [N, 1]) # (batch_size, maxlen2)

            position_enc = np.array([
                [pos / np.power(10000, (i-i%2)/E) for i in range(E)]
                for pos in range(maxlen2)])

            position_enc[:, 0::2] = np.sin(position_enc[:, 0::2])
            position_enc[:, 1::2] = np.cos(position_enc[:, 1::2])
            position_enc = tf.convert_to_tensor(position_enc, tf.float32) # (maxlen2, d_model)

            # lookup
            position_embeddings = tf.nn.embedding_lookup(position_enc, position_ind) # (batch_size, maxlen2, d_model)

            # masking
            position_embeddings = tf.where(tf.equal(dec, 0), dec, position_embeddings)

            position_embeddings = tf.to_float(position_embeddings)

        dec += position_embeddings
        dec = tf.layers.dropout(dec, dropout_rate, training=is_training)

        for i in range(num_blocks):
            with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE):
                # Masked self-attention
                with tf.variable_scope("self_attention", reuse=tf.AUTO_REUSE):
                    Q = tf.layers.dense(dec, d_model, use_bias=False) # (batch_size, maxlen2, d_model)
                    K = tf.layers.dense(dec, d_model, use_bias=False)
                    V = tf.layers.dense(dec, d_model, use_bias=False)

                    Q_ = tf.concat(tf.split(Q, num_heads, axis=2), axis=0) # (num_heads*batch_size, maxlen2, d_model/num_heads)
                    K_ = tf.concat(tf.split(K, num_heads, axis=2), axis=0) # (num_heads*batch_size, maxlen2, d_model/num_heads)
                    V_ = tf.concat(tf.split(V, num_heads, axis=2), axis=0) # (num_heads*batch_size, maxlen2, d_model/num_heads)

                    with tf.variable_scope("scaled_dot_product_attention", reuse=tf.AUTO_REUSE):
                        d_k = Q_.get_shape().as_list()[-1] # d_model/num_heads

                        outputs = tf.matmul(Q_, tf.transpose(K_, [0, 2, 1]))  # (num_heads*batch_size, maxlen2, maxlen2)

                        outputs /= d_k ** 0.5

                        masks = tf.sign(tf.reduce_sum(tf.abs(K_), axis=-1)) # (num_heads*batch_size, maxlen2)
                        masks = tf.expand_dims(masks, 1) # (num_heads*batch_size, 1, maxlen2)
                        masks = tf.tile(masks, [1, tf.shape(Q_)[1], 1]) # (num_heads*batch_size, maxlen2, maxlen2)
                        paddings = tf.ones_like(outputs) * padding_num # (num_heads*batch_size, maxlen2, maxlen2)
                        outputs = tf.where(tf.equal(masks, 0), paddings, outputs)  # (num_heads*batch_size, maxlen2, maxlen2)

                        # 注意，这里和encoder不同，units that reference the future are masked
                        diag_vals = tf.ones_like(outputs[0, :, :]) # (maxlen2, maxlen2)
                        tril = tf.linalg.LinearOperatorLowerTriangular(diag_vals).to_dense() # (maxlen2, maxlen2) 上三角为0
                        masks = tf.tile(tf.expand_dims(tril, 0), [tf.shape(outputs)[0], 1, 1])# (num_heads*batch_size, maxlen2, maxlen2)
                        paddings = tf.ones_like(masks) * padding_num
                        outputs = tf.where(tf.equal(masks, 0), paddings, outputs)

                        outputs = tf.nn.softmax(outputs)
                        attention = tf.transpose(outputs, [0, 2, 1])
                        tf.summary.image("attention", tf.expand_dims(attention[:1], -1)) # 对key的softmax

                        masks = tf.sign(tf.reduce_sum(tf.abs(Q_), axis=-1)) # (num_heads*batch_size, maxlen2)
                        masks = tf.expand_dims(masks, -1) # (num_heads*batch_size, 1, maxlen2)
                        masks = tf.tile(masks, [1, 1, tf.shape(K_)[1]]) # (num_heads*batch_size, maxlen1, maxlen2)
                        outputs = outputs * masks

                        outputs = tf.layers.dropout(outputs, rate=dropout_rate, training=is_training)

                        outputs = tf.matmul(outputs, V_) # (num_heads*batch_size, maxlen2, d_model/num_heads)

                    outputs = tf.concat(tf.split(outputs, num_heads, axis=0), axis=2)

                    outputs += dec

                    with tf.variable_scope("ln", reuse=tf.AUTO_REUSE):
                        outputs_shape = outputs.get_shape()
                        params_shape = outputs_shape[-1:]

                        mean, variance = tf.nn.moments(outputs, [-1], keep_dims=True)
                        beta= tf.get_variable("beta", params_shape, initializer=tf.zeros_initializer())
                        gamma = tf.get_variable("gamma", params_shape, initializer=tf.ones_initializer())
                        normalized = (outputs - mean) / ( (variance + epsilon) ** (.5) )
                        outputs = gamma * normalized + beta

                dec = outputs

                # Vanilla attention
                with tf.variable_scope("vanilla_attention", reuse=tf.AUTO_REUSE):
                    # Linear projections
                    Q = tf.layers.dense(dec, d_model, use_bias=False) # (batch_size, maxlen2, d_model)
                    K = tf.layers.dense(memory, d_model, use_bias=False) # (batch_size, maxlen1, d_model)
                    V = tf.layers.dense(memory, d_model, use_bias=False) # (batch_size, maxlen1, d_model)

                    # Split and concat
                    Q_ = tf.concat(tf.split(Q, num_heads, axis=2), axis=0) # (num_heads*batch_size, maxlen2, d_model/num_heads)
                    K_ = tf.concat(tf.split(K, num_heads, axis=2), axis=0) # (num_heads*batch_size, maxlen1, d_model/num_heads)
                    V_ = tf.concat(tf.split(V, num_heads, axis=2), axis=0) # (num_heads*batch_size, maxlen1, d_model/num_heads)

                    # Attention
                    with tf.variable_scope("scaled_dot_product_attention", reuse=tf.AUTO_REUSE):
                        d_k = Q_.get_shape().as_list()[-1]

                        # dot product
                        outputs = tf.matmul(Q_, tf.transpose(K_, [0, 2, 1]))  # (num_heads*batch_size, maxlen2, maxlen1)

                        # scale
                        outputs /= d_k ** 0.5

                        # key masking
                        masks = tf.sign(tf.reduce_sum(tf.abs(K_), axis=-1)) # (num_heads*batch_size, maxlen1)
                        masks = tf.expand_dims(masks, 1) # (num_heads*batch_size, 1, maxlen1)
                        masks = tf.tile(masks, [1, tf.shape(Q_)[1], 1]) # (num_heads*batch_size, maxlen2, maxlen1)
                        paddings = tf.ones_like(outputs) * padding_num
                        outputs = tf.where(tf.equal(masks, 0), paddings, outputs) # (num_heads*batch_size, maxlen2, maxlen1)

                        # softmax
                        outputs = tf.nn.softmax(outputs)
                        attention = tf.transpose(outputs, [0, 2, 1])
                        tf.summary.image("attention", tf.expand_dims(attention[:1], -1))

                        # query masking
                        masks = tf.sign(tf.reduce_sum(tf.abs(Q_), axis=-1))  # (num_heads*batch_size, maxlen2)
                        masks = tf.expand_dims(masks, -1)
                        masks = tf.tile(masks, [1, 1, tf.shape(K_)[1]])  # (num_heads*batch_size, maxlen1, maxlen2)
                        outputs = outputs * masks

                        # dropout
                        outputs = tf.layers.dropout(outputs, rate=dropout_rate, training=is_training)

                        # weighted sum (context vectors)
                        outputs = tf.matmul(outputs, V_)  # (num_heads*batch_size, maxlen2, d_model)

                    # Restore shape
                    outputs = tf.concat(tf.split(outputs, num_heads, axis=0), axis=2 ) # (num_heads*batch_size, maxlen2, d_model)

                    # Residual connection
                    outputs += dec

                    # Normalize
                    with tf.variable_scope("ln", reuse=tf.AUTO_REUSE):
                        outputs_shape = outputs.get_shape()
                        params_shape = outputs_shape[-1:]

                        mean, variance = tf.nn.moments(outputs, [-1], keep_dims=True)
                        beta= tf.get_variable("beta", params_shape, initializer=tf.zeros_initializer())
                        gamma = tf.get_variable("gamma", params_shape, initializer=tf.ones_initializer())
                        normalized = (outputs - mean) / ( (variance + epsilon) ** (.5) )
                        outputs = gamma * normalized + beta

                # feed forwards
                with tf.variable_scope("positionwise_feedforward", reuse=tf.AUTO_REUSE):
                    # Inner layer
                    outputs_ff = tf.layers.dense(outputs, d_ff, activation=tf.nn.relu)

                    # Outer layer
                    outputs_ff = tf.layers.dense(outputs, d_model)

                    # Residual connection
                    outputs_ff += outputs

                    # Normalize
                    with tf.variable_scope("ln", reuse=tf.AUTO_REUSE):
                        outputs_shape = outputs_ff.get_shape()
                        params_shape = outputs_shape[-1:]

                        mean, variance = tf.nn.moments(outputs_ff, [-1], keep_dims=True)
                        beta= tf.get_variable("beta", params_shape, initializer=tf.zeros_initializer())
                        gamma = tf.get_variable("gamma", params_shape, initializer=tf.ones_initializer())
                        normalized = (outputs_ff - mean) / ( (variance + epsilon) ** (.5) )
                        outputs_ff = gamma * normalized + beta

                dec = outputs_ff # (batch_size, maxlen2, d_model)

    # Final linear projection (embedding weights are shared)
    weights = tf.transpose(embeddings) # (d_model, vocab_size)
    logits = tf.einsum('ntd,dk->ntk', dec, weights) # (batch_size, maxlen2, d_model)
    preds = tf.to_int32(tf.argmax(logits, axis=-1))   
    
    return logits, preds, y, sents2

logits, preds, y, sents2 = decoding(ys, encoding_memory, True)

定义train scheme

label smooth

In [None]:
label_inputs = tf.one_hot(y, depth=vocab_size)
label_V = label_inputs.get_shape().as_list()[-1]
y_ = ((1 - epsilon) * label_inputs) + (epsilon / label_V)

loss

In [None]:
ce = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=y_)
nonpadding = tf.to_float(tf.not_equal(y, token2idx["<pad>"]))  # 0: <pad>
loss = tf.reduce_sum(ce * nonpadding) / (tf.reduce_sum(nonpadding) + 1e-7)

train_op

In [None]:
lr = 0.0003
warmup_steps = 4000

def noam_scheme(init_lr, global_step, warmup_steps=4000.):
    '''Noam scheme learning rate decay
    init_lr: initial learning rate. scalar.
    global_step: scalar.
    warmup_steps: scalar. During warmup_steps, learning rate increases
        until it reaches init_lr.
    '''
    step = tf.cast(global_step + 1, dtype=tf.float32)
    return init_lr * warmup_steps ** 0.5 * tf.minimum(step * warmup_steps ** -1.5, step ** -0.5)

global_step = tf.train.get_or_create_global_step()
lr = noam_scheme(lr, global_step, warmup_steps)
optimizer = tf.train.AdamOptimizer(lr)
train_op = optimizer.minimize(loss, global_step=global_step)

summary

In [None]:
tf.summary.scalar('lr', lr)
tf.summary.scalar("loss", loss)
tf.summary.scalar("global_step", global_step)

train_summaries = tf.summary.merge_all()

定义评测和推理

In [None]:
import logging
from tqdm import tqdm

def convert_idx_to_token_tensor(inputs, idx2token):
    '''Converts int32 tensor to string tensor.
    inputs: 1d int32 tensor. indices.
    idx2token: dictionary

    Returns
    1d string tensor.
    '''
    def my_func(inputs):
        return " ".join(idx2token[elem] for elem in inputs)

    return tf.py_func(my_func, [inputs], tf.string)

def evaluation(xs, ys):
    decoder_inputs, y, y_seqlen, sents2 = ys
    decoder_inputs = tf.ones((tf.shape(xs[0])[0], 1), tf.int32) * token2idx["<s>"]
    ys_decoder = (decoder_inputs, y, y_seqlen, sents2)
    
    memory, sents1 = encoding(xs, False)
    logging.info("Inference graph is being built. Please be patient.")
    for _ in tqdm(range(maxlen2)):
        logits, y_hat, y, sents2 = decoding(ys, memory, False)
        if tf.reduce_sum(y_hat, 1) == token2idx["<pad>"]: break

        _decoder_inputs = tf.concat((decoder_inputs, y_hat), 1)
        ys = (_decoder_inputs, y, y_seqlen, sents2)
        
    # monitor a random sample
    n = tf.random_uniform((), 0, tf.shape(y_hat)[0]-1, tf.int32)
    sent1 = sents1[n]
    pred = convert_idx_to_token_tensor(y_hat[n], idx2token)
    sent2 = sents2[n]
    
    tf.summary.text("sent1", sent1)
    tf.summary.text("pred", pred)
    tf.summary.text("sent2", sent2)
    summaries = tf.summary.merge_all()
    
    return y_hat, summaries

y_hat, eval_summaries = evaluation(xs, ys)

训练过程

In [None]:
num_epochs = 20
logdir = 'log'
evaldir = 'eval'

def save_variable_specs(fpath):
    '''Saves information about variables such as
    their name, shape, and total parameter number
    fpath: string. output file path

    Writes
    a text file named fpath.
    '''
    def _get_size(shp):
        '''Gets size of tensor shape
        shp: TensorShape

        Returns
        size
        '''
        size = 1
        for d in range(len(shp)):
            size *=shp[d]
        return size

    params, num_params = [], 0
    for v in tf.global_variables():
        params.append("{}==={}".format(v.name, v.shape))
        num_params += _get_size(v.shape)
    print("num_params: ", num_params)
    with open(fpath, 'w') as fout:
        fout.write("num_params: {}\n".format(num_params))
        fout.write("\n".join(params))
    logging.info("Variables info has been saved.")
    
logging.info("# Session")
saver = tf.train.Saver(max_to_keep=num_epochs)
with tf.Session() as sess:
    ckpt = tf.train.latest_checkpoint(logdir)
    if ckpt is None:
        logging.info("Initializing from scratch")
        sess.run(tf.global_variables_initializer())
        save_variable_specs(os.path.join(logdir, "specs"))
    else:
        saver.restore(sess, ckpt)

    summary_writer = tf.summary.FileWriter(logdir, sess.graph)

    sess.run(train_init_op)
    total_steps = num_epochs * num_train_batches
    _gs = sess.run(global_step)
    for i in tqdm(range(_gs, total_steps+1)):
        _, _gs, _summary = sess.run([train_op, global_step, train_summaries])
        epoch = math.ceil(_gs / num_train_batches)
        summary_writer.add_summary(_summary, _gs)

        if _gs and _gs % num_train_batches == 0:
            logging.info("epoch {} is done".format(epoch))
            _loss = sess.run(loss) # train loss

            logging.info("# test evaluation")
            _, _eval_summaries = sess.run([eval_init_op, eval_summaries])
            summary_writer.add_summary(_eval_summaries, _gs)

            logging.info("# get hypotheses")
            hypotheses = get_hypotheses(num_eval_batches, num_eval_samples, sess, y_hat, idx2token)

            logging.info("# write results")
            model_output = "en2zh_E%02dL%.2f" % (epoch, _loss)
            if not os.path.exists(evaldir): os.makedirs(evaldir)
            translation = os.path.join(evaldir, model_output)
            with open(translation, 'w') as fout:
                fout.write("\n".join(hypotheses))

            logging.info("# calc bleu score and append it to translation")
            calc_bleu(hp.eval3, translation)

            logging.info("# save models")
            ckpt_name = os.path.join(logdir, model_output)
            saver.save(sess, ckpt_name, global_step=_gs)
            logging.info("after training of {} epochs, {} has been saved.".format(epoch, ckpt_name))

            logging.info("# fall back to train mode")
            sess.run(train_init_op)
    summary_writer.close()

logging.info("Done")

测试