#### Layer Normalization部分

In [None]:
def ln(inputs, epsilon = 1e-8, scope="ln"):
    """
    使用层归一layer normalization
    tensorflow 在实现 Batch Normalization（各个网络层输出的归一化）时，主要用到nn.moments和batch_normalization
    其中moments作用是统计矩，mean 是一阶矩，variance 则是二阶中心矩
    tf.nn.moments 计算返回的 mean 和 variance 作为 tf.nn.batch_normalization 参数进一步调用
    :param inputs: 一个有2个或更多维度的张量，第一个维度是batch_size
    :param epsilon: 很小的数值，防止区域划分错误
    :param scope:
    :return: 返回一个与inputs相同shape和数据的dtype
    """
    with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
        inputs_shape = inputs.get_shape()
        params_shape = inputs_shape[-1:]

        mean, variance = tf.nn.moments(inputs, [-1], keep_dims=True)
        beta= tf.get_variable("beta", params_shape, initializer=tf.zeros_initializer())
        gamma = tf.get_variable("gamma", params_shape, initializer=tf.ones_initializer())
        normalized = (inputs - mean) / ( (variance + epsilon) ** (.5) )
        outputs = gamma * normalized + beta

    return outputs

#### Mask部分
 Padding Mask 与 Sequence Mask

`tf.reduce_sum()`: 计算张量tensor沿着某一维度的和，可以在求和后降维

`tf.sign()`: 返回符号(-1, 0, 1), y= sign(x) = -1 if x<0; 0 if x==0; 1 if x>0.

`tf.expand_dims()`: 使用axis参数。给定一个张量输入，这个操作在输入形状的维数索引轴上插入一个维数为1的维度

`tf.tile()`: 平铺之意，用于在同一维度上的复制，multiples参数维度与input维度应一致

`tf.where(input, a, b)`: a，b均为尺寸一致的tensor，实现a中对应input中true的位置的元素值不变，其余元素由b中对应位置元素替换
 

In [None]:
def mask(inputs, queries=None, keys=None, type=None):
    """
    对Keys或Queries进行遮盖
    :param inputs: (N, T_q, T_k)
    :param queries: (N, T_q, d)
    :param keys: (N, T_k, d)
    :return:

    e.g.,
    >> queries = tf.constant([[[1.],
                               [2.],
                               [0.]]], tf.float32) # (1, 3, 1)
    >> keys = tf.constant([[[4.],
                            [0.]]], tf.float32)  # (1, 2, 1)
    >> inputs = tf.constant([[[4., 0.],
                              [8., 0.],
                              [0., 0.]]], tf.float32)
    >> mask(inputs, queries, keys, "key")
    array([[[ 4.0000000e+00, -4.2949673e+09],
            [ 8.0000000e+00, -4.2949673e+09],
            [ 0.0000000e+00, -4.2949673e+09]]], dtype=float32)
    
    >> inputs = tf.constant([[[1., 0.],
                              [1., 0.],
                              [1., 0.]]], tf.float32)
    >> mask(inputs, queries, keys, "query")
    array([[[1., 0.],
            [1., 0.],
            [0., 0.]]], dtype=float32)
    """

    padding_num = -2 ** 32 + 1
    if type in ("k", "key", "keys"):
        # Generate masks
        masks = tf.sign(tf.reduce_sum(tf.abs(keys), axis=-1))  # (N, T_k)
        masks = tf.expand_dims(masks, 1) # (N, 1, T_k)
        masks = tf.tile(masks, [1, tf.shape(queries)[1], 1])  # (N, T_q, T_k)

        # Apply masks to inputs
        paddings = tf.ones_like(inputs) * padding_num
        outputs = tf.where(tf.equal(masks, 0), paddings, inputs)  # (N, T_q, T_k)
    elif type in ("q", "query", "queries"):
        # Generate masks
        masks = tf.sign(tf.reduce_sum(tf.abs(queries), axis=-1))  # (N, T_q)
        masks = tf.expand_dims(masks, -1)  # (N, T_q, 1)
        masks = tf.tile(masks, [1, 1, tf.shape(keys)[1]])  # (N, T_q, T_k)

        # Apply masks to inputs
        outputs = inputs*masks
    elif type in ("f", "future", "right"):
        diag_vals = tf.ones_like(inputs[0, :, :])  # (T_q, T_k)
        tril = tf.linalg.LinearOperatorLowerTriangular(diag_vals).to_dense()  # (T_q, T_k)
        masks = tf.tile(tf.expand_dims(tril, 0), [tf.shape(inputs)[0], 1, 1])  # (N, T_q, T_k)

        paddings = tf.ones_like(masks) * padding_num
        outputs = tf.where(tf.equal(masks, 0), paddings, inputs)
    else:
        print("Check if you entered type correctly!")


    return outputs

#### Encoder-Decoder Attention
此处是两个不同序列之间的attention，与来源于自身的 self-attention 相区别。context-attention有很多，这里使用的是scaled dot-product。通过 query 和 key 的相似性程度来确定 value 的权重分布。

实际上这部分代码就是self attention用到的QKV的公式的核心代码，不管是Encoder-Decoder Attention还是Self Attention都是用的这里的scaled dot-product方法。

`tf.transpose(X, perm=None)`: 按照新维度序列转置矩阵

In [None]:
def scaled_dot_product_attention(Q, K, V,
                                 causality=False, dropout_rate=0.,
                                 training=True,
                                 scope="scaled_dot_product_attention"):

    '''
    查看原论文中3.2.1attention计算公式：Attention(Q,K,V)=softmax(Q K^T /√dk ) V
    :param Q: 查询，三维张量，[N, T_q, d_k].
    :param K: keys值，三维张量，[N, T_k, d_v].
    :param V: values值，三维张量，[N, T_k, d_v].
    :param causality: 布尔值，如果为True，就会对未来的数值进行遮盖
    :param dropout_rate: 0到1之间的一个数值
    :param training: 布尔值，用来控制dropout
    :param scope:
    '''
    with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
        d_k = Q.get_shape().as_list()[-1]

        # dot product
        outputs = tf.matmul(Q, tf.transpose(K, [0, 2, 1]))  # (N, T_q, T_k)

        # scale
        outputs /= d_k ** 0.5

        # key masking
        outputs = mask(outputs, Q, K, type="key")

        # causality or future blinding masking
        if causality:
            outputs = mask(outputs, type="future")

        # softmax
        outputs = tf.nn.softmax(outputs)
        attention = tf.transpose(outputs, [0, 2, 1])
        tf.summary.image("attention", tf.expand_dims(attention[:1], -1))

        # query masking
        outputs = mask(outputs, Q, K, type="query")

        # dropout
        outputs = tf.layers.dropout(outputs, rate=dropout_rate, training=training)

        # weighted sum (context vectors)
        outputs = tf.matmul(outputs, V)  # (N, T_q, d_v)

    return outputs

#### Multi-head Attention

In [None]:
def multihead_attention(queries, keys, values,
                        num_heads=8,
                        dropout_rate=0,
                        training=True,
                        causality=False,
                        scope="multihead_attention"):
    '''
    查看原论文中3.2.2中multihead_attention构建，
    这里是将不同的Queries、Keys和values方式线性地投影h次是有益的。
    线性投影分别为dk，dk和dv尺寸。在每个预计版本进行queries、keys、values，
    然后并行执行attention功能，产生dv维输出值。这些被连接并再次投影，产生最终值
    :param queries: 三维张量[N, T_q, d_model]
    :param keys: 三维张量[N, T_k, d_model]
    :param values: 三维张量[N, T_k, d_model]
    :param num_heads: heads数
    :param dropout_rate:
    :param training: 控制dropout机制
    :param causality: 控制是否遮盖
    :param scope:
    :return: 三维张量(N, T_q, C)
    '''
    d_model = queries.get_shape().as_list()[-1]
    with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
        # Linear projections
        Q = tf.layers.dense(queries, d_model, use_bias=False) # (N, T_q, d_model)
        K = tf.layers.dense(keys, d_model, use_bias=False) # (N, T_k, d_model)
        V = tf.layers.dense(values, d_model, use_bias=False) # (N, T_k, d_model)

        # Split and concat
        Q_ = tf.concat(tf.split(Q, num_heads, axis=2), axis=0) # (h*N, T_q, d_model/h)
        K_ = tf.concat(tf.split(K, num_heads, axis=2), axis=0) # (h*N, T_k, d_model/h)
        V_ = tf.concat(tf.split(V, num_heads, axis=2), axis=0) # (h*N, T_k, d_model/h)

        # Attention
        outputs = scaled_dot_product_attention(Q_, K_, V_, causality, dropout_rate, training)

        # Restore shape
        outputs = tf.concat(tf.split(outputs, num_heads, axis=0), axis=2 ) # (N, T_q, d_model)

        # Residual connection
        outputs += queries

        # Normalize
        outputs = ln(outputs)

    return outputs

#### Positional Embedding
就目前而言，Transformer 架构还没有提取序列顺序的信息，这个信息对于序列而言非常重要，如果缺失了这个信息，可能我们的结果就是：所有词语都对了，但是无法组成有意义的语句。因此模型对序列中的词语出现的位置进行编码。论文中使用的方法是在偶数位置使用正弦编码，在奇数位置使用余弦编码。

`tf.nn.embedding_lookup(params, ids)`: params可以是张量也可以是数组等，id就是对应的索引

In [None]:
def positional_encoding(inputs,
                        maxlen,
                        masking=True,
                        scope="positional_encoding"):

    '''
    由于模型没有循环和卷积，为了让模型知道句子的编号，
    就必须加入某些绝对位置信息，来表示token之间的关系。
    positional encoding和embedding有相同的维度，这两个能够相加。
    :param inputs:
    :param maxlen:
    :param masking:
    :param scope:
    :return:
    '''

    E = inputs.get_shape().as_list()[-1] # static
    N, T = tf.shape(inputs)[0], tf.shape(inputs)[1] # dynamic
    with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
        # position indices
        position_ind = tf.tile(tf.expand_dims(tf.range(T), 0), [N, 1]) # (N, T)

        # First part of the PE function: sin and cos argument
        position_enc = np.array([
            [pos / np.power(10000, (i-i%2)/E) for i in range(E)]
            for pos in range(maxlen)])

        # Second part, apply the cosine to even columns and sin to odds.
        position_enc[:, 0::2] = np.sin(position_enc[:, 0::2])  # dim 2i
        position_enc[:, 1::2] = np.cos(position_enc[:, 1::2])  # dim 2i+1
        position_enc = tf.convert_to_tensor(position_enc, tf.float32) # (maxlen, E)

        # lookup
        outputs = tf.nn.embedding_lookup(position_enc, position_ind)

        # masks
        if masking:
            outputs = tf.where(tf.equal(inputs, 0), inputs, outputs)

        return tf.to_float(outputs)

#### 整合Encoder与Decoder

In [None]:
def encode(self, xs, training=True):
    '''
    Returns
    memory: encoder outputs. (N, T1, d_model)
    '''
    with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE):
        x, seqlens, sents1 = xs

        # embedding
        enc = tf.nn.embedding_lookup(self.embeddings, x) # (N, T1, d_model)
        enc *= self.hp.d_model**0.5 # scale

        enc += positional_encoding(enc, self.hp.maxlen1)
        enc = tf.layers.dropout(enc, self.hp.dropout_rate, training=training)

        ## Blocks
        for i in range(self.hp.num_blocks):
            with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE):
                # self-attention
                enc = multihead_attention(queries=enc,
                                            keys=enc,
                                            values=enc,
                                            num_heads=self.hp.num_heads,
                                            dropout_rate=self.hp.dropout_rate,
                                            training=training,
                                            causality=False)
                # feed forward
                enc = ff(enc, num_units=[self.hp.d_ff, self.hp.d_model])
    memory = enc
    return memory, sents1

def decode(self, ys, memory, training=True):
    '''
    memory: encoder outputs. (N, T1, d_model)

    Returns
    logits: (N, T2, V). float32.
    y_hat: (N, T2). int32
    y: (N, T2). int32
    sents2: (N,). string.
    '''
    with tf.variable_scope("decoder", reuse=tf.AUTO_REUSE):
        decoder_inputs, y, seqlens, sents2 = ys

        # embedding
        dec = tf.nn.embedding_lookup(self.embeddings, decoder_inputs)  # (N, T2, d_model)
        dec *= self.hp.d_model ** 0.5  # scale

        dec += positional_encoding(dec, self.hp.maxlen2)
        dec = tf.layers.dropout(dec, self.hp.dropout_rate, training=training)

        # Blocks
        for i in range(self.hp.num_blocks):
            with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE):
                # Masked self-attention (Note that causality is True at this time)
                dec = multihead_attention(queries=dec,
                                            keys=dec,
                                            values=dec,
                                            num_heads=self.hp.num_heads,
                                            dropout_rate=self.hp.dropout_rate,
                                            training=training,
                                            causality=True,
                                            scope="self_attention")

                # Vanilla attention
                dec = multihead_attention(queries=dec,
                                            keys=memory,
                                            values=memory,
                                            num_heads=self.hp.num_heads,
                                            dropout_rate=self.hp.dropout_rate,
                                            training=training,
                                            causality=False,
                                            scope="vanilla_attention")
                ### Feed Forward
                dec = ff(dec, num_units=[self.hp.d_ff, self.hp.d_model])

    # Final linear projection (embedding weights are shared)
    weights = tf.transpose(self.embeddings) # (d_model, vocab_size)
    logits = tf.einsum('ntd,dk->ntk', dec, weights) # (N, T2, vocab_size)
    y_hat = tf.to_int32(tf.argmax(logits, axis=-1))

    return logits, y_hat, y, sents2