In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
import matplotlib.pyplot as plt
import numpy as np
import copy
from torch.autograd import Variable
import numpy as np
%matplotlib inline

# torch中变量封装函数Variable.
# 扩展资料: https://www.jb51.net/article/177996.htm
# 源码来自：https://github.com/harvardnlp/annotated-transformer/blob/master/AnnotatedTransformer.ipynb
# 解释来自：https://blog.csdn.net/m0_56192771/article/details/118087175        https://blog.csdn.net/JamesX666/article/details/126454270
# 知乎解释： https://zhuanlan.zhihu.com/p/398039366

In [6]:
x= torch.FloatTensor([[1,2],[3,4]])
var = Variable(tensor, requires_grad=True) # tensor不能反向传播，variable可以反向传播。
# y = torch.mean(x)
# y.backward()  ERROR
y = torch.mean(variable)
y.backward()
print(var.grad, var.data)  # var.data 转化为tensor形式


# requires_grad: True tensor需要计算梯度
# grad_fn：记录tensor如何得到，方便计算梯度
# grad：backward() 后可以查看tensor的梯度
x = torch.ones(2, 2, requires_grad=True)
y = x + 2
# y.backward()  # RuntimeError: grad can be implicitly created only for scalar outputs  
# y不是一个标量，若scalar对tensor求导直接backward()；若tensor对tensor求导，先求出Jacobian矩阵中每个元素的梯度值，然后将这个Jacobian矩阵与grad_tensors参数对应的矩阵进行对应点乘
## grad_tensors=None
## retain_graph=None
## create_graph=False
## grad_variables=None
y.backward(torch.ones_like(y))
print(x.grad)

z = y*4
loss1 = z.mean()
loss2 = z.sum()
print(loss1, loss2)
loss1.backward() # 加上 retain_graph 保存backward（）后的中间参数，否则执行完backward（）后中间参数全部释放掉了，影响下一次的backward（）
print(loss1, loss2)
print(x.grad)
# loss2.backward() # RuntimeError: Trying to backward through the graph a second time (or directly access saved tensors after they have already been freed)

NameError: name 'tensor' is not defined

# 一、输入

## 1.1 词嵌入

In [7]:
class Embeddings(nn.Module):
    def __init__(self, d_model, vocab):
        """
        :param d_model: 词嵌入维度
        :param vocab: 词表大小
        """
        super(Embeddings, self).__init__()
        self.lut = nn.Embedding(vocab, d_model)
        self.d_model = d_model

    def forward(self, x): 
        """
        :param x: [batch, seq_len]
        :return: [batch, seq_len, d_model]
        """
#         w = self.lut.weight
#         print(w)
#         print(w.shape)
        return self.lut(x) * math.sqrt(self.d_model)  

In [8]:
d_model = 512 
vocab = 1000  

x = Variable(torch.LongTensor([[100, 2, 421, 508], [491, 998, 1, 221]]))  # [2, 4] [batch, seq_len]

# 模型实例化
emb = Embeddings(d_model, vocab)  # padding_idx=

# 输入输出
out_emb = emb(x)

out_emb.shape, out_emb # [2, 4, 512]  [batch, seq_len, d_model]

(torch.Size([2, 4, 512]),
 tensor([[[ -6.3135,  16.6665, -26.1533,  ..., -26.1079,  16.0799,  49.0220],
          [ -1.6383, -23.3749, -34.3804,  ...,  -3.0579,   1.4318, -41.3992],
          [ -9.6149,  32.7269,   9.7242,  ...,  36.6066,   7.1971,  25.6940],
          [ 13.2073,  -7.5062,  12.6934,  ..., -23.6169,  32.8903, -26.9443]],
 
         [[ -3.7118,   6.7728,  -2.1114,  ..., -14.4089, -34.6631,   1.5404],
          [-15.5792, -12.9442,  -0.7474,  ...,   9.8891, -15.4912,  19.8836],
          [-41.0385,  22.0167, -30.9122,  ..., -32.6049,   6.3138,  11.4358],
          [-45.0965,  15.8813, -11.8618,  ...,  20.6125, -28.9372, -14.4448]]],
        grad_fn=<MulBackward0>))

## 1.2 位置编码

In [9]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout, max_len=5000):
        """
        :param d_model: 词嵌入维度
        :param dropout: 置0比率，让部分神经元失效
        :param max_len: 语料库中最长句子的长度
        """
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        # Compute the positional encodings once in log space.
        pe = torch.zeros(max_len, d_model) # [max_len, d_model]
        position = torch.arange(0, max_len).unsqueeze(1)  # [0~max_len-1, 1]
        div_term = torch.exp(
            torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model)
        )
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)    # [1, max_len, d_model] 
        self.register_buffer("pe", pe)

    def forward(self, x):  
        """
        :param x:  [batch, seq_len, d_model]
        :return: [batch, seq_len, d_model]    max_len—>seq_len
        """
        x = x + self.pe[:, : x.size(1)].requires_grad_(False)  # 默认 max_len=5000，适配 pe.size(1) == x.size(1)，不需要梯度计算
        return self.dropout(x) 

In [10]:
d_model = 512
dropout = 0.1  # 置0比率为0.1

# 模型实例化
pe = PositionalEncoding(d_model, dropout)

# 输入输出
out_pe = pe(out_emb)

out_pe.shape, out_pe  # [2, 4, 512] [batch, seq_len, d_model]

(torch.Size([2, 4, 512]),
 tensor([[[ -7.0150,  19.6294, -29.0592,  ..., -27.8977,  17.8666,  55.5800],
          [ -0.8853, -25.3718, -37.2873,  ...,  -0.0000,   1.5910, -44.8880],
          [ -9.6729,  35.9008,  11.8451,  ...,  41.7851,   7.9970,  29.6600],
          [ 14.8316,  -9.4402,   0.0000,  ...,  -0.0000,  36.5452, -28.8270]],
 
         [[ -4.1242,   8.6364,  -2.3460,  ..., -14.8987, -38.5145,   2.8227],
          [-16.3753, -13.7821,   0.0827,  ...,  12.0990, -17.2123,  23.2040],
          [-44.5880,  24.0006, -33.3065,  ..., -35.1165,   7.0156,  13.8175],
          [ -0.0000,  16.5459, -12.9075,  ...,  24.0139, -32.1521, -14.9386]]],
        grad_fn=<MulBackward0>))

In [11]:
# ## 绘制向量中特征的分布曲线         内核直接挂掉
# plt.figure(figsize=(15, 5))  # 创建一张 15*5 大小的画布
# pe = PositionalEncoding(20, 0)  # 词嵌入维度20， dropout=0
# y = pe(Variable(torch.zeros(1, 100, 20)))  # [batch, seq_len, d_model]
# plt.plot(np.arange(100), y[0, :, 4:8].data.numpy())
# plt.legend(["dim %d" % p for p in [4, 5, 6, 7]])
# plt.show()

# 二、注意力机制

## 2.1 掩码张量 函数

In [12]:
def subsequent_mask(size):
    """"
    :param size: 掩码张量最后两维度大小（方阵）
    :return: [1, size, size]
    """
    "Mask out subsequent positions."
    attn_shape = (1, size, size)
    subsequent_mask = torch.triu(torch.ones(attn_shape), diagonal=1).type(
        torch.uint8
    )
     # torch.triu 返回上三角阵，diagonal=1 对角线不是掩码
    return subsequent_mask == 0  # 下三角矩阵

In [13]:
size = 5 

# 函数输入输出
sm = subsequent_mask(size) # [1, size, size]
sm.shape, sm

(torch.Size([1, 5, 5]),
 tensor([[[ True, False, False, False, False],
          [ True,  True, False, False, False],
          [ True,  True,  True, False, False],
          [ True,  True,  True,  True, False],
          [ True,  True,  True,  True,  True]]]))

In [1]:
# plt.figure(figsize=(5, 5))
# plt.imshow(subsequent_mask(20)[0])
# plt.show()

NameError: name 'plt' is not defined

## 2.2 注意力机制 函数

In [78]:
"Compute 'Scaled Dot Product Attention'"
def attention(query, key, value, mask=None, dropout=None):
    """
    :param query: [batch, seq_len, d_model]
    :param key: [batch, seq_len, d_model]
    :param value: [batch, seq_len, d_model]
    :param mask: 
    :param dropout: 
    :return: [batch, seq_len, seq_len]*[batch, seq_len, d_model]=[batch, seq_len, d_model]  [batch, seq_len, seq_len]
    """
    d_k = query.size(-1)  # d_model 一般为词嵌入维度
    scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(
        d_k)  # [batch, seq_len, d_model]*[batch, d_model, seq_len] = [batch, seq_len, seq_len]
    if mask is not None:
        scores = scores.masked_fill(mask == 0, -1e9)  # 0 处填充-1e9（-inf）, mask采用广播方式
#     print("scores:", scores.shape, scores)
    
    p_attn = scores.softmax(dim=-1) # [batch, seq_len, seq_len] 每个词语其他词的关系度
    
    if dropout is not None:
        p_attn = dropout(p_attn) 
    return torch.matmul(p_attn, value), p_attn

In [79]:
# 无掩码
query = key = value = out_pe
attn, p_attn = attention(query, key, value) 
print(p_attn.shape, p_attn)  
print(attn.shape, attn)  

torch.Size([2, 4, 4]) tensor([[[1., 0., 0., 0.],
         [0., 1., 0., 0.],
         [0., 0., 1., 0.],
         [0., 0., 0., 1.]],

        [[1., 0., 0., 0.],
         [0., 1., 0., 0.],
         [0., 0., 1., 0.],
         [0., 0., 0., 1.]]], grad_fn=<SoftmaxBackward0>)
torch.Size([2, 4, 512]) tensor([[[ -7.0150,  19.6294, -29.0592,  ..., -27.8977,  17.8666,  55.5800],
         [ -0.8853, -25.3718, -37.2873,  ...,   0.0000,   1.5910, -44.8880],
         [ -9.6729,  35.9008,  11.8451,  ...,  41.7851,   7.9970,  29.6600],
         [ 14.8316,  -9.4402,   0.0000,  ...,   0.0000,  36.5452, -28.8270]],

        [[ -4.1242,   8.6364,  -2.3460,  ..., -14.8987, -38.5145,   2.8227],
         [-16.3753, -13.7821,   0.0827,  ...,  12.0990, -17.2123,  23.2040],
         [-44.5880,  24.0006, -33.3065,  ..., -35.1165,   7.0156,  13.8175],
         [  0.0000,  16.5459, -12.9075,  ...,  24.0139, -32.1521, -14.9386]]],
       grad_fn=<UnsafeViewBackward0>)


In [80]:
# 有/无 掩码
query = key = value = out_pe  # [2,4,512]  [batch, seq_len, d_model]

# mask = subsequent_mask(4) # [1,4,4]  [1, seq_len, seq_len]
# print(mask.shape, mask)
mask = None

attn, p_attn = attention(query, key, value, mask=mask)
print(p_attn.shape, p_attn) # [2,4,4]   [batch, seq_len, seq_len]
print(attn.shape, attn)   # [2,4,512]   [batch, seq_len, d_model]

torch.Size([2, 4, 4]) tensor([[[1., 0., 0., 0.],
         [0., 1., 0., 0.],
         [0., 0., 1., 0.],
         [0., 0., 0., 1.]],

        [[1., 0., 0., 0.],
         [0., 1., 0., 0.],
         [0., 0., 1., 0.],
         [0., 0., 0., 1.]]], grad_fn=<SoftmaxBackward0>)
torch.Size([2, 4, 512]) tensor([[[ -7.0150,  19.6294, -29.0592,  ..., -27.8977,  17.8666,  55.5800],
         [ -0.8853, -25.3718, -37.2873,  ...,   0.0000,   1.5910, -44.8880],
         [ -9.6729,  35.9008,  11.8451,  ...,  41.7851,   7.9970,  29.6600],
         [ 14.8316,  -9.4402,   0.0000,  ...,   0.0000,  36.5452, -28.8270]],

        [[ -4.1242,   8.6364,  -2.3460,  ..., -14.8987, -38.5145,   2.8227],
         [-16.3753, -13.7821,   0.0827,  ...,  12.0990, -17.2123,  23.2040],
         [-44.5880,  24.0006, -33.3065,  ..., -35.1165,   7.0156,  13.8175],
         [  0.0000,  16.5459, -12.9075,  ...,  24.0139, -32.1521, -14.9386]]],
       grad_fn=<UnsafeViewBackward0>)


## 2.3 多头注意力机制

In [81]:
# 克隆函数
def clones(module, N):
    """
    :param module:
    :param N:
    :return:
    """
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)]) # 对model进行N次深度拷贝        
# nn.Sequential, nn.ModuleList 

In [82]:
class MultiHeadedAttention(nn.Module):
    def __init__(self, h, d_model, dropout=0.1):
        """
        :param h: 头数
        :param d_model: 词嵌入维度
        :param dropout: 置0率
        """
        super(MultiHeadedAttention, self).__init__()
        assert d_model % h == 0  
        self.d_k = d_model // h  # 每个头获得等量的词特征
        self.h = h
        self.linears = clones(nn.Linear(d_model, d_model), 4) # 输入和输出维数相同
        # 4个线性层，Q，K，V个需要一个，最后concat需要一个
        self.attn = None
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, query, key, value, mask=None):
        """
        :param query: [batch, seq_len, d_model]
        :param key: [batch, seq_len, d_model]
        :param value: [batch, seq_len, d_model]
        :param mask: [head, seq_len, seq_len]
        :return: [batch, seq_len, d_model]
        """
        if mask is not None:
            # Same mask applied to all h heads.
            mask = mask.unsqueeze(0)  # [head, 1, seq_len, seq_len]
        nbatches = query.size(0)  # batch_size

        # 1) Do all the linear projections in batch from d_model => h x d_k
        query, key, value = [
            lin(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)
            for lin, x in zip(self.linears, (query, key, value))
        ] # [batch, h, seq_len, d_k]

        # 2) Apply attention on all the projected vectors in batch.
        x, self.attn = attention(query, key, value, mask=mask, dropout=self.dropout)
#         print(x.shape, self.attn.shape)  # [batch, h, seq_len, d_k]   [batch, h, seq_len, seq_len]

        # 3) "Concat" using a view and apply a final linear.
        x = x.transpose(1, 2).contiguous().view(nbatches, -1, self.h * self.d_k) # [batch, seq_len, d_model]

        del query
        del key
        del value
        return self.linears[-1](x)  # 最后经过一个线性层

In [83]:
# 有/无 掩码
head = 8
d_model = 512
dropout = 0.2

query = value = key = out_pe  # [2, 4, 512] => [batch, seq_len, d_model]

# mask = subsequent_mask(4)  # [1, seq_len, seq_len]
mask = None

# 模型实例化
mha = MultiHeadedAttention(head, d_model, dropout)

# 输入输出
out_mha = mha(query, key, value, mask)
out_mha.shape, out_mha  # [2, 4, 512] [batch, seq_len, d_model]

(torch.Size([2, 4, 512]),
 tensor([[[  0.1201,  -6.7654,   0.5670,  ...,   6.3632,  -1.5364,   4.1432],
          [ -1.9374, -13.7956,  18.1052,  ..., -23.3191,  10.7850, -10.7807],
          [ 13.7793,  -6.2434,   2.9834,  ...,  -1.1365,  -2.2245,  -0.5206],
          [ -2.5482,   3.5856,   8.4269,  ...,  -0.2525,  -2.3977,   3.4267]],
 
         [[  3.0670,   7.3750, -10.7303,  ...,  -4.7194,  -4.7140, -10.2006],
          [ -1.4688,   3.3906,  24.8131,  ...,  22.3113,   2.1377,  21.2604],
          [  5.9435,  -5.2225,  -8.5219,  ...,  -9.9623,  -4.8698,  -4.9577],
          [  8.9981,  -6.1869,   7.4167,  ...,  -3.9225, -10.8719,  -9.9060]]],
        grad_fn=<ViewBackward0>))

# 三、 前馈全连接层

In [84]:
class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        """
        :param d_model: 线性层的输入维度
        :param d_ff: 线性层的输出维度
        :param dropout:
        """
        super(PositionwiseFeedForward, self).__init__()
        self.w_1 = nn.Linear(d_model, d_ff)
        self.w_2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        """
        :param x: [batch, seq_len, d_model]
        :return: [batch, seq_len, d_model]
        """
        return self.w_2(self.dropout(F.relu(self.w_1(x))))

In [52]:
d_model = 512
d_ff = 64
dropput = 0.2

# 模型实例化
ff = PositionwiseFeedForward(d_model, d_ff, dropout)

#  输入输出
out_ff = ff(out_mha)

out_ff.shape, out_ff  # [2, 4, 512] => [batch, seq_len, d_model]

(torch.Size([2, 4, 512]),
 tensor([[[-2.2931,  3.1031,  6.6184,  ...,  0.0638,  1.8999,  2.3175],
          [ 0.8127, -0.1430,  0.1979,  ...,  1.8851, -1.7223,  1.4125],
          [-4.8497, -0.8918,  1.1326,  ..., -1.7741, -0.4349,  2.6070],
          [-1.9955, -0.6763,  3.5122,  ..., -0.7023,  0.6840, -0.3097]],
 
         [[-0.7978, -0.2064,  3.0988,  ...,  2.5182,  0.2775, -3.0580],
          [ 3.0043, -0.2703, -0.8180,  ...,  3.1509,  0.5949,  1.4028],
          [ 3.0991, -2.1678,  2.5744,  ...,  1.0916,  2.3163, -0.4694],
          [ 0.4183, -2.4270, -1.1791,  ..., -2.3256, -5.4808, -0.0842]]],
        grad_fn=<ViewBackward0>))

# 四、子层

## 4.1 规范化层

In [54]:
class LayerNorm(nn.Module):
    "Construct a layernorm module (See citation for details)."
    def __init__(self, features, eps=1e-6):
        """
        :param features: 词嵌入维度
        :param eps: 足够小的数，防止分母为0
        """
        super(LayerNorm, self).__init__()
        self.a_2 = nn.Parameter(torch.ones(features))
        self.b_2 = nn.Parameter(torch.zeros(features))
        self.eps = eps

    def forward(self, x):
        """
        :param x: [batch, seq_len, d_model]
        :return: [batch, seq_len, d_model]
        """
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        return self.a_2 * (x - mean) / (std + self.eps) + self.b_2

In [56]:
features = d_model = 512 
eps = 1e-6

# 模型实例化
ln = LayerNorm(features, eps)

# 输入输出
out_ln = ln(out_ff) # [2, 4, 512] => [batch, seq_len, d_model]

out_ln.shape, out_ln 

(torch.Size([2, 4, 512]),
 tensor([[[-9.3551e-01,  1.2052e+00,  2.5997e+00,  ..., -5.4127e-04,
            7.2786e-01,  8.9352e-01],
          [ 4.1133e-01, -4.0373e-02,  1.2075e-01,  ...,  9.1817e-01,
           -7.8682e-01,  6.9482e-01],
          [-1.6158e+00, -3.0775e-01,  3.6129e-01,  ..., -5.9933e-01,
           -1.5677e-01,  8.4854e-01],
          [-5.8981e-01, -1.7527e-01,  1.1409e+00,  ..., -1.8346e-01,
            2.5218e-01, -6.0097e-02]],
 
         [[-2.6271e-01, -5.5632e-02,  1.1018e+00,  ...,  8.9846e-01,
            1.1383e-01, -1.0542e+00],
          [ 1.4759e+00, -1.4704e-01, -4.1849e-01,  ...,  1.5485e+00,
            2.8177e-01,  6.8216e-01],
          [ 1.1042e+00, -7.5180e-01,  9.1925e-01,  ...,  3.9674e-01,
            8.2830e-01, -1.5333e-01],
          [ 2.2812e-01, -9.2426e-01, -4.1887e-01,  ..., -8.8322e-01,
           -2.1611e+00,  2.4591e-02]]], grad_fn=<AddBackward0>))

## 4.2 正则+残差

In [127]:
class SublayerConnection(nn.Module):
    """
    A residual connection followed by a layer norm.
    Note for code simplicity the norm is first as opposed to last.
    """
    def __init__(self, size, dropout):
        """
        :param size: 词嵌入维度
        :param dropout: 
        """
        super(SublayerConnection, self).__init__()
        self.layer_norm = LayerNorm(size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, sublayer):
        """
        :param x: [batch, seq_len, d_model]
        :param sub_mha: 实例化模型对象
        :return: [batch, seq_len, d_model]
        """
        "Apply residual connection to any sublayer with the same size."
        return x + self.dropout(sublayer(self.layer_norm(x)))

In [128]:
head = 8
size = d_model = 512
dropout = 0.2

# mask = subsequent_mask(4)
mask = None # 无掩码

# 实例化mutil-head 层
mha = MultiHeadedAttention(head, d_model)

# 函数类型子层
sub_mha = lambda x: mha(x, x, x, mask)   # Q,K,V

# 模型实例化
sc = SublayerConnection(size, dropout)

# 输入输出
out_sc = sc(out_pe, sub_mha)
out_sc.shape, out_sc  # [2, 4, 512] => [batch, seq_len, d_model]

(torch.Size([2, 4, 512]),
 tensor([[[-7.3245e+00,  1.9915e+01, -2.8705e+01,  ..., -2.8111e+01,
            1.7867e+01,  5.5224e+01],
          [-6.6200e-01, -2.5147e+01, -3.6596e+01,  ...,  0.0000e+00,
            2.0100e+00, -4.4888e+01],
          [-9.4958e+00,  3.5901e+01,  1.1775e+01,  ...,  4.1708e+01,
            7.3989e+00,  2.9574e+01],
          [ 1.4832e+01, -8.9624e+00,  0.0000e+00,  ..., -0.0000e+00,
            3.6257e+01, -2.8961e+01]],
 
         [[-3.9705e+00,  8.6364e+00, -2.3460e+00,  ..., -1.5398e+01,
           -3.8058e+01,  2.6414e+00],
          [-1.6705e+01, -1.3782e+01,  1.7031e-01,  ...,  1.1656e+01,
           -1.6830e+01,  2.3204e+01],
          [-4.4710e+01,  2.4001e+01, -3.2875e+01,  ..., -3.4772e+01,
            6.4465e+00,  1.4650e+01],
          [-1.7462e-02,  1.5975e+01, -1.3520e+01,  ...,  2.4157e+01,
           -3.2594e+01, -1.4680e+01]]], grad_fn=<AddBackward0>))

# 五、编码器

## 5.1 编码器层

In [129]:
class EncoderLayer(nn.Module):
    "Encoder is made up of self-attn and feed forward (defined below)"
    def __init__(self, size, self_attn, feed_forward, dropout):
        """
        :param size: 词嵌入 维度
        :param self_attn: 多头 自注意力子层 实例化对象
        :param feed_forward: 前馈 全连接子层 实例化对象
        :param dropout: 
        """
        super(EncoderLayer, self).__init__()
        self.self_attn = self_attn
        self.feed_forward = feed_forward
        self.layer_norm = clones(SublayerConnection(size, dropout), 2)
        self.size = size
        print(self.layer_norm)

    def forward(self, x, mask):
        """
        :param x: [batch, seq_len, d_model]
        :param mask: None 或 [1, seq_len, seq_len]
        :return: [batch, seq_len, d_model]
        """
        x = self.layer_norm[0](x, lambda x: self.self_attn(x, x, x, mask))
        return self.layer_norm[1](x, self.feed_forward)

In [107]:
size = 512
head = 8
d_model = 512
d_ff = 64
dropout = 0.2

# mask = subsequent_mask(4) 
mask = None


# 子层实例化
mha = MultiHeadedAttention(head, d_model)
ff = PositionwiseFeedForward(d_model, d_ff, dropout)
 
# 模型实例化
el = EncoderLayer(size, mha, ff, dropout)

# 输入输出
out_el = el(out_pe, mask)

out_el.shape, out_el  # [2, 4, 512] => [batch, seq_len, d_model]

ModuleList(
  (0): SublayerConnection(
    (norm): LayerNorm()
    (dropout): Dropout(p=0.2, inplace=False)
  )
  (1): SublayerConnection(
    (norm): LayerNorm()
    (dropout): Dropout(p=0.2, inplace=False)
  )
)


(torch.Size([2, 4, 512]),
 tensor([[[ -6.9081,  18.1500, -29.2522,  ..., -28.0629,  18.5636,  55.2811],
          [ -0.9034, -25.8110, -36.8332,  ...,  -0.1725,   1.2486, -45.4309],
          [-10.1193,  35.5325,  10.8556,  ...,  41.3106,   8.7118,  29.2144],
          [ 15.3499,  -9.7299,   0.9824,  ...,   0.1809,  37.0082, -29.8782]],
 
         [[ -4.2244,   8.5951,  -1.1939,  ..., -15.5036, -38.4399,   2.3525],
          [-16.5874, -13.9688,  -0.0790,  ...,  12.0542, -17.6419,  22.5610],
          [-44.1769,  24.4744, -32.0079,  ..., -34.9427,   7.3760,  13.6904],
          [ -0.0963,  16.8771, -12.3203,  ...,  23.8834, -31.6645, -15.1174]]],
        grad_fn=<AddBackward0>))

## 5.2 编码器

In [133]:
class Encoder(nn.Module):
    "Core encoder is a stack of N layers"
    def __init__(self, layer, N):
        """
        :param layer: 编码器层
        :param N: 层数
        """
        super(Encoder, self).__init__()
        self.encoder_layers = clones(layer, N)  # 深度拷贝
        self.layer_norm = LayerNorm(layer.size)  # d_model, size

    "Pass the input (and mask) through each layer in turn."
    def forward(self, x, mask):
        """
        :param x: [batch, seq_len, d_model]
        :param mask: [1, seq_len, seq_len]
        :return: [batch, seq_len, d_model]
        """
        for layer in self.encoder_layers:
            x = layer(x, mask)  # 输入输出
        return self.layer_norm(x)

In [134]:
head = 8
d_model = size = 512
d_ff = 64
dropout = 0.2
N = 8

c = copy.deepcopy
# mask = subsequent_mask(4) 
mask = None


# 子层实例化
mha = MultiHeadedAttention(head, d_model)
ff = PositionwiseFeedForward(d_model, d_ff, dropout)

# 编码层实例化
layer = EncoderLayer(size, c(mha), c(ff), dropout)
print(layer.size, layer)

# 编码器实例化
en = Encoder(layer, N)

# 输入输出 
out_en = en(out_pe, mask)  # [2, 4, 512] => [batch, seq_len, d_model]
out_en.shape, out_en

ModuleList(
  (0): SublayerConnection(
    (layer_norm): LayerNorm()
    (dropout): Dropout(p=0.2, inplace=False)
  )
  (1): SublayerConnection(
    (layer_norm): LayerNorm()
    (dropout): Dropout(p=0.2, inplace=False)
  )
)
512 EncoderLayer(
  (self_attn): MultiHeadedAttention(
    (linears): ModuleList(
      (0): Linear(in_features=512, out_features=512, bias=True)
      (1): Linear(in_features=512, out_features=512, bias=True)
      (2): Linear(in_features=512, out_features=512, bias=True)
      (3): Linear(in_features=512, out_features=512, bias=True)
    )
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (feed_forward): PositionwiseFeedForward(
    (w_1): Linear(in_features=512, out_features=64, bias=True)
    (w_2): Linear(in_features=64, out_features=512, bias=True)
    (dropout): Dropout(p=0.2, inplace=False)
  )
  (layer_norm): ModuleList(
    (0): SublayerConnection(
      (layer_norm): LayerNorm()
      (dropout): Dropout(p=0.2, inplace=False)
    )
    (1): SublayerConn

(torch.Size([2, 4, 512]),
 tensor([[[-5.2359e-01,  9.0716e-01, -1.1908e+00,  ..., -1.2454e+00,
            5.9877e-01,  1.9967e+00],
          [ 8.9224e-02, -8.8665e-01, -1.3271e+00,  ..., -9.3186e-02,
            1.5487e-01, -1.6146e+00],
          [-4.5447e-01,  1.5440e+00,  4.5354e-01,  ...,  1.5994e+00,
            4.5623e-01,  1.1132e+00],
          [ 8.0053e-01, -6.5956e-02, -1.1237e-01,  ..., -9.6918e-02,
            1.5565e+00, -9.3574e-01]],
 
         [[-2.7099e-01,  5.3902e-01, -2.6727e-01,  ..., -5.6800e-01,
           -1.5055e+00,  1.8255e-01],
          [-7.4601e-01, -3.5349e-01,  1.5411e-03,  ...,  5.8791e-01,
           -5.9251e-01,  7.4314e-01],
          [-1.9300e+00,  1.0558e+00, -1.2125e+00,  ..., -1.4497e+00,
            2.6200e-01,  5.1966e-01],
          [-3.0690e-01,  7.6097e-01, -6.0397e-01,  ...,  1.0218e+00,
           -1.3339e+00, -4.8538e-01]]], grad_fn=<AddBackward0>))

# 六、解码器

## 6.1 解码器层

In [137]:
class DecoderLayer(nn.Module):
    "Decoder is made of self-attn, src-attn, and feed forward (defined below)"
    def __init__(self, size, self_attn, src_attn, feed_forward, dropout):
        """
        :param size: 词嵌入 维度
        :param self_attn: 多头 自注意力 实例化对象，Q=K=V
        :param src_attn: 掩码 多头 注意力 实例化对象，Q!=K=V
        :param feed_forward: 前馈 全连接层 实例化对象
        :param dropout: 置0比率，让部分神经元失效
        """
        super(DecoderLayer, self).__init__()
        self.size = size
        self.self_attn = self_attn
        self.src_attn = src_attn
        self.feed_forward = feed_forward
        self.layer_norm = clones(SublayerConnection(size, dropout), 3)
        print(self.layer_norm)

    def forward(self, x, memory, src_mask, tgt_mask):
        """
        :param x: 上一层的输出  [batch, seq_len, d_model]
        :param memory: [batch, seq_len, d_model]
        :param src_mask: [1, seq_len, seq_len]
        :param tgt_mask: [1, seq_len, seq_len]
        :return: [batch, seq_len, d_model]
        """
        m = memory  # 来自encoder 而作为 decoder的 Key-Value
        x = self.layer_norm[0](x, lambda x: self.self_attn(x, x, x, tgt_mask))  # 自注意力，需要掩码，防止信息泄漏
        x = self.layer_norm[1](x, lambda x: self.src_attn(x, m, m, src_mask))  # 常规注意力，需要掩码，遮掉没有意义的注意力值
        return self.layer_norm[2](x, self.feed_forward)

In [139]:
head = 8
size = d_mode = 512
d_ff = 64 
dropout = 0.2

# 模型设计
self_attn = src_attn = MultiHeadedAttention(head, d_model, dropout)
ff = PositionwiseFeedForward(d_model, d_ff, dropout)

# 编码层的输出
memory = out_en # [batch, seq_len, d_model] 
x = out_pe

# decoder 掩码
mask = subsequent_mask(4) # seq_len
source_mask = target_mask = mask

# 模型实例化
dl = DecoderLayer(size, self_attn, src_attn, ff, dropout)

# 输入输出   decoder输入“目标数据”的词嵌入表示，形式上与encoder的“源数据”的词嵌入表示相同，这里使用out_pe来充当
out_dl = dl(x, memory, source_mask, target_mask) # [2, 4, 512] => [batch, seq_len, d_model]


out_dl.shape, out_dl

ModuleList(
  (0): SublayerConnection(
    (layer_norm): LayerNorm()
    (dropout): Dropout(p=0.2, inplace=False)
  )
  (1): SublayerConnection(
    (layer_norm): LayerNorm()
    (dropout): Dropout(p=0.2, inplace=False)
  )
  (2): SublayerConnection(
    (layer_norm): LayerNorm()
    (dropout): Dropout(p=0.2, inplace=False)
  )
)


(torch.Size([2, 4, 512]),
 tensor([[[ -6.8838,  19.6294, -28.9570,  ..., -27.7259,  17.3384,  55.9836],
          [  0.2049, -25.5464, -37.1251,  ...,   0.2234,   1.0651, -44.7951],
          [ -9.3173,  35.1236,  12.3871,  ...,  40.4247,   7.6465,  29.7795],
          [ 14.7570, -10.7893,   0.2505,  ...,  -0.0690,  36.4935, -27.8200]],
 
         [[ -3.2919,   8.8902,  -2.3629,  ..., -15.3256, -38.0702,   2.2155],
          [-16.0966, -14.3701,  -0.4338,  ...,  11.3824, -17.4821,  23.2981],
          [-44.4678,  24.2947, -33.6117,  ..., -35.9683,   6.9066,  13.2562],
          [  0.7189,  15.1474, -12.3291,  ...,  24.0046, -30.5892, -15.2597]]],
        grad_fn=<AddBackward0>))

## 6.2 解码器

In [143]:
class Decoder(nn.Module):
    "Generic N layer decoder with masking."

    def __init__(self, layer, N):
        """
        :param layer: 解码器层
        :param N: 解码器层的个数
        """
        super(Decoder, self).__init__()
        self.decoder_layers = clones(layer, N)
        self.layer_norm = LayerNorm(layer.size)

    def forward(self, x, memory, src_mask, tgt_mask):
        """
        :param x: [batch, seq_len, d_model]
        :param memory: [batch, seq_len, d_model]
        :param src_mask: [1, seq_len, seq_len]
        :param tgt_mask: [1, seq_len, seq_len]
        :return: [batch, seq_len, d_model]
        """
        for layer in self.decoder_layers:
            x = layer(x, memory, src_mask, tgt_mask)
        return self.layer_norm(x)

In [144]:
size = 512
d_model = 512
head = 8
d_ff = 64
dropout = 0.2

c = copy.deepcopy

attn = MultiHeadedAttention(head, d_model)
ff = PositionwiseFeedForward(d_model, d_ff, dropout)
layer = DecoderLayer(d_model, c(attn), c(attn), c(ff), dropout)
print(layer.size, layer)

N = 8

x = out_pe
memory = out_en  # [2, 4, 512] => [batch, seq_len, d_model]

mask = subsequent_mask(4) # seq_len
source_mask = target_mask = mask

# 模型实例化
dl = Decoder(layer, N)

# 输入输出
out_de = dl(x, memory, source_mask, target_mask)

out_de.shape, out_de # [2, 4, 512] => [batch, seq_len, d_model]

ModuleList(
  (0): SublayerConnection(
    (layer_norm): LayerNorm()
    (dropout): Dropout(p=0.2, inplace=False)
  )
  (1): SublayerConnection(
    (layer_norm): LayerNorm()
    (dropout): Dropout(p=0.2, inplace=False)
  )
  (2): SublayerConnection(
    (layer_norm): LayerNorm()
    (dropout): Dropout(p=0.2, inplace=False)
  )
)
512 DecoderLayer(
  (self_attn): MultiHeadedAttention(
    (linears): ModuleList(
      (0): Linear(in_features=512, out_features=512, bias=True)
      (1): Linear(in_features=512, out_features=512, bias=True)
      (2): Linear(in_features=512, out_features=512, bias=True)
      (3): Linear(in_features=512, out_features=512, bias=True)
    )
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (src_attn): MultiHeadedAttention(
    (linears): ModuleList(
      (0): Linear(in_features=512, out_features=512, bias=True)
      (1): Linear(in_features=512, out_features=512, bias=True)
      (2): Linear(in_features=512, out_features=512, bias=True)
      (3): Linear(in

(torch.Size([2, 4, 512]),
 tensor([[[-0.4535,  0.3907, -1.2888,  ..., -1.3470,  0.9963,  1.6831],
          [ 0.2102, -0.4513, -1.7127,  ...,  0.1351,  0.1781, -2.0479],
          [ 0.1133,  1.4191,  0.2536,  ...,  1.6202,  0.3326,  1.0890],
          [ 0.4995, -0.2464,  0.2218,  ..., -0.1123,  1.3521, -0.8444]],
 
         [[-0.4430,  0.5039,  0.4447,  ..., -0.5091, -1.6839, -0.2979],
          [-0.7008, -0.8361,  0.1640,  ...,  0.3541, -0.7514,  1.0254],
          [-1.7786,  0.9805, -1.3585,  ..., -1.5161,  0.4262, -0.0173],
          [-0.1903,  0.6061, -0.3315,  ...,  0.8285, -1.5397, -0.7765]]],
        grad_fn=<AddBackward0>))

# 七、输出 linear+softmax

In [156]:
class Generator(nn.Module):
    "Define standard linear + softmax generation step."
    def __init__(self, d_model, vocab):
        """
        :param d_model: 词嵌入维度
        :param vocab: 词表大小
        """
        super(Generator, self).__init__()
        self.proj = nn.Linear(d_model, vocab)

    def forward(self, x):
        """
        :param x: [batch, seq_len, d_model]
        :return: [batch, seq_len, vocab_size]
        """
        return F.log_softmax(self.proj(x), dim=-1)

In [160]:
d_model = 512
vocab_size = 1000

x = out_de

# 模型实例化
gen = Generator(d_model, vocab_size)

# 输入输出
out_gen = gen(x)

print(torch.argmax(out_gen, dim = -1))
out_gen.shape, out_gen # [2, 4, 1000]

tensor([[ 42, 997, 486, 949],
        [913, 348, 804,  35]])


(torch.Size([2, 4, 1000]),
 tensor([[[-6.9900, -6.3757, -7.1832,  ..., -6.8251, -7.6340, -6.7575],
          [-7.4565, -7.1753, -6.5283,  ..., -5.1829, -6.6898, -7.4696],
          [-6.8189, -6.6836, -7.2612,  ..., -7.7652, -6.9435, -7.0307],
          [-6.6836, -7.2830, -6.7103,  ..., -6.4768, -7.0989, -6.7283]],
 
         [[-7.9600, -7.0918, -6.4814,  ..., -7.2976, -7.8630, -6.8312],
          [-6.7006, -7.6107, -5.6702,  ..., -6.8366, -7.1698, -7.2947],
          [-7.1123, -7.1748, -6.4731,  ..., -6.6584, -7.8360, -7.2967],
          [-8.0112, -6.4194, -6.8697,  ..., -6.8192, -6.9049, -7.2557]]],
        grad_fn=<LogSoftmaxBackward0>))

# 八、编码器-解码器 

In [169]:

class EncoderDecoder(nn.Module):
    def __init__(self, encoder, decoder, source_embed, target_embed, generator):
        """
        :param encoder: 
        :param decoder: 
        :param source_embed: 源数据嵌入
        :param target_embed: 目标函数嵌入
        :param generator: 类别生成器
        """
        super(EncoderDecoder, self).__init__()
        # 将参数传入到类中
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = source_embed
        self.tgt_embed = target_embed
        self.generator = generator

    def forward(self, source, target, source_mask, target_mask):
        """
        :param source: 源数据
        :param target: 目标数据
        :param source_mask: 对应掩码张量
        :param target_mask: 对应掩码张量 
        :return: 
        """
        return self.decode(self.encode(source, source_mask), source_mask,
                            target, target_mask)

    def encode(self, source, source_mask):
        """
        :param source: 
        :param source_mask: 
        :return: 
        """
        return self.encoder(self.src_embed(source), source_mask)

    def decode(self, memory, source_mask, target, target_mask):
        """
        :param memory: 
        :param source_mask: 
        :param target: 
        :param target_mask: 
        :return: 
        """
        return self.decoder(self.tgt_embed(target), memory, source_mask, target_mask)

In [170]:
vocab_size = 1000
d_model = 512

encoder = en
decoder = dl
source_embed = nn.Embedding(vocab_size, d_model)
target_embed = nn.Embedding(vocab_size, d_model)
generator = gen

# 假设源数据与目标数据相同，实际中并不相同
source = target = Variable(torch.LongTensor([[100, 2, 421, 508], [491, 998, 1, 221]]))

# 假设src_mask 与 tgt_mask相同，实际并不相容
source_mask = target_mask = subsequent_mask(4) # seq_len

# 模型实例化
en_de = EncoderDecoder(encoder, decoder, source_embed, target_embed, generator)

# 输入输出
out_ende = en_de(source, target, source_mask, target_mask)


out_ende.shape, out_ende

(torch.Size([2, 4, 512]),
 tensor([[[-0.3687,  0.6515, -1.0932,  ..., -0.2589,  0.4345, -1.9250],
          [ 0.2143,  1.3857, -1.0545,  ...,  0.3011,  0.4664, -2.2906],
          [ 0.1405,  1.9436, -0.4520,  ..., -0.4401, -0.5760, -2.0684],
          [-1.1548,  1.3899,  0.3455,  ..., -0.0339,  0.5120, -2.3459]],
 
         [[-2.7630,  2.1301,  0.9778,  ...,  0.5441,  0.7729, -2.5819],
          [-0.5491,  2.5325,  0.7174,  ..., -0.5978, -1.1866, -1.0995],
          [-0.6238,  1.4725,  1.9062,  ..., -1.3275,  0.9278, -1.7078],
          [-1.6447,  1.6734,  1.0431,  ..., -0.8960, -0.7198, -0.8516]]],
        grad_fn=<AddBackward0>))

# 九、transformer

In [178]:
def make_model(src_vocab, tgt_vocab, N=6,
               d_model=512, d_ff=2048, h=8, dropout=0.1):
    """
    :param src_vocab: 源数据特征（词汇）总数
    :param tgt_vocab: 目标数据特征（词汇）总数
    :param N: 编码器和解码器堆叠数
    :param d_model: 词嵌入维度
    :param d_ff: 前馈全连接网络中 变换矩阵的维度
    :param h: 头数
    :param dropout: 
    :return: 
    """
    "Helper: Construct a model from hyperparameters."
    c = copy.deepcopy
    
    attn = MultiHeadedAttention(h, d_model)
    ff = PositionwiseFeedForward(d_model, d_ff, dropout)
    position = PositionalEncoding(d_model, dropout)
    
    model = EncoderDecoder(
        Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N),
        Decoder(DecoderLayer(d_model, c(attn), c(attn),
                             c(ff), dropout), N),
        nn.Sequential(Embeddings(d_model, src_vocab), c(position)),
        nn.Sequential(Embeddings(d_model, tgt_vocab), c(position)),
        Generator(d_model, tgt_vocab))

    # This was important from their code. 
    # Initialize parameters with Glorot / fan_avg.
    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)
    return model

In [181]:
tmp_model = make_model(10, 10, 2)
print(tmp_model)

ModuleList(
  (0): SublayerConnection(
    (layer_norm): LayerNorm()
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (1): SublayerConnection(
    (layer_norm): LayerNorm()
    (dropout): Dropout(p=0.1, inplace=False)
  )
)
ModuleList(
  (0): SublayerConnection(
    (layer_norm): LayerNorm()
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (1): SublayerConnection(
    (layer_norm): LayerNorm()
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (2): SublayerConnection(
    (layer_norm): LayerNorm()
    (dropout): Dropout(p=0.1, inplace=False)
  )
)
EncoderDecoder(
  (encoder): Encoder(
    (encoder_layers): ModuleList(
      (0): EncoderLayer(
        (self_attn): MultiHeadedAttention(
          (linears): ModuleList(
            (0): Linear(in_features=512, out_features=512, bias=True)
            (1): Linear(in_features=512, out_features=512, bias=True)
            (2): Linear(in_features=512, out_features=512, bias=True)
            (3): Linear(in_features=512, out_features=512, 

In [173]:
%who_ls  # 显示当前所有变量名称

[]