In [1]:
import math
import random
import torch
import torch.nn as nn


In [2]:

max_length=16

class PositionalEncoding(nn.Module):
    "Implement the PE function."

    def __init__(self, d_model, dropout, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        # 初始化Shape为(max_len, d_model)的PE (positional encoding)
        pe = torch.zeros(max_len, d_model)
        # 初始化一个tensor [[0, 1, 2, 3, ...]]
        position = torch.arange(0, max_len).unsqueeze(1)
        # 这里就是sin和cos括号中的内容，通过e和ln进行了变换
        div_term = torch.exp(
            torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model)
        )
        # 计算PE(pos, 2i)
        pe[:, 0::2] = torch.sin(position * div_term)
        # 计算PE(pos, 2i+1)
        pe[:, 1::2] = torch.cos(position * div_term)
        # 为了方便计算，在最外面在unsqueeze出一个batch
        pe = pe.unsqueeze(0)
        # 如果一个参数不参与梯度下降，但又希望保存model的时候将其保存下来
        # 这个时候就可以用register_buffer
        self.register_buffer("pe", pe)

    def forward(self, x):
        """
        x 为embedding后的inputs，例如(1,7, 128)，batch size为1,7个单词，单词维度为128
        """
        # 将x和positional encoding相加。
        x = x + self.pe[:, : x.size(1)].requires_grad_(False)
        return self.dropout(x)


class CopyTaskModel(nn.Module):

    def __init__(self, d_model=128):
        super(CopyTaskModel, self).__init__()

        # 定义词向量，词典数为10。我们不预测两位小数。
        self.embedding = nn.Embedding(num_embeddings=10, embedding_dim=128)
        # 定义Transformer。超参是我拍脑袋想的
        self.transformer = nn.Transformer(d_model=128, num_encoder_layers=2, num_decoder_layers=2, dim_feedforward=512, batch_first=True)

        # 定义位置编码器
        self.positional_encoding = PositionalEncoding(d_model, dropout=0)

        # 定义最后的线性层，这里并没有用Softmax，因为没必要。
        # 因为后面的CrossEntropyLoss中自带了
        self.predictor = nn.Linear(128, 10)

    def forward(self, src, tgt):
        # 生成mask
        tgt_mask = nn.Transformer.generate_square_subsequent_mask(tgt.size()[-1])
        src_key_padding_mask = CopyTaskModel.get_key_padding_mask(src)
        tgt_key_padding_mask = CopyTaskModel.get_key_padding_mask(tgt)

        # 对src和tgt进行编码
        src = self.embedding(src)
        tgt = self.embedding(tgt)
        # 给src和tgt的token增加位置信息
        src = self.positional_encoding(src)
        tgt = self.positional_encoding(tgt)

        # 将准备好的数据送给transformer
        out = self.transformer(src, tgt,
                               tgt_mask=tgt_mask,
                               src_key_padding_mask=src_key_padding_mask,
                               tgt_key_padding_mask=tgt_key_padding_mask)

        """
        这里直接返回transformer的结果。因为训练和推理时的行为不一样，
        所以在该模型外再进行线性层的预测。
        """
        return out

    @staticmethod
    def get_key_padding_mask(tokens):
        """
        用于key_padding_mask
        """
        key_padding_mask = torch.zeros(tokens.size())
        key_padding_mask[tokens == 2] = -torch.inf
        return key_padding_mask


In [3]:

model = CopyTaskModel()


src = torch.LongTensor([[0, 3, 4, 5, 6, 1, 2, 2]])
tgt = torch.LongTensor([[3, 4, 5, 6, 1, 2, 2]])
out = model(src, tgt)
print(out.size())
print(out)


criteria = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)


def generate_random_batch(batch_size, max_length=16):
    src = []
    for i in range(batch_size):
        # 随机生成句子长度
        random_len = random.randint(1, max_length - 2)
        # 随机生成句子词汇，并在开头和结尾增加<bos>和<eos>
        random_nums = [0] + [random.randint(3, 9) for _ in range(random_len)] + [1]
        # 如果句子长度不足max_length，进行填充
        random_nums = random_nums + [2] * (max_length - random_len - 2)
        src.append(random_nums)
    src = torch.LongTensor(src)
    # tgt不要最后一个token
    tgt = src[:, :-1]
    # tgt_y不要第一个的token
    tgt_y = src[:, 1:]
    # 计算tgt_y，即要预测的有效token的数量
    n_tokens = (tgt_y != 2).sum()

    # 这里的n_tokens指的是我们要预测的tgt_y中有多少有效的token，后面计算loss要用
    return src, tgt, tgt_y, n_tokens

generate_random_batch(batch_size=2, max_length=6)

total_loss = 0

for step in range(2000):
    # 生成数据
    src, tgt, tgt_y, n_tokens = generate_random_batch(batch_size=2, max_length=max_length)

    # 清空梯度
    optimizer.zero_grad()
    # 进行transformer的计算
    out = model(src, tgt)
    # 将结果送给最后的线性层进行预测
    out = model.predictor(out)
    """
    计算损失。由于训练时我们的是对所有的输出都进行预测，所以需要对out进行reshape一下。
            我们的out的Shape为(batch_size, 词数, 词典大小)，view之后变为：
            (batch_size*词数, 词典大小)。
            而在这些预测结果中，我们只需要对非<pad>部分进行，所以需要进行正则化。也就是
            除以n_tokens。
    """
    loss = criteria(out.contiguous().view(-1, out.size(-1)), tgt_y.contiguous().view(-1)) / n_tokens
    # 计算梯度
    loss.backward()
    # 更新参数
    optimizer.step()

    total_loss += loss

    # 每40次打印一下loss
    if step != 0 and step % 40 == 0:
        print("Step {}, total_loss: {}".format(step, total_loss))
        total_loss = 0


torch.Size([1, 7, 128])
tensor([[[ 1.8950e+00,  1.7062e-01,  5.4905e-01, -2.3172e-01,  2.8475e-01,
          -2.8223e-01,  9.4189e-01, -1.1219e+00, -7.8631e-02,  8.3927e-01,
          -3.0845e-01,  1.6447e+00, -3.2682e-01, -1.0948e+00,  2.3635e-01,
          -3.8869e-01, -2.4674e-01, -1.2092e+00,  9.2062e-01, -7.3749e-02,
           2.5967e+00, -1.2252e-01, -7.9248e-01,  7.5257e-01, -8.5718e-02,
          -5.3537e-01, -5.2227e-01, -6.8542e-01,  1.5694e+00, -1.2384e+00,
          -7.8544e-01,  6.3387e-01, -3.8675e-01,  7.4982e-01, -7.7887e-01,
           7.3734e-01, -1.5025e+00, -3.0555e-01, -6.9884e-01,  6.9657e-01,
          -1.0176e-01,  7.8381e-01, -4.8636e-02,  9.3010e-01,  1.2965e+00,
          -1.5736e+00, -1.3617e-01,  1.0115e+00, -1.0702e+00,  4.0657e-02,
          -1.4999e+00, -1.3801e+00,  8.8314e-01, -4.5570e-01,  7.7958e-02,
           3.0774e-01,  1.1265e+00, -1.1450e+00, -9.5520e-01, -1.0418e+00,
           8.2487e-01,  1.9281e-01, -1.1435e+00, -7.0385e-01,  5.7181e-01,
 

  from .autonotebook import tqdm as notebook_tqdm


Step 40, total_loss: 3.7978146076202393
Step 80, total_loss: 2.533191204071045
Step 120, total_loss: 2.181631326675415
Step 160, total_loss: 2.1219639778137207
Step 200, total_loss: 2.059121608734131
Step 240, total_loss: 1.922585129737854
Step 280, total_loss: 1.9290006160736084
Step 320, total_loss: 1.7354027032852173
Step 360, total_loss: 1.6569610834121704
Step 400, total_loss: 1.6535319089889526
Step 440, total_loss: 1.5443171262741089
Step 480, total_loss: 1.409035325050354
Step 520, total_loss: 1.4427990913391113
Step 560, total_loss: 1.239930272102356
Step 600, total_loss: 1.115173101425171
Step 640, total_loss: 1.2584936618804932
Step 680, total_loss: 1.1264455318450928
Step 720, total_loss: 1.136805534362793
Step 760, total_loss: 0.9904664158821106
Step 800, total_loss: 1.0075844526290894
Step 840, total_loss: 0.9066217541694641
Step 880, total_loss: 0.8080472350120544
Step 920, total_loss: 0.7174995541572571
Step 960, total_loss: 0.8348253965377808
Step 1000, total_loss: 0.6

In [7]:

model = model.eval()
# 随便定义一个src
src = torch.LongTensor([[0, 4, 3, 4, 6, 8, 9, 9, 8, 1, 2, 2]])
# tgt从<bos>开始，看看能不能重新输出src中的值
tgt = torch.LongTensor([[0]])

# 一个一个词预测，直到预测为<eos>，或者达到句子最大长度
for i in range(max_length):
    # 进行transformer计算
    out = model(src, tgt)
    print(f"out {i}: {out.size()}")
    # 预测结果，因为只需要看最后一个词，所以取`out[:, -1]`
    predict = model.predictor(out[:, -1])
    # 找出最大值的index
    y = torch.argmax(predict, dim=1)
    # 和之前的预测结果拼接到一起
    # tgt = torch.concat([tgt, y.unsqueeze(0)], dim=1)
    tgt = torch.concat([y.unsqueeze(0)], dim=1)
    print(y)

    # 如果为<eos>，说明预测结束，跳出循环
    if y == 1:
        break
print(tgt)

out 0: torch.Size([1, 1, 128])
tensor([4])
out 1: torch.Size([1, 1, 128])
tensor([6])
out 2: torch.Size([1, 1, 128])
tensor([4])
out 3: torch.Size([1, 1, 128])
tensor([6])
out 4: torch.Size([1, 1, 128])
tensor([4])
out 5: torch.Size([1, 1, 128])
tensor([6])
out 6: torch.Size([1, 1, 128])
tensor([4])
out 7: torch.Size([1, 1, 128])
tensor([6])
out 8: torch.Size([1, 1, 128])
tensor([4])
out 9: torch.Size([1, 1, 128])
tensor([6])
out 10: torch.Size([1, 1, 128])
tensor([4])
out 11: torch.Size([1, 1, 128])
tensor([6])
out 12: torch.Size([1, 1, 128])
tensor([4])
out 13: torch.Size([1, 1, 128])
tensor([6])
out 14: torch.Size([1, 1, 128])
tensor([4])
out 15: torch.Size([1, 1, 128])
tensor([6])
tensor([[6]])
