In [25]:
'''
基于transformer的翻译模型 - pytorch教程
https://pytorch.org/tutorials/beginner/translation_transformer.html
https://pytorch.org/tutorials/beginner/transformer_tutorial.html
'''

from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.datasets import multi30k, Multi30k
from typing import Iterable, List
from torch import Tensor
import torch
import torch.nn as nn
from torch.nn import Transformer
import math
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
from timeit import default_timer as timer

In [26]:
''' 
第一部分：词表生成
'''

# We need to modify the URLs for the dataset since the links to the original dataset are broken
# Refer to https://github.com/pytorch/text/issues/1756#issuecomment-1163664163 for more info
multi30k.URL["train"] = "https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/training.tar.gz"
multi30k.URL["valid"] = "https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/validation.tar.gz"

SRC_LANGUAGE = 'de'
TGT_LANGUAGE = 'en'

# Place-holders
# de和en各自的分词器
token_transform = {}
# de和en各自的词表
vocab_transform = {}

# 分词器
token_transform[SRC_LANGUAGE] = get_tokenizer('spacy', language='de_core_news_sm')
token_transform[TGT_LANGUAGE] = get_tokenizer('spacy', language='en_core_web_sm')

# helper function to yield list of tokens
# 句子分词
def yield_tokens(data_iter: Iterable, language: str) -> List[str]:
    language_index = {SRC_LANGUAGE: 0, TGT_LANGUAGE: 1}

    for data_sample in data_iter:
        # 对句子分词
        yield token_transform[language](data_sample[language_index[language]])

# 特殊字符（未知,填充,开始,结束)
# Define special symbols and indices
UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3
# Make sure the tokens are in order of their indices to properly insert them in vocab
special_symbols = ['<unk>', '<pad>', '<bos>', '<eos>']

# 从训练数据中, 分别生成en和de的词表
for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
    # Training data Iterator
    # 训练集
    train_iter = Multi30k(split='train', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
    # Create torchtext's Vocab object
    # 构建词表
    vocab_transform[ln] = build_vocab_from_iterator(yield_tokens(train_iter, ln),
                                                    min_freq=1,
                                                    specials=special_symbols,
                                                    special_first=True)

# Set ``UNK_IDX`` as the default index. This index is returned when the token is not found.
# If not set, it throws ``RuntimeError`` when the queried token is not found in the Vocabulary.
for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
  vocab_transform[ln].set_default_index(UNK_IDX)

print('分词:', token_transform)
print('词表:', vocab_transform)

分词: {'de': functools.partial(<function _spacy_tokenize at 0x0000015C795C4DC0>, spacy=<spacy.lang.de.German object at 0x0000015C88A18FA0>), 'en': functools.partial(<function _spacy_tokenize at 0x0000015C795C4DC0>, spacy=<spacy.lang.en.English object at 0x0000015C4D064430>)}
词表: {'de': Vocab(), 'en': Vocab()}


In [27]:
'''
 第二部分：翻译模型的定义
'''

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# helper Module that adds positional encoding to the token embedding to introduce a notion of word order.
# 位置编码，会在模型里加到词向量上面
class PositionalEncoding(nn.Module):
    def __init__(self,
                 emb_size: int,         # 每个位置向量宽度
                 dropout: float,    # dropout率
                 maxlen: int = 5000): # 最多5000个词输入
        super(PositionalEncoding, self).__init__()
        den = torch.exp(- torch.arange(0, emb_size, 2)* math.log(10000) / emb_size) # 生成emb_size/2宽的向量
        pos = torch.arange(0, maxlen).reshape(maxlen, 1) # 为每个词生成序号，竖起来
        pos_embedding = torch.zeros((maxlen, emb_size)) # 每个位置对应一个位置向量
        pos_embedding[:, 0::2] = torch.sin(pos * den) # 每个位置向量的偶数位置填充pos序号*sin
        pos_embedding[:, 1::2] = torch.cos(pos * den) # 每个位置向量的奇数位置填充pos序号*cos
        print('pos shape:', pos.size(), 'pos data:', pos)
        print('pos_embedding shape:', pos_embedding.size(), 'pos_embedding data:', pos_embedding)
        pos_embedding = pos_embedding.unsqueeze(-2) # 每个位置加上batch维
        print('pos_embedding unsqueeze -2 shape:', pos_embedding.size())
        
        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding) # 固定向量

    def forward(self, token_embedding: Tensor):
        # 输入token序列长度在0维,所以pos emb只加到token序列的长度为止
        return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :]) # 输入的token向量序列和对应位置的pos向量相加

# helper Module to convert tensor of input indices into corresponding tensor of token embeddings
# 词id序列转emb序列
class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size) 
        self.emb_size = emb_size

    def forward(self, tokens: Tensor):
        # embedding对矩阵中每个词ID替换emb vector，不改变其他形状
        # tokens原本是(seq_size,batch_size),处理后是(seq_size,batch_size,emb_size)
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size) # 词id序列分别查emb向量,还elem-wise乘了一下sqrt emb size

# Seq2Seq Network
# 序列生成模型
class Seq2SeqTransformer(nn.Module):
    def __init__(self,
                 num_encoder_layers: int,
                 num_decoder_layers: int,
                 emb_size: int,
                 nhead: int,
                 src_vocab_size: int,
                 tgt_vocab_size: int,
                 dim_feedforward: int = 512,
                 dropout: float = 0.1):
        super(Seq2SeqTransformer, self).__init__()
        # transofmer is all you need 模型，encoder+decoder架构
        self.transformer = Transformer(d_model=emb_size, # 即encoder/decoder的输入词向量宽度
                                       nhead=nhead, # 注意力多头个数
                                       num_encoder_layers=num_encoder_layers, # encoder阶段堆叠(网络结构图中可堆叠的部分)
                                       num_decoder_layers=num_decoder_layers, # decoder阶段堆叠(网络结构图中可堆叠的部分)
                                       dim_feedforward=dim_feedforward, # feedforward结构的神经元个数
                                       dropout=dropout)
        self.generator = nn.Linear(emb_size, tgt_vocab_size) # 接在encoder后面预测下一个词, 输入emb pooling, 神经元个数是翻译目标语言的词表大小
        self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size) # 将输入词id转成emb vector
        self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size) # 将输出词id转成emb vector
        self.positional_encoding = PositionalEncoding( # 位置向量
            emb_size, dropout=dropout)

    def forward(self,
                src: Tensor,
                trg: Tensor,
                src_mask: Tensor,
                tgt_mask: Tensor,
                src_padding_mask: Tensor,
                tgt_padding_mask: Tensor,
                memory_key_padding_mask: Tensor):
        src_emb = self.positional_encoding(self.src_tok_emb(src)) # 输入词Id序列->输入词emb序列->给每个位置的词emb叠加pos emb
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg)) # 输出词id序列->输出词emb序列->给每个位置的词emb叠加pos emb
        # src_emb和tgt_emb都是(seq_size,batch_size,emb_size)形状
        outs = self.transformer(src_emb, tgt_emb, src_mask, tgt_mask, None, # tgt_mask限制outputs输入时每个pos可以看到其他pos的范围
                                src_padding_mask, tgt_padding_mask, memory_key_padding_mask)  # src_padding_mask和tgt_padding_mask告知每个pos上哪些句子是pad的占位符
        # decoder输出emb宽的pooling向量, 再过linear转词概率预测
        return self.generator(outs)

    # 推理时单独调用Encoder阶段
    def encode(self, src: Tensor, src_mask: Tensor):
        return self.transformer.encoder(self.positional_encoding(
                            self.src_tok_emb(src)), src_mask)

    # 推理时单独调用Decoder阶段
    def decode(self, tgt: Tensor, memory: Tensor, tgt_mask: Tensor):
        return self.transformer.decoder(self.positional_encoding(
                          self.tgt_tok_emb(tgt)), memory,
                          tgt_mask)


In [28]:
'''
 第三部分：准备训练阶段
'''

# 掩码是加到attention score上面的，这样-inf加上去就导致softmax为0，起到了忽略输入的效果
def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones((sz, sz), device=DEVICE)) == 1).transpose(0, 1) # Pos=0的词只注意自己，pos=1的词注意pos=0和pos=1，pos=2的词注意pos=0,pos=1,pos=2
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0)) # 标0的转-inf float，其他标0
    return mask

# 参数是词id序列，返回模型所需的mask(attention mask和padding mask)
def create_mask(src, tgt):
    src_seq_len = src.shape[0] # 输入序列的词个数
    tgt_seq_len = tgt.shape[0] # 输出序列的词个数

    tgt_mask = generate_square_subsequent_mask(tgt_seq_len) # 输出序列的mask，要mask住每个词后面的部分
    src_mask = torch.zeros((src_seq_len, src_seq_len),device=DEVICE).type(torch.bool) # 输入序列的mask，是全0的，不做任何mask

    # PAD填充位置填1,其他填0
    src_padding_mask = (src == PAD_IDX).transpose(0, 1) 
    tgt_padding_mask = (tgt == PAD_IDX).transpose(0, 1)
    return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask


torch.manual_seed(0)

SRC_VOCAB_SIZE = len(vocab_transform[SRC_LANGUAGE]) # de输入词表大小
TGT_VOCAB_SIZE = len(vocab_transform[TGT_LANGUAGE]) # en输出词表大小
EMB_SIZE = 512 # 词向量宽度
NHEAD = 8   # 自注意力多头个数
FFN_HID_DIM = 512   # feedforward保持输出emb宽
BATCH_SIZE = 128    
NUM_ENCODER_LAYERS = 3 # encoder堆叠三层encoder
NUM_DECODER_LAYERS = 3 # decoder堆叠三层decoder

# 定义seq2seq模型
transformer = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE,
                                 NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM)
# 所有模型参数
for p in transformer.parameters():
    if p.dim() > 1: 
        nn.init.xavier_uniform_(p)  # 均匀分布初始化参数初始值

# 模型放到GPU上
transformer = transformer.to(DEVICE)

# linear预测的下一个词概率和真实下一个词求损失
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)

# 优化器
optimizer = torch.optim.Adam(transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

# helper function to club together sequential operations
# 组装数据预处理流水线：分词->ID化->添加BOS和EOS id
def sequential_transforms(*transforms):
    def func(txt_input):
        for transform in transforms:
            txt_input = transform(txt_input)
        return txt_input
    return func

# function to add BOS/EOS and create tensor for input sequence indices
def tensor_transform(token_ids: List[int]):
    return torch.cat((torch.tensor([BOS_IDX]),
                      torch.tensor(token_ids),
                      torch.tensor([EOS_IDX])))

# ``src`` and ``tgt`` language text transforms to convert raw strings into tensors indices
# 准备de和en两种句子的预处理方法, 即构造流水线: 分词->id化->添加[BOS]和[EOS]
text_transform = {}
for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
    text_transform[ln] = sequential_transforms(token_transform[ln], #Tokenization
                                               vocab_transform[ln], #Numericalization
                                               tensor_transform) # Add BOS/EOS and create tensor


# function to collate data samples into batch tensors
# 输入1批样本
def collate_fn(batch):
    src_batch, tgt_batch = [], []
    # 对于每一对de句子和en句子
    for src_sample, tgt_sample in batch:
        src_batch.append(text_transform[SRC_LANGUAGE](src_sample.rstrip("\n"))) # de的句子id序列
        tgt_batch.append(text_transform[TGT_LANGUAGE](tgt_sample.rstrip("\n"))) # en的句子id序列

    src_batch = pad_sequence(src_batch, padding_value=PAD_IDX)    # 这一batch的输入句子对齐长度,返回(max_seq_size,batch_size)的形状
    tgt_batch = pad_sequence(tgt_batch, padding_value=PAD_IDX)    # 这一batch的输出句子对齐长度,返回(max_seq_size,batch_size)的形状
    return src_batch, tgt_batch

pos shape: torch.Size([5000, 1]) pos data: tensor([[   0],
        [   1],
        [   2],
        ...,
        [4997],
        [4998],
        [4999]])
pos_embedding shape: torch.Size([5000, 512]) pos_embedding data: tensor([[ 0.0000e+00,  1.0000e+00,  0.0000e+00,  ...,  1.0000e+00,
          0.0000e+00,  1.0000e+00],
        [ 8.4147e-01,  5.4030e-01,  8.2186e-01,  ...,  1.0000e+00,
          1.0366e-04,  1.0000e+00],
        [ 9.0930e-01, -4.1615e-01,  9.3641e-01,  ...,  1.0000e+00,
          2.0733e-04,  1.0000e+00],
        ...,
        [ 9.5625e-01, -2.9254e-01,  9.3594e-01,  ...,  8.5926e-01,
          4.9515e-01,  8.6881e-01],
        [ 2.7050e-01, -9.6272e-01,  8.2251e-01,  ...,  8.5920e-01,
          4.9524e-01,  8.6876e-01],
        [-6.6395e-01, -7.4778e-01,  1.4615e-03,  ...,  8.5915e-01,
          4.9533e-01,  8.6871e-01]])
pos_embedding unsqueeze -2 shape: torch.Size([5000, 1, 512])


In [32]:
''' 
 第四部分：开始训练
''' 

def train_epoch(model, optimizer):
    model.train() # 训练状态(dropout生效)
    losses = 0
    train_iter = Multi30k(split='train', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE)) # 数据集
    train_dataloader = DataLoader(train_iter, batch_size=BATCH_SIZE, collate_fn=collate_fn) # 数据迭代器

    for src, tgt in train_dataloader:
        # 样本放入GPU
        src = src.to(DEVICE)
        tgt = tgt.to(DEVICE)
        # src和tgt是(seq_len,batch_size)形状的
        #print('src size:', src.size(), 'tgt size:', tgt.size()) # src size: torch.Size([27, 128]) tgt size: torch.Size([24, 128])

        # decoder只输入[0,N-1)位置的token,以便让它预测[1,N-1]位置的token
        tgt_input = tgt[:-1, :]
        #print('tgt_input size', tgt_input.size()) # tgt_input size torch.Size([23, 128])

        # tgt_mask解决的是decoder不同pos对其他pos可见性问题，pad mask解决的是batch对齐后无效pos问题
        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)
        #print('src_mask size:', src_mask.size(), 'tgt_mask size:', tgt_mask.size()) # src_mask size: torch.Size([27, 27]) tgt_mask size: torch.Size([23, 23])
        #print('src_padding_mask size:', src_padding_mask.size(), 'tgt_padding_mask size:', tgt_padding_mask.size()) # src_padding_mask size: torch.Size([128, 27]) tgt_padding_mask size: torch.Size([128, 23])
        # print('src_mask:',src_mask)
        # print('tgt_mask:', tgt_mask)
 
        # forward，依据每个样本[0,tgt_size)位置的token，预测出[1,tgt_size]位置的token
        logits = model(src, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask)
        #print('logits size:', logits.size()) # logits size: torch.Size([23, 128, 10837]), (词序列长度,批大小,词表大小) -> (每个句子23个词,128个句子,每个词有10837种可能)

        optimizer.zero_grad()
        
        # 计算每个样本[1,tgt_size]这些token id和预测出的[1,tgt_size]位置token概率的误差
        tgt_out = tgt[1:, :]
        #print('tgt_out size:', tgt_out.size())
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) # 把batch维直接去掉,这样就是每个样本的每个token的logis和每个样本的每个token id一一对应求loss
        loss.backward()

        optimizer.step()
        losses += loss.item()

    return losses / len(list(train_dataloader))

NUM_EPOCHS = 16
for epoch in range(1, NUM_EPOCHS+1):
    start_time = timer()
    train_loss = train_epoch(transformer, optimizer)
    end_time = timer()
    #val_loss = evaluate(transformer)
    val_loss=0
    print((f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Val loss: {val_loss:.3f}, "f"Epoch time = {(end_time - start_time):.3f}s"))


src_mask: tensor([[False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False],
        [False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False],
        [False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False],
        [False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False],
        [False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False,

KeyboardInterrupt: 

In [30]:
''' 
 第五部分：推理
'''
# function to generate output sequence using greedy algorithm
def greedy_decode(model, src, src_mask, max_len, start_symbol):
    src = src.to(DEVICE)
    src_mask = src_mask.to(DEVICE)

    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(DEVICE)
    for i in range(max_len-1):
        memory = memory.to(DEVICE)
        tgt_mask = (generate_square_subsequent_mask(ys.size(0)) # 有几个词就做几个词的attention mask向量
                    .type(torch.bool)).to(DEVICE)
        out = model.decode(ys, memory, tgt_mask)
        out = out.transpose(0, 1)
        prob = model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim=1)
        next_word = next_word.item()

        ys = torch.cat([ys,
                        torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
        if next_word == EOS_IDX:
            break
    return ys


# actual function to translate input sentence into target language
# 翻译方法, 输入de句子
def translate(model: torch.nn.Module, src_sentence: str):
    model.eval() # 推理模式(dropout关闭)
    src = text_transform[SRC_LANGUAGE](src_sentence).view(-1, 1) # src是(seq_size,batch_size),和train时候的dim顺序一样
    print('tranlate src:',src.shape)
    num_tokens = src.shape[0]
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
    print('src_mask src:',src_mask.shape)
    tgt_tokens = greedy_decode(
        model,  src, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX).flatten()
    return " ".join(vocab_transform[TGT_LANGUAGE].lookup_tokens(list(tgt_tokens.cpu().numpy()))).replace("<bos>", "").replace("<eos>", "")

print(translate(transformer, "Eine Gruppe von Menschen steht vor einem Iglu ."))

tranlate src: torch.Size([11, 1])
src_mask src: torch.Size([11, 11])
 A group of people standing in front of an igloo . 
