# 手撕Transformer-小冬瓜AIGC

![contetn](image/content.png)

## 1 预处理requirements/configure/tokenizer/dataloader

### 1.1 requirements

In [None]:
!pip3 install torchtext==0.6.0
!pip3 install spacy
!pip3 install torch

In [None]:
!python3 -m spacy download de_core_news_sm
!python3 -m spacy download en_core_web_sm

In [3]:
import math
import time
import spacy
import torch

from torch import nn, optim
from torch.optim import Adam
from torch import tensor

### 1.2 configure配置参数

In [16]:
# Transformer 配置参数
# GPU device setting
 
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# 模型参数
batch_size = 128 # 训练批次 句话
max_len = 256    # 单句最大长度 
##
# padding=10

d_model = 512    # 词嵌入向量维度
n_layers = 6     # encoder/decoder层数量
n_heads = 8      # 注意力头数： 假如有词嵌入维度d_model = 512 / n_heads = 8 => 单头向量维度 512 / 8 = 64，即QKV维度
ffn_hidden = 2048 # 前向传播维度。 512 -> 2048 -> 512, 通常也称作proj
drop_prob = 0.1  # dropout提升鲁棒性，随机失活一些节点
n_hidden = ffn_hidden

# optimizer parameter setting
init_lr = 1e-5
factor = 0.9
adam_eps = 5e-9
patience = 10
warmup = 100
epoch = 100
clip = 1.0
weight_decay = 5e-4
inf = float('inf')

### 1.3 Tokenizer 英德文tokenzier

In [17]:
class Tokenizer:
    def __init__(self):
        self.spacy_de = spacy.load('de_core_news_sm')
        self.spacy_en = spacy.load('en_core_web_sm')

    def tokenize_de(self, text):
        return [tok.text for tok in self.spacy_de.tokenizer(text)]

    def tokenize_en(self, text):
        return [tok.text for tok in self.spacy_en.tokenizer(text)]
        # example
        # doc = nlp('This is an example sentence.')
        # tokens = [token.text for token in doc]
        # print(tokens)
        # ['This', 'is', 'an', 'example', 'sentence', '.']

# 加载Token
tokenizer = Tokenizer()
example = 'This is an example sentence.'
tokens = tokenizer.tokenize_en(example)
# tokenizer将句子按照单词分成list
print(example)
print(tokens)
# ['This', 'is', 'an', 'example', 'sentence', '.']

This is an example sentence.
['This', 'is', 'an', 'example', 'sentence', '.']


In [18]:
example = 'two young, white males are outside near many bushes'
tokens = tokenizer.tokenize_en(example)
print(example)
print(tokens)

two young, white males are outside near many bushes
['two', 'young', ',', 'white', 'males', 'are', 'outside', 'near', 'many', 'bushes']


### 1.4 Dataloader创建

In [19]:
from torchtext.data import Field, BucketIterator
from torchtext.datasets.translation import Multi30k
class DataLoader:
    source: Field = None
    target: Field = None
    def __init__(self, ext, tokenize_en, tokenize_de, init_token, eos_token):
        self.ext = ext
        self.tokenize_en = tokenize_en
        self.tokenize_de = tokenize_de
        self.init_token = init_token
        self.eos_token = eos_token
        print('dataset initializing start')

    def make_dataset(self):
        if self.ext == ('.de', '.en'):
            self.source = Field(tokenize=self.tokenize_de, init_token=self.init_token, eos_token=self.eos_token,
                                lower=True, batch_first=True)
            self.target = Field(tokenize=self.tokenize_en, init_token=self.init_token, eos_token=self.eos_token,
                                lower=True, batch_first=True)

        elif self.ext == ('.en', '.de'):
            # Field() 函数返回一个 Field 类的实例，该实例有以下常用方法
            # build_vocab：根据数据集构建词汇表。
            self.source = Field(tokenize=self.tokenize_en, init_token=self.init_token, eos_token=self.eos_token,
                                lower=True, batch_first=True)
            self.target = Field(tokenize=self.tokenize_de, init_token=self.init_token, eos_token=self.eos_token,
                                lower=True, batch_first=True)
        # 拆分数据集
        train_data, valid_data, test_data = Multi30k.splits(exts=self.ext, fields=(self.source, self.target))
        return train_data, valid_data, test_data

    def build_vocab(self, train_data, min_freq):
        self.source.build_vocab(train_data, min_freq=min_freq)
        self.target.build_vocab(train_data, min_freq=min_freq)

    def make_iter(self, train, validate, test, batch_size, device):
        train_iterator, valid_iterator, test_iterator = BucketIterator.splits((train, validate, test),
                                                                              batch_size=batch_size,
                                                                              device=device)
        print('dataset initializing done')
        return train_iterator, valid_iterator, test_iterator

# 需要对整句加上句头句尾token [<sos>, 'This', 'is', 'an', 'example', 'sentence', '.',  <eos>] 
loader = DataLoader(ext=('.en', '.de'),
                    tokenize_en=tokenizer.tokenize_en,
                    tokenize_de=tokenizer.tokenize_de,
                    init_token='<sos>',
                    eos_token='<eos>')

# 创建 source/target Field实例（包含数据）
print('\n--------0. 根据spacy mutli30k 创建数据集-------')
train, valid, test = loader.make_dataset()
print(train.examples[0].src)
print(train.examples[0].trg)
print(len(train.examples))
print(len(test.examples))
print(len(valid.examples))


dataset initializing start

--------0. 根据spacy mutli30k 创建数据集-------
['two', 'young', ',', 'white', 'males', 'are', 'outside', 'near', 'many', 'bushes', '.']
['zwei', 'junge', 'weiße', 'männer', 'sind', 'im', 'freien', 'in', 'der', 'nähe', 'vieler', 'büsche', '.']
29000
1000
1014


In [20]:
loader.build_vocab(train_data=train, min_freq=2)
print('--------1. 查看词表大小-------')
print('src vocab size:', len(loader.source.vocab.stoi)) 
print('trg vocab size:', len(loader.target.vocab.stoi)) 

--------1. 查看词表大小-------
src vocab size: 5893
trg vocab size: 7853


In [21]:
print('--------2. 建立词表后，如何将单词转成token数值-------')
# print('查看词表:', loader.source.vocab.stoi)
print('word \t -> \t token')
print('<sos> \t \t',loader.source.vocab.stoi['<sos>'])
print('two \t \t',loader.source.vocab.stoi['two'])
print('young \t \t',loader.source.vocab.stoi['young'])
print(', \t \t',loader.source.vocab.stoi[','])
print('<eos> \t \t',loader.source.vocab.stoi['<eos>'])
print('<pad> \t \t',loader.source.vocab.stoi['<pad>'])

--------2. 建立词表后，如何将单词转成token数值-------
word 	 -> 	 token
<sos> 	 	 2
two 	 	 16
young 	 	 24
, 	 	 15
<eos> 	 	 3
<pad> 	 	 1


In [22]:
train_iter, valid_iter, test_iter = loader.make_iter(train, valid, test,
                                                     batch_size=batch_size,
                                                     device=device)
print('----3. 从迭代器中取一对，可见其开头为<sos>2, 结尾<eos>3， 剩余为<pad>1---------------')
print('padding的作用：一个batch中有不同的句子， 句子里最大句长为l, 小于l的句子都填充<pad>1')
for batch in train_iter:
    print(batch.src[0])
    print(batch.trg[0])
    break

dataset initializing done
----3. 从迭代器中取一对，可见其开头为<sos>2, 结尾<eos>3， 剩余为<pad>1---------------
padding的作用：一个batch中有不同的句子， 句子里最大句长为l, 小于l的句子都填充<pad>1
tensor([  2,   4,  14, 346,  20,   4, 153,  28,  21, 115, 154, 107,   8,   5,
          3,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1])
tensor([   2,    8,   16,  169,    5,  164,   21,    9,   35,    5, 1259,  116,
         441,    4,    3,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1])


In [23]:
# print('----4. 以下词表参数也是模型中重要的部分----')
# src_pad_idx = loader.source.vocab.stoi['<pad>']
# trg_pad_idx = loader.target.vocab.stoi['<pad>']
# trg_sos_idx = loader.target.vocab.stoi['<sos>']
src_pad_idx = 1
trg_pad_idx = 1
trg_sos_idx = 2

In [24]:

enc_voc_size = 5893
dec_voc_size = 7853

# enc_voc_size = len(loader.source.vocab)
# print("嵌入层的输入参数 {} x 维度 {}".format(enc_voc_size,d_model))
# dec_voc_size = len(loader.target.vocab)
# print("全链接层输出维度 {} x 输出词表{}：".format(d_model,dec_voc_size))


In [25]:
# 从data中获取数据
# 仅运行一次，保证测试时使用同一组数据

# for i, batch in enumerate(train_iter):
#     src = batch.src
#     trg = batch.trg
#     print("save src shape:",src.shape)
#     print("save trg shape",trg.shape)
#     torch.save(src, 'tensor_src.pt')
#     torch.save(trg, 'tensor_trg.pt')
#     break

test_src = torch.load('tensor_src.pt')
test_trg = torch.load('tensor_trg.pt')
print("load src shape", test_src.shape)
print("load trg shape", test_trg.shape)

load src shape torch.Size([128, 36])
load trg shape torch.Size([128, 38])


## 1.5 评价指标

In [26]:
import nltk

def calculate_bleu(reference, candidate):
    reference = [reference.split()]
    candidate = candidate.split()
    smoothing_function = nltk.translate.bleu_score.SmoothingFunction()
    bleu_score = nltk.translate.bleu_score.sentence_bleu(reference, candidate, smoothing_function=smoothing_function.method1)
    return bleu_score

# 示例用法
reference_sentence = "The cat is on the mat"
# candidate_sentence = "The cat is sitting on the mat"
candidate_sentence = "The cat is on the mat"
bleu = calculate_bleu(reference_sentence, candidate_sentence)
print("BLEU score:", bleu)

BLEU score: 1.0


In [27]:
from rouge import Rouge

def calculate_rouge(reference, candidate):
    rouge = Rouge()
    scores = rouge.get_scores(candidate, reference)
    rouge_1 = scores[0]['rouge-1']['f']
    rouge_2 = scores[0]['rouge-2']['f']
    rouge_l = scores[0]['rouge-l']['f']
    return rouge_1, rouge_2, rouge_l

# 示例用法
reference_summary = "The cat is on the mat"
candidate_summary = "The cat is sitting on the mat"
rouge_1, rouge_2, rouge_l = calculate_rouge(reference_summary, candidate_summary)
print("ROUGE-1 score:", rouge_1)
print("ROUGE-2 score:", rouge_2)
print("ROUGE-L score:", rouge_l)

ROUGE-1 score: 0.9230769181065088
ROUGE-2 score: 0.7272727223140496
ROUGE-L score: 0.9230769181065088


In [28]:
import jiwer

def calculate_wer(reference, candidate):
    wer = jiwer.wer(reference, candidate)
    return wer

# 示例用法
reference_transcription = "The cat is on the mat"
candidate_transcription = "The cat is sitting on the mat"
wer = calculate_wer(reference_transcription, candidate_transcription)
print("WER score:", wer)

WER score: 0.16666666666666666


## 2. 手撕Transformer模型

这个章节主要理解模型构造的过程，第3章会自顶向下debug 数据流

### 2.1.1 Token Embedding
目的将1个token转成一串向量
参照Word2Vec算法原理如下图示

Embdding Vec
数据类型流向 word(string) -> 【token(int) -> vec(list(float))】

以下为两个词对应的vec进行比较， 越相近的向量，词性相同
![0](image/embeddings-cosine-personality.png)

Word2Vec embedding

纵轴词表数量， 横轴vec词向量维度， 期望找出当前单词和右边相近的单词向量


![1](image/word2vec-lookup-embeddings.png)

SkipGram: 

假设"我是小冬瓜", 对于"冬"单词与"小"和"瓜"相近positive，与"我"间隔较远
![2](image/skipgram-sliding-window-5.png)

Data and model

则对于"冬"则与"冬-小"和"冬-瓜"相近label则为1， 人为构造负样本"冬-控","冬-龙","冬-抗","冬-狼"设置label为0
![3](image/word2vec-training-example-2.png)

根据所构造的样本，即可训练词表

Train error
![4](image/word2vec-training-update.png)

## embedding 实例

In [54]:
import torch.nn.functional as F
embd_layer = torch.nn.Embedding(14, 512)
print('embedding.weight', embd_layer.weight.shape)
print('embedding.weight:', embd_layer.weight[3,:10])

print(embd_layer.weight[4][:10])

input_id = torch.tensor([[2, 4, 5, 6, 7, 8, 3, 1, 1, 1], 
                      [2, 4, 9, 10,11,12,13,3, 1, 1],
                      [2, 6, 7, 8, 9, 10,11,12,13,3]])


print("输入数据",input_id.shape)
print("输入数据的embedding", embd_layer(input_id).shape)

print(embd_layer(input_id)[0][1][:10])

embedding.weight torch.Size([14, 512])
embedding.weight: tensor([ 0.3077,  1.6821, -0.8445,  1.7122,  0.6515,  0.2656,  0.5346,  0.7102,
        -0.5206,  1.4002], grad_fn=<SliceBackward0>)
tensor([ 1.6492,  0.6710, -0.0295, -0.8489,  1.3947, -0.5767,  0.7576,  0.4782,
         0.4737, -0.3594], grad_fn=<SliceBackward0>)
输入数据 torch.Size([3, 10])
输入数据的embedding torch.Size([3, 10, 512])
tensor([ 1.6492,  0.6710, -0.0295, -0.8489,  1.3947, -0.5767,  0.7576,  0.4782,
         0.4737, -0.3594], grad_fn=<SliceBackward0>)


In [30]:
print("embedding更多直接了解word2vec:")
print("按照以上理论可以直接，通过torch创建embedding表")
a = nn.Embedding(enc_voc_size, d_model)
# embedding_layer = nn.Embedding(14, 128)
print(a.weight.shape) # 14 * 128
print(input_id.shape) # 
x = a(input_id)
print(x.shape)

embedding更多直接了解word2vec:
按照以上理论可以直接，通过torch创建embedding表
torch.Size([5893, 512])
torch.Size([3, 10])
torch.Size([3, 10, 512])


In [31]:
# 创建Token embedding类
class TokenEmbedding(nn.Embedding):
    def __init__(self, vocab_size, d_model):
        super(TokenEmbedding, self).__init__(vocab_size, d_model, padding_idx=1)
        
test_src_token = TokenEmbedding(enc_voc_size, d_model) #对 src：en 进行embedding
test_trg_token = TokenEmbedding(dec_voc_size, d_model) #对 trg：de 进行embedding
print(test_src_token) 
print(test_trg_token)

TokenEmbedding(5893, 512, padding_idx=1)
TokenEmbedding(7853, 512, padding_idx=1)


### 2.1.2 position encoding

Position 编码公式

十进制13  ->  二进制(1,1,0,1) 这是一种位置编码向量: transformer中则使用连续函数描述向量的生成。

可直接记住公式， 也可以尝试通俗理解以下过程

(1,1,0,1)  两两成组 (1,1) (0,1) -> 4维/2=2组： 两组index为 i+1, i 

position encoding后为： (sin(13/(i+1)),cos/(13(i+1))、 ((sin(13/i),cos(13/i)))

则最后 (1,1,0,1) ->  (sin(13/(i+1)),cos(13(i+1))、 ((sin(13/i),cos(13/i)))


![title](image/positional_encoding.jpg)


以下为一种可视化理解如何从p,i变量生成位置编码

![pos](image/position_embeding_pos.png)
![pos_i](image/Fhc4M.png)



In [32]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len, device):
        super(PositionalEncoding, self).__init__()
        self.encoding = torch.zeros(max_len, d_model, device=device)
        self.encoding.requires_grad = False  
        pos = torch.arange(0, max_len, device=device)
        pos = pos.float().unsqueeze(dim=1)
        _2i = torch.arange(0, d_model, step=2, device=device).float()
        self.encoding[:, 0::2] = torch.sin(pos / (10000 ** (_2i / d_model)))
        self.encoding[:, 1::2] = torch.cos(pos / (10000 ** (_2i / d_model)))
        # 512
        # 2x256 cos sin
        
    def forward(self, x):
        batch_size, seq_len = x.size()
        return self.encoding[:seq_len, :]

test_pos_encoding = PositionalEncoding(d_model, max_len, device)
print(test_pos_encoding.encoding.shape)
print(test_pos_encoding.encoding[255,:]) # 255 is position 


torch.Size([256, 512])
tensor([-0.5064, -0.8623,  0.8102,  0.5862, -0.9944,  0.1054,  0.4133, -0.9106,
         0.7891,  0.6142, -0.5736,  0.8192, -0.9598, -0.2807, -0.3029, -0.9530,
         0.4024, -0.9155,  0.7761, -0.6306,  0.9018, -0.4321,  0.9040, -0.4274,
         0.7908, -0.6121,  0.4624, -0.8867, -0.1569, -0.9876, -0.8389, -0.5443,
        -0.8985,  0.4391,  0.0994,  0.9950,  0.9971,  0.0763,  0.0795, -0.9968,
        -0.9965,  0.0837,  0.3968,  0.9179,  0.6316, -0.7753, -0.9985, -0.0547,
         0.6582,  0.7528, -0.0600, -0.9982, -0.4476,  0.8942,  0.7570, -0.6534,
        -0.9037,  0.4281,  0.9573, -0.2891, -0.9663,  0.2576,  0.9428, -0.3334,
        -0.8641,  0.5033,  0.6826, -0.7308, -0.3510,  0.9364, -0.1308, -0.9914,
         0.6554,  0.7553, -0.9834, -0.1812,  0.8371, -0.5471, -0.1461,  0.9893,
        -0.7031, -0.7111,  0.9773, -0.2120, -0.2734,  0.9619, -0.7682, -0.6402,
         0.8635, -0.5043,  0.2464,  0.9692, -0.9994,  0.0346,  0.1163, -0.9932,
         0.9787, 

### 2.1.3 LayerNorm

layer norm 公式

原图公式与主要四行代码一一对应

layernorm作用在最后一维进行归一化

![layer](image/layer_norm.jpg)

In [33]:
class LayerNorm(nn.Module):
    def __init__(self, d_model, eps=1e-12):
        super(LayerNorm, self).__init__()
        self.gamma = nn.Parameter(torch.ones(d_model))
        self.beta = nn.Parameter(torch.zeros(d_model))
        self.eps = eps

    def forward(self, x):
        # layernorm作用在(-1) 最后一维进行归一化
        mean = x.mean(-1, keepdim=True)
        var = x.var(-1, unbiased=False, keepdim=True)
        out = (x - mean) / torch.sqrt(var + self.eps)
        out = self.gamma * out + self.beta
        return out
    
test_ln = LayerNorm(d_model)
print(test_ln.gamma.shape)
print(test_ln.beta.shape)

torch.Size([512])
torch.Size([512])


### 2.1.4 Scaled-Dot-Production

scaled dot product 图示
class ScaleDotProductAttention(nn.Module)
![attention](image/scale_dot_product_attention.jpg)

In [34]:
# 单头注意力机制
# 图-代码-公式完全对应， 第3章节有详细推导
# 先记住实现
class ScaleDotProductAttention(nn.Module):
    def __init__(self):
        super(ScaleDotProductAttention, self).__init__()
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, q, k, v, mask=None, e=1e-12):
        batch_size, head, length, d_tensor = k.size() # /n_embd/8
        k_t = k.transpose(2, 3) 
        score = (q @ k_t) / math.sqrt(d_tensor) #qk^t/dk
        if mask is not None:
            score = score.masked_fill(mask == 0, -10000)
        score = self.softmax(score) #softmax(qk^t/dk)
        v = score @ v #softmax(qk^t/dk)*V
        return v, score

### 2.2.1 position wise feed forward

ffn
![layer](image/positionwise_feed_forward.jpg)

In [35]:
# 前向传播，当成神经网络全链接层 + 隐含层理解
class PositionwiseFeedForward(nn.Module):

    def __init__(self, d_model, hidden, drop_prob=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.linear1 = nn.Linear(d_model, hidden)
        self.linear2 = nn.Linear(hidden, d_model)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=drop_prob)

    def forward(self, x):
        x = self.linear1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.linear2(x)
        return x
ffw = PositionwiseFeedForward(d_model, ffn_hidden)
print(ffw)

PositionwiseFeedForward(
  (linear1): Linear(in_features=512, out_features=2048, bias=True)
  (linear2): Linear(in_features=2048, out_features=512, bias=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.1, inplace=False)
)


### 2.2.2 Multi-Head-Attention

multi-head-attention
![multiheadattention](image/multi_head_attention.jpg)

In [36]:
class MultiHeadAttention(nn.Module):
    # dmodel_n_embed; 512 8
    def __init__(self, d_model, n_head):
        super(MultiHeadAttention, self).__init__()
        self.n_head = n_head
        self.attention = ScaleDotProductAttention()
        self.w_q = nn.Linear(d_model, d_model)
        self.w_k = nn.Linear(d_model, d_model)
        self.w_v = nn.Linear(d_model, d_model)
        self.w_concat = nn.Linear(d_model, d_model)

    def forward(self, q, k, v, mask=None):
        q, k, v = self.w_q(q), self.w_k(k), self.w_v(v)  # 对应图里liner：先对QKV投影
        q, k, v = self.split(q), self.split(k), self.split(v) # Q->Q0, Q1, ... 
        out, attention = self.attention(q, k, v, mask=mask) # 每一头计算attention，z0, z1, ...
        out = self.concat(out) # 将每一头拼接 z0 z1 .. = z
        out = self.w_concat(out) # z -> linner -> output
        return out

    # 先不用看实现，后面会讲
    def split(self, tensor):
        batch_size, length, d_model = tensor.size()
        d_tensor = d_model // self.n_head
        tensor = tensor.view(batch_size, length, self.n_head, d_tensor).transpose(1, 2)
        return tensor

    # 先不用看实现，后面会讲
    def concat(self, tensor):
        batch_size, head, length, d_tensor = tensor.size()
        d_model = head * d_tensor
        tensor = tensor.transpose(1, 2).contiguous().view(batch_size, length, d_model)
        return tensor
    
test_multihead_attention = MultiHeadAttention(d_model, n_heads)
print(test_multihead_attention)
print(d_model, n_heads)

MultiHeadAttention(
  (attention): ScaleDotProductAttention(
    (softmax): Softmax(dim=-1)
  )
  (w_q): Linear(in_features=512, out_features=512, bias=True)
  (w_k): Linear(in_features=512, out_features=512, bias=True)
  (w_v): Linear(in_features=512, out_features=512, bias=True)
  (w_concat): Linear(in_features=512, out_features=512, bias=True)
)
512 8


### 2.2.3 Transformer Embeding

model.png：见input后的操作符token+position

![model](image/model.png)

In [37]:
# Transformer—embedding数据流：【嵌入向量+位置编码 ->  X】 -> QKV -> X
class TransformerEmbedding(nn.Module):
    def __init__(self, vocab_size, d_model, max_len, drop_prob, device):
        super(TransformerEmbedding, self).__init__()
        self.tok_emb = TokenEmbedding(vocab_size, d_model)
        self.pos_emb = PositionalEncoding(d_model, max_len, device)
        self.drop_out = nn.Dropout(p=drop_prob)

    def forward(self, x):
        tok_emb = self.tok_emb(x)
        pos_emb = self.pos_emb(x)
        # 记住这里还有个Dropout
        return self.drop_out(tok_emb + pos_emb)
    
test_embedding = TransformerEmbedding(enc_voc_size, d_model, max_len, drop_prob, device)
print(test_embedding)

TransformerEmbedding(
  (tok_emb): TokenEmbedding(5893, 512, padding_idx=1)
  (pos_emb): PositionalEncoding()
  (drop_out): Dropout(p=0.1, inplace=False)
)


### 2.3.1 Transformer Encode Block

编解码：enc-dec

特别注意【每个 decoder block】都需要接受encoder的输出

![enc-dec](image/enc_dec.jpg)

In [38]:
# 单独一个encoder block
# 多个 encoder block 组成一个 encoder

# 可以叫encoder-layer 也可以叫 encoder-block
class EncoderLayer(nn.Module):

    def __init__(self, d_model, ffn_hidden, n_head, drop_prob):
        super(EncoderLayer, self).__init__()
        self.attention = MultiHeadAttention(d_model=d_model, n_head=n_head)
        self.norm1 = LayerNorm(d_model=d_model)
        self.dropout1 = nn.Dropout(p=drop_prob)

        self.ffn = PositionwiseFeedForward(d_model=d_model, hidden=ffn_hidden, drop_prob=drop_prob)
        self.norm2 = LayerNorm(d_model=d_model)
        self.dropout2 = nn.Dropout(p=drop_prob)

    def forward(self, x, s_mask):
        # 1. compute self attention
        # print("encoder layer x: ", x.shape)
        _x = x
        x = self.attention(q=x, k=x, v=x, mask=s_mask)
        
        # 2. add and norm
        x = self.dropout1(x)
        x = self.norm1(x + _x)
        
        # 3. positionwise feed forward network
        _x = x
        x = self.ffn(x)
      
        # 4. add and norm
        x = self.dropout2(x)
        x = self.norm2(x + _x)
        return x
test_encoder_block = EncoderLayer(d_model, ffn_hidden, n_heads, drop_prob)
print(test_encoder_block)

EncoderLayer(
  (attention): MultiHeadAttention(
    (attention): ScaleDotProductAttention(
      (softmax): Softmax(dim=-1)
    )
    (w_q): Linear(in_features=512, out_features=512, bias=True)
    (w_k): Linear(in_features=512, out_features=512, bias=True)
    (w_v): Linear(in_features=512, out_features=512, bias=True)
    (w_concat): Linear(in_features=512, out_features=512, bias=True)
  )
  (norm1): LayerNorm()
  (dropout1): Dropout(p=0.1, inplace=False)
  (ffn): PositionwiseFeedForward(
    (linear1): Linear(in_features=512, out_features=2048, bias=True)
    (linear2): Linear(in_features=2048, out_features=512, bias=True)
    (relu): ReLU()
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (norm2): LayerNorm()
  (dropout2): Dropout(p=0.1, inplace=False)
)


### 2.3.2 Transformer Decoder Block

In [39]:
class DecoderLayer(nn.Module):

    def __init__(self, d_model, ffn_hidden, n_head, drop_prob):
        super(DecoderLayer, self).__init__()
        self.self_attention = MultiHeadAttention(d_model=d_model, n_head=n_head)
        self.norm1 = LayerNorm(d_model=d_model)
        self.dropout1 = nn.Dropout(p=drop_prob)

        # enc_dec_attention使用encoder的 Q， decoder的 K，V
        self.enc_dec_attention = MultiHeadAttention(d_model=d_model, n_head=n_head)
        self.norm2 = LayerNorm(d_model=d_model)
        self.dropout2 = nn.Dropout(p=drop_prob)

        self.ffn = PositionwiseFeedForward(d_model=d_model, hidden=ffn_hidden, drop_prob=drop_prob)
        self.norm3 = LayerNorm(d_model=d_model)
        self.dropout3 = nn.Dropout(p=drop_prob)

    def forward(self, dec, enc, t_mask, s_mask):
        # 1. compute self attention
        _x = dec
        x = self.self_attention(q=dec, k=dec, v=dec, mask=t_mask)#下三角矩阵
        
        # 2. add and norm
        x = self.dropout1(x)
        x = self.norm1(x + _x)

        if enc is not None:
            # 3. compute encoder - decoder attention
            _x = x
            x = self.enc_dec_attention(q=x, k=enc, v=enc, mask=s_mask) # 
            
            # 4. add and norm
            x = self.dropout2(x)
            x = self.norm2(x + _x)

        # 5. positionwise feed forward network
        _x = x
        x = self.ffn(x)
        
        # 6. add and norm
        x = self.dropout3(x)
        x = self.norm3(x + _x)
        return x
test_decoder_block = DecoderLayer(d_model, ffn_hidden, n_heads, drop_prob)
print(test_decoder_block)

DecoderLayer(
  (self_attention): MultiHeadAttention(
    (attention): ScaleDotProductAttention(
      (softmax): Softmax(dim=-1)
    )
    (w_q): Linear(in_features=512, out_features=512, bias=True)
    (w_k): Linear(in_features=512, out_features=512, bias=True)
    (w_v): Linear(in_features=512, out_features=512, bias=True)
    (w_concat): Linear(in_features=512, out_features=512, bias=True)
  )
  (norm1): LayerNorm()
  (dropout1): Dropout(p=0.1, inplace=False)
  (enc_dec_attention): MultiHeadAttention(
    (attention): ScaleDotProductAttention(
      (softmax): Softmax(dim=-1)
    )
    (w_q): Linear(in_features=512, out_features=512, bias=True)
    (w_k): Linear(in_features=512, out_features=512, bias=True)
    (w_v): Linear(in_features=512, out_features=512, bias=True)
    (w_concat): Linear(in_features=512, out_features=512, bias=True)
  )
  (norm2): LayerNorm()
  (dropout2): Dropout(p=0.1, inplace=False)
  (ffn): PositionwiseFeedForward(
    (linear1): Linear(in_features=512, ou

### 2.3.3 Transformer Encoder

In [40]:
class Encoder(nn.Module):

    def __init__(self, enc_voc_size, max_len, d_model, ffn_hidden, n_head, n_layers, drop_prob, device):
        super().__init__()
        self.emb = TransformerEmbedding(d_model=d_model,
                                        max_len=max_len,
                                        vocab_size=enc_voc_size,
                                        drop_prob=drop_prob,
                                        device=device)

        self.layers = nn.ModuleList([EncoderLayer(d_model=d_model,
                                                  ffn_hidden=ffn_hidden,
                                                  n_head=n_head,
                                                  drop_prob=drop_prob)
                                     for _ in range(n_layers)])

    def forward(self, x, s_mask):
        x = self.emb(x)
        # 每个encoder block的输入输出tensor是一致的
        for layer in self.layers:
            x = layer(x, s_mask)
        return x
    

test_encoder = Encoder(enc_voc_size, max_len, d_model, ffn_hidden, n_heads, n_layers, drop_prob, device)
print("encoder block size : ", len(test_encoder.layers))
print(test_encoder)

encoder block size :  6
Encoder(
  (emb): TransformerEmbedding(
    (tok_emb): TokenEmbedding(5893, 512, padding_idx=1)
    (pos_emb): PositionalEncoding()
    (drop_out): Dropout(p=0.1, inplace=False)
  )
  (layers): ModuleList(
    (0-5): 6 x EncoderLayer(
      (attention): MultiHeadAttention(
        (attention): ScaleDotProductAttention(
          (softmax): Softmax(dim=-1)
        )
        (w_q): Linear(in_features=512, out_features=512, bias=True)
        (w_k): Linear(in_features=512, out_features=512, bias=True)
        (w_v): Linear(in_features=512, out_features=512, bias=True)
        (w_concat): Linear(in_features=512, out_features=512, bias=True)
      )
      (norm1): LayerNorm()
      (dropout1): Dropout(p=0.1, inplace=False)
      (ffn): PositionwiseFeedForward(
        (linear1): Linear(in_features=512, out_features=2048, bias=True)
        (linear2): Linear(in_features=2048, out_features=512, bias=True)
        (relu): ReLU()
        (dropout): Dropout(p=0.1, inplace

### 2.3.4 Transformer Decoder

In [41]:
class Decoder(nn.Module):
    def __init__(self, dec_voc_size, max_len, d_model, ffn_hidden, n_head, n_layers, drop_prob, device):
        super().__init__()
        self.emb = TransformerEmbedding(d_model=d_model,
                                        drop_prob=drop_prob,
                                        max_len=max_len,
                                        vocab_size=dec_voc_size,
                                        device=device)

        self.layers = nn.ModuleList([DecoderLayer(d_model=d_model,
                                                  ffn_hidden=ffn_hidden,
                                                  n_head=n_head,
                                                  drop_prob=drop_prob)
                                     for _ in range(n_layers)])

        self.linear = nn.Linear(d_model, dec_voc_size)

    def forward(self, trg, enc_src, trg_mask, src_mask):
        trg = self.emb(trg)

        # 这里的每个layer，都有decoder的enc_src输入
        for layer in self.layers:
            trg = layer(trg, enc_src, trg_mask, src_mask) # src_trg_mask

        # pass to LM head
        output = self.linear(trg)
        return output

test_decoder = Decoder(dec_voc_size, max_len, d_model, ffn_hidden, n_heads, n_layers, drop_prob, device)
print("decoder block size : ", len(test_decoder.layers))
print(test_encoder)

decoder block size :  6
Encoder(
  (emb): TransformerEmbedding(
    (tok_emb): TokenEmbedding(5893, 512, padding_idx=1)
    (pos_emb): PositionalEncoding()
    (drop_out): Dropout(p=0.1, inplace=False)
  )
  (layers): ModuleList(
    (0-5): 6 x EncoderLayer(
      (attention): MultiHeadAttention(
        (attention): ScaleDotProductAttention(
          (softmax): Softmax(dim=-1)
        )
        (w_q): Linear(in_features=512, out_features=512, bias=True)
        (w_k): Linear(in_features=512, out_features=512, bias=True)
        (w_v): Linear(in_features=512, out_features=512, bias=True)
        (w_concat): Linear(in_features=512, out_features=512, bias=True)
      )
      (norm1): LayerNorm()
      (dropout1): Dropout(p=0.1, inplace=False)
      (ffn): PositionwiseFeedForward(
        (linear1): Linear(in_features=512, out_features=2048, bias=True)
        (linear2): Linear(in_features=2048, out_features=512, bias=True)
        (relu): ReLU()
        (dropout): Dropout(p=0.1, inplace

### 2.4 Transformer结构

In [42]:
# 完整的Transfomer 类， 创建encoder / decoder
class Transformer(nn.Module):

    def __init__(self, src_pad_idx, trg_pad_idx, trg_sos_idx, enc_voc_size, dec_voc_size, d_model, n_head, max_len,
                 ffn_hidden, n_layers, drop_prob, device):
        super().__init__()
        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx
        self.trg_sos_idx = trg_sos_idx
        self.device = device
        self.encoder = Encoder(d_model=d_model,
                               n_head=n_head,
                               max_len=max_len,
                               ffn_hidden=ffn_hidden,
                               enc_voc_size=enc_voc_size,
                               drop_prob=drop_prob,
                               n_layers=n_layers,
                               device=device)

        self.decoder = Decoder(d_model=d_model,
                               n_head=n_head,
                               max_len=max_len,
                               ffn_hidden=ffn_hidden,
                               dec_voc_size=dec_voc_size,
                               drop_prob=drop_prob,
                               n_layers=n_layers,
                               device=device)

    def forward(self, src, trg):
        src_mask = self.make_pad_mask(src, src, self.src_pad_idx, self.src_pad_idx)

        src_trg_mask = self.make_pad_mask(trg, src, self.trg_pad_idx, self.src_pad_idx)

        trg_mask = self.make_pad_mask(trg, trg, self.trg_pad_idx, self.trg_pad_idx) * \
                   self.make_no_peak_mask(trg, trg)
        # encoder计算流程 src -> encoder -> enc_src
        # decoder计算流程 enc_src + trg -> decoder  -> output
        # 关于Mask后面会讲解
        enc_src = self.encoder(src, src_mask)
        output = self.decoder(trg, enc_src, trg_mask, src_trg_mask)
        return output

    def make_pad_mask(self, q, k, q_pad_idx, k_pad_idx):
        len_q, len_k = q.size(1), k.size(1)

        # batch_size x 1 x 1 x len_k
        k = k.ne(k_pad_idx).unsqueeze(1).unsqueeze(2)
        # batch_size x 1 x len_q x len_k
        k = k.repeat(1, 1, len_q, 1)

        # batch_size x 1 x len_q x 1
        q = q.ne(q_pad_idx).unsqueeze(1).unsqueeze(3)
        # batch_size x 1 x len_q x len_k
        q = q.repeat(1, 1, 1, len_k)

        mask = k & q
        return mask

    def make_no_peak_mask(self, q, k):
        len_q, len_k = q.size(1), k.size(1)
        # len_q x len_k
        mask = torch.tril(torch.ones(len_q, len_k)).type(torch.BoolTensor).to(self.device)
        return mask

## 3. 调试

### 3.1 创建Transformer model

In [43]:
# Transformer为./models/transformer.py里的模型类，包含多个对象和方法

model = Transformer(src_pad_idx=src_pad_idx,
                    trg_pad_idx=trg_pad_idx,
                    trg_sos_idx=trg_sos_idx,
                    d_model=d_model,
                    enc_voc_size=enc_voc_size,
                    dec_voc_size=dec_voc_size,
                    max_len=max_len,
                    ffn_hidden=ffn_hidden,
                    n_head=n_heads,
                    n_layers=n_layers,
                    drop_prob=drop_prob,
                    device=device).to(device)

# 使用kaiming_uniform对model初始化
def initialize_weights(m):
    if hasattr(m, 'weight') and m.weight.dim() > 1:
        nn.init.kaiming_uniform(m.weight.data)
        
model.apply(initialize_weights)

  nn.init.kaiming_uniform(m.weight.data)


Transformer(
  (encoder): Encoder(
    (emb): TransformerEmbedding(
      (tok_emb): TokenEmbedding(5893, 512, padding_idx=1)
      (pos_emb): PositionalEncoding()
      (drop_out): Dropout(p=0.1, inplace=False)
    )
    (layers): ModuleList(
      (0-5): 6 x EncoderLayer(
        (attention): MultiHeadAttention(
          (attention): ScaleDotProductAttention(
            (softmax): Softmax(dim=-1)
          )
          (w_q): Linear(in_features=512, out_features=512, bias=True)
          (w_k): Linear(in_features=512, out_features=512, bias=True)
          (w_v): Linear(in_features=512, out_features=512, bias=True)
          (w_concat): Linear(in_features=512, out_features=512, bias=True)
        )
        (norm1): LayerNorm()
        (dropout1): Dropout(p=0.1, inplace=False)
        (ffn): PositionwiseFeedForward(
          (linear1): Linear(in_features=512, out_features=2048, bias=True)
          (linear2): Linear(in_features=2048, out_features=512, bias=True)
          (relu): Re

### 3.2 创建调试数据

In [44]:
# # 从data中获取数据
# for i, batch in enumerate(train_iter):
#     src = batch.src
#     trg = batch.trg
#     print("save src shape:",src.shape)
#     print("save trg shape",trg.shape)
#     torch.save(src, 'tensor_src.pt')
#     torch.save(trg, 'tensor_trg.pt')
#     break

test_src = torch.load('tensor_src.pt')
test_trg = torch.load('tensor_trg.pt')
print("load src shape", test_src.shape)
print("load trg shape", test_trg.shape)


load src shape torch.Size([128, 36])
load trg shape torch.Size([128, 38])


In [45]:
# 加载数据集, 从dataloader中获取
# 接下来所有数据计算，都基于batch(128)

src = torch.load('tensor_src.pt')
trg = torch.load('tensor_trg.pt')
print("load src shape", src.shape)
print("load trg shape", trg.shape)
print('batch size : {} and src length: {} '.format(src.shape[0], src.shape[1]))
print('batch size : {} and trg length: {} '.format(trg.shape[0], trg.shape[1]))
print('src [0]: ', src[0])
print('trg [0]: ', trg[0])
print('src_pad_idx:',src_pad_idx)
print('trg_pad_idx:',trg_pad_idx)
print('trg_sos_idx:',trg_sos_idx)

load src shape torch.Size([128, 36])
load trg shape torch.Size([128, 38])
batch size : 128 and src length: 36 
batch size : 128 and trg length: 38 
src [0]:  tensor([   2,   30, 1622,   58,   16,   30,   17,    6,    4,  565, 1028,    5,
           3,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1])
trg [0]:  tensor([  2,  30, 185,  23,   9,  35,  18,  30,  20,   0,  52,   4,   3,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1])
src_pad_idx: 1
trg_pad_idx: 1
trg_sos_idx: 2


### 3.3 创建mask

In [46]:
# 根据pad信息，创建mask，先忽略实现细节
src_mask = model.make_pad_mask(src, src, src_pad_idx, src_pad_idx)
src_trg_mask = model.make_pad_mask(trg, src, trg_pad_idx, src_pad_idx)
trg_mask = model.make_pad_mask(trg, trg, trg_pad_idx, trg_pad_idx) * \
            model.make_no_peak_mask(trg, trg)
print("src_mask:", src_mask.shape)
print("src_trg_mask:", src_trg_mask.shape)
print("trg_mask:", trg_mask.shape)

src_mask: torch.Size([128, 1, 36, 36])
src_trg_mask: torch.Size([128, 1, 38, 36])
trg_mask: torch.Size([128, 1, 38, 38])


In [47]:
# print(src_mask[0][0].int())
# print(src_trg_mask[0][0].int())
# # trg.Q.shape() * src.K^T.shape()
# print(trg_mask[0][0].int()) # 下三角

### 3.4 图解Transformer

![all](image/the_transformer_3.png)
![all](image/The_transformer_encoders_decoders.png)
![all](image/The_transformer_encoder_decoder_stack.png)

### 3.4.1 计算src->[encoder->decoder]->target

In [48]:
# # transformer 编码层和解码层计算
# print("查看模型：", model)
enc_src = model.encoder(src, src_mask)
output = model.decoder(trg, enc_src, trg_mask, src_trg_mask)
print(src.shape)
print(enc_src.shape)
print(output.shape)
print("decode voc size:", dec_voc_size)
print("d_model:", d_model)

torch.Size([128, 36])
torch.Size([128, 36, 512])
torch.Size([128, 38, 7853])
decode voc size: 7853
d_model: 512


![embedding](image/transformer_positional_encoding_vectors.png)

In [49]:
# encoder 编码层计算
# encoder包含emb和n_layers层

emb_src = model.encoder.emb(src)
print('src:', emb_src.shape)
print('emb_src:', emb_src.shape)
print('n_layers:', n_layers)
print('encode layers:', len(model.encoder.layers))
# encoder0 -> encoder1
for layer in model.encoder.layers:
    encoder_src = layer(emb_src, src_mask)
    print('encoder_src:', encoder_src.shape)

src: torch.Size([128, 36, 512])
emb_src: torch.Size([128, 36, 512])
n_layers: 6
encode layers: 6
encoder_src: torch.Size([128, 36, 512])
encoder_src: torch.Size([128, 36, 512])
encoder_src: torch.Size([128, 36, 512])
encoder_src: torch.Size([128, 36, 512])
encoder_src: torch.Size([128, 36, 512])
encoder_src: torch.Size([128, 36, 512])


### 3.4.2 计算input->embedding

数值position
![embedding-sample](image/transformer_positional_encoding_example.png)

In [50]:
# embedding 嵌入层计算
# models/embedding/transformer_embedding.py
# class TransformerEmbedding(nn.Module)

emb = model.encoder.emb
print(emb)
tok_emb = emb.tok_emb(src)
pos_emb = emb.pos_emb(src)
emb_out = emb.drop_out(tok_emb + pos_emb)
print('src:', src.shape)
print('tok_emb:', tok_emb.shape)
print('pos_emb:', pos_emb.shape)
print('emb_out:', emb_out.shape)

# tok_emb 使用 nn.embedding
# pos_emb 计算如下
# 512 / 2[cos/sin] -> i 256
print('\n-----------------------手撕position编码-----------------------')
# 位置编码仅计算一次
emb.pos_emb.encoding = torch.zeros(max_len, d_model)
print("位置编码向量tensor: ",emb.pos_emb.encoding.shape)

emb.pos_emb.encoding.requires_grad = False  # we don't need to compute gradient
pos = torch.arange(0, max_len)
print('pos:', pos.shape)
pos = pos.float().unsqueeze(dim=1)
print('pos 增加一个维度后:', pos.shape)

_2i = torch.arange(0, d_model, step=2, device=device).float()
print('_2i ', _2i.shape)
print('_2i[0:10] ', _2i[:10])

print('赋值pos_embeding')
emb.pos_emb.encoding[:, 0::2] = torch.sin(pos / (10000 ** (_2i / d_model)))
emb.pos_emb.encoding[:, 1::2] = torch.cos(pos / (10000 ** (_2i / d_model)))
print('--------------:', emb.pos_emb.encoding.shape)
print("打印前10个数据", emb.pos_emb.encoding[0:5, 0:5])

# 使用时
batch_size, seq_len = src.size()
print(batch_size)
print(seq_len)
print(emb.pos_emb.encoding[:seq_len, :].shape)

TransformerEmbedding(
  (tok_emb): TokenEmbedding(5893, 512, padding_idx=1)
  (pos_emb): PositionalEncoding()
  (drop_out): Dropout(p=0.1, inplace=False)
)
src: torch.Size([128, 36])
tok_emb: torch.Size([128, 36, 512])
pos_emb: torch.Size([36, 512])
emb_out: torch.Size([128, 36, 512])

-----------------------手撕position编码-----------------------
位置编码向量tensor:  torch.Size([256, 512])
pos: torch.Size([256])
pos 增加一个维度后: torch.Size([256, 1])
_2i  torch.Size([256])
_2i[0:10]  tensor([ 0.,  2.,  4.,  6.,  8., 10., 12., 14., 16., 18.])
赋值pos_embeding
--------------: torch.Size([256, 512])
打印前10个数据 tensor([[ 0.0000,  1.0000,  0.0000,  1.0000,  0.0000],
        [ 0.8415,  0.5403,  0.8219,  0.5697,  0.8020],
        [ 0.9093, -0.4161,  0.9364, -0.3509,  0.9581],
        [ 0.1411, -0.9900,  0.2451, -0.9695,  0.3428],
        [-0.7568, -0.6536, -0.6572, -0.7537, -0.5486]])
128
36
torch.Size([36, 512])


In [51]:
import torch
PE = torch.randn(5, 4)

print(PE)
# print(PE[0,:])
# print(PE[1,:])
print(PE[3,:])

tensor([[-0.8943,  0.5794, -0.7674, -0.3635],
        [-1.1186, -0.4357,  0.1521, -2.6122],
        [-0.0393, -0.3453,  0.3196, -0.0532],
        [ 0.2895,  1.9149,  0.6449, -0.2646],
        [ 0.4749,  1.2114,  1.3111,  1.2232]])
tensor([ 0.2895,  1.9149,  0.6449, -0.2646])


### 3.4.3 计算 embedding->[encoder block]->output

encoder block 主要包含multi-head attention 和 feed forward position两个主要模块

![encoder_block](image/Transformer_encoder.png)

----

encoder block更加具体为

![encoder_block detail](image/transformer_resideual_layer_norm.png)

In [52]:
# 请尝试独自debug各层类

# 获取encode中的一个blocks
layer=model.encoder.layers[0]
# print(layer)

# 0. 保留输入向量, 用于short cut
emb_src = emb_out
_emb_src = emb_src 

# 1. 编码层 多头-自注意力机制（后面会详细介绍）
x = layer.attention(q=emb_src, k=emb_src, v=emb_src, mask=src_mask)

# 2. dropout和layer-norm（后面会介绍）
x = layer.dropout1(x)
x = layer.norm1(x + _emb_src) # shorcut连接

# 3. 基于位置的前向传播将维度512->2048->512
_x = x
x = layer.ffn(x)

# 4. dropout + shortcut + layer-norm
x = layer.dropout2(x)
x = layer.norm2(x + _x)



shortcut目的在以保留信息，防止信息损失，见ResNet

![shortcut](image/transformer_resideual_layer_norm_2.png)

### 3.4.4 计算 embeding->[multi-head-attention]->score



输入输出
![multi-head](image/transformer_attention_heads_z.png)


------
输出拼接
![multi-concate](image/transformer_attention_heads_weight_matrix_o.png)

-----
Multi-head-attention计算流程
![multi-head-attention-pipeline](image/transformer_multi-headed_self-attention-recap.png)
---

---

![wq](image/self-attention-matrix-calculation.png)
---


![8头](image/transformer_attention_heads_qkv.png)


In [53]:
# multihead多头注意力计算

# encode multi-attention直接计算多头注意力分数
multi_head_attention = model.encoder.layers[0].attention
print("multi_head_attention层包含:", multi_head_attention)
x_attention_out = multi_head_attention(q=emb_src, k=emb_src, v=emb_src, mask=src_mask)
print("emb_src:", emb_src.shape)
print("x_attention_out:", x_attention_out.shape)
print("以下为多头注意力forward分解步骤：")

# 0. 自注意力向量
q = k = v = emb_src # embdedding+positional = x
print("\n 0. 输入向量emb_src:", emb_src.shape)
print("q.shape:", q.shape)
print("k.shape:", k.shape)
print("v.shape:", v.shape)

# 1. liner转化
print("\n 1. 对qkv liner 转化")
q = multi_head_attention.w_q(q)
k = multi_head_attention.w_k(k)
v = multi_head_attention.w_v(v)
print("q=f(q): ", q.shape)

# 2. 将输入向量拆成n_head
print("\n 2. 将输入向量拆成n_head")
print("n_heads:", n_heads)
print("multi_head_attention.n_head:", multi_head_attention.n_head)
_q = q

# do split multi_head_attention.split()
print('*-------multi_head_attention.split()-------------*')
batch_size, length, d_model = _q.size()
d_tensor = d_model // multi_head_attention.n_head
print("d_model:{} / n_heads:{} = d_tensor:{}".format(d_model, n_heads, d_tensor))
print("单头向量维度为:", d_tensor)
_q_split = _q.view(batch_size, length, multi_head_attention.n_head, d_tensor).transpose(1, 2)
print("_q_split:", _q_split.shape)
print('*-------multi_head_attention.split()-------------*')

q, k, v = multi_head_attention.split(q), multi_head_attention.split(k), multi_head_attention.split(v)
print("shape = [batch_size:128, heads:8, length:29, d_tensor:64]")
print("multi_head_attention.split(q):", q.shape)
print("multi_head_attention.split(k):", k.shape)
print("multi_head_attention.split(v):", v.shape)


# 3. do scale dot product to compute similarity
# 计算每一头的attention（scale and dot attention）
print("\n 3. 计算单头注意力, scale and dot attention")
print("上面将512维度分成8头64维")
print("会独立介绍单头注意力的计算")
_q_single = q
_k_single = k
_v_single = v
out, attention = multi_head_attention.attention(q, k, v, mask=src_mask)
print("对每一头进行自注意力后的结果:", out.shape)

# 4. concat and pass to linear layer
print("\n 4. 将8头64维拼接成512维度向量")
_out = out 
# do concat 

print('*-------multi_head_attention.concat()-------------*')
print("multi_head_attention.concat() 函数示例")
batch_size, head, length, d_tensor = _out.size()
d_model = head * d_tensor
_out_concat = _out.transpose(1, 2).contiguous().view(batch_size, length, d_model)
print("concat 操作后", _out_concat.shape)
print('*-------multi_head_attention.concat()-------------*')

out = multi_head_attention.concat(out)
print("after concat out shape:", out.shape)
out = multi_head_attention.w_concat(out)
print("对多头注意力输出再进行前向传播", out.shape)

multi_head_attention层包含: MultiHeadAttention(
  (attention): ScaleDotProductAttention(
    (softmax): Softmax(dim=-1)
  )
  (w_q): Linear(in_features=512, out_features=512, bias=True)
  (w_k): Linear(in_features=512, out_features=512, bias=True)
  (w_v): Linear(in_features=512, out_features=512, bias=True)
  (w_concat): Linear(in_features=512, out_features=512, bias=True)
)
emb_src: torch.Size([128, 36, 512])
x_attention_out: torch.Size([128, 36, 512])
以下为多头注意力forward分解步骤：

 0. 输入向量emb_src: torch.Size([128, 36, 512])
q.shape: torch.Size([128, 36, 512])
k.shape: torch.Size([128, 36, 512])
v.shape: torch.Size([128, 36, 512])

 1. 对qkv liner 转化
q=f(q):  torch.Size([128, 36, 512])

 2. 将输入向量拆成n_head
n_heads: 8
multi_head_attention.n_head: 8
*-------multi_head_attention.split()-------------*
d_model:512 / n_heads:8 = d_tensor:64
单头向量维度为: 64
_q_split: torch.Size([128, 8, 36, 64])
*-------multi_head_attention.split()-------------*
shape = [batch_size:128, heads:8, length:29, d_tensor:64]
multi

### 3.4.5 计算 [scale-dot-production] :  mask(q@k^t/scaled)@v

![pipeline](image/self-attention-matrix-calculation-2.png)
![pipeline_qkv2](image/self-attention-output.png)

In [74]:
# attention, 单头注意力计算
# models/layer/scale_dot_product_attention.py
# class ScaleDotProductAttention(nn.Module)

attention = multi_head_attention.attention
print(attention)

# input is 4 dimension tensor
# [batch_size, head, length, d_tensor]
k = _k_single
q = _q_single
v = _v_single
batch_size, head, length, d_tensor = k.size()

print('tensor中的格式： 只关注length句长， d_tensor向量长度')
print('[batch_size:{}, head:{}, length:{}, d_tensor:{}]'.format(batch_size,head,length,d_tensor))

# 1. dot product Query with Key^T to compute similarity
k_t = k.transpose(2, 3)  # transpose

print("q:", q.shape)
print("k_t:", k_t.shape)
score = (q @ k_t) / math.sqrt(d_tensor)  # scaled dot product
########   dot   ####### scaled #######
print("通过计算两个向量的点积dot操作:score=q@k_t: ", score.shape)
print("每个词与词之间计算相关性")
print("score代表注意力分数 ")
print("src_mask:", src_mask.shape)
# 2. apply masking (opt)
if src_mask is not None:
    score = score.masked_fill(src_mask == 0, -10000)
# 3. pass them softmax to make [0, 1] range
score = attention.softmax(score)
# 4. multiply with Value
print("v:", v.shape)
v = score @ v
print("score * v:", v.shape)
print("score * v: 代表注意力特征向量，即每个词在当前这个句子中的特征表达")

ScaleDotProductAttention(
  (softmax): Softmax(dim=-1)
)
tensor中的格式： 只关注length句长， d_tensor向量长度
[batch_size:128, head:8, length:36, d_tensor:64]
q: torch.Size([128, 8, 36, 64])
k_t: torch.Size([128, 8, 64, 36])
通过计算两个向量的点积dot操作:score=q@k_t:  torch.Size([128, 8, 36, 36])
每个词与词之间计算相关性
score代表注意力分数 
src_mask: torch.Size([128, 1, 36, 36])
v: torch.Size([128, 8, 36, 64])
score * v: torch.Size([128, 8, 36, 64])
score * v: 代表注意力特征向量，即每个词在当前这个句子中的特征表达



一个句子中：关于'it'单词的 单头自注意力score 30个词 [1,30,1] 'it'
![vis-1](image/transformer_self-attention_visualization.png)

一个句子中：关于'it'单词的 两头自注意力score 30个词 [2, 30,1] 'it'
![vis-2](image/transformer_self-attention_visualization_2.png)


一个句子中：关于'it'单词的 八头自注意力score 30个词 [8, 30,1] 'it'
![vis-3](image/transformer_self-attention_visualization_3.png)


一个句子中：关于30个单词的 八头自注意力score 30个词 [8, 30,30] 

128个句子中：关于30个单词的 八头自注意力score 30个词 [128, 8, 30,30] 



QK可视化 score
![vis-gpt](image/gpt2-self-attention-scoring-2.png)

### 3.4.6 计算 emb_src->[layer normaliztion] ->multihead attention

layer norm 公式
class LayerNorm(nn.Module)
![layer](image/layer_norm.jpg)

In [75]:
# Layer Normalization, 层归一化
# models/layer/layer_norm.py
# class LayerNorm(nn.Module)

norm = model.encoder.layers[0].norm1
print(norm)

x = emb_src
print("==============LayerNorm===========")
print("LayerNorm gamma: ", norm.gamma.shape)
print("LayerNorm beta: ", norm.beta.shape)
print("LayerNorm eps: ", norm.eps)

mean = x.mean(-1, keepdim=True)
print("LayerNorm mean: ", mean.shape)

var = x.var(-1, unbiased=False, keepdim=True)
print("LayerNorm var: ", var.shape)
# '-1' means last dimension. 

out = (x - mean) / torch.sqrt(var + norm.eps)
print("LayerNorm norm out: ", out.shape)

out = norm.gamma * out + norm.beta
print("LayerNorm norm out offset: ", out.shape)


LayerNorm()
LayerNorm gamma:  torch.Size([512])
LayerNorm beta:  torch.Size([512])
LayerNorm eps:  1e-12
LayerNorm mean:  torch.Size([128, 36, 1])
LayerNorm var:  torch.Size([128, 36, 1])
LayerNorm norm out:  torch.Size([128, 36, 512])
LayerNorm norm out offset:  torch.Size([128, 36, 512])


### 3.4.7 计算attention-> [position-wise-feed-forward]->layernorm

In [76]:
# PositionwiseFeedForward, 位置前向传播
# models/layer/position_wise_feed_forward.py
# class PositionwiseFeedForward(nn.Module)
print("PositionwiseFeedForward, 位置前向传播")
ffn = model.encoder.layers[0].ffn
print(ffn)
print("n_hidden: ", ffn_hidden)

_x = emb_src
print("1. before linear:", _x.shape)

_x = ffn.linear1(_x)
print("2. after linear1:", _x.shape)

_x = ffn.relu(_x)
_x = ffn.dropout(_x)
_x = ffn.linear2(_x)
print("3. after linear2:", _x.shape)

PositionwiseFeedForward, 位置前向传播
PositionwiseFeedForward(
  (linear1): Linear(in_features=512, out_features=2048, bias=True)
  (linear2): Linear(in_features=2048, out_features=512, bias=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.1, inplace=False)
)
n_hidden:  2048
1. before linear: torch.Size([128, 36, 512])
2. after linear1: torch.Size([128, 36, 2048])
3. after linear2: torch.Size([128, 36, 512])


### 3.4.8 计算enc_src+emb_trg->[decoder]->output

![decoder_dataflow_block](image/The_transformer_encoder_decoder_stack.png)
![decoder_dataflow](image/transformer_resideual_layer_norm_3.png)
![decoder_pipeline_single](image/transformer_decoding_2.gif)

In [77]:
# Decoder, 解码结构
# models/model/decoder.py
# class Decoder(nn.Module)
print("解码层结构：")
# print(model.decoder)

print("解码层输入target和编码层一样做embeding")
print("trg输入", trg.shape)
emb_trg = model.decoder.emb(trg) # target -> Label mask
print("trg embding", emb_trg.shape)
print("解码层数:", len(model.decoder.layers))

# encoder - > encoder K encoder V
# decoder Q

for layer in model.decoder.layers:
    # 注意这里需要有编码层的输入
    decode_trg = layer(emb_trg, enc_src, trg_mask, src_trg_mask)
print("编码层输出：", decode_trg.shape)
# pass to LM head
output_decode = model.decoder.linear(decode_trg)
print("编码层liner处理：", output_decode.shape)



解码层结构：
解码层输入target和编码层一样做embeding
trg输入 torch.Size([128, 38])
trg embding torch.Size([128, 38, 512])
解码层数: 6
编码层输出： torch.Size([128, 38, 512])
编码层liner处理： torch.Size([128, 38, 7853])


### 3.4.9 计算[decoder block]: decode-self-attention -> enc-dec-attention ->ffn

![encoder-decoder](image/transformer_resideual_layer_norm_3.png)

In [78]:
# DecoderLayer, 解码层
# models/blocks/decoder_layer.py
# class DecoderLayer(nn.Module)

# decode layer
layer = model.decoder.layers[0]
# print("decode layer结构：")
# print(layer)

dec = emb_trg
enc = enc_src
_x = dec

x = layer.self_attention(q=dec, k=dec, v=dec, mask=trg_mask)
x = layer.dropout1(x)
x = layer.norm1(x + _x)

if enc is not None:
    # 3. compute encoder - decoder attention
    _x = x
    # 多头注意力机制
    print('q: trg_x:', x.shape)
    print('k: enc:', enc.shape)
    print('v: enc:', enc.shape)
    print('mask: src_trg_mask:', src_trg_mask.shape)
    x = layer.enc_dec_attention(q=x, k=enc, v=enc, mask=src_trg_mask)
    print("enc->dec 注意力后: ", x.shape)
    # 4. add and norm
    x = layer.dropout2(x)
    x = layer.norm2(x + _x)

# 5. positionwise feed forward network
_x = x
x = layer.ffn(x)

# 6. add and norm
x = layer.dropout3(x)
x = layer.norm3(x + _x)


q: trg_x: torch.Size([128, 38, 512])
k: enc: torch.Size([128, 36, 512])
v: enc: torch.Size([128, 36, 512])
mask: src_trg_mask: torch.Size([128, 1, 38, 36])
enc->dec 注意力后:  torch.Size([128, 38, 512])


### 3.4.10 计算loss : output->[Cross Entropy loss]->logits->loss

![loss](image/transformer_decoder_output_softmax.png)
![loss_vocab](image/output_trained_model_probability_distributions.png)


In [79]:
## 损失计算
print('损失计算，使用交叉损失：')
criterion = nn.CrossEntropyLoss(ignore_index=src_pad_idx)
print('src:', src.shape)
print('trg:', trg.shape)
print('trg[:, :-1]:', trg[:, :-1].shape)
output = model(src, trg[:, :-1])
print('output:', output.shape)
output_reshape = output.contiguous().view(-1, output.shape[-1])
print('output_reshape:', output_reshape.shape)

trg_view = trg[:, 1:].contiguous().view(-1)
print('trg.view(-1):', trg_view.shape)
loss = criterion(output_reshape, trg_view)
print('loss:', loss)
loss.backward()

损失计算，使用交叉损失：
src: torch.Size([128, 36])
trg: torch.Size([128, 38])
trg[:, :-1]: torch.Size([128, 37])
output: torch.Size([128, 37, 7853])
output_reshape: torch.Size([4736, 7853])
trg.view(-1): torch.Size([4736])
loss: tensor(10.3012, grad_fn=<NllLossBackward0>)


### 3.4.11 编解码Mask计算原理enc-dec-mask

In [64]:
## mask机制
print("src_mask:", src_mask.shape)
print("src_trg_mask:", src_trg_mask.shape)
print("trg_mask:", trg_mask.shape)
# print(src_mask[0][0].int())
# print(src_trg_mask[0][0].int())
print(src_mask[0,0,:5,:5].int())
print(trg_mask[0,0,:5,:5].int())
print(src_trg_mask[0,0,:20,:20].int())


src_mask: torch.Size([128, 1, 36, 36])
src_trg_mask: torch.Size([128, 1, 38, 36])
trg_mask: torch.Size([128, 1, 38, 38])
tensor([[1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1]], dtype=torch.int32)
tensor([[1, 0, 0, 0, 0],
        [1, 1, 0, 0, 0],
        [1, 1, 1, 0, 0],
        [1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1]], dtype=torch.int32)
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1,

In [81]:
# encode-decode-mask
layer = model.decoder.layers[0]
# print("decode layer结构：")
# print(layer)

dec = emb_trg
enc = enc_src

# _x = dec
# 1. decode self attention for target 
# x = layer.self_attention(q=dec, k=dec, v=dec, mask=trg_mask)
# x = layer.dropout1(x)
# x = layer.norm1(x + _x)

# 2. ecode-decode-attention + mask
# if enc is not None:
#     # 3. compute encoder - decoder attention
#     _x = x
#     # 多头注意力机制
#     print('q: trg_x:', x.shape)
#     print('k: enc:', enc.shape)
#     print('v: enc:', enc.shape)
#     print('mask: src_trg_mask:', src_trg_mask.shape)
#     x = layer.enc_dec_attention(q=x, k=enc, v=enc, mask=src_trg_mask)

# layer.enc_dec_attention 多头
# layer.enc_dec_attention.attention() 单头


q_dec = dec
q = q_dec = layer.enc_dec_attention.split(dec)
k = k_enc = _k_single
v = v_enc = _v_single

batch_size, head, length, d_tensor = k.size()

# 1. dot product Query with Key^T to compute similarity
k_t = k.transpose(2, 3)  # transpose


print("q:", q.shape)
print("k_t:", k_t.shape)
score = (q @ k_t) / math.sqrt(d_tensor)  # scaled dot product
print("score:", score.shape)
# 2. apply masking (opt)
if src_trg_mask is not None: # 实际预测时，没有mask，会预测出终止标志符号
    print("enc-dec-mask:",src_trg_mask.shape)
    score = score.masked_fill(src_trg_mask == 0, -10000)
# 3. pass them softmax to make [0, 1] range
score = attention.softmax(score)
# 4. multiply with Value
print("v:", v.shape)
v = score @ v
print("score * v:", v.shape)


q: torch.Size([128, 8, 38, 64])
k_t: torch.Size([128, 8, 64, 36])
score: torch.Size([128, 8, 38, 36])
enc-dec-mask: torch.Size([128, 1, 38, 36])
v: torch.Size([128, 8, 36, 64])
score * v: torch.Size([128, 8, 38, 64])


## 4. 训练

In [82]:
optimizer = Adam(params=model.parameters(),
                 lr=init_lr,
                 weight_decay=weight_decay,
                 eps=adam_eps)

scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,
                                                 verbose=True,
                                                 factor=factor,
                                                 patience=patience)

criterion = nn.CrossEntropyLoss(ignore_index=src_pad_idx)

def train(model, iterator, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0
    for i, batch in enumerate(iterator):
        src = batch.src 
        trg = batch.trg 

        optimizer.zero_grad()
        output = model(src, trg[:, :-1])
        output_reshape = output.contiguous().view(-1, output.shape[-1])
        trg = trg[:, 1:].contiguous().view(-1)

        loss = criterion(output_reshape, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()

        epoch_loss += loss.item()
        print('step :', round((i / len(iterator)) * 100, 2), '% , loss :', loss.item())
    return epoch_loss / len(iterator)

In [83]:
iter_max = 100
# iter_max = 1000
train_losses = []
for step in range(iter_max):
        train_loss = train(model, train_iter, optimizer, criterion, clip)

        if step > warmup:
            scheduler.step(valid_loss)
        train_losses.append(train_loss)
        f = open('result/train_loss.txt', 'w')
        f.write(str(train_losses))
        f.close()
        print(f'\tTrain Loss: {train_loss:.3f}')
torch.save(model.state_dict(), 'model-final.pt')

NameError: name 'train_iter' is not defined

In [None]:
import matplotlib.pyplot as plt
import re
print(train_losses)
plt.xlabel('epoch')
plt.ylabel('loss')
plt.plot(train_losses, 'r', label='train')
plt.title('training result')
plt.grid(True, which='both', axis='both')
plt.show()