In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

In [3]:
random_torch = torch.rand(4,4)
print(random_torch)

tensor([[0.4528, 0.3942, 0.3466, 0.6886],
        [0.5443, 0.6389, 0.7202, 0.4414],
        [0.4870, 0.6657, 0.8296, 0.2937],
        [0.8218, 0.1417, 0.5371, 0.7254]])


In [4]:
dropout_module = nn.Dropout(p=0.1)
tensor_test = torch.ones(5,5)
print(tensor_test)
print(dropout_module(tensor_test))

tensor([[1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.]])
tensor([[0.0000, 1.1111, 1.1111, 1.1111, 1.1111],
        [1.1111, 1.1111, 1.1111, 1.1111, 1.1111],
        [1.1111, 1.1111, 1.1111, 1.1111, 1.1111],
        [1.1111, 1.1111, 1.1111, 1.1111, 1.1111],
        [1.1111, 1.1111, 1.1111, 1.1111, 1.1111]])


In [5]:
encoding = torch.zeros(10, 10)
range_tensor = torch.arange(0, 10)
print(range_tensor)
range_tensor = range_tensor.float().unsqueeze(dim=1)
print(range_tensor)
_2i = torch.arange(0, 10, step=2).float()
print(_2i)
encoding[:,0::2] = torch.sin(range_tensor/(10000**(_2i/10)))
encoding[:,1::2] = torch.cos(range_tensor/(10000**(_2i/10)))
print(encoding)

tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
tensor([[0.],
        [1.],
        [2.],
        [3.],
        [4.],
        [5.],
        [6.],
        [7.],
        [8.],
        [9.]])
tensor([0., 2., 4., 6., 8.])
tensor([[ 0.0000e+00,  1.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
          1.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,  1.0000e+00],
        [ 8.4147e-01,  5.4030e-01,  1.5783e-01,  9.8747e-01,  2.5116e-02,
          9.9968e-01,  3.9811e-03,  9.9999e-01,  6.3096e-04,  1.0000e+00],
        [ 9.0930e-01, -4.1615e-01,  3.1170e-01,  9.5018e-01,  5.0217e-02,
          9.9874e-01,  7.9621e-03,  9.9997e-01,  1.2619e-03,  1.0000e+00],
        [ 1.4112e-01, -9.8999e-01,  4.5775e-01,  8.8908e-01,  7.5285e-02,
          9.9716e-01,  1.1943e-02,  9.9993e-01,  1.8929e-03,  1.0000e+00],
        [-7.5680e-01, -6.5364e-01,  5.9234e-01,  8.0569e-01,  1.0031e-01,
          9.9496e-01,  1.5924e-02,  9.9987e-01,  2.5238e-03,  1.0000e+00],
        [-9.5892e-01,  2.8366e-01,  7.1207e-01

In [6]:
# 将输入的词表索引转化为制定维度的embedding向量
class TokenEmbedding(nn.Embedding):
    def __init__(self, vocab_size, d_model):
        super(TokenEmbedding, self).__init__(vocab_size, d_model, padding_idx=1)


class PositionalEmbeddinh(nn.Module):
    def __init__(self, d_model, max_len, device):
        super(PositionalEmbeddinh, self).__init__()
        self.encoding = torch.zeros(max_len, d_model, device=device)
        # 位置编码部分无需进行梯度计算
        self.encoding.requires_grad = False
        pos = torch.arange(0, max_len, device=device)
        pos = pos.float().unsqueeze(dim=1)
        _2i = torch.arange(0, d_model, step=2, device=device).float()
        self.encoding[:, 0::2] = torch.sin(pos / (10000 ** (_2i / d_model)))
        self.encoding[:, 1::2] = torch.cos(pos / (10000 ** (_2i / d_model)))

    def forward(self, x):
        batch_size, seq_len = x.size()
        return self.encoding[:seq_len, :]
    
class TransformerEmbedding(nn.Module):
    def __init__(self, vocab_size, d_model, max_len, drop_prob, device):
        super(TransformerEmbedding, self).__init__()
        self.token_emb = TokenEmbedding(vocab_size, d_model)
        self.positional_emb = PositionalEmbeddinh(d_model, max_len, device)
        self.dropout = nn.Dropout(p=drop_prob)
    
    def forward(self, x):
        token_emb = self.token_emb(x)
        positional_emb = self.positional_emb(x)
        return self.dropout(token_emb + positional_emb)
        

In [7]:
d_model=512
n_head=8
seq_len=100
batch_size=32
x= torch.rand(batch_size,seq_len,d_model)
print(x)

tensor([[[0.8602, 0.5952, 0.1883,  ..., 0.0347, 0.8538, 0.3209],
         [0.7795, 0.3800, 0.4935,  ..., 0.2866, 0.4380, 0.2638],
         [0.0680, 0.6881, 0.8974,  ..., 0.3727, 0.3069, 0.1443],
         ...,
         [0.9565, 0.8905, 0.7109,  ..., 0.4038, 0.2495, 0.5170],
         [0.9452, 0.6876, 0.6367,  ..., 0.0945, 0.9855, 0.9936],
         [0.6842, 0.9269, 0.0556,  ..., 0.3792, 0.4920, 0.8326]],

        [[0.4641, 0.6400, 0.7785,  ..., 0.9932, 0.6236, 0.7659],
         [0.8308, 0.9978, 0.3542,  ..., 0.6326, 0.6208, 0.4169],
         [0.6817, 0.8458, 0.7835,  ..., 0.8941, 0.0218, 0.4044],
         ...,
         [0.4366, 0.7274, 0.2430,  ..., 0.6565, 0.9185, 0.0116],
         [0.6958, 0.6547, 0.2299,  ..., 0.9701, 0.6695, 0.0684],
         [0.5824, 0.7570, 0.9232,  ..., 0.8923, 0.8773, 0.3020]],

        [[0.8023, 0.0818, 0.2763,  ..., 0.7570, 0.8218, 0.1875],
         [0.5443, 0.8096, 0.5142,  ..., 0.3124, 0.4487, 0.8635],
         [0.5366, 0.0873, 0.1727,  ..., 0.8470, 0.2620, 0.

In [8]:
softmax_module = nn.Softmax(dim=-1)
tensor_test = torch.rand(5,5,4)
print(tensor_test)
print(softmax_module(tensor_test))

tensor([[[0.6652, 0.9868, 0.6111, 0.1540],
         [0.1370, 0.5841, 0.9452, 0.7400],
         [0.3623, 0.7231, 0.1108, 0.2180],
         [0.4648, 0.8470, 0.0261, 0.7433],
         [0.9295, 0.9085, 0.0969, 0.5468]],

        [[0.6938, 0.1161, 0.5000, 0.2882],
         [0.3176, 0.2435, 0.4225, 0.9892],
         [0.8092, 0.5260, 0.0355, 0.7638],
         [0.5860, 0.4700, 0.4145, 0.9026],
         [0.8303, 0.7315, 0.6855, 0.0562]],

        [[0.4518, 0.1536, 0.0952, 0.0645],
         [0.4997, 0.6820, 0.6177, 0.6402],
         [0.3996, 0.2353, 0.1188, 0.7965],
         [0.3877, 0.5161, 0.0594, 0.0432],
         [0.7465, 0.8587, 0.5447, 0.6269]],

        [[0.1015, 0.5006, 0.2319, 0.2783],
         [0.5291, 0.6734, 0.0285, 0.6407],
         [0.6795, 0.9358, 0.0575, 0.0138],
         [0.0529, 0.1513, 0.8129, 0.8101],
         [0.1747, 0.1867, 0.0597, 0.9402]],

        [[0.2303, 0.6764, 0.4425, 0.1132],
         [0.1073, 0.1737, 0.8024, 0.1269],
         [0.2553, 0.0803, 0.6521, 0.8847],
   

In [9]:
class ScaleDotProductAttention(nn.Module):
    def __init__(self):
        super(ScaleDotProductAttention, self).__init__()
        self.softmax = nn.Softmax(dim=-1)
        
    def forward(self, query, key, value, mask=None, e=1e-12):
        batch_size, head, seq_len, d_tensor = key.size()
        
        key_transpose = key.transpose(2, 3)
        attn_score = (query@key_transpose)/math.sqrt(d_tensor)
        
        if mask is not None:
            attn_score = attn_score.masked_fill(mask == 0, -10000)
        
        attn_score = self.softmax(attn_score)
        
        value = attn_score@value
        
        return value, attn_score

In [10]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, n_head):
        super(MultiHeadAttention, self).__init__()
        self.d_model = d_model
        self.attention = ScaleDotProductAttention()

        self.w_q = nn.Linear(d_model, d_model)
        self.w_k = nn.Linear(d_model, d_model)
        self.w_v = nn.Linear(d_model, d_model)
        self.w_concat = nn.Linear(d_model, d_model)
        
    def forward(self, query, key, value, mask=None):
        query, key, value = self.w_q(query), self.w_k(key), self.w_v(value)
        
        query, key, value = self.split(query), self.split(key), self.split(value)
        
        out, attention = self.attention(query, key, value, mask)
        
        out = self.concat(out)
        out = self.w_concat(out)
        
        return out
        
    def split(self, tensor):
        batch_size, seq_len, d_model = tensor.size()
        d_tensor = d_model // self.n_head
        tensor = tensor.view(batch_size, seq_len, self.n_head, d_tensor).transpose(1, 2)
        
        return tensor
    
    def concat(self, tensor):
        batch_size, head, seq_len, d_tensor = tensor.size()
        d_model = head * d_tensor
        tensor = tensor.transpose(1,2).contiguous().view(batch_size, seq_len, d_model)

        return tensor

mark website 
https://github.com/hyunwoongko/transformer/blob/master/train.py

In [None]:
# 正则化层
class LayerNorm(nn.Module):
    def __init__(self, d_model, eps=1e-12):
        super(LayerNorm, self).__init__()
        self.gamma = nn.Parameter(torch.ones(d_model))
        self.beta = nn.Parameter(torch.zeros(d_model))
        self.eps = eps
        
    def forward(self,x):
        mean = x.mean(-1, keepdim=True)
        var = x.std(-1, unbiased=False, keepdim=True)
        # print(mean)
        # print(var)
        # 广播机制
        out = (x-mean)/torch.sqrt(var+self.eps)
        out = self.gamma*out + self.beta
        
        return out

In [14]:
# 正则化层测试用例
norm_module = LayerNorm(10)
tensor_test = torch.rand(5,10)
print(tensor_test)
print(norm_module(tensor_test))

tensor([[0.5758, 0.3650, 0.7966, 0.0540, 0.3709, 0.3839, 0.6052, 0.8991, 0.8277,
         0.3066],
        [0.6916, 0.7370, 0.3320, 0.9683, 0.9773, 0.2058, 0.0138, 0.1888, 0.3622,
         0.3914],
        [0.5048, 0.8577, 0.0768, 0.5691, 0.2114, 0.0060, 0.5521, 0.9818, 0.2747,
         0.1764],
        [0.8550, 0.7710, 0.7156, 0.3065, 0.3730, 0.8709, 0.0638, 0.9368, 0.4317,
         0.8798],
        [0.6782, 0.1691, 0.2553, 0.4793, 0.6829, 0.5019, 0.7223, 0.3432, 0.9799,
         0.7509]])
tensor([[0.5185],
        [0.4868],
        [0.4211],
        [0.6204],
        [0.5563]])
tensor([[0.2555],
        [0.3189],
        [0.3110],
        [0.2867],
        [0.2389]])
tensor([[ 0.1134, -0.3037,  0.5502, -0.9189, -0.2920, -0.2662,  0.1715,  0.7530,
          0.6119, -0.4192],
        [ 0.3626,  0.4430, -0.2742,  0.8526,  0.8685, -0.4976, -0.8376, -0.5277,
         -0.2208, -0.1689],
        [ 0.1501,  0.7830, -0.6173,  0.2654, -0.3761, -0.7444,  0.2349,  1.0055,
         -0.2624, -0.43

In [15]:
class PositionWiseFeedForward(nn.Module):
    def __init__(self, d_model, hidden, drop_prob=0.1):
        super(PositionWiseFeedForward, self).__init__()
        self.linear1 = nn.Linear(d_model, hidden)
        self.linear2 = nn.Linear(hidden, d_model)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=drop_prob)
        
    def forward(self, x):
        x = self.linear1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.linear2(x)
        return x

In [None]:
# 编码层
class EncoderLayer(nn.Module):
    
    def __init__(self, d_model, ffn_hidden, n_head, drop_prob):
        super(EncoderLayer, self).__init__()
        self.attention = MultiHeadAttention(d_model=d_model, n_head=n_head)
        self.norm1 = LayerNorm(d_model=d_model)
        self.dropout1 = nn.Dropout(p=drop_prob)
        
        self.ffn = PositionWiseFeedForward(d_model=d_model, hidden=ffn_hidden, drop_prob=drop_prob)
        self.norm2 = LayerNorm(d_model=d_model)
        self.dropout2 = nn.Dropout(p=drop_prob)
        
    def forward(self, x, src_mask):
        # 计算自注意力
        _x = x
        x = self.attention(query=x, key=x, value=x, mask=src_mask)
        
        # 添加残差连接
        x = self.dropout1(x)
        x = self.norm1(x+_x)
        
        # 前馈网络
        _x = x
        x = self.ffn(x)
        
        # 添加残差连接
        x = self.dropout2(x)
        x = self.norm2(x+_x)

        return x
        