In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

In [2]:
random_torch = torch.rand(4,4)
print(random_torch)

tensor([[0.9738, 0.1652, 0.1956, 0.4268],
        [0.0213, 0.2562, 0.2386, 0.0656],
        [0.5477, 0.9836, 0.1516, 0.8134],
        [0.7741, 0.1835, 0.9570, 0.6306]])


In [4]:
dropout_module = nn.Dropout(p=0.1)
tensor_test = torch.ones(5,5)
print(tensor_test)
print(dropout_module(tensor_test))

tensor([[1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.]])
tensor([[1.1111, 1.1111, 1.1111, 1.1111, 1.1111],
        [1.1111, 1.1111, 1.1111, 1.1111, 1.1111],
        [1.1111, 1.1111, 1.1111, 1.1111, 1.1111],
        [1.1111, 1.1111, 1.1111, 1.1111, 1.1111],
        [1.1111, 1.1111, 1.1111, 1.1111, 1.1111]])


In [13]:
encoding = torch.zeros(10, 10)
range_tensor = torch.arange(0, 10)
print(range_tensor)
range_tensor = range_tensor.float().unsqueeze(dim=1)
print(range_tensor)
_2i = torch.arange(0, 10, step=2).float()
print(_2i)
encoding[:,0::2] = torch.sin(range_tensor/(10000**(_2i/10)))
encoding[:,1::2] = torch.cos(range_tensor/(10000**(_2i/10)))
print(encoding)

tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
tensor([[0.],
        [1.],
        [2.],
        [3.],
        [4.],
        [5.],
        [6.],
        [7.],
        [8.],
        [9.]])
tensor([0., 2., 4., 6., 8.])
tensor([[ 0.0000e+00,  1.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
          1.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,  1.0000e+00],
        [ 8.4147e-01,  5.4030e-01,  1.5783e-01,  9.8747e-01,  2.5116e-02,
          9.9968e-01,  3.9811e-03,  9.9999e-01,  6.3096e-04,  1.0000e+00],
        [ 9.0930e-01, -4.1615e-01,  3.1170e-01,  9.5018e-01,  5.0217e-02,
          9.9874e-01,  7.9621e-03,  9.9997e-01,  1.2619e-03,  1.0000e+00],
        [ 1.4112e-01, -9.8999e-01,  4.5775e-01,  8.8908e-01,  7.5285e-02,
          9.9716e-01,  1.1943e-02,  9.9993e-01,  1.8929e-03,  1.0000e+00],
        [-7.5680e-01, -6.5364e-01,  5.9234e-01,  8.0569e-01,  1.0031e-01,
          9.9496e-01,  1.5924e-02,  9.9987e-01,  2.5238e-03,  1.0000e+00],
        [-9.5892e-01,  2.8366e-01,  7.1207e-01

In [2]:
# 将输入的词表索引转化为制定维度的embedding向量
class TokenEmbedding(nn.Embedding):
    def __init__(self, vocab_size, d_model):
        super(TokenEmbedding, self).__init__(vocab_size, d_model, padding_idx=1)


class PositionalEmbeddinh(nn.Module):
    def __init__(self, d_model, max_len, device):
        super(PositionalEmbeddinh, self).__init__()
        self.encoding = torch.zeros(max_len, d_model, device=device)
        # 位置编码部分无需进行梯度计算
        self.encoding.requires_grad = False
        pos = torch.arange(0, max_len, device=device)
        pos = pos.float().unsqueeze(dim=1)
        _2i = torch.arange(0, d_model, step=2, device=device).float()
        self.encoding[:, 0::2] = torch.sin(pos / (10000 ** (_2i / d_model)))
        self.encoding[:, 1::2] = torch.cos(pos / (10000 ** (_2i / d_model)))

    def forward(self, x):
        batch_size, seq_len = x.size()
        return self.encoding[:seq_len, :]
    
class TransformerEmbedding(nn.Module):
    def __init__(self, vocab_size, d_model, max_len, drop_prob, device):
        super(TransformerEmbedding, self).__init__()
        self.token_emb = TokenEmbedding(vocab_size, d_model)
        self.positional_emb = PositionalEmbeddinh(d_model, max_len, device)
        self.dropout = nn.Dropout(p=drop_prob)
    
    def forward(self, x):
        token_emb = self.token_emb(x)
        positional_emb = self.positional_emb(x)
        return self.dropout(token_emb + positional_emb)
        

In [3]:
d_model=512
n_head=8
seq_len=100
batch_size=32
x= torch.rand(batch_size,seq_len,d_model)
print(x)

tensor([[[0.1718, 0.6347, 0.4844,  ..., 0.0751, 0.4225, 0.8231],
         [0.6993, 0.7753, 0.0491,  ..., 0.1735, 0.6124, 0.5339],
         [0.5230, 0.8160, 0.9122,  ..., 0.7412, 0.4844, 0.9861],
         ...,
         [0.8551, 0.2721, 0.2874,  ..., 0.8630, 0.7339, 0.7964],
         [0.1180, 0.1976, 0.6389,  ..., 0.8773, 0.7392, 0.7329],
         [0.8274, 0.4139, 0.5577,  ..., 0.6363, 0.0265, 0.1200]],

        [[0.9705, 0.4886, 0.4199,  ..., 0.2950, 0.8726, 0.1482],
         [0.7251, 0.1476, 0.7664,  ..., 0.6897, 0.3211, 0.0617],
         [0.1607, 0.3128, 0.7766,  ..., 0.4229, 0.1321, 0.6412],
         ...,
         [0.0977, 0.2108, 0.8233,  ..., 0.0927, 0.6918, 0.4467],
         [0.9970, 0.8300, 0.1543,  ..., 0.4448, 0.7686, 0.4423],
         [0.9139, 0.0541, 0.1475,  ..., 0.6415, 0.8883, 0.8334]],

        [[0.3270, 0.1988, 0.9699,  ..., 0.3274, 0.8873, 0.0438],
         [0.0225, 0.4085, 0.3254,  ..., 0.2133, 0.6447, 0.3613],
         [0.6707, 0.4262, 0.9569,  ..., 0.0288, 0.1423, 0.

In [9]:
softmax_module = nn.Softmax(dim=-1)
tensor_test = torch.rand(5,5,4)
print(tensor_test)
print(softmax_module(tensor_test))

tensor([[[5.7576e-01, 7.4992e-01, 1.0883e-01, 3.1611e-01],
         [8.4574e-01, 2.5657e-01, 7.5338e-01, 7.3449e-01],
         [5.2536e-01, 5.6833e-01, 4.2851e-01, 7.0382e-01],
         [7.4450e-01, 6.7147e-01, 1.2334e-02, 1.1139e-01],
         [4.0722e-01, 1.5480e-01, 6.0856e-01, 7.6757e-01]],

        [[2.9967e-01, 5.0381e-01, 8.0809e-01, 7.2294e-01],
         [7.6542e-01, 9.3450e-01, 2.5317e-01, 7.7737e-01],
         [2.7964e-01, 6.5680e-01, 2.3038e-01, 7.4334e-01],
         [5.8598e-01, 7.9420e-01, 9.3936e-01, 2.3606e-01],
         [8.0755e-01, 9.2437e-01, 7.3659e-02, 3.7176e-01]],

        [[8.4527e-01, 4.7466e-01, 1.9511e-01, 3.6845e-01],
         [9.5119e-01, 4.3112e-01, 9.2169e-02, 9.8798e-01],
         [2.7633e-01, 1.0228e-01, 8.2631e-01, 4.6585e-01],
         [1.6554e-01, 5.4809e-01, 3.8546e-01, 4.2342e-01],
         [3.3007e-01, 5.6158e-01, 7.9330e-02, 9.3653e-01]],

        [[4.0462e-01, 7.7912e-01, 2.3384e-01, 5.0259e-01],
         [9.6370e-01, 3.3861e-01, 1.0660e-01, 7.21

In [10]:
class ScaleDotProductAttention(nn.Module):
    def __init__(self):
        super(ScaleDotProductAttention, self).__init__()
        self.softmax = nn.Softmax(dim=-1)
        
    def forward(self, query, key, value, mask=None, e=1e-12):
        batch_size, head, seq_len, d_tensor = key.size()
        
        key_transpose = key.transpose(2, 3)
        attn_score = (query@key_transpose)/math.sqrt(d_tensor)
        
        if mask is not None:
            attn_score = attn_score.masked_fill(mask == 0, -10000)
        
        attn_score = self.softmax(attn_score)
        
        value = attn_score@value
        
        return value, attn_score

In [11]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, n_head):
        super(MultiHeadAttention, self).__init__()
        self.d_model = d_model
        self.attention = ScaleDotProductAttention()

        self.w_q = nn.Linear(d_model, d_model)
        self.w_k = nn.Linear(d_model, d_model)
        self.w_v = nn.Linear(d_model, d_model)
        self.w_concat = nn.Linear(d_model, d_model)
        
    def forward(self, query, key, value, mask=None):
        query, key, value = self.w_q(query), self.w_k(key), self.w_v(value)
        
        query, key, value = self.split(query), self.split(key), self.split(value)
        
        out, attention = self.attention(query, key, value, mask)
        
        out = self.concat(out)
        out = self.w_concat(out)
        
        return out
        
    def split(self, tensor):
        batch_size, seq_len, d_model = tensor.size()
        d_tensor = d_model // self.n_head
        tensor = tensor.view(batch_size, seq_len, self.n_head, d_tensor).transpose(1, 2)
        
        return tensor
    
    def concat(self, tensor):
        batch_size, head, seq_len, d_tensor = tensor.size()
        d_model = head * d_tensor
        tensor = tensor.transpose(1,2).contiguous().view(batch_size, seq_len, d_model)

        return tensor

mark website 
https://github.com/hyunwoongko/transformer/blob/master/train.py