## refs
### paper
- https://arxiv.org/pdf/1706.03762

### vis
- https://bbycroft.net/llm
- https://docs.google.com/spreadsheets/d/10O-amPDV4zvnZZedlqX33pLFxlO4jj8CbhY6jzdk5rw/edit?gid=260373902#gid=260373902
- https://jalammar.github.io/illustrated-transformer/
- https://jalammar.github.io/illustrated-gpt2/

### impl
- karpathy: https://github.com/karpathy/build-nanogpt
- https://nlp.seas.harvard.edu/annotated-transformer/#attention-visualization
- https://course.fast.ai/Lessons/lesson24.html
- https://github.com/jadore801120/attention-is-all-you-need-pytorch


In [20]:
import torch
from torch import nn
from torch.nn.functional import log_softmax, pad


In [12]:
batch, sentence_length, embedding_dim = 20, 5, 10
embedding = torch.randn(batch, sentence_length, embedding_dim)

print(embedding.shape)
embedding

torch.Size([20, 5, 10])


tensor([[[-2.7566e-01,  9.6622e-01, -4.1771e-01, -1.0384e+00, -1.1449e+00,
           3.3826e-01,  2.6574e+00,  3.9139e-01,  1.5089e+00, -9.1030e-01],
         [-6.7187e-01, -1.1564e+00,  1.6002e-01,  3.1982e-01, -5.8819e-01,
          -1.2096e+00,  5.9180e-01,  1.4962e+00,  4.5475e-02, -1.5257e-01],
         [ 3.0872e-01,  9.5537e-01,  6.0882e-01,  1.8214e+00, -3.7271e-01,
           4.6030e-02,  2.0253e+00,  4.7160e-01, -6.0692e-01,  1.6038e+00],
         [ 6.7682e-01, -1.7595e+00,  1.5512e+00, -8.7805e-01,  8.1387e-01,
          -5.6361e-01, -3.0955e-01, -9.3675e-01, -1.2904e+00,  1.3118e-01],
         [-2.0052e+00,  1.6342e+00, -1.6320e-02,  6.3341e-01,  1.6084e+00,
           3.1202e-01, -1.2441e+00,  4.8552e-01, -1.5531e-01, -1.5133e+00]],

        [[-4.3139e-01,  3.6371e-01, -1.7862e+00, -1.5335e+00, -2.3935e-01,
           3.1209e-01, -2.6215e+00, -3.3162e-02,  1.8825e+00,  8.4365e-01],
         [ 6.9610e-01,  7.8043e-01,  3.8258e-01,  1.0262e+00, -2.7433e-01,
          -5.3510

In [14]:
layer_norm = nn.LayerNorm(embedding_dim)
# Activate module
y = layer_norm(embedding)
print(y.shape)
y

torch.Size([20, 5, 10])


tensor([[[-0.4147,  0.6511, -0.5366, -1.0693, -1.1606,  0.1122,  2.1025,
           0.1578,  1.1169, -0.9593],
         [-0.7050, -1.3202,  0.3511,  0.5540, -0.5988, -1.3877,  0.8993,
           2.0475,  0.2057, -0.0458],
         [-0.4389,  0.3131, -0.0899,  1.3202, -1.2314, -0.7444,  1.5573,
          -0.2495, -1.5037,  1.0672],
         [ 0.9472, -1.5254,  1.8346, -0.6308,  1.0863, -0.3117, -0.0539,
          -0.6904, -1.0493,  0.3934],
         [-1.6806,  1.4098,  0.0083,  0.5600,  1.3879,  0.2871, -1.0343,
           0.4344, -0.1097, -1.2629]],

        [[-0.0844,  0.5424, -1.1524, -0.9532,  0.0670,  0.5017, -1.8108,
           0.2295,  1.7397,  0.9207],
         [ 0.9266,  1.0147,  0.5994,  1.2712, -0.0863, -0.3585,  0.2440,
          -1.5780, -1.8484, -0.1846],
         [-0.8789, -0.5243,  0.9759, -1.0221,  0.5850,  0.9365,  0.5587,
           0.9956,  0.4339, -2.0602],
         [ 1.2221, -0.9777,  0.3536,  1.2431, -1.5510, -0.3630,  0.5220,
           1.0140, -0.0345, -1.4286],

In [17]:
# TODO: check layer norm same mean and variance across same dim index(all sentence, all batch)
print(embedding[0][0][0] / y[0][0][0])
print(embedding[0][2][0] / y[0][2][0])
print(embedding[1][0][0] / y[1][0][0])

tensor(0.6648, grad_fn=<DivBackward0>)
tensor(-0.7034, grad_fn=<DivBackward0>)
tensor(5.1102, grad_fn=<DivBackward0>)


![](https://upload.wikimedia.org/wikipedia/commons/thumb/8/8f/The-Transformer-model-architecture.png/580px-The-Transformer-model-architecture.png)

In [19]:

class EncoderDecoder(nn.Module):
    def __init__(self, encoder, decoder, src_embed, target_embed, generator):
        super(EncoderDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed
        self.target_embed = target_embed
        self.generator = generator

    def forward(self, src, target, src_mask, target_mask):
        return self.decode(self.encode(src, src_mask), src_mask, target, target_mask)

    def encode(self, src, src_mask):
        return self.encoder(self.src_embed(src), src_mask)

    def decode(self, memory, src_mask, target, target_mask):
        return self.decoder(self.target_embed(target), memory, src_mask, target_mask)
    
# Last layer
class LinearSoftmax(nn.Module):
    "Define standard linear + softmax generation step."

    def __init__(self, d_model, vocab):
        super(LinearSoftmax, self).__init__()
        self.proj = nn.Linear(d_model, vocab)

    def forward(self, x):
        return log_softmax(self.proj(x), dim=-1)


LayerNorm(x + Sublayer(x)), 여기서 Sublayer(x)는 서브레이어 자체에 의해 구현된 함수입니다. 우리는 각 서브레이어의 출력에 드롭아웃을 적용한 후, 서브레이어 입력에 추가하고 정규화합니다.

이 잔여 연결을 용이하게 하기 위해, 모델의 모든 서브레이어와 임베딩 레이어는 모두 차원 $d_{model} = 512$ 의 출력을 생성합니다.


In [None]:
import copy

def deepcopy(module, N):
    "Produce N identical layers."
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])


class Encoder(nn.Module):
    "Core encoder is a stack of N layers"

    def __init__(self, layer, N):
        super(Encoder, self).__init__()
        self.layers = deepcopy(layer, N)
        self.norm = LayerNorm(layer.size)

    def forward(self, x, mask):
        "Pass the input (and mask) through each layer in turn."
        for layer in self.layers:
            x = layer(x, mask)
        return self.norm(x)
    

class LayerNorm(nn.Module):
    "Construct a layernorm module (See citation for details)."

    def __init__(self, features, eps=1e-6):
        super(LayerNorm, self).__init__()
        self.a_2 = nn.Parameter(torch.ones(features))
        self.b_2 = nn.Parameter(torch.zeros(features))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(-1, keepdim=True) # (batch, sentence, d_model)
        std = x.std(-1, keepdim=True)
        return self.a_2 * (x - mean) / (std + self.eps) + self.b_2   


class SublayerConnection(nn.Module):
    """
    A residual connection followed by a layer norm.
    Note for code simplicity the norm is first as opposed to last.
    """

    def __init__(self, size, dropout):
        super(SublayerConnection, self).__init__()
        self.norm = LayerNorm(size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, sublayer):
        "Apply residual connection to any sublayer with the same size."
        return x + self.dropout(sublayer(self.norm(x)))


class EncoderLayer(nn.Module):
    "Encoder is made up of self-attn and feed forward (defined below)"

    def __init__(self, size, self_attn, feed_forward, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = self_attn
        self.feed_forward = feed_forward
        self.sublayer = deepcopy(SublayerConnection(size, dropout), 2)
        self.size = size

    def forward(self, x, mask):
        "Follow Figure 1 (left) for connections."
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, mask))
        return self.sublayer[1](x, self.feed_forward)    

In [22]:
class Decoder(nn.Module):
    "Generic N layer decoder with masking."

    def __init__(self, layer, N):
        super(Decoder, self).__init__()
        self.layers = deepcopy(layer, N)
        self.norm = LayerNorm(layer.size)

    def forward(self, x, memory, src_mask, tgt_mask):
        for layer in self.layers:
            x = layer(x, memory, src_mask, tgt_mask)
        return self.norm(x)

class DecoderLayer(nn.Module):
    "Decoder is made of self-attn, src-attn, and feed forward (defined below)"

    def __init__(self, size, self_attn, src_attn, feed_forward, dropout):
        super(DecoderLayer, self).__init__()
        self.size = size
        self.self_attn = self_attn
        self.src_attn = src_attn
        self.feed_forward = feed_forward
        self.sublayer = deepcopy(SublayerConnection(size, dropout), 3)

    def forward(self, x, memory, src_mask, tgt_mask):
        "Follow Figure 1 (right) for connections."
        m = memory
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, tgt_mask))
        x = self.sublayer[1](x, lambda x: self.src_attn(x, m, m, src_mask))
        return self.sublayer[2](x, self.feed_forward)    