'''
  해당 레퍼런스를 자세히 뜯어보는 코드입니다. 
  code by Tae Hwan Jung(Jeff Jung) @graykode, Derek Miller @dmmiller612
  Reference : https://github.com/jadore801120/attention-is-all-you-need-pytorch
              https://github.com/JayParks/transformer
'''

In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable

In [2]:
dtype = torch.FloatTensor
# S: Symbol that shows starting of decoding input
# E: Symbol that shows starting of decoding output
# P: Symbol that will fill in blank sequence if current batch data size is short than time steps (padding)
sentences = ['ich mochte ein bier P', 'S i want a beer', 'i want a beer E']

In [5]:
# Transformer Parameters
# Padding Should be Zero
src_vocab = {'P' : 0, 'ich' : 1, 'mochte' : 2, 'ein' : 3, 'bier' : 4}
src_vocab_size = len(src_vocab)

print(src_vocab)
print(src_vocab_size)

{'P': 0, 'ich': 1, 'mochte': 2, 'ein': 3, 'bier': 4}
5


In [4]:
tgt_vocab = {'P' : 0, 'i' : 1, 'want' : 2, 'a' : 3, 'beer' : 4, 'S' : 5, 'E' : 6}
number_dict = {i: w for i, w in enumerate(tgt_vocab)}
tgt_vocab_size = len(tgt_vocab)

print(number_dict)
print(tgt_vocab_size)

{0: 'P', 1: 'i', 2: 'want', 3: 'a', 4: 'beer', 5: 'S', 6: 'E'}
7


#### model = Transformer()  
Encoder, Decoder 객체를 각각 만들어서 연결!  

1) Encoder 클래스 이해하기  
2) Decoder 클래스 이해하기  

<흐름>
0. batch 생성: enc_inputs, dec_inputs, target_batch
1. encoder(enc_inputs) -> enc_outputs, enc_self_attns
2. decoder(dec_inputs, enc_inputs, enc_outputs) -> dec_outputs, dec_self_attns

------------

#### 0) batch 생성
```python
def make_batch(sentences):
    input_batch = [[src_vocab[n] for n in sentences[0].split()]]
    output_batch = [[tgt_vocab[n] for n in sentences[1].split()]]
    target_batch = [[tgt_vocab[n] for n in sentences[2].split()]]
    return Variable(torch.LongTensor(input_batch)), Variable(torch.LongTensor(output_batch)), Variable(torch.LongTensor(target_batch))

enc_inputs, dec_inputs, target_batch = make_batch(sentences)

```

In [10]:
sentences

['ich mochte ein bier P', 'S i want a beer', 'i want a beer E']

In [9]:
sentences[0].split()

['ich', 'mochte', 'ein', 'bier', 'P']

In [11]:
src_vocab

{'P': 0, 'ich': 1, 'mochte': 2, 'ein': 3, 'bier': 4}

In [12]:
[src_vocab[n] for n in sentences[0].split()]

[1, 2, 3, 4, 0]

In [17]:
input_batch = [[src_vocab[n] for n in sentences[0].split()]]
print(input_batch)

enc_inputs =  Variable(torch.LongTensor(input_batch))
print(enc_inputs)
print(enc_inputs.shape)

[[1, 2, 3, 4, 0]]
tensor([[1, 2, 3, 4, 0]])
torch.Size([1, 5])


In [19]:
output_batch = [[tgt_vocab[n] for n in sentences[1].split()]]
target_batch = [[tgt_vocab[n] for n in sentences[2].split()]]
dec_inputs, target_batch = Variable(torch.LongTensor(output_batch)), Variable(torch.LongTensor(target_batch))

print(dec_inputs, target_batch)
print(dec_inputs.shape, target_batch.shape)

tensor([[5, 1, 2, 3, 4]]) tensor([[1, 2, 3, 4, 6]])
torch.Size([1, 5]) torch.Size([1, 5])


--------------

#### 1) Encoder 클래스
```python

class Encoder(nn.Module):
    def __init__(self):
        super(Encoder, self).__init__()
        self.src_emb = nn.Embedding(src_vocab_size, d_model)
        self.pos_emb = nn.Embedding.from_pretrained(get_sinusoid_encoding_table(src_vocab_size, d_model),freeze=True)
        self.layers = nn.ModuleList([EncoderLayer() for _ in range(n_layers)])

    def forward(self, enc_inputs): # enc_inputs : [batch_size x source_len]
        enc_outputs = self.src_emb(enc_inputs) + self.pos_emb(torch.LongTensor([[1,2,3,4,0]]))
        enc_self_attn_mask = get_attn_pad_mask(enc_inputs, enc_inputs)
        enc_self_attns = []
        for layer in self.layers:
            enc_outputs, enc_self_attn = layer(enc_outputs, enc_self_attn_mask)
            enc_self_attns.append(enc_self_attn)
        return enc_outputs, enc_self_attns
```

  
  
필요한 것
* enc_inputs(o)
* src_vocab_size(o), d_model
* get_sinusoid_encoding_table

* get_attn_pad_mask
* EncoderLayer

1.1) src_emb  
1.2) pos_emb  
1.3) enc_outputs  
1.4) enc_self_attn_mask  
1.5) enc_self_attns  

-> enc_outputs, enc_self_attns

#### (잠시) 전체파일 구조보기

<함수>  
get_sinusoid_encoding_table  
get_attn_pad_mask  
get_attn_subsequent_mask  
  
<클래스>  
ScaledDotProductAttention  
MultiHeadAttention  
PoswiseFeedForwardNet  
EncoderLayer  
DecoderLayer  
Encoder  
Decoder  
Transformer  

In [21]:
# 1.1) self.src_emb

d_model = 512  # Embedding Size
src_emb = nn.Embedding(src_vocab_size, d_model)
print(src_emb)

Embedding(5, 512)


In [29]:
# src_emb : src -> 각각을 512 차원에 임베딩
print(src_emb.weight)
print(src_emb.weight.shape)
print(src_vocab)
print(src_emb.weight[0]) # P embedding

Parameter containing:
tensor([[-0.1280,  2.3881,  0.0602,  ..., -0.0226, -1.2408,  1.9292],
        [-1.7390,  0.9872, -0.2620,  ..., -0.9140,  0.7316, -0.1670],
        [ 0.1747, -1.1317, -0.5184,  ..., -0.7197, -0.0271, -0.4607],
        [-0.8533,  1.5153, -1.2670,  ..., -0.0183,  0.2211,  0.0245],
        [-1.8163,  1.3144, -2.3754,  ..., -1.1859,  0.4277, -0.6877]],
       requires_grad=True)
torch.Size([5, 512])
{'P': 0, 'ich': 1, 'mochte': 2, 'ein': 3, 'bier': 4}
tensor([-1.2803e-01,  2.3881e+00,  6.0206e-02,  7.7500e-01, -4.1753e-01,
        -4.4098e-01,  6.2275e-01, -1.8652e+00, -2.1891e-01, -1.3015e+00,
        -9.3235e-01,  1.7837e-01, -7.0967e-01, -1.1873e+00, -8.4855e-01,
         2.6412e-01,  7.9109e-01, -1.9532e+00, -3.9711e-01,  2.3889e-01,
         1.3457e+00, -7.0276e-01, -6.2562e-01,  4.5948e-01,  2.2145e-01,
        -1.2298e+00,  1.0109e+00,  4.2429e-01, -4.9430e-01,  7.1398e-01,
        -1.6573e+00,  3.2635e-01,  1.4090e+00,  4.7146e-01,  1.0180e+00,
         1.9441

```python
# 1.2) pos_emb
# self.pos_emb 
pos_emb = nn.Embedding.from_pretrained(get_sinusoid_encoding_table(src_vocab_size, d_model), freeze=True)

# src_vocab_size(n_position) = 5, d_model = 512
def get_sinusoid_encoding_table(n_position, d_model):
    def cal_angle(position, hid_idx):
        return position / np.power(10000, 2 * (hid_idx // 2) / d_model)
    def get_posi_angle_vec(position):
        return [cal_angle(position, hid_j) for hid_j in range(d_model)]

    sinusoid_table = np.array([get_posi_angle_vec(pos_i) for pos_i in range(n_position)])
    sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # dim 2i
    sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])  # dim 2i+1
    return torch.FloatTensor(sinusoid_table)

pos_emb = nn.Embedding.from_pretrained(get_sinusoid_encoding_table(src_vocab_size, d_model), freeze=True)
```

In [30]:
# sinusoid_table 부터 만들어보자.

def get_posi_angle_vec(position): # position 에는 0부터 4까지가 들어간다.
    return [cal_angle(position, hid_j) for hid_j in range(d_model)]

def cal_angle(position, hid_idx): # hid_idx 로는 0부터 511까지가 들어간다. 
    return position / np.power(10000, 2 * (hid_idx // 2) / d_model)

n_position = 5
np.array([get_posi_angle_vec(pos_i) for pos_i in range(n_position)])

5 512


In [40]:
def get_posi_angle_vec(position): # position 에는 0부터 4까지가 들어간다.
    return [cal_angle(position, hid_j) for hid_j in range(d_model)]

In [34]:
def cal_angle(position, hid_idx): # hid_idx 로는 0부터 511까지가 들어간다. 
    return position / np.power(10000, 2 * (hid_idx // 2) / d_model)

In [179]:
[cal_angle(4, hid_j) for hid_j in range(d_model)] # 한번에 512개씩 생성

[4.0,
 4.0,
 3.8586464796447966,
 3.8586464796447966,
 3.722288163718796,
 3.722288163718796,
 3.590748529789257,
 3.590748529789257,
 3.4638572934402614,
 3.4638572934402614,
 3.3414501878313048,
 3.3414501878313048,
 3.2233687510459275,
 3.2233687510459275,
 3.109460120955103,
 3.109460120955103,
 2.9995768373298235,
 2.9995768373298235,
 2.893576650946699,
 2.893576650946699,
 2.7913223394394655,
 2.7913223394394655,
 2.692681529657993,
 2.692681529657993,
 2.5975265263048453,
 2.5975265263048453,
 2.505734146627542,
 2.505734146627542,
 2.417185560952531,
 2.417185560952531,
 2.3317661388544297,
 2.3317661388544297,
 2.2493653007613963,
 2.2493653007613963,
 2.1698763748045304,
 2.1698763748045304,
 2.093196458725979,
 2.093196458725979,
 2.019226286666988,
 2.019226286666988,
 1.9478701006634522,
 1.9478701006634522,
 1.8790355266825967,
 1.8790355266825967,
 1.812633455040327,
 1.812633455040327,
 1.7485779250444358,
 1.7485779250444358,
 1.6867860137143291,
 1.6867860137143291,


In [43]:
sinusoid_table = np.array([get_posi_angle_vec(pos_i) for pos_i in range(n_position)])
print(sinusoid_table.shape)

(5, 512)


In [47]:
sinusoid_table[2]

array([2.00000000e+00, 2.00000000e+00, 1.92932324e+00, 1.92932324e+00,
       1.86114408e+00, 1.86114408e+00, 1.79537426e+00, 1.79537426e+00,
       1.73192865e+00, 1.73192865e+00, 1.67072509e+00, 1.67072509e+00,
       1.61168438e+00, 1.61168438e+00, 1.55473006e+00, 1.55473006e+00,
       1.49978842e+00, 1.49978842e+00, 1.44678833e+00, 1.44678833e+00,
       1.39566117e+00, 1.39566117e+00, 1.34634076e+00, 1.34634076e+00,
       1.29876326e+00, 1.29876326e+00, 1.25286707e+00, 1.25286707e+00,
       1.20859278e+00, 1.20859278e+00, 1.16588307e+00, 1.16588307e+00,
       1.12468265e+00, 1.12468265e+00, 1.08493819e+00, 1.08493819e+00,
       1.04659823e+00, 1.04659823e+00, 1.00961314e+00, 1.00961314e+00,
       9.73935050e-01, 9.73935050e-01, 9.39517763e-01, 9.39517763e-01,
       9.06316728e-01, 9.06316728e-01, 8.74288963e-01, 8.74288963e-01,
       8.43393007e-01, 8.43393007e-01, 8.13588864e-01, 8.13588864e-01,
       7.84837952e-01, 7.84837952e-01, 7.57103050e-01, 7.57103050e-01,
      

:: 의미
```python
a = [1,2,3,4,5,6,7,8,9]
a[2::3]

# [3, 6, 9]
```

In [49]:
sinusoid_table[:, 0::2].shape # 0, 2, 4, ..., 510 (511)

(5, 256)

In [50]:
sinusoid_table[:, 0::2][2]

array([2.00000000e+00, 1.92932324e+00, 1.86114408e+00, 1.79537426e+00,
       1.73192865e+00, 1.67072509e+00, 1.61168438e+00, 1.55473006e+00,
       1.49978842e+00, 1.44678833e+00, 1.39566117e+00, 1.34634076e+00,
       1.29876326e+00, 1.25286707e+00, 1.20859278e+00, 1.16588307e+00,
       1.12468265e+00, 1.08493819e+00, 1.04659823e+00, 1.00961314e+00,
       9.73935050e-01, 9.39517763e-01, 9.06316728e-01, 8.74288963e-01,
       8.43393007e-01, 8.13588864e-01, 7.84837952e-01, 7.57103050e-01,
       7.30348255e-01, 7.04538930e-01, 6.79641666e-01, 6.55624230e-01,
       6.32455532e-01, 6.10105578e-01, 5.88545435e-01, 5.67747193e-01,
       5.47683927e-01, 5.28329664e-01, 5.09659350e-01, 4.91648814e-01,
       4.74274741e-01, 4.57514640e-01, 4.41346814e-01, 4.25750332e-01,
       4.10705005e-01, 3.96191356e-01, 3.82190595e-01, 3.68684598e-01,
       3.55655882e-01, 3.43087579e-01, 3.30963420e-01, 3.19267709e-01,
       3.07985305e-01, 2.97101603e-01, 2.86602514e-01, 2.76474445e-01,
      

In [51]:
sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # dim 2i
sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])  # dim 2i+1

In [53]:
sinusoid_table[:, 0::2][2]

array([9.09297427e-01, 9.36414739e-01, 9.58144376e-01, 9.74888185e-01,
       9.87046251e-01, 9.95011274e-01, 9.99164200e-01, 9.99870940e-01,
       9.97479998e-01, 9.92320856e-01, 9.84702998e-01, 9.74915430e-01,
       9.63226623e-01, 9.49884770e-01, 9.35118300e-01, 9.19136572e-01,
       9.02130715e-01, 8.84274552e-01, 8.65725587e-01, 8.46626027e-01,
       8.27103803e-01, 8.07273589e-01, 7.87237797e-01, 7.67087535e-01,
       7.46903535e-01, 7.26757021e-01, 7.06710541e-01, 6.86818743e-01,
       6.67129105e-01, 6.47682605e-01, 6.28514353e-01, 6.09654169e-01,
       5.91127117e-01, 5.72953994e-01, 5.55151778e-01, 5.37734041e-01,
       5.20711320e-01, 5.04091459e-01, 4.87879918e-01, 4.72080050e-01,
       4.56693360e-01, 4.41719725e-01, 4.27157610e-01, 4.13004247e-01,
       3.99255804e-01, 3.85907533e-01, 3.72953906e-01, 3.60388735e-01,
       3.48205276e-01, 3.36396328e-01, 3.24954314e-01, 3.13871360e-01,
       3.03139357e-01, 2.92750025e-01, 2.82694961e-01, 2.72965685e-01,
      

In [54]:
sinusoid_table[:, 1::2][2]

array([-0.41614684, -0.35089519, -0.28628544, -0.22269492, -0.16043596,
       -0.09976254, -0.04087666,  0.01606558,  0.07094825,  0.12369041,
        0.17424123,  0.22257561,  0.26869029,  0.31260026,  0.35433567,
        0.39393903,  0.43146283,  0.46696736,  0.50051894,  0.53218828,
        0.5620492 ,  0.59017739,  0.61664954,  0.64154245,  0.66493241,
        0.68689463,  0.7075028 ,  0.72682874,  0.74494212,  0.76191026,
        0.77779799,  0.79266752,  0.80657841,  0.81958753,  0.83174906,
        0.84311452,  0.85373282,  0.86365028,  0.87291075,  0.88155569,
        0.88962418,  0.8971531 ,  0.90417718,  0.9107291 ,  0.91683957,
        0.92253747,  0.92784987,  0.93280221,  0.93741831,  0.94172051,
        0.94572971,  0.94946552,  0.95294624,  0.95618901,  0.95920986,
        0.96202377,  0.96464473,  0.96708582,  0.96935924,  0.97147639,
        0.97344791,  0.97528374,  0.97699312,  0.97858472,  0.98006658,
        0.98144622,  0.98273065,  0.9839264 ,  0.98503957,  0.98

In [55]:
# pos_emb = nn.Embedding.from_pretrained(get_sinusoid_encoding_table(src_vocab_size, d_model), freeze=True)
pos_emb = nn.Embedding.from_pretrained(torch.FloatTensor(sinusoid_table), freeze=True)

In [58]:
pos_emb.weight.shape

torch.Size([5, 512])

In [63]:
# 1.3) enc_outputs
enc_outputs = src_emb(enc_inputs) + pos_emb(enc_inputs)

In [68]:
src_emb(enc_inputs).shape

torch.Size([1, 5, 512])

In [69]:
pos_emb(enc_inputs).shape

torch.Size([1, 5, 512])

In [65]:
print(enc_outputs)
print(enc_outputs.shape)

## enc_outputs 의미 : enc_inputs의 src_emb + pos_emb 
## -> 1 batch, 5 tokens, 512 embedding dimensions

tensor([[[-0.8975,  1.5275,  0.5599,  ...,  0.0860,  0.7317,  0.8330],
         [ 1.0840, -1.5479,  0.4180,  ...,  0.2803, -0.0269,  0.5393],
         [-0.7122,  0.5253, -1.0219,  ...,  0.9817,  0.2214,  1.0245],
         [-2.5731,  0.6607, -3.0326,  ..., -0.1859,  0.4281,  0.3123],
         [-0.1280,  3.3881,  0.0602,  ...,  0.9774, -1.2408,  2.9292]]],
       grad_fn=<AddBackward0>)
torch.Size([1, 5, 512])


```python
# 1.4) enc_self_attn_mask
enc_self_attn_mask = get_attn_pad_mask(enc_inputs, enc_inputs)

def get_attn_pad_mask(seq_q, seq_k):
    batch_size, len_q = seq_q.size()
    batch_size, len_k = seq_k.size()
    # eq(zero) is PAD token
    pad_attn_mask = seq_k.data.eq(0).unsqueeze(1)  # batch_size x 1 x len_k(=len_q), one is masking
    return pad_attn_mask.expand(batch_size, len_q, len_k)  # batch_size x len_q x len_k
```

In [71]:
print(enc_inputs)
print(enc_inputs.size())

tensor([[1, 2, 3, 4, 0]])
torch.Size([1, 5])


In [72]:
batch_size, len_q = enc_inputs.size()
batch_size, len_k = enc_inputs.size()
print(batch_size, len_q, len_k)

1 5 5


In [77]:
print(enc_inputs.data)
print(enc_inputs.data.eq(0))
print(enc_inputs.data.eq(0).unsqueeze(1))
print(enc_inputs.data.eq(0).unsqueeze(1).shape) # index=1 크기 하나 늘리기

tensor([[1, 2, 3, 4, 0]])
tensor([[0, 0, 0, 0, 1]], dtype=torch.uint8)
tensor([[[0, 0, 0, 0, 1]]], dtype=torch.uint8)
torch.Size([1, 1, 5])


In [80]:
pad_attn_mask = enc_inputs.data.eq(0).unsqueeze(1)
print(pad_attn_mask.shape)

torch.Size([1, 1, 5])


In [86]:
enc_self_attn_mask = pad_attn_mask.expand(batch_size, len_q, len_k)
print(pad_attn_mask.expand(batch_size, len_q, len_k))
print(pad_attn_mask.expand(batch_size, len_q, len_k).shape)
## enc_self_attn_mask 이렇게 하는 이유 : 여기는 패딩이라 안볼거야

tensor([[[0, 0, 0, 0, 1],
         [0, 0, 0, 0, 1],
         [0, 0, 0, 0, 1],
         [0, 0, 0, 0, 1],
         [0, 0, 0, 0, 1]]], dtype=torch.uint8)
torch.Size([1, 5, 5])


```python

# 1.5) enc_self_attns
layers = nn.ModuleList([EncoderLayer() for _ in range(n_layers)])

enc_self_attns = []
for layer in self.layers:
    enc_outputs, enc_self_attn = layer(enc_outputs, enc_self_attn_mask)
    enc_self_attns.append(enc_self_attn)

# EncoderLayer() 가 필요!

class EncoderLayer(nn.Module):
    def __init__(self):
        super(EncoderLayer, self).__init__()
        self.enc_self_attn = MultiHeadAttention()
        self.pos_ffn = PoswiseFeedForwardNet()

    def forward(self, enc_inputs, enc_self_attn_mask):
        enc_outputs, attn = self.enc_self_attn(enc_inputs, enc_inputs, enc_inputs, enc_self_attn_mask) # enc_inputs to same Q,K,V
        enc_outputs = self.pos_ffn(enc_outputs) # enc_outputs: [batch_size x len_q x d_model]
        return enc_outputs, attn
```
MultiHeadAttention() ---- Q: enc_outputs, K: enc_outputs, V: enc_outputs, attn_mask: enc_self_attn_mask (enc_inputs 아님 주의)  
PoswiseFeedForwardNet() ---- inputs: enc_outputs 

In [100]:
print(enc_outputs)
print(enc_outputs.shape) # 1 batch, 5 sentences, 512 dimensions

tensor([[[-0.8975,  1.5275,  0.5599,  ...,  0.0860,  0.7317,  0.8330],
         [ 1.0840, -1.5479,  0.4180,  ...,  0.2803, -0.0269,  0.5393],
         [-0.7122,  0.5253, -1.0219,  ...,  0.9817,  0.2214,  1.0245],
         [-2.5731,  0.6607, -3.0326,  ..., -0.1859,  0.4281,  0.3123],
         [-0.1280,  3.3881,  0.0602,  ...,  0.9774, -1.2408,  2.9292]]],
       grad_fn=<AddBackward0>)
torch.Size([1, 5, 512])


In [87]:
enc_self_attn_mask

tensor([[[0, 0, 0, 0, 1],
         [0, 0, 0, 0, 1],
         [0, 0, 0, 0, 1],
         [0, 0, 0, 0, 1],
         [0, 0, 0, 0, 1]]], dtype=torch.uint8)

```python
# MultiHeadAttention(), PoswiseFeedForwardNet() 필요!!

class MultiHeadAttention(nn.Module):
    def __init__(self):
        super(MultiHeadAttention, self).__init__()
        self.W_Q = nn.Linear(d_model, d_k * n_heads)
        self.W_K = nn.Linear(d_model, d_k * n_heads)
        self.W_V = nn.Linear(d_model, d_v * n_heads)
    def forward(self, Q, K, V, attn_mask):
        # q: [batch_size x len_q x d_model], k: [batch_size x len_k x d_model], v: [batch_size x len_k x d_model]
        residual, batch_size = Q, Q.size(0)
        # (B, S, D) -proj-> (B, S, D) -split-> (B, S, H, W) -trans-> (B, H, S, W)
        q_s = self.W_Q(Q).view(batch_size, -1, n_heads, d_k).transpose(1,2)  # q_s: [batch_size x n_heads x len_q x d_k]
        k_s = self.W_K(K).view(batch_size, -1, n_heads, d_k).transpose(1,2)  # k_s: [batch_size x n_heads x len_k x d_k]
        v_s = self.W_V(V).view(batch_size, -1, n_heads, d_v).transpose(1,2)  # v_s: [batch_size x n_heads x len_k x d_v]

        attn_mask = attn_mask.unsqueeze(1).repeat(1, n_heads, 1, 1) # attn_mask : [batch_size x n_heads x len_q x len_k]

        # context: [batch_size x n_heads x len_q x d_v], attn: [batch_size x n_heads x len_q(=len_k) x len_k(=len_q)]
        context, attn = ScaledDotProductAttention()(q_s, k_s, v_s, attn_mask)
        context = context.transpose(1, 2).contiguous().view(batch_size, -1, n_heads * d_v) # context: [batch_size x len_q x n_heads * d_v]
        output = nn.Linear(n_heads * d_v, d_model)(context)
        return nn.LayerNorm(d_model)(output + residual), attn # output: [batch_size x len_q x d_model]
```

In [89]:
d_k = d_v = 64  # dimension of K(=Q), V
n_heads = 8  # number of heads in Multi-Head Attention

# key, query, value 에 대한 각각의 weight matrix
W_Q = nn.Linear(d_model, d_k * n_heads)
W_K = nn.Linear(d_model, d_k * n_heads)
W_V = nn.Linear(d_model, d_v * n_heads)

print(W_Q)
print(W_K)
print(W_V)
# 512 -> 512 의미

Linear(in_features=512, out_features=512, bias=True)
Linear(in_features=512, out_features=512, bias=True)
Linear(in_features=512, out_features=512, bias=True)


In [101]:
# enc_outputs 의미 : 'ich mochte ein bier P' (5 tokens -> 512 )
Q, K, V, attn_mask = enc_outputs, enc_outputs, enc_outputs, enc_self_attn_mask

In [102]:
print(Q)
print(Q.size())
print(Q.size(0))

tensor([[[-0.8975,  1.5275,  0.5599,  ...,  0.0860,  0.7317,  0.8330],
         [ 1.0840, -1.5479,  0.4180,  ...,  0.2803, -0.0269,  0.5393],
         [-0.7122,  0.5253, -1.0219,  ...,  0.9817,  0.2214,  1.0245],
         [-2.5731,  0.6607, -3.0326,  ..., -0.1859,  0.4281,  0.3123],
         [-0.1280,  3.3881,  0.0602,  ...,  0.9774, -1.2408,  2.9292]]],
       grad_fn=<AddBackward0>)
torch.Size([1, 5, 512])
1


In [103]:
residual, batch_size = Q, Q.size(0)
print(residual, batch_size)

tensor([[[-0.8975,  1.5275,  0.5599,  ...,  0.0860,  0.7317,  0.8330],
         [ 1.0840, -1.5479,  0.4180,  ...,  0.2803, -0.0269,  0.5393],
         [-0.7122,  0.5253, -1.0219,  ...,  0.9817,  0.2214,  1.0245],
         [-2.5731,  0.6607, -3.0326,  ..., -0.1859,  0.4281,  0.3123],
         [-0.1280,  3.3881,  0.0602,  ...,  0.9774, -1.2408,  2.9292]]],
       grad_fn=<AddBackward0>) 1


```python
q_s = self.W_Q(Q).view(batch_size, -1, n_heads, d_k).transpose(1,2)  # q_s: [batch_size x n_heads x len_q x d_k]
k_s = self.W_K(K).view(batch_size, -1, n_heads, d_k).transpose(1,2)  # k_s: [batch_size x n_heads x len_k x d_k]
v_s = self.W_V(V).view(batch_size, -1, n_heads, d_v).transpose(1,2)  # v_s: [batch_size x n_heads x len_k x d_v]
```

In [97]:
W_Q

Linear(in_features=512, out_features=512, bias=True)

In [107]:
# x1를 weight 행렬인 WQ로 곱하는 것은 현재 단어와 연관된 query 벡터인 q1를 생성합니다.
print(W_Q(Q))
print(W_Q(Q).shape)

tensor([[[ 0.6816,  0.6673, -0.4468,  ..., -1.0690,  0.6963, -0.2285],
         [ 0.7088,  0.8162,  0.3654,  ...,  0.6470,  0.3708,  0.5197],
         [ 1.5571,  0.1647,  1.2334,  ..., -0.5666,  0.5689, -0.5585],
         [ 1.3335,  0.4457,  0.0876,  ...,  0.5219, -0.1814,  0.5346],
         [ 0.0571,  0.7212, -0.4121,  ..., -0.3992, -0.3169, -1.4227]]],
       grad_fn=<AddBackward0>)
torch.Size([1, 5, 512])


In [109]:
# 512 -> 8, 64
# d_k = d_v = 64  # dimension of K(=Q), V
# n_heads = 8  # number of heads in Multi-Head Attention
W_Q(Q).view(batch_size, -1, n_heads, d_k).shape

# torch.Size([1, 5, 8, 64]) -> 1 batch, 5 tokens, 8 attention heads, 64 dimensions

torch.Size([1, 5, 8, 64])

In [110]:
W_Q(Q).view(batch_size, -1, n_heads, d_k).transpose(1,2).shape

torch.Size([1, 8, 5, 64])

In [111]:
q_s = W_Q(Q).view(batch_size, -1, n_heads, d_k).transpose(1,2)  # q_s: [batch_size x n_heads x len_q x d_k]
k_s = W_K(K).view(batch_size, -1, n_heads, d_k).transpose(1,2)  # k_s: [batch_size x n_heads x len_k x d_k]
v_s = W_V(V).view(batch_size, -1, n_heads, d_v).transpose(1,2)  # v_s: [batch_size x n_heads x len_k x d_v]

In [112]:
print(q_s.shape)
print(k_s.shape)
print(v_s.shape)

torch.Size([1, 8, 5, 64])
torch.Size([1, 8, 5, 64])
torch.Size([1, 8, 5, 64])


In [117]:
# attn_mask = attn_mask.unsqueeze(1).repeat(1, n_heads, 1, 1)
print(attn_mask.shape)
print(attn_mask.unsqueeze(1).shape)
print(attn_mask.unsqueeze(1).repeat(1, n_heads, 1, 1).shape)

attn_mask = attn_mask.unsqueeze(1).repeat(1, n_heads, 1, 1)

torch.Size([1, 5, 5])
torch.Size([1, 1, 5, 5])
torch.Size([1, 8, 5, 5])


In [118]:
attn_mask 
# attn_mask : [batch_size x n_heads x len_q x len_k]

tensor([[[[0, 0, 0, 0, 1],
          [0, 0, 0, 0, 1],
          [0, 0, 0, 0, 1],
          [0, 0, 0, 0, 1],
          [0, 0, 0, 0, 1]],

         [[0, 0, 0, 0, 1],
          [0, 0, 0, 0, 1],
          [0, 0, 0, 0, 1],
          [0, 0, 0, 0, 1],
          [0, 0, 0, 0, 1]],

         [[0, 0, 0, 0, 1],
          [0, 0, 0, 0, 1],
          [0, 0, 0, 0, 1],
          [0, 0, 0, 0, 1],
          [0, 0, 0, 0, 1]],

         [[0, 0, 0, 0, 1],
          [0, 0, 0, 0, 1],
          [0, 0, 0, 0, 1],
          [0, 0, 0, 0, 1],
          [0, 0, 0, 0, 1]],

         [[0, 0, 0, 0, 1],
          [0, 0, 0, 0, 1],
          [0, 0, 0, 0, 1],
          [0, 0, 0, 0, 1],
          [0, 0, 0, 0, 1]],

         [[0, 0, 0, 0, 1],
          [0, 0, 0, 0, 1],
          [0, 0, 0, 0, 1],
          [0, 0, 0, 0, 1],
          [0, 0, 0, 0, 1]],

         [[0, 0, 0, 0, 1],
          [0, 0, 0, 0, 1],
          [0, 0, 0, 0, 1],
          [0, 0, 0, 0, 1],
          [0, 0, 0, 0, 1]],

         [[0, 0, 0, 0, 1],
          [0, 

```python

# context: [batch_size x n_heads x len_q x d_v], attn: [batch_size x n_heads x len_q(=len_k) x len_k(=len_q)]
context, attn = ScaledDotProductAttention()(q_s, k_s, v_s, attn_mask)
context = context.transpose(1, 2).contiguous().view(batch_size, -1, n_heads * d_v) # context: [batch_size x len_q x n_heads * d_v]
output = nn.Linear(n_heads * d_v, d_model)(context)


class ScaledDotProductAttention(nn.Module):
    def __init__(self):
        super(ScaledDotProductAttention, self).__init__()

    def forward(self, Q, K, V, attn_mask):
        scores = torch.matmul(Q, K.transpose(-1, -2)) / np.sqrt(d_k) # scores : [batch_size x n_heads x len_q(=len_k) x len_k(=len_q)]
        scores.masked_fill_(attn_mask, -1e9) # Fills elements of self tensor with value where mask is one.
        attn = nn.Softmax(dim=-1)(scores)
        context = torch.matmul(attn, V)
        return context, attn
```

In [123]:
q_s.shape

torch.Size([1, 8, 5, 64])

In [124]:
k_s.transpose(-1, -2).shape

torch.Size([1, 8, 64, 5])

In [125]:
torch.matmul(q_s, k_s.transpose(-1, -2)).shape

torch.Size([1, 8, 5, 5])

In [131]:
print(d_k)
scores = torch.matmul(q_s, k_s.transpose(-1, -2)) / np.sqrt(d_k)

64


In [132]:
scores

tensor([[[[-5.9383e-01, -2.4807e-01,  6.5411e-01, -1.1670e-01, -7.2737e-01],
          [-9.3032e-01, -4.2527e-01,  4.5711e-01, -2.3553e-01, -4.9635e-01],
          [-1.6991e-01,  2.3219e-01,  5.1079e-01,  2.6412e-01, -5.9693e-01],
          [-1.9371e-01,  1.4250e+00, -3.6556e-01, -1.8922e-01,  2.2626e-01],
          [-9.7784e-01, -6.4192e-01, -3.1446e-01, -9.6350e-01, -1.0996e+00]],

         [[-3.0698e-01, -9.2977e-01,  5.2781e-01,  1.8117e-01,  7.2998e-02],
          [-4.8617e-01, -5.0216e-01, -2.7823e-01, -2.1277e-01, -4.1139e-01],
          [ 6.6586e-02, -8.6792e-01, -6.5248e-01,  4.0352e-01, -1.4613e-01],
          [ 3.2779e-01,  7.2557e-01, -5.0681e-02,  4.2920e-01,  2.7464e-01],
          [-3.8958e-01, -5.4248e-01, -1.4609e-01, -2.3969e-01, -5.1014e-01]],

         [[-3.2309e-02,  4.2039e-01, -2.9104e-01,  6.6914e-01, -4.7894e-01],
          [ 3.9633e-01,  8.9111e-01,  6.4468e-01,  9.0339e-01,  1.8073e-01],
          [-3.1823e-01, -7.3168e-02,  7.1581e-02, -1.6773e-02, -6.5338e-

In [129]:
scores.shape

torch.Size([1, 8, 5, 5])

In [133]:
scores.masked_fill_(attn_mask, -1e9)

tensor([[[[-5.9383e-01, -2.4807e-01,  6.5411e-01, -1.1670e-01, -1.0000e+09],
          [-9.3032e-01, -4.2527e-01,  4.5711e-01, -2.3553e-01, -1.0000e+09],
          [-1.6991e-01,  2.3219e-01,  5.1079e-01,  2.6412e-01, -1.0000e+09],
          [-1.9371e-01,  1.4250e+00, -3.6556e-01, -1.8922e-01, -1.0000e+09],
          [-9.7784e-01, -6.4192e-01, -3.1446e-01, -9.6350e-01, -1.0000e+09]],

         [[-3.0698e-01, -9.2977e-01,  5.2781e-01,  1.8117e-01, -1.0000e+09],
          [-4.8617e-01, -5.0216e-01, -2.7823e-01, -2.1277e-01, -1.0000e+09],
          [ 6.6586e-02, -8.6792e-01, -6.5248e-01,  4.0352e-01, -1.0000e+09],
          [ 3.2779e-01,  7.2557e-01, -5.0681e-02,  4.2920e-01, -1.0000e+09],
          [-3.8958e-01, -5.4248e-01, -1.4609e-01, -2.3969e-01, -1.0000e+09]],

         [[-3.2309e-02,  4.2039e-01, -2.9104e-01,  6.6914e-01, -1.0000e+09],
          [ 3.9633e-01,  8.9111e-01,  6.4468e-01,  9.0339e-01, -1.0000e+09],
          [-3.1823e-01, -7.3168e-02,  7.1581e-02, -1.6773e-02, -1.0000e+

In [136]:
attn = nn.Softmax(dim=-1)(scores)
print(attn)

tensor([[[[0.1332, 0.1882, 0.4639, 0.2146, 0.0000],
          [0.1154, 0.1912, 0.4622, 0.2312, 0.0000],
          [0.1663, 0.2486, 0.3285, 0.2567, 0.0000],
          [0.1267, 0.6394, 0.1067, 0.1273, 0.0000],
          [0.1867, 0.2613, 0.3625, 0.1894, 0.0000]],

         [[0.1828, 0.0981, 0.4213, 0.2979, 0.0000],
          [0.2208, 0.2173, 0.2718, 0.2902, 0.0000],
          [0.3048, 0.1197, 0.1485, 0.4269, 0.0000],
          [0.2336, 0.3478, 0.1600, 0.2586, 0.0000],
          [0.2328, 0.1998, 0.2970, 0.2704, 0.0000]],

         [[0.1865, 0.2933, 0.1440, 0.3762, 0.0000],
          [0.1791, 0.2938, 0.2296, 0.2974, 0.0000],
          [0.1958, 0.2502, 0.2892, 0.2647, 0.0000],
          [0.3097, 0.2648, 0.2795, 0.1459, 0.0000],
          [0.1579, 0.2631, 0.1800, 0.3990, 0.0000]],

         [[0.4271, 0.2079, 0.1840, 0.1810, 0.0000],
          [0.1468, 0.4142, 0.2366, 0.2024, 0.0000],
          [0.3658, 0.2158, 0.1696, 0.2487, 0.0000],
          [0.3922, 0.1659, 0.2082, 0.2337, 0.0000],
      

In [141]:
print(attn.shape)
print(v_s.shape)
context = torch.matmul(attn, v_s) # softmax * value
print(context.shape)

torch.Size([1, 8, 5, 5])
torch.Size([1, 8, 5, 64])
torch.Size([1, 8, 5, 64])


In [144]:
context.transpose(1, 2).shape

torch.Size([1, 5, 8, 64])

In [145]:
context.transpose(1, 2).contiguous().shape

torch.Size([1, 5, 8, 64])

In [146]:
context = context.transpose(1, 2).contiguous().view(batch_size, -1, n_heads * d_v) 
# context: [batch_size x len_q x n_heads * d_v]

In [147]:
context.shape # sum???

torch.Size([1, 5, 512])

In [148]:
mh_outputs = nn.Linear(n_heads * d_v, d_model)(context) # 512 -> 512

In [150]:
mh_outputs.shape

torch.Size([1, 5, 512])

In [151]:
nn.LayerNorm(d_model)

LayerNorm(torch.Size([512]), eps=1e-05, elementwise_affine=True)

In [153]:
residual.shape

torch.Size([1, 5, 512])

In [155]:
enc_mh_outputs = nn.LayerNorm(d_model)(mh_outputs + residual)

In [156]:
enc_mh_outputs.shape

torch.Size([1, 5, 512])

```python
enc_pf_outputs = self.pos_ffn(enc_mh_outputs)

class PoswiseFeedForwardNet(nn.Module):
    def __init__(self):
        super(PoswiseFeedForwardNet, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=d_model, out_channels=d_ff, kernel_size=1)
        self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_model, kernel_size=1)

    def forward(self, inputs):
        residual = inputs # inputs : [batch_size, len_q, d_model]
        output = nn.ReLU()(self.conv1(inputs.transpose(1, 2)))
        output = self.conv2(output).transpose(1, 2)
        return nn.LayerNorm(d_model)(output + residual)
```

In [157]:
d_ff = 2048 # FeedForward dimension
conv1 = nn.Conv1d(in_channels=d_model, out_channels=d_ff, kernel_size=1)

pf_residual = enc_mh_outputs

In [162]:
print(conv1)
print(pf_residual.shape)
print(enc_mh_outputs.transpose(1, 2).shape)
print(conv1(enc_mh_outputs.transpose(1, 2)).shape)

Conv1d(512, 2048, kernel_size=(1,), stride=(1,))
torch.Size([1, 5, 512])
torch.Size([1, 512, 5])
torch.Size([1, 2048, 5])


In [166]:
pf_output = nn.ReLU()(conv1(enc_mh_outputs.transpose(1, 2)))
print(pf_output)
print(pf_output.shape)

tensor([[[0.0000, 0.3408, 0.0000, 0.0000, 0.2808],
         [0.0000, 0.0681, 0.0000, 0.0000, 0.7553],
         [0.0000, 0.0000, 0.1571, 0.0000, 0.0000],
         ...,
         [0.5268, 0.0000, 0.1336, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000, 0.3500, 0.2845],
         [0.8660, 0.4463, 0.0000, 0.6833, 0.0000]]],
       grad_fn=<ThresholdBackward0>)
torch.Size([1, 2048, 5])


In [170]:
d_ff

2048

In [167]:
conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_model, kernel_size=1)
print(conv2)
print(conv2(pf_output).shape)
pf_output = conv2(pf_output).transpose(1, 2)
print(pf_output.shape)

Conv1d(2048, 512, kernel_size=(1,), stride=(1,))
torch.Size([1, 512, 5])
torch.Size([1, 5, 512])


In [169]:
nn.LayerNorm(d_model)(pf_output + residual).shape
# enc_self_attns = []의 0번째 element

torch.Size([1, 5, 512])