In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
import copy

from autoencoder import LayerNorm, FeedForward, clones, Encoder, Decoder

In [2]:
d_model, N, head_num, d_ff = 10, 3, 2, 20  # N: number of layer, head_num: number of head
encoder = Encoder(d_model, N, head_num, d_ff)
decoder = Decoder(d_model, N, head_num, d_ff)

In [3]:
batch_size, seq_len = 2, 64
x = torch.rand(batch_size, seq_len, d_model)
mask = torch.ones(batch_size, 1, seq_len)

In [4]:
memory = encoder(x, mask)

- encoder input: torch.Size([2, 64, 10])
- encoder: torch.Size([2, 64, 10])
- encoder: torch.Size([2, 64, 10])
- encoder: torch.Size([2, 64, 10])
- encoder: torch.Size([2, 32, 10])
- encoder: torch.Size([2, 16, 10])
- encoder: torch.Size([2, 8, 10])


In [5]:
memory.shape

torch.Size([2, 8, 10])

In [6]:
output = decoder(memory, torch.ones(batch_size, 1, memory.shape[1]))

- decoder input: torch.Size([2, 8, 10])
- decoder: torch.Size([2, 16, 10])
- decoder: torch.Size([2, 32, 10])
- decoder: torch.Size([2, 64, 10])
- decoder: torch.Size([2, 64, 10])
- decoder: torch.Size([2, 64, 10])
- decoder: torch.Size([2, 64, 10])


In [7]:
output.shape

torch.Size([2, 64, 10])

In [8]:
log_softmax = nn.LogSoftmax(dim=1)
criterion = nn.NLLLoss()

In [9]:
inputs = torch.tensor([[ 1.3956,  1.1497, -1.3392,  0.8979,  1.0989],
                       [-0.3986,  0.0929, -0.2454,  2.4595,  0.3489],
                       [ 0.4532,  0.6627,  0.6888, -0.9131, -1.8882]], requires_grad=True)
targets = torch.tensor([1, 0, 4])
print("Inputs: {}".format(inputs.shape))
print("Targets: {}".format(targets.shape))

Inputs: torch.Size([3, 5])
Targets: torch.Size([3])


In [10]:
ls = torch.nn.functional.log_softmax(inputs, dim=-1)
print(ls)

tensor([[-1.1626, -1.4085, -3.8974, -1.6603, -1.4593],
        [-3.1502, -2.6587, -2.9970, -0.2921, -2.4027],
        [-1.3481, -1.1386, -1.1125, -2.7144, -3.6895]],
       grad_fn=<LogSoftmaxBackward>)


In [11]:
ls = criterion(ls, targets)

In [12]:
ls.backward()

In [13]:
inputs.grad

tensor([[ 0.1042, -0.2518,  0.0068,  0.0634,  0.0775],
        [-0.3191,  0.0233,  0.0166,  0.2489,  0.0302],
        [ 0.0866,  0.1068,  0.1096,  0.0221, -0.3250]])

In [14]:
# tensor([[ 0.1042, -0.2518,  0.0068,  0.0634,  0.0775],
#         [-0.3191,  0.0233,  0.0166,  0.2489,  0.0302],
#         [ 0.0866,  0.1068,  0.1096,  0.0221, -0.3250]])

In [15]:
ls

tensor(2.7494, grad_fn=<NllLossBackward>)

In [16]:
ls.mean()

tensor(2.7494, grad_fn=<MeanBackward0>)

In [17]:
logits = torch.tensor([[[10, 1, 1],
                        [1, 10, 1],
                        [1, 1, 10]]], dtype=torch.float32)

In [18]:
logits

tensor([[[10.,  1.,  1.],
         [ 1., 10.,  1.],
         [ 1.,  1., 10.]]])

In [19]:
log_probs = F.log_softmax(logits, dim=-1)

In [20]:
log_probs

tensor([[[-2.4673e-04, -9.0002e+00, -9.0002e+00],
         [-9.0002e+00, -2.4673e-04, -9.0002e+00],
         [-9.0002e+00, -9.0002e+00, -2.4673e-04]]])

In [21]:
_, preds = torch.max(log_probs, dim=-1)

In [22]:
preds

tensor([[0, 1, 2]])

In [23]:
labels = torch.tensor([[1, 2, 3, 4],
                       [2, 3, 4, 5]], dtype=torch.int32)
targets = torch.tensor([[1, 2, 3, 0],
                        [2, 3, 4, 0]], dtype=torch.int32)

In [24]:
torch.sum(labels == targets).item() / torch.sum(torch.ones_like(labels)).item()

0.75

In [25]:
labels == targets

tensor([[ True,  True,  True, False],
        [ True,  True,  True, False]])

In [26]:
torch.mean((labels == targets).float()).item()

0.75

In [31]:
a = torch.rand(1, 4, 2)

In [32]:
a.shape

torch.Size([1, 4, 2])

In [35]:
a

tensor([[[0.5357, 0.6174],
         [0.5388, 0.3659],
         [0.3637, 0.4416],
         [0.3376, 0.9531]]])

In [36]:
a.view(1, 2, 4)

tensor([[[0.5357, 0.6174, 0.5388, 0.3659],
         [0.3637, 0.4416, 0.3376, 0.9531]]])

In [30]:
a

tensor([[[0.9640, 0.3859, 0.9873, 0.9585],
         [0.1057, 0.9411, 0.6776, 0.8257]]])