In [36]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
import copy

from autoencoder_a import LayerNorm, FeedForward, clones, Encoder, Decoder

In [37]:
d_model, N, head_num, d_ff = 10, 3, 2, 20  # N: number of layer, head_num: number of head
encoder = Encoder(d_model, N, head_num, d_ff)
decoder = Decoder(d_model, N, head_num, d_ff)

In [38]:
batch_size, seq_len = 2, 64
x = torch.rand(batch_size, seq_len, d_model)
mask = torch.ones(batch_size, 1, seq_len)

In [39]:
memory = encoder(x, mask)

- encoder input: torch.Size([2, 64, 10])
- encoder: torch.Size([2, 64, 10])
- encoder: torch.Size([2, 64, 10])
- encoder: torch.Size([2, 64, 10])
- encoder: torch.Size([2, 32, 10])
- encoder: torch.Size([2, 16, 10])
- encoder: torch.Size([2, 8, 10])


In [40]:
memory.shape

torch.Size([2, 8, 10])

In [41]:
output = decoder(memory, torch.ones(batch_size, 1, memory.shape[1]))

- decoder input: torch.Size([2, 8, 10])
- decoder: torch.Size([2, 16, 10])
- decoder: torch.Size([2, 32, 10])
- decoder: torch.Size([2, 64, 10])
- decoder: torch.Size([2, 64, 10])
- decoder: torch.Size([2, 64, 10])
- decoder: torch.Size([2, 64, 10])


In [42]:
output.shape

torch.Size([2, 64, 10])

In [43]:
log_softmax = nn.LogSoftmax(dim=1)
criterion = nn.NLLLoss()

In [44]:
inputs = torch.tensor([[ 1.3956,  1.1497, -1.3392,  0.8979,  1.0989],
                       [-0.3986,  0.0929, -0.2454,  2.4595,  0.3489],
                       [ 0.4532,  0.6627,  0.6888, -0.9131, -1.8882]], requires_grad=True)
targets = torch.tensor([1, 0, 4])
print("Inputs: {}".format(inputs.shape))
print("Targets: {}".format(targets.shape))

Inputs: torch.Size([3, 5])
Targets: torch.Size([3])


In [45]:
ls = torch.nn.functional.log_softmax(inputs, dim=-1)
print(ls)

tensor([[-1.1626, -1.4085, -3.8974, -1.6603, -1.4593],
        [-3.1502, -2.6587, -2.9970, -0.2921, -2.4027],
        [-1.3481, -1.1386, -1.1125, -2.7144, -3.6895]],
       grad_fn=<LogSoftmaxBackward>)


In [46]:
ls = criterion(ls, targets)

In [47]:
ls.backward()

In [48]:
inputs.grad

tensor([[ 0.1042, -0.2518,  0.0068,  0.0634,  0.0775],
        [-0.3191,  0.0233,  0.0166,  0.2489,  0.0302],
        [ 0.0866,  0.1068,  0.1096,  0.0221, -0.3250]])

In [49]:
# tensor([[ 0.1042, -0.2518,  0.0068,  0.0634,  0.0775],
#         [-0.3191,  0.0233,  0.0166,  0.2489,  0.0302],
#         [ 0.0866,  0.1068,  0.1096,  0.0221, -0.3250]])

In [50]:
ls

tensor(2.7494, grad_fn=<NllLossBackward>)

In [51]:
ls.mean()

tensor(2.7494, grad_fn=<MeanBackward0>)

In [52]:
logits = torch.tensor([[[10, 1, 1],
                        [1, 10, 1],
                        [1, 1, 10]]], dtype=torch.float32)

In [53]:
logits

tensor([[[10.,  1.,  1.],
         [ 1., 10.,  1.],
         [ 1.,  1., 10.]]])

In [54]:
log_probs = F.log_softmax(logits, dim=-1)

In [55]:
log_probs

tensor([[[-2.4673e-04, -9.0002e+00, -9.0002e+00],
         [-9.0002e+00, -2.4673e-04, -9.0002e+00],
         [-9.0002e+00, -9.0002e+00, -2.4673e-04]]])

In [56]:
_, preds = torch.max(log_probs, dim=-1)

In [57]:
preds

tensor([[0, 1, 2]])

In [58]:
labels = torch.tensor([[1, 2, 3, 4],
                       [2, 3, 4, 5]], dtype=torch.int32)
targets = torch.tensor([[1, 2, 3, 0],
                        [2, 3, 4, 0]], dtype=torch.int32)

In [59]:
torch.sum(labels == targets).item() / torch.sum(torch.ones_like(labels)).item()

0.75

In [60]:
labels == targets

tensor([[ True,  True,  True, False],
        [ True,  True,  True, False]])

In [61]:
torch.mean((labels == targets).float()).item()

0.75

In [62]:
a = torch.rand(1, 4, 2)

In [63]:
a.shape

torch.Size([1, 4, 2])

In [64]:
a

tensor([[[0.4793, 0.1453],
         [0.3791, 0.8151],
         [0.0494, 0.4479],
         [0.5376, 0.1685]]])

In [65]:
a.view(1, 2, 4)

tensor([[[0.4793, 0.1453, 0.3791, 0.8151],
         [0.0494, 0.4479, 0.5376, 0.1685]]])

In [66]:
a

tensor([[[0.4793, 0.1453],
         [0.3791, 0.8151],
         [0.0494, 0.4479],
         [0.5376, 0.1685]]])