In [13]:
from torch import Tensor
import torch
import torch.nn.functional as f


def scaled_dot_product_attention(query: Tensor, key: Tensor, value: Tensor) -> Tensor:
    temp = query.bmm(key.transpose(1, 2))
    scale = query.size(-1) ** 0.5
    softmax = f.softmax(temp / scale, dim=-1)
    return softmax.bmm(value)

In [3]:
import torch
from torch import nn


class AttentionHead(nn.Module):
    def __init__(self, dim_in: int, dim_q: int, dim_k: int):
        super().__init__()
        self.q = nn.Linear(dim_in, dim_q)
        self.k = nn.Linear(dim_in, dim_k)
        self.v = nn.Linear(dim_in, dim_k)
    def forward(self, query: Tensor, key: Tensor, value: Tensor) -> Tensor:
        return scaled_dot_product_attention(self.q(query), self.k(key), self.v(value))

In [4]:
class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads: int, dim_in: int, dim_q: int, dim_k: int):
        super().__init__()
        self.heads = nn.ModuleList(
            [AttentionHead(dim_in, dim_q, dim_k) for _ in range(num_heads)]
        )
        self.linear = nn.Linear(num_heads * dim_k, dim_in)

    def forward(self, query: Tensor, key: Tensor, value: Tensor) -> Tensor:
        return self.linear(
            torch.cat([h(query, key, value) for h in self.heads], dim=-1)
        )

In [5]:
def position_encoding(
    seq_len: int, dim_model: int, device: torch.device = torch.device("cpu"),
) -> Tensor:
    pos = torch.arange(seq_len, dtype=torch.float, device=device).reshape(1, -1, 1)
    dim = torch.arange(dim_model, dtype=torch.float, device=device).reshape(1, 1, -1)
    phase = pos / (1e4 ** (dim // dim_model))

    return torch.where(dim.long() % 2 == 0, torch.sin(phase), torch.cos(phase))

In [6]:
def feed_forward(dim_input: int = 512, dim_feedforward: int = 2048) -> nn.Module:
    return nn.Sequential(
        nn.Linear(dim_input, dim_feedforward),
        nn.ReLU(),
        nn.Linear(dim_feedforward, dim_input),
    )

In [7]:
class Residual(nn.Module):
    def __init__(self, sublayer: nn.Module, dimension: int, dropout: float = 0.1):
        super().__init__()
        self.sublayer = sublayer
        self.norm = nn.LayerNorm(dimension)
        self.dropout = nn.Dropout(dropout)

    def forward(self, *tensors: Tensor) -> Tensor:
        # Assume that the "query" tensor is given first, so we can compute the
        # residual.  This matches the signature of 'MultiHeadAttention'.
        return self.norm(tensors[0] + self.dropout(self.sublayer(*tensors)))

In [8]:
class TransformerEncoderLayer(nn.Module):
    def __init__(
        self,
        dim_model: int = 512,
        num_heads: int = 6,
        dim_feedforward: int = 2048,
        dropout: float = 0.1,
    ):
        super().__init__()
        dim_q = dim_k = max(dim_model // num_heads, 1)
        self.attention = Residual(
            MultiHeadAttention(num_heads, dim_model, dim_q, dim_k),
            dimension=dim_model,
            dropout=dropout,
        )
        self.feed_forward = Residual(
            feed_forward(dim_model, dim_feedforward),
            dimension=dim_model,
            dropout=dropout,
        )

    def forward(self, src: Tensor) -> Tensor:
        src = self.attention(src, src, src)
        return self.feed_forward(src)


class TransformerEncoder(nn.Module):
    def __init__(
        self,
        num_layers: int = 6,
        dim_model: int = 512,
        num_heads: int = 8,
        dim_feedforward: int = 2048,
        dropout: float = 0.1,
    ):
        super().__init__()
        self.layers = nn.ModuleList(
            [
                TransformerEncoderLayer(dim_model, num_heads, dim_feedforward, dropout)
                for _ in range(num_layers)
            ]
        )

    def forward(self, src: Tensor) -> Tensor:
        seq_len, dimension = src.size(1), src.size(2)
        src += position_encoding(seq_len, dimension)
        for layer in self.layers:
            src = layer(src)

        return src

In [9]:
class TransformerDecoderLayer(nn.Module):
    def __init__(
        self,
        dim_model: int = 512,
        num_heads: int = 6,
        dim_feedforward: int = 2048,
        dropout: float = 0.1,
    ):
        super().__init__()
        dim_q = dim_k = max(dim_model // num_heads, 1)
        self.attention_1 = Residual(
            MultiHeadAttention(num_heads, dim_model, dim_q, dim_k),
            dimension=dim_model,
            dropout=dropout,
        )
        self.attention_2 = Residual(
            MultiHeadAttention(num_heads, dim_model, dim_q, dim_k),
            dimension=dim_model,
            dropout=dropout,
        )
        self.feed_forward = Residual(
            feed_forward(dim_model, dim_feedforward),
            dimension=dim_model,
            dropout=dropout,
        )

    def forward(self, tgt: Tensor, memory: Tensor) -> Tensor:
        tgt = self.attention_1(tgt, tgt, tgt)
        tgt = self.attention_2(tgt, memory, memory)
        return self.feed_forward(tgt)


class TransformerDecoder(nn.Module):
    def __init__(
        self,
        num_layers: int = 6,
        dim_model: int = 512,
        num_heads: int = 8,
        dim_feedforward: int = 2048,
        dropout: float = 0.1,
    ):
        super().__init__()
        self.layers = nn.ModuleList(
            [
                TransformerDecoderLayer(dim_model, num_heads, dim_feedforward, dropout)
                for _ in range(num_layers)
            ]
        )
        self.linear = nn.Linear(dim_model, dim_model)

    def forward(self, tgt: Tensor, memory: Tensor) -> Tensor:
        seq_len, dimension = tgt.size(1), tgt.size(2)
        tgt += position_encoding(seq_len, dimension)
        for layer in self.layers:
            tgt = layer(tgt, memory)

        return torch.softmax(self.linear(tgt), dim=-1)

In [10]:
class Transformer(nn.Module):
    def __init__(
        self, 
        num_encoder_layers: int = 6,
        num_decoder_layers: int = 6,
        dim_model: int = 512, 
        num_heads: int = 6, 
        dim_feedforward: int = 2048, 
        dropout: float = 0.1, 
        activation: nn.Module = nn.ReLU(),
    ):
        super().__init__()
        self.encoder = TransformerEncoder(
            num_layers=num_encoder_layers,
            dim_model=dim_model,
            num_heads=num_heads,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
        )
        self.decoder = TransformerDecoder(
            num_layers=num_decoder_layers,
            dim_model=dim_model,
            num_heads=num_heads,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
        )

    def forward(self, src: Tensor, tgt: Tensor) -> Tensor:
        return self.decoder(tgt, self.encoder(src))

In [12]:
src = torch.rand(64, 32, 512)
tgt = torch.rand(64, 16, 512)
out = Transformer()(src, tgt)
print(out.shape)
# torch.Size([64, 16, 512])

torch.Size([64, 16, 512])


In [11]:
print(out)

tensor([[[0.0026, 0.0016, 0.0004,  ..., 0.0019, 0.0014, 0.0060],
         [0.0016, 0.0015, 0.0008,  ..., 0.0012, 0.0018, 0.0025],
         [0.0008, 0.0030, 0.0010,  ..., 0.0006, 0.0018, 0.0030],
         ...,
         [0.0021, 0.0020, 0.0007,  ..., 0.0022, 0.0008, 0.0031],
         [0.0008, 0.0024, 0.0007,  ..., 0.0011, 0.0021, 0.0030],
         [0.0005, 0.0027, 0.0007,  ..., 0.0007, 0.0017, 0.0039]],

        [[0.0024, 0.0013, 0.0007,  ..., 0.0015, 0.0022, 0.0055],
         [0.0014, 0.0018, 0.0004,  ..., 0.0012, 0.0035, 0.0020],
         [0.0012, 0.0030, 0.0006,  ..., 0.0011, 0.0027, 0.0031],
         ...,
         [0.0022, 0.0018, 0.0011,  ..., 0.0020, 0.0028, 0.0055],
         [0.0007, 0.0030, 0.0009,  ..., 0.0009, 0.0012, 0.0039],
         [0.0008, 0.0026, 0.0010,  ..., 0.0005, 0.0023, 0.0035]],

        [[0.0021, 0.0011, 0.0009,  ..., 0.0030, 0.0020, 0.0024],
         [0.0013, 0.0011, 0.0007,  ..., 0.0008, 0.0020, 0.0012],
         [0.0007, 0.0028, 0.0008,  ..., 0.0009, 0.0022, 0.

In [15]:
import GPUtil
from torch.autograd import Variable
import numpy as np


def data_gen(V, batch, nbatches):
    "Generate random data for a src-tgt copy task."
    for i in range(nbatches):
        data = torch.from_numpy(np.random.randint(1, V, size=(batch, 10)))
        data[:, 0] = 1
        src = Variable(data, requires_grad=False)
        tgt = Variable(data, requires_grad=False)
        yield Batch(src, tgt, 0)
data_gen(   )

In [48]:
class Generator(nn.Module):
    "Define standard linear + softmax generation step."

    def __init__(self, d_model, vocab):
        super(Generator, self).__init__()
        self.proj = nn.Linear(d_model, vocab)

    def forward(self, x):
        part = self.proj(x)
        print(part)
        return f.log_softmax(part, dim=-1)
gen = Generator(10,4)
data = torch.randn(10)
print(data)
data_nw = gen(data)
print(data_nw)

tensor([ 1.0136,  0.1570,  0.7114,  0.4319,  0.0539,  0.6642,  0.9094, -0.5927,
        -0.7690,  0.7795])
tensor([ 0.9688, -0.0738, -0.3291, -1.0385], grad_fn=<AddBackward0>)
tensor([-0.5653, -1.6079, -1.8632, -2.5726], grad_fn=<LogSoftmaxBackward0>)


In [44]:
data

tensor([ 1.1338,  0.5175, -0.1311, -0.4373, -0.9687, -0.2133,  0.4102, -1.1990,
        -0.0418, -0.8016])

In [46]:
data_nw

tensor([-1.4662, -1.8699, -0.9613, -1.4582], grad_fn=<LogSoftmaxBackward0>)

In [37]:

print(input)
output = f.log_softmax(input,dim=1)
output

tensor([[ 0.1566, -0.4103, -0.8070],
        [ 0.9630, -0.4335,  0.7595]])


tensor([[-0.6672, -1.2341, -1.6308],
        [-0.7243, -2.1208, -0.9278]])

In [40]:
m = nn.LogSoftmax(dim=-1)

print(input)
output = m(input)
output

tensor([[ 0.1566, -0.4103, -0.8070],
        [ 0.9630, -0.4335,  0.7595]])


tensor([[-0.6672, -1.2341, -1.6308],
        [-0.7243, -2.1208, -0.9278]])