# Attention is all you need

### Rasul Alakbarli, Mahammad Nuriyev, Petko Petkov

## Required libraries

In [None]:
import torch
from torch import nn
import math

General `Module` class so we always have access to the device used:

In [None]:
class Module(nn.Module):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

Implement the embeddings in the transformer architecture (both `input` and `output` with the positional encodings):

In [None]:
class Embedding(Module):
    def __init__(self, d_model, vocab_len, pad_index, dropout_rate=0.1):
        super().__init__()

        self.d_model = d_model
        self.embedding = nn.Embedding(vocab_len, self.d_model, padding_idx=pad_index)
        self.dropout = nn.Dropout(p=dropout_rate)

    def forward(self, x):
        # Embedding shape: (batch, sequence_len, d_model)
        # Positional encoding shape: (sequence_len, d_model)
        return self.dropout(self.embedding(x) + self.positional_encoding(x))

    def positional_encoding(self, x):
        # result.shape = (seq_len, d_model)
        result = torch.zeros(
            (x.size(1), self.d_model),
            dtype=torch.float,
            requires_grad=False
        )

        # pos.shape = (seq_len, 1)
        pos = torch.arange(0, x.size(1)).unsqueeze(1)

        # dim.shape = (d_model)
        dim = torch.arange(0, self.d_model, step=2)

        # Sine for even positions, cosine for odd dimensions
        result[:, 0::2] = torch.sin(pos / (10_000 ** (dim / self.d_model)))
        result[:, 1::2] = torch.cos(pos / (10_000 ** (dim / self.d_model)))
        return result.to(self.device)

Implementation of feed-forward neural network which is used in both of the encoder and decoder parts in the transformer:

In [None]:
class FeedForwardNetwork(Module):
    def __init__(self, d_model, d_ff=2048):
        super().__init__()

        self.linear1 = nn.Linear(d_model, d_ff)
        self.relu = nn.ReLU()
        self.linear2 = nn.Linear(d_ff, d_model)

    def forward(self, x):
        return self.linear2(self.relu(self.linear1(x)))