# Transformer

Ya tenemos todo el código necesario para crear una clase del transformer entero

<div style="text-align:center;">
  <img src="Imagenes/transformer_architecture_model.png" alt="Transformer architecture" style="width:425px;height:626px;">
</div>

Así que vamos a ello

## Implementación

Escribimos todas las clases de más bajo nivel necesarias

In [1]:
import torch
import torch.nn as nn
import math
import torch.nn.functional as F

class Embedding(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        """
        Args:
            vocab_size: size of vocabulary
            embed_dim: dimension of embeddings
        """
        super().__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim

        self.embedding = nn.Embedding(vocab_size, embedding_dim)

    def forward(self, x):
        """
        Args:
            x: input vector
        Returns:
            out: embedding vector
        """
        return self.embedding(x)

class PositionalEncoding(nn.Module):
    def __init__(self, max_sequence_len, embedding_model_dim):
        """
        Args:
            seq_len: length of input sequence
            embed_model_dim: demension of embedding
        """
        super().__init__()
        self.embedding_dim = embedding_model_dim

        # create constant 'positional_encoding' matrix with values dependant on pos and i
        positional_encoding = torch.zeros(max_sequence_len, self.embedding_dim)
        for pos in range(max_sequence_len):
            for i in range(0, self.embedding_dim, 2):
                positional_encoding[pos, i]     = math.sin(pos / (10000 ** ((2 *     i) / self.embedding_dim)))
                positional_encoding[pos, i + 1] = math.cos(pos / (10000 ** ((2 * (i+1)) / self.embedding_dim)))
        positional_encoding = positional_encoding.unsqueeze(0)
        self.register_buffer('positional_encoding', positional_encoding)

    def forward(self, x):
        """
        Args:
            x: input vector
        Returns:
            x: output
        """
        # make embeddings relatively larger
        x = x * math.sqrt(self.embedding_dim)
        
        # add encoding matrix to embedding (x)
        sequence_len = x.size(1)
        # x = x + torch.autograd.Variable(self.positional_encoding[:,:sequence_len], requires_grad=False)
        x = x + self.positional_encoding[:,:sequence_len]
        return x

class ScaledDotProductAttention(nn.Module):
    def __init__(self, dim_embedding):
        """
        Args:
            dim_embedding: dimension of embedding vector
        """
        super().__init__()
        self.dim_embedding = dim_embedding
    
    def forward(self, key, query, value, mask=None):
        """
        Args:
            key: key vector
            query: query vector
            value: value vector
            mask: mask matrix (optional)
        
        Returns:
            output vector from scaled dot product attention
        """
        # MatMul
        key_trasposed = key.transpose(-1,-2)
        product = torch.matmul(query, key_trasposed)
        # scale
        scale = product / math.sqrt(self.dim_embedding)
        # Mask (optional)
        if mask is not None:
            scale = scale.masked_fill(mask == 0, float('-inf'))
        # softmax
        attention_matrix = torch.nn.functional.softmax(scale, dim=-1)
        # MatMul
        output = torch.matmul(attention_matrix, value)
        
        return output

class MultiHeadAttention(nn.Module):
    def __init__(self, heads, dim_embedding):
        """
        Args:
            heads: number of heads
            dim_embedding: dimension of embedding vector
        """
        super().__init__()
        
        self.dim_embedding = dim_embedding
        self.dim_proyection = dim_embedding // heads
        self.heads = heads
        
        self.proyection_Q = nn.Linear(dim_embedding, dim_embedding)
        self.proyection_K = nn.Linear(dim_embedding, dim_embedding)
        self.proyection_V = nn.Linear(dim_embedding, dim_embedding)
        self.attention = nn.Linear(dim_embedding, dim_embedding)

        self.scaled_dot_product_attention = ScaledDotProductAttention(self.dim_proyection)
    
    def forward(self, Q, K, V):
        """
        Args:
            Q: query vector
            K: key vector
            V: value vector

        Returns:
            output vector from multi-head attention
        """
        batch_size = Q.size(0)
        
        # perform linear operation and split into h heads
        proyection_Q = self.proyection_Q(Q).view(batch_size, -1, self.heads, self.dim_proyection)
        proyection_K = self.proyection_K(K).view(batch_size, -1, self.heads, self.dim_proyection)
        proyection_V = self.proyection_V(V).view(batch_size, -1, self.heads, self.dim_proyection)
        
        # transpose to get dimensions bs * h * sl * d_model
        proyection_Q = proyection_Q.transpose(1,2)
        proyection_K = proyection_K.transpose(1,2)
        proyection_V = proyection_V.transpose(1,2)

        # calculate attention
        scaled_dot_product_attention = self.scaled_dot_product_attention(proyection_Q, proyection_K, proyection_V)
        
        # concatenate heads and put through final linear layer
        concat = scaled_dot_product_attention.transpose(1,2).contiguous().view(batch_size, -1, self.dim_embedding)
        
        output = self.attention(concat)
    
        return output

class AddAndNorm(nn.Module):
    def __init__(self, dim_embedding):
        """
        Args:
            dim_embedding (int): Embedding dimension.
        """
        super().__init__()
        self.normalization = nn.LayerNorm(dim_embedding)

    def forward(self, x, sublayer):
        """
        Args:
            x (torch.Tensor): Input tensor.
            sublayer (torch.Tensor): Sublayer tensor.

        Returns:
            torch.Tensor: Output tensor.
        """
        return self.normalization(torch.add(x, sublayer))

class FeedForward(nn.Module):
    def __init__(self, dim_embedding, increment=4):
        super().__init__()
        self.feed_forward = nn.Sequential(
            nn.Linear(dim_embedding, dim_embedding*increment),
            nn.ReLU(),
            nn.Linear(dim_embedding*increment, dim_embedding)
        )
    
    def forward(self, x):
        """
        Args:
            x (torch.Tensor): (batch_size, seq_len, dim_embedding)

        Returns:
            torch.Tensor: (batch_size, seq_len, dim_embedding)
        """
        x = self.feed_forward(x)
        return x

class Linear(nn.Module):
    def __init__(self, dim_embedding, vocab_size):
        super().__init__()
        self.linear = nn.Linear(dim_embedding, vocab_size)
        
    def forward(self, x):
        x = self.linear(x)
        return x

class Softmax(nn.Module):
    def __init__(self):
        super().__init__()
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self, x):
        x = self.softmax(x)
        return x

Escribimos las clases de en coder y decoder layer

In [2]:
class EncoderLayer(nn.Module):
    def __init__(self, heads, dim_embedding):
        super().__init__()
        self.multi_head_attention = MultiHeadAttention(heads, dim_embedding)
        self.add_and_norm_1 = AddAndNorm(dim_embedding)
        self.feed_forward = FeedForward(dim_embedding)
        self.add_and_norm_2 = AddAndNorm(dim_embedding)
    
    def forward(self, x):
        """
        Args:
            x (torch.Tensor): (batch_size, seq_len, dim_embedding)

        Returns:
            torch.Tensor: (batch_size, seq_len, dim_embedding)
        """
        multi_head_attention = self.multi_head_attention(x, x, x)
        add_and_norm_1 = self.add_and_norm_1(x, multi_head_attention)
        feed_forward = self.feed_forward(add_and_norm_1)
        add_and_norm_2 = self.add_and_norm_2(add_and_norm_1, feed_forward)
        return add_and_norm_2

class DecoderLayer(nn.Module):
    def __init__(self, heads, dim_embedding):
        """
        Args:
            heads: number of heads
            dim_embedding: dimension of embedding vector
        """
        super().__init__()
        self.masked_multi_head_attention = MultiHeadAttention(heads, dim_embedding)
        self.encoder_decoder_multi_head_attention = MultiHeadAttention(heads, dim_embedding)
        self.add_and_norm_1 = AddAndNorm(dim_embedding)
        self.add_and_norm_2 = AddAndNorm(dim_embedding)
        self.add_and_norm_3 = AddAndNorm(dim_embedding)
        self.feed_forward = FeedForward(dim_embedding)
    
    def forward(self, x, encoder_output, mask=None):
        """
        Args:
            x: input vector
            encoder_output: output vector from encoder
            mask: mask matrix (optional)
        
        Returns:
            output vector from decoder layer
        """
        masked_multi_head_attention = self.masked_multi_head_attention(x, x, x, mask)
        add_and_norm_1 = self.add_and_norm_1(masked_multi_head_attention, x)
        encoder_decoder_multi_head_attention = self.encoder_decoder_multi_head_attention(add_and_norm_1, encoder_output, encoder_output)
        add_and_norm_2 = self.add_and_norm_2(encoder_decoder_multi_head_attention, add_and_norm_1)
        feed_forward = self.feed_forward(add_and_norm_2)
        add_and_norm_3 = self.add_and_norm_3(feed_forward, add_and_norm_2)
        return add_and_norm_3

Escribimos las clases de encoder y decoder

In [3]:
class Encoder(nn.Module):
    def __init__(self, heads, dim_embedding, Nx):
        super().__init__()
        self.encoder_layers = nn.ModuleList([EncoderLayer(heads, dim_embedding) for _ in range(Nx)])
    
    def forward(self, x):
        """
        Args:
            x (torch.Tensor): (batch_size, seq_len, dim_embedding)

        Returns:
            torch.Tensor: (batch_size, seq_len, dim_embedding)
        """
        for encoder_layer in self.encoder_layers:
            x = encoder_layer(x)
        return x

class Decoder(nn.Module):
    def __init__(self, heads, dim_embedding, Nx):
        """
        Args:
            heads: number of heads
            dim_embedding: dimension of embedding vector
            Nx: number of decoder layers
        """
        super().__init__()
        self.layers = nn.ModuleList([DecoderLayer(heads, dim_embedding) for _ in range(Nx)])
    
    def forward(self, x, encoder_output, mask=None):
        """
        Args:
            x: input vector
            encoder_output: output vector from encoder
            mask: mask matrix (optional)
        
        Returns:
            output vector from decoder
        """
        for decoder_layer in self.layers:
            x = decoder_layer(x, encoder_output, mask)
        return x

Escribimos las clases transformer encoder y transformer decoder

In [4]:
class TransformerEncoder(nn.Module):
    def __init__(self, vocab_size, dim_embedding, max_sequence_len, heads, Nx):
        super().__init__()
        self.input_embedding = Embedding(vocab_size, dim_embedding)
        self.positional_encoding = PositionalEncoding(max_sequence_len, dim_embedding)
        self.encoder = Encoder(heads, dim_embedding, Nx)
    
    def forward(self, x):
        """
        Args:
            x (torch.Tensor): (batch_size, seq_len)

        Returns:
            torch.Tensor: (batch_size, seq_len, dim_embedding)
        """
        input_embedding = self.input_embedding(x)
        positional_encoding = self.positional_encoding(input_embedding)
        encoder = self.encoder(positional_encoding)
        return encoder

class TransformerDecoder(nn.Module):
    def __init__(self, heads, dim_embedding, Nx, vocab_size, max_sequence_len):
        """
        Args:
            heads: number of heads
            dim_embedding: dimension of embedding vector
            Nx: number of decoder layers
            vocab_size: size of vocabulary
            max_sequence_len: maximum length of sequence
        """
        super().__init__()
        self.embedding = Embedding(vocab_size, dim_embedding)
        self.positional_encoding = PositionalEncoding(max_sequence_len, dim_embedding)
        self.decoder = Decoder(heads, dim_embedding, Nx)
        self.linear = Linear(dim_embedding, vocab_size)
        self.softmax = Softmax()
    
    def forward(self, x, encoder_output, mask=None):
        """
        Args:
            x: input vector
            encoder_output: output vector from encoder
            mask: mask matrix (optional)
        
        Returns:
            output vector from decoder
        """
        x = self.embedding(x)
        x = self.positional_encoding(x)
        x = self.decoder(x, encoder_output, mask)
        x = self.linear(x)
        x = self.softmax(x)
        return x

Y por último escribimos la clase transformer

In [5]:
class Transformer(nn.Module):
    def __init__(self, vocab_size, dim_embedding, max_sequence_len, heads, Nx):
        """
        Args:
            vocab_size: size of vocabulary
            dim_embedding: dimension of embedding vector
            max_sequence_len: maximum length of sequence
            heads: number of heads
            Nx: number of decoder layers
        """
        super().__init__()
        self.encoder = TransformerEncoder(vocab_size, dim_embedding, max_sequence_len, heads, Nx)
        self.decoder = TransformerDecoder(heads, dim_embedding, Nx, vocab_size, max_sequence_len)
    
    def forward(self, source, target):
        """
        Args:
            source: source vector
            target: target vector

        Returns:
            output vector from decoder
        """
        encoder_output = self.encoder(source)
        decoder_output = self.decoder(target, encoder_output)
        return decoder_output

Enhorabuena, si hasta aquí has entendido todo ya tienes lo necesario para implementar un transformer. Pero ahora vamos a ver una última cosa, que son las consideraciones últimas que explican en el paper que han metido para el entrenamiento