# Encoder Layer

Si vemos la arquitectura del transformer parece que ya tenemos todo lo necesario para construir el encoder

<div style="text-align:center;">
  <img src="Imagenes/transformer_architecture_model.png" alt="Multi-Head Attention" style="width:425px;height:626px;">
</div>

Pues vamos a implementarlo

## Implementación

Primero volvemos a escribir todas las clases creadas hasta ahora

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

class ScaledDotProductAttention(nn.Module):
    def __init__(self, dim_embedding):
        """
        Args:
            dim_embedding: dimension of embedding vector
        """
        super().__init__()
        self.dim_embedding = dim_embedding
    
    def forward(self, key, query, value):
        """
        Args:
            key: key vector
            query: query vector
            value: value vector
        
        Returns:
            output vector from scaled dot product attention
        """
        # MatMul
        key_trasposed = key.transpose(-1,-2)
        product = torch.matmul(query, key_trasposed)
        # scale
        scale = product / math.sqrt(self.dim_embedding)
        # softmax
        attention_matrix = torch.nn.functional.softmax(scale, dim=-1)
        # MatMul
        output = torch.matmul(attention_matrix, value)
        
        return output

class MultiHeadAttention(nn.Module):
    def __init__(self, heads, dim_embedding):
        super().__init__()
        
        self.dim_embedding = dim_embedding
        self.dim_proyection = dim_embedding // heads
        self.heads = heads
        
        self.proyection_Q = nn.Linear(dim_embedding, dim_embedding)
        self.proyection_K = nn.Linear(dim_embedding, dim_embedding)
        self.proyection_V = nn.Linear(dim_embedding, dim_embedding)
        self.attention = nn.Linear(dim_embedding, dim_embedding)

        self.scaled_dot_product_attention = ScaledDotProductAttention(self.dim_proyection)
    
    def forward(self, Q, K, V):
        batch_size = Q.size(0)
        
        # perform linear operation and split into h heads
        proyection_Q = self.proyection_Q(Q).view(batch_size, -1, self.heads, self.dim_proyection)
        proyection_K = self.proyection_K(K).view(batch_size, -1, self.heads, self.dim_proyection)
        proyection_V = self.proyection_V(V).view(batch_size, -1, self.heads, self.dim_proyection)
        
        # transpose to get dimensions bs * h * sl * d_model
        proyection_Q = proyection_Q.transpose(1,2)
        proyection_K = proyection_K.transpose(1,2)
        proyection_V = proyection_V.transpose(1,2)

        # calculate attention
        scaled_dot_product_attention = self.scaled_dot_product_attention(proyection_Q, proyection_K, proyection_V)
        
        # concatenate heads and put through final linear layer
        concat = scaled_dot_product_attention.transpose(1,2).contiguous().view(batch_size, -1, self.dim_embedding)
        
        output = self.attention(concat)
    
        return output

class AddAndNorm(nn.Module):
    def __init__(self, dim_embedding):
        """
        Args:
            dim_embedding (int): Embedding dimension.
        """
        super().__init__()
        self.normalization = nn.LayerNorm(dim_embedding)

    def forward(self, x, sublayer):
        """
        Args:
            x (torch.Tensor): Input tensor.
            sublayer (torch.Tensor): Sublayer tensor.

        Returns:
            torch.Tensor: Output tensor.
        """
        return self.normalization(torch.add(x, sublayer))

class FeedForward(nn.Module):
    def __init__(self, dim_embedding, increment=4):
        super().__init__()
        self.feed_forward = nn.Sequential(
            nn.Linear(dim_embedding, dim_embedding*increment),
            nn.ReLU(),
            nn.Linear(dim_embedding*increment, dim_embedding)
        )
    
    def forward(self, x):
        """
        Args:
            x (torch.Tensor): (batch_size, seq_len, dim_embedding)

        Returns:
            torch.Tensor: (batch_size, seq_len, dim_embedding)
        """
        x = self.feed_forward(x)
        return x

Ahora creamos una clase encoder

In [2]:
class EncoderLayer(nn.Module):
    def __init__(self, heads, dim_embedding):
        super().__init__()
        self.multi_head_attention = MultiHeadAttention(heads, dim_embedding)
        self.add_and_norm_1 = AddAndNorm(dim_embedding)
        self.feed_forward = FeedForward(dim_embedding)
        self.add_and_norm_2 = AddAndNorm(dim_embedding)
    
    def forward(self, x):
        """
        Args:
            x (torch.Tensor): (batch_size, seq_len, dim_embedding)

        Returns:
            torch.Tensor: (batch_size, seq_len, dim_embedding)
        """
        multi_head_attention = self.multi_head_attention(x, x, x)
        add_and_norm_1 = self.add_and_norm_1(x, multi_head_attention)
        feed_forward = self.feed_forward(add_and_norm_1)
        add_and_norm_2 = self.add_and_norm_2(add_and_norm_1, feed_forward)
        return add_and_norm_2

Volvemos a definir la función que obtiene el embbeding más el positional encoding de BERT

In [3]:
from transformers import BertModel, BertTokenizer
def extract_embeddings(input_sentences, model_name='bert-base-uncased'):
    tokenizer = BertTokenizer.from_pretrained(model_name)
    model = BertModel.from_pretrained(model_name)
    
    # tokenización de lote
    inputs = tokenizer(input_sentences, return_tensors='pt', padding=True, truncation=True)
    
    with torch.no_grad():
        outputs = model(**inputs)
        
    token_embeddings = outputs[0]
    
    # Los embeddings posicionales están en la segunda capa de los embeddings de la arquitectura BERT
    positional_encodings = model.embeddings.position_embeddings.weight[:token_embeddings.shape[1], :].detach().unsqueeze(0).repeat(token_embeddings.shape[0], 1, 1)

    embeddings_with_positional_encoding = token_embeddings + positional_encodings

    # convierte las IDs de los tokens a tokens
    tokens = [tokenizer.convert_ids_to_tokens(input_id) for input_id in inputs['input_ids']]

    return tokens, inputs['input_ids'], token_embeddings, positional_encodings, embeddings_with_positional_encoding

Obtenemos el resultado del input embedding más el positional encoding de una de las secuencias

In [4]:
sentence1 = "I gave the dog a bone because it was hungry"
tokens1, input_ids1, token_embeddings1, positional_encodings1, embeddings_with_positional_encoding1 = extract_embeddings(sentence1)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Instanciamos un objeto de nuestra clase encoder

In [5]:
heads = 8
dim_embedding = embeddings_with_positional_encoding1.shape[-1]
encoder_layer = EncoderLayer(heads=heads, dim_embedding=dim_embedding)

Obtenemos la salida del encoder

In [6]:
encoder_layer_output = encoder_layer(embeddings_with_positional_encoding1)
encoder_layer_output.shape

torch.Size([1, 12, 768])