
#Replication of paper Attention is all you need

https://arxiv.org/pdf/1706.03762.pdf

#0.Imports

In [None]:
try:
    import torch
    import torchvision
    assert int(torch.__version__.split(".")[1]) >= 12, "torch version should be 1.12+"
    assert int(torchvision.__version__.split(".")[1]) >= 13, "torchvision version should be 0.13+"
    print(f"torch version: {torch.__version__}")
    print(f"torchvision version: {torchvision.__version__}")
except:
    print(f"[INFO] torch/torchvision versions not as required, installing nightly versions.")
    !pip3 install -U torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu113
    import torch
    import torchvision
    print(f"torch version: {torch.__version__}")
    print(f"torchvision version: {torchvision.__version__}")

[INFO] torch/torchvision versions not as required, installing nightly versions.
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/, https://download.pytorch.org/whl/cu113
torch version: 2.0.0+cu118
torchvision version: 0.15.1+cu118


In [None]:
# Continue with regular imports
import matplotlib.pyplot as plt
import torch
import torchvision

from torch import nn
from torchvision import transforms

# Try to get torchinfo, install it if it doesn't work
try:
    from torchinfo import summary
except:
    print("[INFO] Couldn't find torchinfo... installing it.")
    !pip install -q torchinfo
    from torchinfo import summary

[INFO] Couldn't find torchinfo... installing it.
[INFO] Couldn't find going_modular or helper_functions scripts... downloading them from GitHub.
Cloning into 'pytorch-deep-learning'...
remote: Enumerating objects: 3578, done.[K
remote: Counting objects: 100% (200/200), done.[K
remote: Compressing objects: 100% (113/113), done.[K
remote: Total 3578 (delta 88), reused 189 (delta 82), pack-reused 3378[K
Receiving objects: 100% (3578/3578), 647.31 MiB | 25.98 MiB/s, done.
Resolving deltas: 100% (2044/2044), done.
Updating files: 100% (240/240), done.


In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cpu'

#1.Model Definition

In [1]:
#test IGNORE THIS
# import torch

# # Create a 2D tensor of size 10x20 filled with zeros
# tensor = torch.zeros(6, 6)

# # Set the values above the main diagonal to 0.0001
# tensor = tensor + 0.0001 * torch.triu(torch.ones(6, 6), diagonal=1)

# # Print the tensor
# tensor

##1.1 Defining basic blocks of model


From paper page number 3, paragraph 1:

"...The first is a multi-head self-attention mechanism, and the second is a simple, positionwise fully connected feed-forward network. We employ a residual connection [11] around each of the two sub-layers, followed by layer normalization.."

From pager page number 7, last paragraph:

"We apply dropout [33] to the output of each sub-layer, before it is added to the sub-layer input and normalized. In addition, we apply dropout to the sums of the embeddings and the positional encodings in both the encoder and decoder stacks. For the base model, we use a rate of Pdrop = 0.1."



###1.1.1 Multihead Self-Attention Sublayer with Normalization

In [None]:
class MultiHeadSelfAttentionAndNormBlock(nn.Module):
  
  def __init__(self,
               embed_dim:int = 512,
               num_heads:int = 6,
               masked:bool = False,
               dropout:float = 0.1):
    super().__init__()

    self.multihead_attn = nn.MultiheadAttention(embed_dim=embed_dim,
                                                num_heads=num_heads,
                                                batch_first=False)
    self.layer_norm = nn.LayerNorm(normalized_shape=embed_dim)
    self.dropout = nn.Dropout(p=dropout)
    self.masked = masked

  def forward(self, x, query, key, value):
    
    
    if masked:
      mask = torch.zeros(self.embed_dim, self.embed_dim)
      mask = mask - 0.0000000001 * torch.triu(torch.ones(self.embed_dim, self.embed_dim), diagonal=1)
      
      attention_out, _ = self.multihead_attn(query=query,
                                           key=key,
                                           value=value,
                                           attn_mask=mask,
                                           need_weights=False)
    else:
      attention_out, _ = self.multihead_attn(query=query,
                                           key=key,
                                           value=value,
                                           need_weights=False)
    dp = self.dropout(attention_out)
    residual_conn = dp + x
    block_out = self.layer_norm(residual_conn)
    return block_out

###1.1.2 FeedForward Sublayer with Normalization

In [None]:
class FeedForwardAndNorm(nn.Module):

  def __init__(self,
               embed_dim:int = 512,
               ffn_size:int = 2048,
               dropout:float = 0.1):
    super().__init__()

    self.dropout = nn.Dropout(p=dropout)

    self.ffn = nn.Sequential(
        nn.Linear(in_features=embed_dim,
                  out_features=ffn_size),
        nn.ReLU(),
        nn.Linear(in_features=ffn_size,
                  out_features=embed_dim)
    )

    self.layer_norm = nn.LayerNorm(normalized_shape=embed_dim)

  def forward(self, x):
    ffn_out = self.ffn(x)
    dp = self.dropout(ffn_out)
    residual_conn = dp + x
    block_out = self.layer_norm(residual_conn)
    return block_out


##1.2 Encoder Block

From paper page number 3, paragraph 1: 

"The encoder is composed of a stack of N = 6 identical layers. Each layer has two sub-layers. The first is a multi-head self-attention mechanism, and the second is a simple, positionwise fully connected feed-forward network. ..."



Also check out Figure 1: The Transformer - model architecture, for better underestanding of arhitecture 

In [None]:
class EncoderBlock(nn.Module):
  def _init__(self,
              embed_dim:int = 512,
              num_heads:int = 6,
              ffn_size = 2048):
    super().__init__()

    self.mhsan_block = MultiHeadSelfAttentionAndNormBlock(embed_dim=embed_dim,
                                                num_heads=num_heads)
    
    self.ffn_block = FeedForwardAndNorm(embed_dim = embed_dim,
                                        ffn_size = ffn_size)
    
  def forward(self, x):
    x = self.mhsan_block(x,x,x,x)
    x = self.ffn_block(x)
    return x


##1.3 Decoder Block

From paper page number 3, paragraph 1:


"The decoder is also composed of a stack of N = 6 identical layers. In addition to the two sub-layers in each encoder layer, the decoder inserts a third sub-layer, which performs multi-head attention over the output of the encoder stack. ..." 




In [None]:
class DecoderBlock(nn.Module):
  def __init_(self,
              embed_dim = 512,
              num_heads = 6,
              ffn_size = 2048
              ):
    super().__init__()

    self.mhsan_block1 = MultiHeadSelfAttentionAndNormBlock(embed_dim=embed_dim,
                                                num_heads=num_heads,masked=True)

    self.mhsan_block2 = MultiHeadSelfAttentionAndNormBlock(embed_dim=embed_dim,
                                                num_heads=num_heads)
    
    self.ffn_block = FeedForwardAndNorm(embed_dim = embed_dim,
                                        ffn_size = ffn_size)
    
  def forward(self, x, encoder_out):    
    block1_out = self.mhsan_block(x,x,x,x)
    block2_out = self.mhsan_block(block1_out,encoder_out,encoder_out,block1_out)
    out = self.ffn_block(block2_out)
    return out

##1.4 Transformer Model

In [2]:
class TransformerBlock(nn.Module):
  def __init_(self,
              embed_dim = 512,
              num_heads = 6,
              ffn_size = 2048,
              enc_layer_num = 6,
              dec_layer_num = 6
              ):
    super().__init__()
    
    self.embed_dim = embed_dim
    self.num_heads = num_heads
    self.ffn_size = ffn_size

    self.encoder_layers = nn.ModuleList(
        [
            EncoderBlock(embed_dim=embed_dim, num_heads=num_heads, ffn_size=ffn_size)
            for _ in range(enc_layer_num)
        
        ]
    )

    self.decoder_layers = nn.ModuleList(
        [
            DecoderBlock(embed_dim=embed_dim, num_heads=num_heads, ffn_size=ffn_size)
            for _ in range(dec_layer_num)
        
        ]
    )
  
  def forward(self, encoder_input, decoder_input):    
    
    x = encoder_input
    for layer in self.encoder_layers:
      x = layer(x)
    
    
    encoder_out = x

    x = decoder_input
    for layer in self.decoder_layers:
      x = layer(x, encoder_out)
    
    decoder_out = x


    #TODO: add linear and softmax layer

    return decoder_out

NameError: ignored

#2.Positional embedings

In [None]:
#word_embedding = nn.Embedding(src_vocab_size, embed_size)
#position_embedding = nn.Embedding(max_length, embed_size)