# Attention Head
* so in transformer we do use multiplicative attention ,there are two type of attention one is additive and other is scale dot attention
* so in research paper  "attention all u need" they hav incorported multiplicative attention
* i assume it just involves addition rather than the multiplication to calculate the score

dimension of data (directly proportional to) variance 

hence the scailing down is down by (dimension of model)**(1/2)


In [None]:
import torch 
import numpy as np

In [None]:
#it has been modified to process the 3d data where the data across the z index correspond to each head

def attention_head(queries, keys, values, dropout=None, mask=None):

  # Calculate the attention scores
  # Scores will have shape (batch_size * sequence_length, batch_size * sequence_length)
  d_k = keys.size(-1)
  scores = torch.matmul(queries, keys.transpose(-2, -1))
  scores = scores / d_k**0.5

#now this look ahead mask will be used in the decoder
  if mask is not None:
# mask should be broadcastable to the shape of the scores tensor
# for 3d input, the mask should have shape (num_heads,batch_size * sequence_length, batch_size * sequence_length)
      scores = scores.masked_fill(mask == 0, -1e9)

  # row mean -1 and -2 means col
  scores = scores.softmax(dim=-1) # apply softmax across the row dimension (keys)


  # Apply dropout
  if dropout is not None:
      scores = dropout(scores)

  # Apply scores to values
  weighted_sum = torch.matmul(scores, values)

  return weighted_sum

### we will test our attention head with custom args

In [None]:
import torch

In [None]:
# Define input dimensions
batch_size = 2
sequence_length = 3
embedding_dim = 4
num_heads = 2
head_dim = embedding_dim // num_heads

# Create sample input tensors with compatible shapes
# Embeddings are now 2D: (batch_size * sequence_length, embedding_dim)
sample_embeddings = torch.randn(batch_size * sequence_length, embedding_dim)

# Create dummy linear layers that output the expected shape for attention_head
# The linear layers now expect an input shape of (batch_size * sequence_length, embedding_dim)
# and output a shape of (batch_size * sequence_length, head_dim)
dummy_key_linear = torch.nn.Linear(embedding_dim, head_dim)
dummy_query_linear = torch.nn.Linear(embedding_dim, head_dim)
dummy_value_linear = torch.nn.Linear(embedding_dim, head_dim)

# Create a dummy dropout layer
dummy_dropout = torch.nn.Dropout(0.1)

# Create a dummy mask
# The mask should have shape (batch_size * sequence_length, batch_size * sequence_length) for 2D input
dummy_mask = torch.ones(batch_size * sequence_length, batch_size * sequence_length) # Assuming no masking for simplicity

# Pass the sample_embeddings through the dummy linear layers to get the keys, queries, and values tensors
# These tensors should have shape (batch_size * sequence_length, head_dim)
sample_keys = dummy_key_linear(sample_embeddings)
sample_queries = dummy_query_linear(sample_embeddings)
sample_values = dummy_value_linear(sample_embeddings)

print("Sample Keys Shape:", sample_keys.shape)
print("Sample Queries Shape:", sample_queries.shape)
print("Sample Values Shape:", sample_values.shape)

# Test the attention_head function with the compatible inputs

  # attention_head now expects 2D inputs for keys, queries, and values
attention_output = attention_head(sample_queries, sample_keys, sample_values, dummy_dropout, dummy_mask)
print("Attention head output shape:", attention_output.shape)


# testing for the 3d inputs

In [None]:
# Define input dimensions
batch_size = 2
sequence_length = 3
embedding_dim = 4
num_heads = 2
head_dim = embedding_dim // num_heads

# Create sample input tensors with compatible shapes
# Embeddings are now 2D: (batch_size * sequence_length, embedding_dim)
sample_embeddings = torch.randn(batch_size * sequence_length, embedding_dim)

# Create dummy linear layers that output the expected shape for attention_head
# The linear layers now expect an input shape of (batch_size * sequence_length, embedding_dim)
# and output a shape of (batch_size * sequence_length, head_dim)
dummy_key_linear = torch.nn.Linear(embedding_dim, head_dim)
dummy_query_linear = torch.nn.Linear(embedding_dim, head_dim)
dummy_value_linear = torch.nn.Linear(embedding_dim, head_dim)

# Create a dummy dropout layer
dummy_dropout = torch.nn.Dropout(0.1)

# Create a dummy mask
# The mask should have shape (batch_size * sequence_length, batch_size * sequence_length) for 2D input
dummy_mask = torch.ones(batch_size * sequence_length, batch_size * sequence_length) # Assuming no masking for simplicity

# Pass the sample_embeddings through the dummy linear layers to get the keys, queries, and values tensors
# These tensors should have shape (batch_size * sequence_length, head_dim)
sample_keys = dummy_key_linear(sample_embeddings)
sample_queries = dummy_query_linear(sample_embeddings)
sample_values = dummy_value_linear(sample_embeddings)

print("Sample Keys Shape:", sample_keys.shape)
print("Sample Queries Shape:", sample_queries.shape)
print("Sample Values Shape:", sample_values.shape)

# Test the attention_head function with the compatible inputs

  # attention_head now expects 2D inputs for keys, queries, and values
attention_output = attention_head(sample_queries, sample_keys, sample_values, dummy_dropout, dummy_mask)
print("Attention head output shape:", attention_output.shape)


# Multi Head Attention 
#### we create tunable parameters for each head one for query ,key and value 

![attention flow](./attention_flow.png)

In [None]:
import torch.nn as nn


class MultiHeadAttention(nn.Module):
  def __init__(self,heads,d_model,dropout=0.01):
     super(MultiHeadAttention,self).__init__();
     self.heads=heads;
     self.d_model=d_model;
     self.d_key=d_model//heads;
     # Linear layers for query, key, value, and projection
     self.query_linears = nn.Linear(d_model, d_model)
     self.key_linears =   nn.Linear(d_model, d_model)
     self.value_linears = nn.Linear(d_model, d_model)
     #prjection layer will be used to project the concatenated output of all heads to the end of attention in the multihead attention
     self.projection_layer=nn.Linear(d_model,self.d_model)
     #this field is for debugging purposes
     self.attn=None
     self.dropout=nn.Dropout(dropout);

  def forward(self,input_embedding):
    head_dim=self.d_model//self.heads;
    queries=self.query_linears(input_embedding).view(self.heads,-1,head_dim);
    keys=self.key_linears(input_embedding).view(self.heads,-1,head_dim);
    values=self.value_linears(input_embedding).view(self.heads,-1,head_dim);
    dropout = torch.nn.Dropout(0.1)
    result=attention_head(queries,keys,values,dropout=dropout).view(-1,self.d_model)
    return self.projection_layer(result)


In [None]:
import torch

# Define parameters for MultiHeadAttention
heads = 4
d_model = 128 # Make sure d_model is divisible by heads
dropout_prob = 0.1

# Create a dummy input embedding
batch_size = 2
sequence_length = 10
input_embedding = torch.randn(batch_size, sequence_length, d_model)

# Create an instance of MultiHeadAttention
multi_head_attention = MultiHeadAttention(heads, d_model, dropout=dropout_prob)

# Test the forward method
output = multi_head_attention(input_embedding)

print("Input embedding shape:", input_embedding.shape)
print("Output shape:", output.shape)

# Adding the add and normalize layer

In [None]:
# this add and normalize layer will be used in the subsidual linkage 
# this subisidual is used to acoid the diminishing effect or influence of the original input
def add_norm(prev,input):
  result=prev+input;
  return torch.nn.functional.normalize(result);

# Adding the position wise feed forward 

In [None]:
class PositionWiseFFN(nn.Module):
  def __init__(self,d_model,d_ff,dropout=0.1):
    super(PositionWiseFFN,self).__init__();
    self.linear1=nn.Linear(d_model,d_ff);
    self.linear2=nn.Linear(d_ff,d_model);
    self.dropout=nn.Dropout(dropout);

  def forward(self,x):
    x=self.linear1(x);
    x=torch.relu(x);
    x=self.dropout(x);
    x=self.linear2(x);
    return x;
    

# test case for the position wise feed forward network 

In [None]:
# Test case for PositionWiseFFN

# Define input dimensions
batch_size = 2
sequence_length = 3
embedding_dim = 4
d_ff = 8 # Dimension of the feed-forward hidden layer

# Create sample input tensor with shape (batch_size * sequence_length, embedding_dim)
sample_input = torch.randn(batch_size * sequence_length, embedding_dim)

print("Sample Input Shape for PositionWiseFFN:", sample_input.shape)

# Instantiate the PositionWiseFFN module
position_wise_ffn = PositionWiseFFN(d_model=embedding_dim, d_ff=d_ff)

# Pass the sample input through the PositionWiseFFN module
try:
  print(sample_input)
  output_ffn = position_wise_ffn(sample_input)
  print("PositionWiseFFN Output Shape:", output_ffn)
except Exception as e:
  print(f"An error occurred during the PositionWiseFFN test: {e}")

# Adding positional encoding

In [None]:
import numpy as np
def get_position_encoding(seq_len, d_model, n=10000): 

    P = torch.zeros((seq_len, d_model)) 

    for pos in range(seq_len): 

        for i in range(d_model): 

            angle = pos / np.power(n, (2 * (i // 2)) / d_model) 

            P[pos, i] = np.sin(angle) if i % 2 == 0 else np.cos(angle) 

    return P 

In [None]:
# Test case for get_position_encoding

# Define parameters for testing
seq_len = 1
d_model = 10

# Get positional encoding
position_encoding = get_position_encoding(seq_len, d_model)

print("Position Encoding Shape:", position_encoding.shape)
print("Position Encoding:\n", position_encoding)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Define parameters (you can adjust these)
seq_len = 50
d_model = 128

# Get positional encoding
position_encoding = get_position_encoding(seq_len, d_model)

# Visualize the positional encoding
plt.figure(figsize=(12, 6))
sns.heatmap(position_encoding.numpy(), cmap="viridis")
plt.title("Positional Encoding Heatmap")
plt.xlabel("Embedding Dimension")
plt.ylabel("Position in Sequence")
plt.show()

# creating the add and normalisation layer

In [None]:
def add_norm(prev,input):
  result=prev+input;
  return torch.nn.functional.normalize(result);

# Creating the Encoder Sublayer

In [None]:
class EncoderSublayer(nn.Module):
  def __init__(self,d_model,num_heads,ffn_dim) -> None:
    super(EncoderSublayer,self).__init__()
    self.multihead=MultiHeadAttention(heads=num_heads,d_model=d_model)
    self.ffn=PositionWiseFFN(d_model=d_model,d_ff=ffn_dim)
    self.d_model=d_model
    self.ffn_dim=ffn_dim
  def forward(self,input_embeddings):
    add_norm_1=add_norm(prev=input_embeddings,input=self.multihead(input_embeddings))
    add_norm_2=add_norm(prev=add_norm_1,input=self.ffn(add_norm_1))
    return add_norm_2

## Adding the test case for the encoder sublayer

In [None]:
# Test case for EncoderSublayer

# Define input dimensions
batch_size = 2
sequence_length = 3
embedding_dim = 4
num_heads = 2
ffn_dim = 8 # Dimension of the feed-forward hidden layer

# Create sample input embeddings with shape (batch_size * sequence_length, embedding_dim)
sample_input_embeddings = torch.randn(batch_size * sequence_length, embedding_dim)

print("Sample Input Embeddings Shape for EncoderSublayer:", sample_input_embeddings.shape)

# Instantiate the EncoderSublayer module
encoder_sublayer = EncoderSublayer(d_model=embedding_dim, num_heads=num_heads, ffn_dim=ffn_dim)

# Pass the sample input embeddings through the EncoderSublayer module
try:
  output_encoder_sublayer = encoder_sublayer(sample_input_embeddings)
  print("EncoderSublayer Output Shape:", output_encoder_sublayer.shape)
except Exception as e:
  print(f"An error occurred during the EncoderSublayer test: {e}")

# Creating the Encoder

In [None]:
class Encoder(nn.Module):
  def __init__(self,d_model,num_heads,ffn_dim,num_layers,n=None) -> None:
    super(Encoder,self).__init__()
    self.num_heads=num_heads
    self.num_layers=num_layers
    self.d_model=d_model
    self.ffn_dim=ffn_dim
    self.n=n
    self.encoder_layers=[EncoderSublayer(d_model=d_model,num_heads=num_heads,ffn_dim=ffn_dim) for i in range(0,num_layers)];
  def forward(self,input_embedding):
    pe=None
    if (self.n!=None):
     pe=get_position_encoding(input_embedding.size(0),self.d_model,self.n)
    else:
     pe=get_position_encoding(input_embedding.size(0),self.d_model)
    print("positional encoding",pe.shape)
    print("input embedding",input_embedding.shape)

    result=add_norm(prev=input_embedding,input=pe)
    for i in range(self.num_layers):
      result=self.encoder_layers[i](result)
    return result

# Adding the test case for the encoder

In [None]:
# Test case for Encoder

# Define input dimensions
batch_size = 2
sequence_length = 10
d_model = 128 # Make sure d_model is divisible by num_heads
num_heads = 4
ffn_dim = 256 # Dimension of the feed-forward hidden layer
num_layers = 2 # Number of encoder layers

# Create sample input embeddings with shape (batch_size, sequence_length, d_model)
sample_input_embeddings = torch.randn(batch_size* sequence_length, d_model)

print("Sample Input Embeddings Shape for Encoder:", sample_input_embeddings.shape)

# Instantiate the Encoder module
encoder = Encoder(d_model=d_model, num_heads=num_heads, ffn_dim=ffn_dim, num_layers=num_layers)

# Pass the sample input embeddings through the Encoder module

output_encoder = encoder(sample_input_embeddings)
print("Encoder Output Shape:", output_encoder.shape)
