## Additive Attention Mechanism
Bahdanau Attention, also known as Additive Attention, is a mechanism introduced by Dzmitry Bahdanau in the paper "Neural Machine Translation by Jointly Learning to Align and Translate." It computes the attention weights by comparing the current decoder state with the encoder states, producing a context vector. Here's the equation for Bahdanau Attention:

### 1. **Score Function**:
The score function computes a similarity measure between the decoder hidden state ($ s_t $) and each encoder hidden state ($ h_i $):
$$
e_{t,i} = v_a^\top \tanh(W_a h_i + U_a s_t + b_a)
$$

Where:
- $ h_i $: Encoder hidden state at step $ i $
- $ s_t $: Decoder hidden state at step $ t $
- $ W_a, U_a $: Weight matrices
- $ b_a $: Bias vector
- $ v_a $: Weight vector for producing a scalar score

In [1]:
import torch
import torch.nn as nn
bs,seq_len, hidden_dim= 2, 10, 256
encoder_hidden_states= torch.rand((bs,seq_len, hidden_dim))
decoder_hn= torch.rand((bs,hidden_dim))



In [2]:
v_a=nn.Linear(hidden_dim, 1)
Wa=nn.Linear(hidden_dim, hidden_dim)
Ua=nn.Linear(hidden_dim, hidden_dim)

In [3]:
scores=v_a(torch.tanh(Wa(encoder_hidden_states)+Ua(decoder_hn).unsqueeze(1))).squeeze(2)

### 2. **Attention Weights**:
The attention weights are computed by applying a softmax function over the scores to ensure they sum to 1:
$$
\alpha_{t,i} = \frac{\exp(e_{t,i})}{\sum_{j=1}^{T_x} \exp(e_{t,j})}
$$

Where:
- $ \alpha_{t,i} $: Attention weight for encoder hidden state $ h_i $ at time $ t $
- $ T_x $: Number of encoder time steps


In [4]:
weights= torch.nn.functional.softmax(scores,1)

In [5]:
torch.sum(weights[0])

tensor(1., grad_fn=<SumBackward0>)

### 3. **Context Vector**:
The context vector $ c_t $ is a weighted sum of the encoder hidden states, where the weights are the attention scores:
$$
c_t = \sum_{i=1}^{T_x} \alpha_{t,i} h_i
$$


In [6]:
encoder_hidden_states.shape

torch.Size([2, 10, 256])

In [7]:
weights.shape

torch.Size([2, 10])

(1 x 10) (10 x 256) -> (1 x 256)

In [8]:
weights.unsqueeze(1).shape

torch.Size([2, 1, 10])

In [9]:
context= torch.bmm(weights.unsqueeze(1),encoder_hidden_states ).squeeze(1)

In [10]:
context.shape

torch.Size([2, 256])

### Old Scripts

In [11]:
from collections import Counter

In [12]:
# English-French sentence pairs
sentence_pairs = [
    ("I love programming.", "J'aime programmer."),
    ("The weather is nice today.", "Il fait beau aujourd'hui."),
    ("Can you help me?", "Peux-tu m'aider ?")
]

In [13]:
train_tokens_A=[token for sentence_pair in  sentence_pairs for token in sentence_pair[0].lower().split()]
train_tokens_B=[token for sentence_pair in  sentence_pairs for token in sentence_pair[1].lower().split()]

In [14]:
vocab_A=Counter(train_tokens_A)
vocab_B=Counter(train_tokens_B)

vocab_A['<pad>']=1
vocab_B['<pad>']=1

vocab_A['<bos>']=1
vocab_B['<bos>']=1

vocab_A['<eos>']=1
vocab_B['<eos>']=1

In [15]:
w2i_A={k:i for i, (k,v) in enumerate(vocab_A.items())}
w2i_B={k:i for i, (k,v) in enumerate(vocab_B.items())}

In [16]:
inputs_A=[torch.tensor([w2i_A[token] for token in ['<bos>']+ sentence_pair[0].lower().split()+['<eos>'] ]) for sentence_pair in sentence_pairs ]
inputs_B=[torch.tensor([w2i_B[token] for token in ['<bos>']+sentence_pair[1].lower().split()+['<eos>'] ]) for sentence_pair in sentence_pairs ]

In [17]:
# Padding sequences to create a batch
padded_sequences_A = torch.nn.utils.rnn.pad_sequence(inputs_A, padding_value=w2i_A['<pad>'], batch_first=True)

# Padding sequences to create a batch
padded_sequences_B = torch.nn.utils.rnn.pad_sequence(inputs_B, padding_value=w2i_B['<pad>'], batch_first=True)

In [34]:
class BahdanauAttention(nn.Module):
    def __init__(self, hidden_size):
        super(BahdanauAttention, self).__init__()
        self.Wa = nn.Linear(hidden_size, hidden_size)
        self.Ua = nn.Linear(hidden_size, hidden_size)
        self.Va = nn.Linear(hidden_size, 1)

    def forward(self, query, keys):
        scores = self.Va(torch.tanh(self.Wa(query) + self.Ua(keys)))
        scores = scores.squeeze(2).unsqueeze(1)

        weights = torch.nn.functional.softmax(scores, dim=-1)
        context = torch.bmm(weights, keys)

        return context, weights



In [35]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, padding_idx_A, dropout_p=0.1):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size, padding_idx=padding_idx_A)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, input):
        embedded = self.dropout(self.embedding(input))
        output, hidden = self.gru(embedded)
        return output, hidden

In [45]:
class Seq2Seq(nn.Module):
    def __init__(self, num_embeddings_A, num_embeddings_B, hidden_size, padding_idx_A, padding_idx_B , dropout_rate=0.1):
        super(Seq2Seq, self).__init__()

        self.encoder= EncoderRNN(num_embeddings_A, hidden_size, padding_idx_A, dropout_rate)
        self.embedding = nn.Embedding(num_embeddings_B, hidden_size, padding_idx=padding_idx_B)
        
        self.attention = BahdanauAttention(hidden_size)
        
        self.gru = nn.GRU(2 * hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, num_embeddings_B)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, input_tensor, target_tensor):

        encoder_outputs, encoder_hidden=self.encoder(input_tensor)
        batch_size,seq_len = target_tensor.shape
       
        decoder_hidden = encoder_hidden
        decoder_outputs = []
        attentions = []

        for i in range(seq_len):
            
            decoder_input = target_tensor[:, i].unsqueeze(1)
            decoder_output, decoder_hidden, attn_weights = self.forward_step(
                decoder_input, decoder_hidden, encoder_outputs
            )
            decoder_outputs.append(decoder_output)
            attentions.append(attn_weights)

            
        decoder_outputs = torch.cat(decoder_outputs, dim=1)
        attentions = torch.cat(attentions, dim=1)

        return decoder_outputs, attentions


    def forward_step(self, input, hidden, encoder_outputs):
        embedded =  self.dropout(self.embedding(input))

        query = hidden.permute(1, 0, 2)
        context, attn_weights = self.attention(query, encoder_outputs)
        input_gru = torch.cat((embedded, context), dim=2)

        output, hidden = self.gru(input_gru, hidden)
        output = self.out(output)

        return output, hidden, attn_weights

In [46]:
num_embeddings_B=len(w2i_B)
padding_idx_B=w2i_B['<pad>']


num_embeddings_A=len(w2i_A)
embedding_dim = 300
padding_idx_A=w2i_A['<pad>']

hidden_size= 256
dropout_rate=0.25

model= Seq2Seq(num_embeddings_A, num_embeddings_B ,hidden_size,padding_idx_A, padding_idx_B,dropout_rate)
outputs, attention_weights=model(padded_sequences_A, padded_sequences_B)