# NLP Exercise 3: Seq2Seq Model and Attention Mechanisms
---

In [None]:
import torch
import torch.nn as nn
import random

## Define the Encoder and Decoder for Seq2Seq without Attention

Encoder

In [3]:
class Encoder(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim):
        super(Encoder, self).__init__
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
    
    def forward(self, src):
        embedded = self.embedding(src)
        ouputs, (hidden, cell) = self.lstm(embedded)
        return hidden, cell

Decoder

In [2]:
class Decoder(nn.Module):
    def __init__(self, output_dim, embedding_dim, hidden_dim):
        super(Decoder, self).__init__
        self.embedding = nn.Embedding(output_dim, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        self.fc_out = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, input, hidden, cell):
        input = input.unsqueeze(0)
        embedded = self.embedding(input)
        output, (hidden, cell) = self.lstm(embedded, (hidden, cell))
        prediction = self.fc_out(output.squeeze(0))
        return prediction, hidden, cell

Seq2Seq Model without Attention

In [4]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
    
    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        trg_len, batch_size = trg.shape
        trg_vocab_size = self.decoder.fc_out.out_features
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size)
        
        hidden, cell = self.encoder(src)
        input = trg[0, :]
        
        for t in range(1, trg_len):
            output, hidden, cell = self.decoder(input, hidden, cell)
            outputs[t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input = trg[t] if teacher_force else top1
        
        return outputs


## Define the Attention Mechanism and Decoder with Attention

In [5]:
class Attention(nn.Module):
    def __init__(self, hidden_dim):
        super(Attention, self).__init__()
        self.attn = nn.Linear(hidden_dim * 2, hidden_dim)
        self.v = nn.Parameter(torch.rand(hidden_dim))
    
    def forward(self, hidden, encoder_outputs):
        src_len = encoder_outputs.shape[0]
        hidden = hidden[-1].repeat(src_len, 1, 1).transpose(0, 1)
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))
        attention = torch.sum(self.v * energy, dim=2)
        return torch.softmax(attention, dim=1)


Decoder with Attention

In [None]:
class DecoderWithAttention(nn.Module):
    def __init__(self, output_dim, embedding_dim, hidden_dim, attention):
        super(DecoderWithAttention, self).__init__()
        self.output_dim = output_dim
        self.attention = attention
        self.embedding = nn.Embedding(output_dim, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim + hidden_dim, hidden_dim)
        self.fc_out = nn.Linear(hidden_dim * 2, output_dim)
        
    def forward(self, input, hidden, cell, encoder_outputs):
        input.unsqueeze(0)
        embedded = self.embedding(input)
        a = self.attention(hidden, encoder_outputs).unsqueeze(1)
        weighted = torch.bmm(a, encoder_outputs.permute(1, 0, 2))
        rnn_input = torch.cat((embedded, weighted.permute(1, 0, 2)), dim=2)
        output, (hidden, cell) = self.lstm(rnn_input, (hidden, cell))
        prediction = self.fc_out(torch.cat((output.squeeze(0), weighted.squeeze(0)), dim=1))
        return prediction, hidden, cell, a.squeeze(1)


Define the Seq2Seq Model with Attention

In [7]:
class Seq2SeqWithAttention(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2SeqWithAttention, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
    
    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        trg_len, batch_size = trg.shape
        trg_vocab_size = self.decoder.output_dim
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size)
        attentions = torch.zeros(trg_len, src.shape[0], batch_size)
        
        encoder_outputs, (hidden, cell) = self.encoder(src)
        input = trg[0, :]
        
        for t in range(1, trg_len):
            output, hidden, cell, attention = self.decoder(input, hidden, cell, encoder_outputs)
            outputs[t] = output
            attentions[t] = attention
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input = trg[t] if teacher_force else top1
        
        return outputs, attentions


## BLEU Score Calculation

In [9]:
from nltk.translate.bleu_score import sentence_bleu

In [31]:
def calculate_bleu_score(reference, candidate):
    """
    Calculate BLEU score for a single reference and candidate sentence pair.
    
        :param reference: List of words in the target sentence (ground truth).
        :param candidate: List of words in the predicted sentence.

    Return: BLEU score (float)
    """
    return sentence_bleu([reference], candidate)

# Example usage
reference = "I am learning NLP".split()
candidate_seq2seq = "I am learni NLP".split()  # Example output without attention
candidate_with_attention = "I am learning NLP".split()  # Example output with attention

bleu_seq2seq = calculate_bleu_score(reference, candidate_seq2seq)
bleu_with_attention = calculate_bleu_score(reference, candidate_with_attention)

print(f"BLEU score for Seq2Seq without attention: {bleu_seq2seq}")
print(f"BLEU score for Seq2Seq with attention: {bleu_with_attention}")


BLEU score for Seq2Seq without attention: 1.0547686614863434e-154
BLEU score for Seq2Seq with attention: 1.0
