## Seq2Seq LSTM Model in PyTorch
This lab demonstrates the implementation of a Sequence-to-Sequence (Seq2Seq) model using LSTM layers in PyTorch.

The Seq2Seq model architecture consists of an encoder and a decoder. The encoder processes the input sequence and outputs a context vector, which the decoder uses to generate the output sequence.

### Mathematical Formulation
Let $ x = (x_1, x_2, \dots, x_T) $ be the input sequence, and $ y = (y_1, y_2, \dots, y_T') $ be the target sequence. The encoder computes hidden states $ h_t $ as follows:
$$
h_t = f(x_t, h_{t-1})
$$
where $ f $ is an LSTM cell.

The decoder predicts $ y_t $ based on the context vector $ c $ and previous outputs:
$$
y_t = g(y_{t-1}, s_t, c)
$$
where $ s_t $ is the decoder's hidden state, and $ g $ is another LSTM cell.

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

class Attention(nn.Module):
    def __init__(self, hidden_dim):
        super(Attention, self).__init__()
        self.attn = nn.Linear(hidden_dim * 2, hidden_dim)
        self.v = nn.Linear(hidden_dim, 1, bias=False)

    def forward(self, hidden, encoder_outputs):
        # hidden: [n_layers, batch_size, hidden_dim]
        # encoder_outputs: [src_len, batch_size, hidden_dim]
        src_len = encoder_outputs.shape[0]
        batch_size = encoder_outputs.shape[1]

        hidden = hidden[-1].unsqueeze(1).repeat(1, src_len, 1)  # [batch_size, src_len, hidden_dim]
        encoder_outputs = encoder_outputs.permute(1, 0, 2)  # [batch_size, src_len, hidden_dim]

        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))  # [batch_size, src_len, hidden_dim]
        attention = self.v(energy).squeeze(2)  # [batch_size, src_len]

        return torch.softmax(attention, dim=1)  # [batch_size, src_len]

class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hidden_dim, n_layers, dropout):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hidden_dim, n_layers, dropout=dropout)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        # src: [src_len, batch_size]
        embedded = self.dropout(self.embedding(src))
        # embedded: [src_len, batch_size, emb_dim]
        outputs, (hidden, cell) = self.lstm(embedded)
        # outputs: [src_len, batch_size, hidden_dim]
        # hidden: [n_layers, batch_size, hidden_dim]
        # cell: [n_layers, batch_size, hidden_dim]
        return outputs, hidden, cell

class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hidden_dim, n_layers, dropout, attention):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.lstm = nn.LSTM(hidden_dim + emb_dim, hidden_dim, n_layers, dropout=dropout)
        self.fc_out = nn.Linear(hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(dropout)
        self.attention = attention

    def forward(self, input, hidden, cell, encoder_outputs):
        # input: [batch_size]
        # encoder_outputs: [src_len, batch_size, hidden_dim]
        input = input.unsqueeze(0)  # [1, batch_size]
        embedded = self.dropout(self.embedding(input))  # [1, batch_size, emb_dim]

        attn_weights = self.attention(hidden, encoder_outputs)  # [batch_size, src_len]
        attn_weights = attn_weights.unsqueeze(1)  # [batch_size, 1, src_len]
        encoder_outputs = encoder_outputs.permute(1, 0, 2)  # [batch_size, src_len, hidden_dim]
        context = torch.bmm(attn_weights, encoder_outputs).permute(1, 0, 2)  # [1, batch_size, hidden_dim]

        lstm_input = torch.cat((embedded, context), dim=2)  # [1, batch_size, hidden_dim + emb_dim]
        output, (hidden, cell) = self.lstm(lstm_input, (hidden, cell))
        # output: [1, batch_size, hidden_dim]

        prediction = self.fc_out(torch.cat((output.squeeze(0), context.squeeze(0)), dim=1))
        # prediction: [batch_size, output_dim]
        return prediction, hidden, cell

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        # src: [src_len, batch_size]
        # trg: [trg_len, batch_size]
        trg_len, batch_size = trg.shape
        trg_vocab_size = self.decoder.embedding.num_embeddings

        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)

        encoder_outputs, hidden, cell = self.encoder(src)

        input = trg[0, :]  # First input to the decoder is the <sos> token

        for t in range(1, trg_len):
            output, hidden, cell = self.decoder(input, hidden, cell, encoder_outputs)
            outputs[t] = output
            top1 = output.argmax(1)  # Get the highest probability token

            input = trg[t] if torch.rand(1).item() < teacher_forcing_ratio else top1

        return outputs

# Hyperparameters
INPUT_DIM = 1000  # Size of the source vocabulary
OUTPUT_DIM = 1000  # Size of the target vocabulary
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HIDDEN_DIM = 512
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

# Instantiate models
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
attention = Attention(HIDDEN_DIM)
encoder = Encoder(INPUT_DIM, ENC_EMB_DIM, HIDDEN_DIM, N_LAYERS, ENC_DROPOUT)
decoder = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HIDDEN_DIM, N_LAYERS, DEC_DROPOUT, attention)
model = Seq2Seq(encoder, decoder, device).to(device)

# Training setup
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()

def train(model, iterator, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0

    for src, trg in iterator:
        src, trg = src.to(device), trg.to(device)

        optimizer.zero_grad()
        output = model(src, trg)
        # output: [trg_len, batch_size, output_dim]
        # trg: [trg_len, batch_size]

        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)

        loss = criterion(output, trg)
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / len(iterator)

# Example usage:
# Create your data iterators, initialize training loop, and start training the Seq2Seq model.


### Generate synthetic sequential data

### Model Implementation

### Model, loss, and optimizer