In [46]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from collections import Counter
import random
from docx import Document


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(f"Using device: {device}")

'''
devNumber = torch.cuda.current_device()
devName = torch.cuda.get_device_name(devNumber)

print(f"Current device number is: {devNumber}")
print(f"GPU name is: {devName}")
'''



Using device: cuda


'\ndevNumber = torch.cuda.current_device()\ndevName = torch.cuda.get_device_name(devNumber)\n\nprint(f"Current device number is: {devNumber}")\nprint(f"GPU name is: {devName}")\n'

In [83]:
'''
Problem 1 (30pts)
In this homework, we focus on sequence-to-sequence modeling. Use the English to French Dataset provided. 
Developed a GRU-based encoder-decoder architecture for English to French Translation. 
Train the model on the entire dataset and evaluate it on the entire dataset. 
Report training loss, validation loss, and validation accuracy. 
Also, try some qualitative validation as well, asking the network to generate French translations for some English sentences.
'''

# Load dataset from .docx file
def load_english_french_pairs(docx_path):
    doc = Document(docx_path)
    text = "\n".join([p.text for p in doc.paragraphs])
    english_to_french = []
    
    for line in text.split("\n"):
        if '", "' in line:
            en, fr = line.split('", "')
            en = en.replace('("', '').strip()
            fr = fr.replace('")', '').strip()
            english_to_french.append((en, fr))
            
    return english_to_french

# Load dataset
dataset = load_english_french_pairs("Dataset - English to French.docx")

# Vocabulary builder
class Vocabulary:
    def __init__(self):
        self.word2index = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2, "<UNK>": 3}
        self.index2word = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK>"}
    
    def add_sentence(self, sentence):
        for word in sentence.split():
            self.add_word(word)

    def add_word(self, word):
        if word not in self.word2index:
            index = len(self.word2index)
            self.word2index[word] = index
            self.index2word[index] = word

    def sentence_to_indices(self, sentence):
        return [self.word2index.get(word, self.word2index["<UNK>"]) for word in sentence.split()] + [self.word2index["<EOS>"]]

# Build vocabularies
english_vocab = Vocabulary()
french_vocab = Vocabulary()

for en, fr in dataset:
    english_vocab.add_sentence(en)
    french_vocab.add_sentence(fr)

# Custom dataset class
class TranslationDataset(Dataset):
    def __init__(self, pairs, english_vocab, french_vocab):
        self.pairs = pairs
        self.english_vocab = english_vocab
        self.french_vocab = french_vocab

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        en_sentence, fr_sentence = self.pairs[idx]
        en_indices = self.english_vocab.sentence_to_indices(en_sentence)
        fr_indices = self.french_vocab.sentence_to_indices(fr_sentence)
        
        return torch.tensor(en_indices), torch.tensor(fr_indices)

# Collate function for padding
def collate_fn(batch):
    en_batch = [item[0] for item in batch]
    fr_batch = [item[1] for item in batch]

    en_batch = nn.utils.rnn.pad_sequence(en_batch, batch_first=True, padding_value=english_vocab.word2index["<PAD>"])
    fr_batch = nn.utils.rnn.pad_sequence(fr_batch, batch_first=True, padding_value=french_vocab.word2index["<PAD>"])

    return en_batch, fr_batch

# DataLoader
train_dataset = TranslationDataset(dataset, english_vocab, french_vocab)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)

# Encoder with LSTM
class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.lstm = nn.LSTM(embedding_size, hidden_size, batch_first=True)

    def forward(self, x):
        embedded = self.embedding(x)
        output, hidden = self.lstm(embedded)
        return output, hidden

    def initHidden(self, batch_size):
        return (torch.zeros(1, batch_size, hidden_size, device=device),
                torch.zeros(1, batch_size, hidden_size, device=device))

# Decoder with LSTM
class Decoder(nn.Module):
    def __init__(self, hidden_size, output_size, embedding_size):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(output_size, embedding_size)
        self.lstm = nn.LSTM(embedding_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, x, hidden):
        embedded = self.embedding(x).unsqueeze(1)
        output, hidden = self.lstm(embedded, hidden)
        output = self.softmax(self.fc(output.squeeze(1)))
        return output, hidden

# Model parameters
embedding_size = 256
hidden_size = 512

input_size = len(english_vocab.word2index)
output_size = len(french_vocab.word2index)

encoder = Encoder(input_size, embedding_size, hidden_size).to(device)
decoder = Decoder(hidden_size, output_size, embedding_size).to(device)

# Optimizers and loss function
encoder_optimizer = optim.Adam(encoder.parameters(), lr=0.001)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=0.001)
criterion = nn.NLLLoss(ignore_index=english_vocab.word2index["<PAD>"])

# Training epoch with teacher forcing
def train_epoch(encoder, decoder, train_loader, criterion, encoder_optimizer, decoder_optimizer, device, teacher_forcing_ratio=0.5):
    encoder.train()
    decoder.train()
    total_loss = 0

    for en_tensor, fr_tensor in train_loader:
        en_tensor = en_tensor.to(device)
        fr_tensor = fr_tensor.to(device)

        batch_size = en_tensor.size(0)
        target_length = fr_tensor.size(1)

        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()

        encoder_hidden = encoder.initHidden(batch_size)
        encoder_outputs, encoder_hidden = encoder(en_tensor)

        decoder_input = torch.full((batch_size,), french_vocab.word2index["<SOS>"], dtype=torch.long, device=device)
        decoder_hidden = encoder_hidden

        loss = 0

        # Use teacher forcing or not
        use_teacher_forcing = random.random() < teacher_forcing_ratio

        if use_teacher_forcing:
            # Use the true target as the next input (teacher forcing)
            for t in range(target_length):
                output, decoder_hidden = decoder(decoder_input, decoder_hidden)

                # Output: [batch_size, vocab_size], Target: [batch_size]
                loss += criterion(output, fr_tensor[:, t])

                decoder_input = fr_tensor[:, t]  # Teacher forcing
        else:
            # Use the model's predictions as the next input
            for t in range(target_length):
                output, decoder_hidden = decoder(decoder_input, decoder_hidden)

                # Output: [batch_size, vocab_size], Target: [batch_size]
                loss += criterion(output, fr_tensor[:, t])

                # Get the top prediction
                _, topi = output.topk(1)
                decoder_input = topi.squeeze().detach()

        # Normalize the loss by the sequence length
        loss = loss / target_length

        loss.backward()
        encoder_optimizer.step()
        decoder_optimizer.step()

        total_loss += loss.item()

    return total_loss / len(train_loader)


def evaluate_and_show_examples(encoder, decoder, dataloader, criterion, n_examples):
    encoder.eval()
    decoder.eval()

    total_loss = 0
    correct_predictions = 0  
    printed_examples = 0  # Track how many examples have been printed

    with torch.no_grad():
        for i, (en_tensor, fr_tensor) in enumerate(dataloader):
            en_tensor = en_tensor.to(device)
            fr_tensor = fr_tensor.to(device)

            batch_size = en_tensor.size(0)
            encoder_hidden = encoder.initHidden(batch_size)

            encoder_outputs, encoder_hidden = encoder(en_tensor)

            decoder_input = torch.full((batch_size,), french_vocab.word2index["<SOS>"], dtype=torch.long, device=device)
            decoder_hidden = encoder_hidden

            predicted_indices = [[] for _ in range(batch_size)]

            # Generate predictions
            for t in range(fr_tensor.size(1)):
                output, decoder_hidden = decoder(decoder_input, decoder_hidden)

                # Get the top prediction
                _, topi = output.topk(1)

                for b in range(batch_size):
                    predicted_indices[b].append(topi[b].item())

                decoder_input = topi.squeeze().detach()

                # Stop decoding if all sentences produce EOS
                if all(decoder_input == french_vocab.word2index["<EOS>"]):
                    break

            total_loss += criterion(output, fr_tensor[:, t]).item()

            # Check for correctness
            if predicted_indices == fr_tensor.tolist():
                correct_predictions += 1

            # Print examples while keeping track of the number of examples printed
            for batch_idx in range(batch_size):
                if printed_examples >= n_examples:
                    break  # Stop once n_examples have been printed

                predicted_words = [
                    french_vocab.index2word[idx]
                    for idx in predicted_indices[batch_idx]
                    if idx not in [french_vocab.word2index["<PAD>"], french_vocab.word2index["<EOS>"]]
                ]
                target_words = [
                    french_vocab.index2word[idx.item()]
                    for idx in fr_tensor[batch_idx]
                    if idx.item() not in [french_vocab.word2index["<PAD>"], french_vocab.word2index["<EOS>"]]
                ]
                input_sentence = " ".join(
                    [english_vocab.index2word[idx.item()] 
                     for idx in en_tensor[batch_idx] 
                     if idx.item() not in [english_vocab.word2index["<PAD>"], english_vocab.word2index["<EOS>"]]]
                )          
                predicted_sentence = " ".join(predicted_words)
                target_sentence = " ".join(target_words)

                print(f"Input: {input_sentence}")
                print(f"Target: {target_sentence}")
                print(f"Predicted: {predicted_sentence}")
                print("-" * 30)

                printed_examples += 1  # Increment the counter

            if printed_examples >= n_examples:
                break  # Stop iterating once n_examples have been printed

        # Calculate and display overall validation loss and accuracy
        average_loss = total_loss / len(dataloader)
        accuracy = correct_predictions / len(dataloader)
        print(f'Evaluation Loss: {average_loss:.4f}, Accuracy: {accuracy:.4f}')


num_epochs = 10
for epoch in range(num_epochs):
    loss = train_epoch(encoder, decoder, train_loader, criterion, encoder_optimizer, decoder_optimizer, device)
    print(f"Epoch {epoch+1}, Loss: {loss:.4f}")

# Evaluate with examples
evaluate_and_show_examples(encoder, decoder, train_loader, criterion, 5)


Epoch 1, Loss: 5.5559
Epoch 2, Loss: 4.4981
Epoch 3, Loss: 3.8122
Epoch 4, Loss: 3.5858
Epoch 5, Loss: 3.4210
Epoch 6, Loss: 3.2114
Epoch 7, Loss: 3.0451
Epoch 8, Loss: 3.0644
Epoch 9, Loss: 2.9019
Epoch 10, Loss: 2.8360
Input: We dance at the wedding
Target: Nous dansons au mariage,
Predicted: Nous le le le
------------------------------
Input: The restaurant serves delicious food
Target: Le restaurant sert une délicieuse cuisine,
Predicted: Le Nous le
------------------------------
Input: She teaches English at school
Target: Elle enseigne l'anglais à l'école,
Predicted: Elle étudie le le
------------------------------
Input: He enjoys reading books
Target: Il aime lire des livres,
Predicted: Il se la
------------------------------
Input: They drink coffee in the morning
Target: Ils boivent du café le matin,
Predicted: Ils jouent la le
------------------------------
Evaluation Loss: 0.7764, Accuracy: 0.0000


In [39]:
# Importing necessary libraries
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import numpy as np
import random

# Dataset containing pairs of synonyms
dataset = load_english_french_pairs("Dataset - English to French.docx")
english_vocab = Vocabulary()
french_vocab = Vocabulary()


for en, fr in dataset:
    english_vocab.add_sentence(en)
    french_vocab.add_sentence(fr)

# Special tokens for the start and end of sequences
SOS_token = 0  # Start Of Sequence Token
EOS_token = 1  # End Of Sequence Token

# Preparing the character to index mapping and vice versa
# These mappings will help convert characters to numerical format for the neural network
# 'SOS' and 'EOS' tokens are added at the start of the char_to_index dictionary
char_to_index = {"SOS": SOS_token, "EOS": EOS_token, **{char: i+2 for i, char in enumerate(sorted(list(set(''.join([word for pair in dataset for word in pair])))))}}
index_to_char = {i: char for char, i in char_to_index.items()}

class SynonymDataset(Dataset):
    """Custom Dataset class for handling synonym pairs."""
    def __init__(self, dataset, char_to_index):
        self.dataset = dataset
        self.char_to_index = char_to_index

    def __len__(self):
        # Returns the total number of synonym pairs in the dataset
        return len(self.dataset)

    def __getitem__(self, idx):
        # Retrieves a synonym pair by index, converts characters to indices,
        # and adds the EOS token at the end of each word.
        input_word, target_word = self.dataset[idx]
        input_tensor = torch.tensor([self.char_to_index[char] for char in input_word] + [EOS_token], dtype=torch.long)
        target_tensor = torch.tensor([self.char_to_index[char] for char in target_word] + [EOS_token], dtype=torch.long)
        return input_tensor, target_tensor

# Creating a DataLoader to batch and shuffle the dataset
synonym_dataset = SynonymDataset(dataset, char_to_index)
dataloader = DataLoader(synonym_dataset, batch_size=1, shuffle=True)

# Setting the device to GPU if available, else CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class Encoder(nn.Module):
    """The Encoder part of the seq2seq model."""
    def __init__(self, input_size, hidden_size):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)  # Embedding layer
        self.lstm = nn.LSTM(hidden_size, hidden_size)  # LSTM layer

    def forward(self, input, hidden):
        # Forward pass for the encoder
        embedded = self.embedding(input).view(1, 1, -1)
        output, hidden = self.lstm(embedded, hidden)
        return output, hidden

    def initHidden(self):
        # Initializes hidden state
        return (torch.zeros(1, 1, self.hidden_size, device=device),
                torch.zeros(1, 1, self.hidden_size, device=device))
    
class Decoder(nn.Module):
    """The Decoder part of the seq2seq model."""
    def __init__(self, hidden_size, output_size):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(output_size, hidden_size)  # Embedding layer
        self.lstm = nn.LSTM(hidden_size, hidden_size)  # LSTM layer
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)
                             
    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output, hidden = self.lstm(embedded, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return (torch.zeros(1, 1, self.hidden_size, device=device),
                torch.zeros(1, 1, self.hidden_size, device=device))

# Assuming all characters in the dataset + 'SOS' and 'EOS' tokens are included in char_to_index
input_size = len(char_to_index)
hidden_size = 12
output_size = len(char_to_index)

encoder = Encoder(input_size=len(char_to_index), hidden_size=256).to(device)
decoder = Decoder(hidden_size=256, output_size=len(char_to_index)).to(device)

# Set the learning rate for optimization
learning_rate = 0.01

# Initializing optimizers for both encoder and decoder with Stochastic Gradient Descent (SGD)
encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)

def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=12):
    # Initialize encoder hidden state
    encoder_hidden = encoder.initHidden()

    # Clear gradients for optimizers
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    # Calculate the length of input and target tensors
    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    # Initialize loss
    loss = 0

    # Encoding each character in the input
    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(input_tensor[ei].unsqueeze(0), encoder_hidden)

    # Decoder's first input is the SOS token
    decoder_input = torch.tensor([[char_to_index['SOS']]], device=device)

    # Decoder starts with the encoder's last hidden state
    decoder_hidden = encoder_hidden

    # Decoding loop
    for di in range(target_length):
        decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
        # Choose top1 word from decoder's output
        topv, topi = decoder_output.topk(1)
        decoder_input = topi.squeeze().detach()  # Detach from history as input

        # Calculate loss
        loss += criterion(decoder_output, target_tensor[di].unsqueeze(0))
        if decoder_input.item() == char_to_index['EOS']:  # Stop if EOS token is generated
            break

    # Backpropagation
    loss.backward()

    # Update encoder and decoder parameters
    encoder_optimizer.step()
    decoder_optimizer.step()

    # Return average loss
    return loss.item() / target_length

# Negative Log Likelihood Loss function for calculating loss
criterion = nn.NLLLoss()

# Set number of epochs for training
n_epochs = 41

# Training loop
for epoch in range(n_epochs):
    total_loss = 0
    for input_tensor, target_tensor in dataloader:
        # Move tensors to the correct device
        input_tensor = input_tensor[0].to(device)
        target_tensor = target_tensor[0].to(device)
        
        # Perform a single training step and update total loss
        loss = train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
        total_loss += loss
    
    # Print loss every 10 epochs
    if epoch % 10 == 0:
       print(f'Epoch {epoch}, Loss: {total_loss / len(dataloader)}')

def evaluate_and_show_examples(encoder, decoder, dataloader, criterion, n_examples=5):
    # Switch model to evaluation mode
    encoder.eval()
    decoder.eval()
    
    total_loss = 0
    correct_predictions = 0
    
    # No gradient calculation
    with torch.no_grad():
        for i, (input_tensor, target_tensor) in enumerate(dataloader):
            # Move tensors to the correct device
            input_tensor = input_tensor[0].to(device)
            target_tensor = target_tensor[0].to(device)
            
            encoder_hidden = encoder.initHidden()

            input_length = input_tensor.size(0)
            target_length = target_tensor.size(0)

            loss = 0

            # Encoding step
            for ei in range(input_length):
                encoder_output, encoder_hidden = encoder(input_tensor[ei].unsqueeze(0), encoder_hidden)

            # Decoding step
            decoder_input = torch.tensor([[SOS_token]], device=device)
            decoder_hidden = encoder_hidden

            predicted_indices = []

            for di in range(target_length):
                decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
                topv, topi = decoder_output.topk(1)
                predicted_indices.append(topi.item())
                decoder_input = topi.squeeze().detach()

                loss += criterion(decoder_output, target_tensor[di].unsqueeze(0))
                if decoder_input.item() == EOS_token:
                    break

            # Calculate and print loss and accuracy for the evaluation
            total_loss += loss.item() / target_length
            if predicted_indices == target_tensor.tolist():
                correct_predictions += 1

            # Optionally, print some examples
            if i < n_examples:
                predicted_string = ''.join([index_to_char[index] for index in predicted_indices if index not in (SOS_token, EOS_token)])
                target_string = ''.join([index_to_char[index.item()] for index in target_tensor if index.item() not in (SOS_token, EOS_token)])
                input_string = ''.join([index_to_char[index.item()] for index in input_tensor if index.item() not in (SOS_token, EOS_token)])
                
                print(f'Input: {input_string}, Target: {target_string}, Predicted: {predicted_string}')
        
        # Print overall evaluation results
        average_loss = total_loss / len(dataloader)
        accuracy = correct_predictions / len(dataloader)
        print(f'Evaluation Loss: {average_loss}, Accuracy: {accuracy}')

# Perform evaluation with examples
evaluate_and_show_examples(encoder, decoder, dataloader, criterion)

Epoch 0, Loss: 3.1153654334644485
Epoch 10, Loss: 2.50001520986509
Epoch 20, Loss: 2.4934052624489564
Epoch 30, Loss: 2.370223235299775
Epoch 40, Loss: 2.1259228833623993
Input: The rain falls gently, Target: La pluie tombe doucement,, Predicted: Ll c ait         eeeeeee,e
Input: The flowers bloom in spring, Target: Les fleurs fleurissent au printemps,, Predicted: Ils fite       eeee,e
Input: They play video games, Target: Ils jouent aux jeux vidéo,, Predicted: Ils porte         eeee,ee
Input: The baby cries, Target: Le bébé pleure,, Predicted: Ils ééééit      
Input: He turns off the light, Target: Il éteint la lumière,, Predicted: Il  oate         eeee,
Evaluation Loss: 2.109506796352628, Accuracy: 0.0


In [85]:
'''Problem 2 (30pts)
Repeat problem 1, this time extend the network with attention. Train the model on the entire dataset and evaluate it on the entire dataset. 
Report training loss, validation loss, and validation accuracy. Also, try some qualitative validation as well, 
asking the network to generate French translations for some English sentences. Also, compare the results against problem 1.
'''
import torch.nn.functional as F

# Load dataset from .docx file
def load_english_french_pairs(docx_path):
    doc = Document(docx_path)
    text = "\n".join([p.text for p in doc.paragraphs])
    english_to_french = []
    
    for line in text.split("\n"):
        if '", "' in line:
            en, fr = line.split('", "')
            en = en.replace('("', '').strip()
            fr = fr.replace('")', '').strip()
            english_to_french.append((en, fr))
            
    return english_to_french

# Load dataset
dataset = load_english_french_pairs("Dataset - English to French.docx")

# Vocabulary builder
class Vocabulary:
    def __init__(self):
        self.word2index = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2, "<UNK>": 3}
        self.index2word = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK>"}
    
    def add_sentence(self, sentence):
        for word in sentence.split():
            self.add_word(word)

    def add_word(self, word):
        if word not in self.word2index:
            index = len(self.word2index)
            self.word2index[word] = index
            self.index2word[index] = word

    def sentence_to_indices(self, sentence):
        return [self.word2index.get(word, self.word2index["<UNK>"]) for word in sentence.split()] + [self.word2index["<EOS>"]]

# Build vocabularies
english_vocab = Vocabulary()
french_vocab = Vocabulary()

for en, fr in dataset:
    english_vocab.add_sentence(en)
    french_vocab.add_sentence(fr)

# Custom dataset class
class TranslationDataset(Dataset):
    def __init__(self, pairs, english_vocab, french_vocab):
        self.pairs = pairs
        self.english_vocab = english_vocab
        self.french_vocab = french_vocab

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        en_sentence, fr_sentence = self.pairs[idx]
        en_indices = self.english_vocab.sentence_to_indices(en_sentence)
        fr_indices = self.french_vocab.sentence_to_indices(fr_sentence)
        
        return torch.tensor(en_indices), torch.tensor(fr_indices)

# Collate function for padding
def collate_fn(batch):
    en_batch = [item[0] for item in batch]
    fr_batch = [item[1] for item in batch]

    en_batch = nn.utils.rnn.pad_sequence(en_batch, batch_first=True, padding_value=english_vocab.word2index["<PAD>"])
    fr_batch = nn.utils.rnn.pad_sequence(fr_batch, batch_first=True, padding_value=french_vocab.word2index["<PAD>"])

    return en_batch, fr_batch

# DataLoader
train_dataset = TranslationDataset(dataset, english_vocab, french_vocab)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)

class Attention(nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        self.attn = nn.Linear(hidden_size * 2, hidden_size)
        self.v = nn.Parameter(torch.rand(hidden_size))
        
    def forward(self, hidden, encoder_outputs):
        """
        hidden: [batch_size, hidden_size]
        encoder_outputs: [batch_size, seq_len, hidden_size]
        """
        batch_size = encoder_outputs.shape[0]
        seq_len = encoder_outputs.shape[1]

        # Repeat hidden state across the sequence length
        hidden = hidden.unsqueeze(1).repeat(1, seq_len, 1)  # [batch_size, seq_len, hidden_size]

        # Concatenate hidden with encoder outputs
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))  # [batch_size, seq_len, hidden_size]

        # Calculate attention scores
        v = self.v.repeat(batch_size, 1).unsqueeze(2)  # [batch_size, hidden_size, 1]
        attention_scores = torch.bmm(energy, v).squeeze(2)  # [batch_size, seq_len]

        # Softmax to normalize scores into probabilities
        attn_weights = F.softmax(attention_scores, dim=1)  # [batch_size, seq_len]

        return attn_weights

# Encoder with LSTM
class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.lstm = nn.LSTM(embedding_size, hidden_size, batch_first=True)

    def forward(self, x):
        embedded = self.embedding(x)
        output, hidden = self.lstm(embedded)
        return output, hidden

    def initHidden(self, batch_size):
        return (torch.zeros(1, batch_size, hidden_size, device=device),
                torch.zeros(1, batch_size, hidden_size, device=device))

# Decoder with LSTM
class AttentionalDecoder(nn.Module):
    def __init__(self, hidden_size, output_size, embedding_size):
        super(AttentionalDecoder, self).__init__()
        self.embedding = nn.Embedding(output_size, embedding_size)
        self.lstm = nn.LSTM(embedding_size + hidden_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size * 2, output_size)  # Concatenate with attention context
        self.softmax = nn.LogSoftmax(dim=1)

        # Attention module
        self.attention = Attention(hidden_size)

    def forward(self, x, hidden, encoder_outputs):
        """
        x: [batch_size]
        hidden: (h, c) - LSTM hidden and cell states
        encoder_outputs: [batch_size, seq_len, hidden_size]
        """
        embedded = self.embedding(x).unsqueeze(1)  # [batch_size, 1, embedding_size]

        # Attention weights
        attn_weights = self.attention(hidden[0][-1], encoder_outputs)  # [batch_size, seq_len]

        # Apply attention to encoder outputs
        attn_weights = attn_weights.unsqueeze(1)  # [batch_size, 1, seq_len]
        context = torch.bmm(attn_weights, encoder_outputs)  # [batch_size, 1, hidden_size]

        # Concatenate context with embedded input
        lstm_input = torch.cat((embedded, context), dim=2)  # [batch_size, 1, embedding_size + hidden_size]

        # Pass through LSTM
        output, hidden = self.lstm(lstm_input, hidden)  # Output: [batch_size, 1, hidden_size]

        # Combine output with context for final prediction
        output = self.fc(torch.cat((output.squeeze(1), context.squeeze(1)), dim=1))  # [batch_size, output_size]

        return self.softmax(output), hidden, attn_weights

# Model parameters
embedding_size = 256
hidden_size = 512

input_size = len(english_vocab.word2index)
output_size = len(french_vocab.word2index)

encoder = Encoder(input_size, embedding_size, hidden_size).to(device)
decoder = Decoder(hidden_size, output_size, embedding_size).to(device)

# Optimizers and loss function
encoder_optimizer = optim.Adam(encoder.parameters(), lr=0.001)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=0.001)
criterion = nn.NLLLoss(ignore_index=english_vocab.word2index["<PAD>"])

# Training epoch with teacher forcing
def train_epoch(encoder, decoder, train_loader, criterion, encoder_optimizer, decoder_optimizer, device, teacher_forcing_ratio=0.5):
    encoder.train()
    decoder.train()
    total_loss = 0

    for en_tensor, fr_tensor in train_loader:
        en_tensor = en_tensor.to(device)
        fr_tensor = fr_tensor.to(device)

        batch_size = en_tensor.size(0)
        target_length = fr_tensor.size(1)

        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()

        encoder_hidden = encoder.initHidden(batch_size)
        encoder_outputs, encoder_hidden = encoder(en_tensor)

        decoder_input = torch.full((batch_size,), french_vocab.word2index["<SOS>"], dtype=torch.long, device=device)
        decoder_hidden = encoder_hidden

        loss = 0

        use_teacher_forcing = random.random() < teacher_forcing_ratio

        if use_teacher_forcing:
            for t in range(target_length):
                output, decoder_hidden, _ = decoder(decoder_input, decoder_hidden, encoder_outputs)
                loss += criterion(output, fr_tensor[:, t])
                decoder_input = fr_tensor[:, t]  # Teacher forcing
        else:
            for t in range(target_length):
                output, decoder_hidden, _ = decoder(decoder_input, decoder_hidden, encoder_outputs)

                # Output: [batch_size, vocab_size], Target: [batch_size]
                loss += criterion(output, fr_tensor[:, t])

                _, topi = output.topk(1)
                decoder_input = topi.squeeze().detach()

        loss = loss / target_length
        loss.backward()

        encoder_optimizer.step()
        decoder_optimizer.step()

        total_loss += loss.item()

    return total_loss / len(train_loader)

def evaluate_and_show_examples(encoder, decoder, dataloader, criterion, n_examples):
    encoder.eval()
    decoder.eval()

    total_loss = 0
    correct_predictions = 0  
    printed_examples = 0

    with torch.no_grad():
        for en_tensor, fr_tensor in dataloader:
            en_tensor = en_tensor.to(device)
            fr_tensor = fr_tensor.to(device)

            batch_size = en_tensor.size(0)
            encoder_hidden = encoder.initHidden(batch_size)
            encoder_outputs, encoder_hidden = encoder(en_tensor)

            decoder_input = torch.full((batch_size,), french_vocab.word2index["<SOS>"], dtype=torch.long, device=device)
            decoder_hidden = encoder_hidden

            predicted_indices = [[] for _ in range(batch_size)]

            for t in range(fr_tensor.size(1)):
                output, decoder_hidden, attn_weights = decoder(decoder_input, decoder_hidden, encoder_outputs)

                _, topi = output.topk(1)

                for b in range(batch_size):
                    predicted_indices[b].append(topi[b].item())

                decoder_input = topi.squeeze().detach()

            total_loss += criterion(output, fr_tensor[:, -1]).item()

            for batch_idx in range(batch_size):
                if printed_examples >= n_examples:
                    break

                predicted_words = [french_vocab.index2word[idx] for idx in predicted_indices[batch_idx] if idx != french_vocab.word2index["<EOS>"]]
                target_words = [french_vocab.index2word[idx.item()] for idx in fr_tensor[batch_idx] if idx.item() != french_vocab.word2index["<EOS>"]]

                print(f"Input: {' '.join(predicted_words)}")
                print(f"Target: {' '.join(target_words)}")
                print("-" * 30)

                printed_examples += 1

    print(f'Evaluation Loss: {total_loss / len(dataloader):.4f}')


attn_decoder = AttentionalDecoder(hidden_size, output_size, embedding_size).to(device)

# Optimizers
encoder_optimizer = optim.Adam(encoder.parameters(), lr=0.001)
decoder_optimizer = optim.Adam(attn_decoder.parameters(), lr=0.001)

# Train and Evaluate
num_epochs = 10
for epoch in range(num_epochs):
    loss = train_epoch(encoder, attn_decoder, train_loader, criterion, encoder_optimizer, decoder_optimizer, device)
    print(f"Epoch {epoch+1}, Loss: {loss:.4f}")

evaluate_and_show_examples(encoder, attn_decoder, train_loader, criterion, 5)

Epoch 1, Loss: 5.2262
Epoch 2, Loss: 4.2964
Epoch 3, Loss: 3.7145
Epoch 4, Loss: 3.4998
Epoch 5, Loss: 3.3134
Epoch 6, Loss: 3.2353
Epoch 7, Loss: 3.0515
Epoch 8, Loss: 2.9319
Epoch 9, Loss: 2.6365
Epoch 10, Loss: 2.5879
Input: Il se le le
Target: Il gravit la montagne, <PAD> <PAD> <PAD> <PAD>
------------------------------
Input: Nous Nous un
Target: Nous cuisinons le dîner ensemble, <PAD> <PAD> <PAD>
------------------------------
Input: Ils se le le
Target: Elle nage dans l'océan, <PAD> <PAD> <PAD> <PAD>
------------------------------
Input: Ils se la la plage,
Target: Elle se promène le long de la plage,
------------------------------
Input: Elle une une
Target: Elle peint un tableau, <PAD> <PAD> <PAD> <PAD>
------------------------------
Evaluation Loss: 0.6533


In [None]:
'''Problem 3 (40pts)

Repeat problems 1 and 2, this time try to translate from French to English. Train the model on the entire dataset and evaluate it on the entire dataset. 
Report training loss, validation loss, and validation accuracy. 
Also, try some qualitative validation as well, asking the network to generate French translations for some English sentences. 
Which one seems to be more effective, French-to-English or English-to-French?'''