<a href="https://colab.research.google.com/github/myllanes/Introduction-to-Deep-Learning/blob/main/HW4_3_French_to_English.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Michael Yllanes
# Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import numpy as np
import random
import ast  # For safely evaluating the file content
from sklearn.model_selection import train_test_split

# Install python-docx for reading .docx files
!pip install python-docx

# .docx file in Google Drive
file_path = '/content/drive/My Drive/Dataset_English_to_French.docx'

# Load the .docx
from docx import Document
doc = Document(file_path)


text = []
for paragraph in doc.paragraphs:
    text.append(paragraph.text)


text = '\n'.join(text)

# Extract the list of tuples from the text
start_index = text.find('[')  # Find the start
end_index = text.rfind(']') + 1  # Find the end
list_content = text[start_index:end_index]  # Extract the list content

# evaluate the list content
dataset = ast.literal_eval(list_content)

# Swap the input and target pairs
dataset = [(french, english) for english, french in dataset]

# Special tokens for the start and end of sequences
SOS_token = 0  # Start Of Sequence Token
EOS_token = 1  # End Of Sequence Token

# Mappings charaters
all_chars = set(''.join([word for pair in dataset for word in pair]))
char_to_index = {"SOS": SOS_token, "EOS": EOS_token, **{char: i+2 for i, char in enumerate(sorted(list(all_chars)))}}
index_to_char = {i: char for char, i in char_to_index.items()}

# Custom Dataset class for French to English
class TranslationDataset(Dataset):
    def __init__(self, dataset, char_to_index):
        self.dataset = dataset
        self.char_to_index = char_to_index

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        input_word, target_word = self.dataset[idx]
        input_tensor = torch.tensor([self.char_to_index[char] for char in input_word] + [EOS_token], dtype=torch.long)
        target_tensor = torch.tensor([self.char_to_index[char] for char in target_word] + [EOS_token], dtype=torch.long)
        return input_tensor, target_tensor

# DataLoader check
translation_dataset = TranslationDataset(dataset, char_to_index)
dataloader = DataLoader(translation_dataset, batch_size=1, shuffle=True)

# Set device (GPU if available, else CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Split dataset into training and validation sets
train_dataset, val_dataset = train_test_split(dataset, test_size=0.2, random_state=42)

# Create DataLoader for training and validation sets
train_translation_dataset = TranslationDataset(train_dataset, char_to_index)
val_translation_dataset = TranslationDataset(val_dataset, char_to_index)

train_dataloader = DataLoader(train_translation_dataset, batch_size=1, shuffle=True)
val_dataloader = DataLoader(val_translation_dataset, batch_size=1, shuffle=False)

# Encoder model
class Encoder(nn.Module):
    def __init__(self, input_size, en_out, hidden_size):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, en_out)
        self.lstm = nn.LSTM(en_out, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output, hidden = self.lstm(embedded, hidden)
        return output, hidden

    def initHidden(self):
        return (torch.zeros(1, 1, self.hidden_size, device=device),
                torch.zeros(1, 1, self.hidden_size, device=device))

# Decoder model
class Decoder(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output, hidden = self.lstm(embedded, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return (torch.zeros(1, 1, self.hidden_size, device=device),
                torch.zeros(1, 1, self.hidden_size, device=device))

# Model hyperparameters
input_size = len(char_to_index)
hidden_size = 256
output_size = len(char_to_index)

# Initialize encoder and decoder
encoder = Encoder(input_size=input_size, en_out=64, hidden_size=hidden_size).to(device)
decoder = Decoder(hidden_size=hidden_size, output_size=output_size).to(device)

# Optimizers and loss function
learning_rate = 0.01
encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
criterion = nn.NLLLoss()

# Training function
def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=12):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    loss = 0

    # Encoding each character
    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(input_tensor[ei].unsqueeze(0), encoder_hidden)

    # Decoder's first input is the SOS token
    decoder_input = torch.tensor([[SOS_token]], device=device)

    # Decoder starts with the encoder's last hidden state
    decoder_hidden = encoder_hidden

    # Decoding loop
    for di in range(target_length):
        decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
        topv, topi = decoder_output.topk(1)
        decoder_input = topi.squeeze().detach()

        # Calculate loss
        loss += criterion(decoder_output, target_tensor[di].unsqueeze(0))
        if decoder_input.item() == EOS_token:
            break

    # Backpropagation
    loss.backward()

    # Update encoder and decoder parameters
    encoder_optimizer.step()
    decoder_optimizer.step()

    # Return average loss
    return loss.item() / target_length

# Training loop
n_epochs = 200
for epoch in range(n_epochs):
    # Training phase
    encoder.train()
    decoder.train()
    total_train_loss = 0
    for input_tensor, target_tensor in train_dataloader:
        input_tensor = input_tensor[0].to(device)
        target_tensor = target_tensor[0].to(device)
        loss = train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
        total_train_loss += loss
    avg_train_loss = total_train_loss / len(train_dataloader)

    # Validation phase
    encoder.eval()
    decoder.eval()
    total_val_loss = 0
    correct_predictions = 0  # Counter for correct predictions
    with torch.no_grad():
        for input_tensor, target_tensor in val_dataloader:
            input_tensor = input_tensor[0].to(device)
            target_tensor = target_tensor[0].to(device)
            encoder_hidden = encoder.initHidden()
            input_length = input_tensor.size(0)
            target_length = target_tensor.size(0)
            loss = 0

            # Encoding step
            for ei in range(input_length):
                encoder_output, encoder_hidden = encoder(input_tensor[ei].unsqueeze(0), encoder_hidden)

            # Decoding step
            decoder_input = torch.tensor([[SOS_token]], device=device)
            decoder_hidden = encoder_hidden

            predicted_indices = []  # Store predicted indices for accuracy calculation
            for di in range(target_length):
                decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
                topv, topi = decoder_output.topk(1)
                predicted_indices.append(topi.item())
                decoder_input = topi.squeeze().detach()

                loss += criterion(decoder_output, target_tensor[di].unsqueeze(0))
                if decoder_input.item() == EOS_token:
                    break

            # Calculate validation loss
            total_val_loss += loss.item() / target_length

            # Calculate accuracy
            target_indices = target_tensor.tolist()
            if predicted_indices == target_indices:
                correct_predictions += 1

    avg_val_loss = total_val_loss / len(val_dataloader)
    val_accuracy = correct_predictions / len(val_dataloader)  # Validation accuracy

    # Print training and validation
    if epoch % 10 == 0:
        print(f'Epoch {epoch}, Training Loss: {avg_train_loss:.4f}, Validation Loss: {avg_val_loss:.4f}')

# Evaluation function
def evaluate_and_show_examples(encoder, decoder, dataloader, criterion, n_examples=5):
    encoder.eval()
    decoder.eval()

    total_loss = 0
    correct_predictions = 0

    with torch.no_grad():
        for i, (input_tensor, target_tensor) in enumerate(dataloader):
            input_tensor = input_tensor[0].to(device)
            target_tensor = target_tensor[0].to(device)

            encoder_hidden = encoder.initHidden()

            input_length = input_tensor.size(0)
            target_length = target_tensor.size(0)

            loss = 0

            # Encoding step
            for ei in range(input_length):
                encoder_output, encoder_hidden = encoder(input_tensor[ei].unsqueeze(0), encoder_hidden)

            # Decoding step
            decoder_input = torch.tensor([[SOS_token]], device=device)
            decoder_hidden = encoder_hidden

            predicted_indices = []

            for di in range(target_length):
                decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
                topv, topi = decoder_output.topk(1)
                predicted_indices.append(topi.item())
                decoder_input = topi.squeeze().detach()

                loss += criterion(decoder_output, target_tensor[di].unsqueeze(0))
                if decoder_input.item() == EOS_token:
                    break

            # Calculate and print loss and accuracy for the evaluation
            total_loss += loss.item() / target_length
            if predicted_indices == target_tensor.tolist():
                correct_predictions += 1

            # Print some examples
            if i < n_examples:
                predicted_string = ''.join([index_to_char[index] for index in predicted_indices if index not in (SOS_token, EOS_token)])
                target_string = ''.join([index_to_char[index.item()] for index in target_tensor if index.item() not in (SOS_token, EOS_token)])
                input_string = ''.join([index_to_char[index.item()] for index in input_tensor if index.item() not in (SOS_token, EOS_token)])

                print(f'Input: {input_string}, Target: {target_string}, Predicted: {predicted_string}')

        # Print overall evaluation results
        average_loss = total_loss / len(dataloader)
        accuracy = correct_predictions / len(dataloader)
        print(f'Evaluation Loss: {average_loss}, Accuracy: {accuracy}')

# Perform evaluation with examples
evaluate_and_show_examples(encoder, decoder, dataloader, criterion)


Epoch 0, Training Loss: 3.1027, Validation Loss: 2.9236
Epoch 10, Training Loss: 2.2806, Validation Loss: 2.3457
Epoch 20, Training Loss: 2.2473, Validation Loss: 2.0112
Epoch 30, Training Loss: 2.1998, Validation Loss: 1.9105
Epoch 40, Training Loss: 2.1563, Validation Loss: 2.2342
Epoch 50, Training Loss: 2.0916, Validation Loss: 2.0487
Epoch 60, Training Loss: 2.0175, Validation Loss: 2.1800
Epoch 70, Training Loss: 2.1133, Validation Loss: 2.7490
Epoch 80, Training Loss: 1.9154, Validation Loss: 2.8147
Epoch 90, Training Loss: 1.8692, Validation Loss: 2.6164
Epoch 100, Training Loss: 1.7382, Validation Loss: 2.8467
Epoch 110, Training Loss: 1.7193, Validation Loss: 2.5161
Epoch 120, Training Loss: 1.6060, Validation Loss: 2.8747
Epoch 130, Training Loss: 1.4317, Validation Loss: 3.0369
Epoch 140, Training Loss: 1.1653, Validation Loss: 3.2203
Epoch 150, Training Loss: 0.7748, Validation Loss: 4.0242
Epoch 160, Training Loss: 0.4900, Validation Loss: 4.4135
Epoch 170, Training Loss:

# New Section