# Home Exercise 1 on Machine Translation

Implement a **sequence2sequence** model to translate English to Vietnamese. In this exercise, we will sequentially practice the steps to build a machine learning system for the machine translation task using a **seq2seq model**. These steps include:
- Downloading and preprocessing bilingual data
- Creating training data
- Building a seq2seq model with **attention**
- Visualizing attention data
- Translating new sentences on real-world data.

- **Data**: [IWSLT’15 English-Vietnamese](https://www.manythings.org/anki/)  
  - **Train set**: `train.en` and `train.vi`  
  - **Validation set**: `tst2012.en` and `tst2012.vi`  
  - **Test set**: `tst2013.en` and `tst2013.vi`

**Note**: Submit only a **single Jupyter Notebook file** that can handle all tasks, including data downloading, preprocessing, model training, and model evaluation. *(Submissions that do not follow the guidelines will receive a score of 0.)*

## Grading Criteria

For valid submissions, scores will be assigned based on the **leaderboard ranking** (**strictly greater**):

- **Top 25%** → **10 points**
- **25% - 50%** → **9.0 points**
- **50% - 75%** → **8.0 points**
- **75% - 100%** → **7.0 points**


In [1]:
%pip install torch torchvision torchaudio torchtext nltk sacrebleu


Note: you may need to restart the kernel to use updated packages.




In [None]:
pip show torch torchtext


Name: torchNote: you may need to restart the kernel to use updated packages.





Version: 2.5.1
Summary: Tensors and Dynamic neural networks in Python with strong GPU acceleration
Home-page: https://pytorch.org/
Author: PyTorch Team
Author-email: packages@pytorch.org
License: BSD-3-Clause
Location: e:\anaconda3\envs\ml_env_test\Lib\site-packages
Requires: filelock, fsspec, jinja2, networkx, setuptools, sympy, typing-extensions
Required-by: pgmpy, torchaudio, torchtext, torchvision
---
Name: torchtext
Version: 0.18.0
Summary: Text utilities, models, transforms, and datasets for PyTorch.
Home-page: https://github.com/pytorch/text
Author: PyTorch Text Team
Author-email: packages@pytorch.org
License: BSD
Location: e:\anaconda3\envs\ml_env_test\Lib\site-packages
Requires: numpy, requests, torch, tqdm
Required-by: 


: 

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchtext
from torch.utils.data import DataLoader, Dataset
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
import random
import numpy as np
import nltk
import sacrebleu

nltk.download('punkt')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [None]:
# =====================================
# 📥 STEP 1: Download and Load Dataset
# =====================================
import os
import urllib.request

# Download English-Vietnamese data
urls = {
    "train.en": "https://nlp.stanford.edu/projects/nmt/data/iwslt15.en-vi/train.en",
    "train.vi": "https://nlp.stanford.edu/projects/nmt/data/iwslt15.en-vi/train.vi",
    "tst2012.en": "https://nlp.stanford.edu/projects/nmt/data/iwslt15.en-vi/tst2012.en",
    "tst2012.vi": "https://nlp.stanford.edu/projects/nmt/data/iwslt15.en-vi/tst2012.vi",
    "tst2013.en": "https://nlp.stanford.edu/projects/nmt/data/iwslt15.en-vi/tst2013.en",
    "tst2013.vi": "https://nlp.stanford.edu/projects/nmt/data/iwslt15.en-vi/tst2013.vi",
}

if not os.path.exists("data"):
    os.makedirs("data")

for filename, url in urls.items():
    filepath = os.path.join("data", filename)
    if not os.path.exists(filepath):
        print(f"Downloading {filename}...")
        urllib.request.urlretrieve(url, filepath)
print("Download complete!")


Downloading data from https://nlp.stanford.edu/projects/nmt/data/iwslt15.en-vi.tar.gz


Exception: URL fetch failure on https://nlp.stanford.edu/projects/nmt/data/iwslt15.en-vi.tar.gz: 404 -- Not Found

In [None]:
# Tokenizers for English & Vietnamese
en_tokenizer = get_tokenizer("spacy", language="en_core_web_sm")
vi_tokenizer = get_tokenizer("moses", language="vi")

# Read and tokenize sentences
def yield_tokens(filepath, tokenizer):
    with open(filepath, encoding="utf-8") as f:
        for line in f:
            yield tokenizer(line.strip())

# Build vocabulary
def build_vocab(filepath, tokenizer):
    return build_vocab_from_iterator(yield_tokens(filepath, tokenizer), specials=["<unk>", "<pad>", "<bos>", "<eos>"])

en_vocab = build_vocab("data/train.en", en_tokenizer)
vi_vocab = build_vocab("data/train.vi", vi_tokenizer)

# Convert text to integer sequence
def encode_sentence(sentence, vocab, tokenizer):
    return [vocab["<bos>"]] + [vocab[token] for token in tokenizer(sentence)] + [vocab["<eos>"]]

# Padding sequences
def pad_sequence(seq, max_len, pad_idx):
    return seq + [pad_idx] * (max_len - len(seq))

# Dataset Class
class TranslationDataset(Dataset):
    def __init__(self, en_filepath, vi_filepath, en_vocab, vi_vocab, en_tokenizer, vi_tokenizer, max_len=50):
        self.en_sentences = open(en_filepath, encoding="utf-8").read().strip().split("\n")
        self.vi_sentences = open(vi_filepath, encoding="utf-8").read().strip().split("\n")
        self.en_vocab = en_vocab
        self.vi_vocab = vi_vocab
        self.en_tokenizer = en_tokenizer
        self.vi_tokenizer = vi_tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.en_sentences)

    def __getitem__(self, idx):
        en_seq = encode_sentence(self.en_sentences[idx], self.en_vocab, self.en_tokenizer)
        vi_seq = encode_sentence(self.vi_sentences[idx], self.vi_vocab, self.vi_tokenizer)
        return torch.tensor(pad_sequence(en_seq, self.max_len, self.en_vocab["<pad>"])), torch.tensor(pad_sequence(vi_seq, self.max_len, self.vi_vocab["<pad>"]))

# Load dataset
train_dataset = TranslationDataset("data/train.en", "data/train.vi", en_vocab, vi_vocab, en_tokenizer, vi_tokenizer)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)


In [None]:
class Attention(nn.Module):
    def __init__(self, hidden_dim):
        super(Attention, self).__init__()
        self.attn = nn.Linear(hidden_dim * 2, hidden_dim)
        self.v = nn.Linear(hidden_dim, 1, bias=False)

    def forward(self, hidden, encoder_outputs):
        attn_energies = self.v(torch.tanh(self.attn(torch.cat((hidden.expand_as(encoder_outputs), encoder_outputs), dim=2))))
        return torch.softmax(attn_energies.squeeze(2), dim=1)

class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hidden_dim):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hidden_dim, bidirectional=True)

    def forward(self, src):
        embedded = self.embedding(src)
        outputs, (hidden, cell) = self.lstm(embedded)
        return outputs, hidden

class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hidden_dim):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.lstm = nn.LSTM(emb_dim + hidden_dim * 2, hidden_dim)
        self.fc_out = nn.Linear(hidden_dim * 3, output_dim)
        self.attention = Attention(hidden_dim)

    def forward(self, trg, hidden, encoder_outputs):
        embedded = self.embedding(trg).unsqueeze(0)
        attn_weights = self.attention(hidden[-1], encoder_outputs)
        context = torch.bmm(attn_weights.unsqueeze(1), encoder_outputs.permute(1, 0, 2)).squeeze(1)
        rnn_input = torch.cat((embedded, context.unsqueeze(0)), dim=2)
        output, (hidden, cell) = self.lstm(rnn_input, (hidden, torch.zeros_like(hidden)))
        output = self.fc_out(torch.cat((output.squeeze(0), context), dim=1))
        return output, hidden


In [None]:
encoder = Encoder(len(en_vocab), 256, 512).to(device)
decoder = Decoder(len(vi_vocab), 256, 512).to(device)

optimizer = optim.Adam(list(encoder.parameters()) + list(decoder.parameters()), lr=0.001)
criterion = nn.CrossEntropyLoss(ignore_index=vi_vocab["<pad>"])

epochs = 10
for epoch in range(epochs):
    total_loss = 0
    for en_batch, vi_batch in train_loader:
        en_batch, vi_batch = en_batch.to(device), vi_batch.to(device)
        optimizer.zero_grad()
        encoder_outputs, hidden = encoder(en_batch)
        loss = 0
        for t in range(1, vi_batch.shape[1]):
            output, hidden = decoder(vi_batch[:, t-1], hidden, encoder_outputs)
            loss += criterion(output, vi_batch[:, t])
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss:.4f}")


In [None]:
def translate(sentence):
    en_seq = encode_sentence(sentence, en_vocab, en_tokenizer)
    en_tensor = torch.tensor(pad_sequence(en_seq, 50, en_vocab["<pad>"])).unsqueeze(0).to(device)
    encoder_outputs, hidden = encoder(en_tensor)
    vi_seq = [vi_vocab["<bos>"]]
    for _ in range(50):
        output, hidden = decoder(torch.tensor([vi_seq[-1]]).to(device), hidden, encoder_outputs)
        next_word = output.argmax(1).item()
        if next_word == vi_vocab["<eos>"]:
            break
        vi_seq.append(next_word)
    return " ".join([vi_vocab.lookup_token(w) for w in vi_seq])

print(translate("Hello, how are you?"))
