Q2. Train and evaluate two separate Seq2Seq Transformer models (10 marks)
1. English-to-French translation model
2. French-to-English translation model
3. Use english.txt and french.txt data from demo 3 folder

Tasks:

- Implement a Seq2Seq Transformer model for English-to-French translation
- Train a second model with the same architecture but for French-to-English translation using the opposite dataset.
- Insert novel sentences into the English-to-French model and collect the outputs.
- Feed the translated French sentences into your French-to-English model.
- Compare the outputs of the two models.

Setup (imports, device, random seeds)

In [25]:
# importing the usual PyTorch and other libraries
import math, os, re, random, io
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader

# setting seed so as i want reproducibility
SEED = 42
random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)


Device: cuda


Load english.txt & french.txt

In [26]:
EN_PATH = "english.txt"
FR_PATH = "french.txt"

with io.open(EN_PATH, "r", encoding="utf-8") as f:
    en_lines = [ln.strip() for ln in f if ln.strip()]

with io.open(FR_PATH, "r", encoding="utf-8") as f:
    fr_lines = [ln.strip() for ln in f if ln.strip()]

# I need the same number of aligned examples, so I'll cut to the shared min length.
N = min(len(en_lines), len(fr_lines))
en_lines = en_lines[:N]
fr_lines = fr_lines[:N]

print("Total aligned pairs:", N)
print("Example:", en_lines[0], "|||", fr_lines[0])


Total aligned pairs: 154883
Example: Go. ||| Va !


Preprocessing + tokenization + vocabulary build

In [27]:
# Preprocessing :
# - lowercase
# - punctuation as separate tokens
# - split on whitespace

def basic_clean(s):
    s = s.lower().strip()
    s = re.sub(r"([.,!?;:()\"“”'’\-])", r" \1 ", s)
    s = re.sub(r"\s+", " ", s)
    return s

def tokenize(s):
    return basic_clean(s).split()

# Building vocabularies for both languages from the training text.
from collections import Counter

SPECIAL_TOKENS = {
    "PAD": "<pad>",
    "SOS": "<sos>",
    "EOS": "<eos>",
    "UNK": "<unk>",
}
PAD_ID, SOS_ID, EOS_ID, UNK_ID = 0, 1, 2, 3

def build_vocab(lines, max_vocab_size=20000):
    counter = Counter()
    for ln in lines:
        counter.update(tokenize(ln))
    most_common = counter.most_common(max_vocab_size - 4)
    itos = [SPECIAL_TOKENS["PAD"], SPECIAL_TOKENS["SOS"], SPECIAL_TOKENS["EOS"], SPECIAL_TOKENS["UNK"]] + [w for w, _ in most_common]
    stoi = {w:i for i,w in enumerate(itos)}
    return stoi, itos

# Building English (src) and French (tgt) vocabs separately.
SRC_MAX_VOCAB = 20000
TGT_MAX_VOCAB = 20000

src_stoi, src_itos = build_vocab(en_lines, SRC_MAX_VOCAB)
tgt_stoi, tgt_itos = build_vocab(fr_lines, TGT_MAX_VOCAB)

SRC_V = len(src_itos)
TGT_V = len(tgt_itos)
print("SRC vocab size:", SRC_V, "TGT vocab size:", TGT_V)

def encode_src(sentence):
    toks = tokenize(sentence)
    return torch.tensor([src_stoi.get(t, UNK_ID) for t in toks], dtype=torch.long)

def encode_tgt(sentence):
    toks = tokenize(sentence)
    ids = [tgt_stoi.get(t, UNK_ID) for t in toks]
    # For decoder, I want <sos> ... <eos>
    return torch.tensor([SOS_ID] + ids + [EOS_ID], dtype=torch.long)


SRC vocab size: 13728 TGT vocab size: 20000


Dataset + DataLoader

In [28]:
class EnFrDataset(Dataset):
    def __init__(self, en_list, fr_list):
        self.en_list = en_list
        self.fr_list = fr_list
    def __len__(self):
        return len(self.en_list)
    def __getitem__(self, idx):
        src_ids = encode_src(self.en_list[idx])
        tgt_ids = encode_tgt(self.fr_list[idx])
        return src_ids, tgt_ids

dataset = EnFrDataset(en_lines, fr_lines)

# Simple train/val split
val_ratio = 0.05
val_size = int(len(dataset) * val_ratio)
train_size = len(dataset) - val_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size], generator=torch.Generator().manual_seed(SEED))
print("Train:", len(train_dataset), "Val:", len(val_dataset))

# Collate: pad to the longest in batch and create masks for Transformer.
def generate_square_subsequent_mask(sz):
    mask = torch.triu(torch.ones(sz, sz, device=device), diagonal=1).bool()
    return mask

def collate_fn(batch):
    src_seqs, tgt_seqs = zip(*batch)
    src_lens = [len(s) for s in src_seqs]
    tgt_lens = [len(t) for t in tgt_seqs]

    src_pad = pad_sequence(src_seqs, batch_first=True, padding_value=PAD_ID)   # [B, S]
    tgt_pad = pad_sequence(tgt_seqs, batch_first=True, padding_value=PAD_ID)   # [B, T]

    tgt_input = tgt_pad[:, :-1]
    tgt_output = tgt_pad[:, 1:]

    src_key_padding_mask = (src_pad == PAD_ID)
    tgt_key_padding_mask = (tgt_input == PAD_ID)

    T = tgt_input.size(1)
    tgt_mask = generate_square_subsequent_mask(T)

    batch_dict = {
        "src": src_pad,
        "tgt_in": tgt_input,
        "tgt_out": tgt_output,
        "src_key_padding_mask": src_key_padding_mask,
        "tgt_key_padding_mask": tgt_key_padding_mask,
        "tgt_mask": tgt_mask
    }
    return batch_dict

BATCH_SIZE = 128
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_loader   = DataLoader(val_dataset,   batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)


Train: 147139 Val: 7744


Transformer Seq2Seq model, English -> French

In [29]:
# nn.Transformer layout with embeddings + sinusoidal positions.
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=1000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(1)  # [max_len, 1, d_model]
        self.register_buffer('pe', pe)

    def forward(self, x):
        S = x.size(0)
        return x + self.pe[:S]

class TransformerSeq2Seq(nn.Module):
    def __init__(self, src_vocab, tgt_vocab, d_model=256, nhead=8, num_layers=3, dim_ff=512, dropout=0.1):
        super().__init__()
        self.src_embed = nn.Embedding(src_vocab, d_model, padding_idx=PAD_ID)
        self.tgt_embed = nn.Embedding(tgt_vocab, d_model, padding_idx=PAD_ID)
        self.pos_enc   = PositionalEncoding(d_model)

        self.transformer = nn.Transformer(
            d_model=d_model, nhead=nhead,
            num_encoder_layers=num_layers,
            num_decoder_layers=num_layers,
            dim_feedforward=dim_ff,
            dropout=dropout, batch_first=False
        )
        self.generator = nn.Linear(d_model, tgt_vocab)

        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)

    def forward(self, src, tgt_in, src_key_padding_mask, tgt_key_padding_mask, tgt_mask):
        src = src.transpose(0,1)
        tgt_in = tgt_in.transpose(0,1)

        src_emb = self.pos_enc(self.src_embed(src))
        tgt_emb = self.pos_enc(self.tgt_embed(tgt_in))

        memory = self.transformer.encoder(
            src_emb,
            src_key_padding_mask=src_key_padding_mask
        )
        out = self.transformer.decoder(
            tgt_emb, memory,
            tgt_mask=tgt_mask,  # [T,T]
            tgt_key_padding_mask=tgt_key_padding_mask,
            memory_key_padding_mask=src_key_padding_mask
        )
        logits = self.generator(out)
        return logits.transpose(0,1)

    @torch.no_grad()
    def greedy_decode(self, src, max_len=60):

        self.eval()
        src_key_padding_mask = (src == PAD_ID)
        src_t = src.transpose(0,1)
        src_emb = self.pos_enc(self.src_embed(src_t))
        memory = self.transformer.encoder(src_emb, src_key_padding_mask=src_key_padding_mask)

        ys = torch.tensor([[SOS_ID]], dtype=torch.long, device=src.device)
        for _ in range(max_len-1):
            tgt_emb = self.pos_enc(self.tgt_embed(ys.transpose(0,1)))
            tgt_mask = generate_square_subsequent_mask(ys.size(1))
            out = self.transformer.decoder(
                tgt_emb, memory,
                tgt_mask=tgt_mask,
                tgt_key_padding_mask=None,
                memory_key_padding_mask=src_key_padding_mask
            )
            logits = self.generator(out)
            next_id = logits[-1, 0].argmax().unsqueeze(0).unsqueeze(0)
            ys = torch.cat([ys, next_id], dim=1)
            if next_id.item() == EOS_ID:
                break
        return ys.squeeze(0).tolist()


Train (10 epochs)

In [30]:
model = TransformerSeq2Seq(SRC_V, TGT_V, d_model=256, nhead=8, num_layers=3, dim_ff=512, dropout=0.1).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=PAD_ID)
optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)

EPOCHS = 10

def run_epoch(data_loader, train=True):
    model.train(train)
    total_loss, total_tokens = 0.0, 0
    for batch in data_loader:
        src = batch["src"].to(device)
        tgt_in = batch["tgt_in"].to(device)
        tgt_out = batch["tgt_out"].to(device)
        src_kpm = batch["src_key_padding_mask"].to(device)
        tgt_kpm = batch["tgt_key_padding_mask"].to(device)
        tgt_mask = batch["tgt_mask"].to(device)

        if train:
            optimizer.zero_grad()

        logits = model(src, tgt_in, src_kpm, tgt_kpm, tgt_mask)
        B, T, V = logits.shape
        loss = criterion(logits.reshape(B*T, V), tgt_out.reshape(B*T))

        if train:
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()

        tokens = (tgt_out != PAD_ID).sum().item()
        total_loss += loss.item() * tokens
        total_tokens += tokens

    return total_loss / max(1,total_tokens)

for ep in range(1, EPOCHS+1):
    train_loss = run_epoch(train_loader, train=True)
    val_loss   = run_epoch(val_loader,   train=False)
    print(f"Epoch {ep:02d} | train xent/token: {train_loss:.4f} | val xent/token: {val_loss:.4f}")


Epoch 01 | train xent/token: 4.1678 | val xent/token: 3.0774
Epoch 02 | train xent/token: 2.6643 | val xent/token: 2.1589
Epoch 03 | train xent/token: 1.9062 | val xent/token: 1.5826
Epoch 04 | train xent/token: 1.4709 | val xent/token: 1.3302
Epoch 05 | train xent/token: 1.2171 | val xent/token: 1.2247
Epoch 06 | train xent/token: 1.0502 | val xent/token: 1.0987
Epoch 07 | train xent/token: 0.9325 | val xent/token: 1.0417
Epoch 08 | train xent/token: 0.8431 | val xent/token: 1.0068
Epoch 09 | train xent/token: 0.7748 | val xent/token: 0.9819
Epoch 10 | train xent/token: 0.7187 | val xent/token: 0.9674


Trying out English sentences on the transformer

In [31]:

examples = [
    "I love to play soccer",
    "I like cricket",
    "I am from Mars"
]

def ids_to_toks(ids, itos):
    toks = []
    for i in ids:
        if i == SOS_ID:
            continue
        if i == EOS_ID:
            break
        toks.append(itos[i] if i < len(itos) else "<unk>")
    return toks

model.eval()
for sample_en in examples:
    src_ids = encode_src(sample_en).unsqueeze(0).to(device)
    pred_ids = model.greedy_decode(src_ids, max_len=60)
    pred_toks = ids_to_toks(pred_ids, tgt_itos)
    print("EN:", sample_en)
    print("FR (greedy):", " ".join(pred_toks))
    print("-" * 50)


EN: I love to play soccer
FR (greedy): j ' adore jouer au football .
--------------------------------------------------
EN: I like cricket
FR (greedy): j ' aime le cricket .
--------------------------------------------------
EN: I am from Mars
FR (greedy): je viens de mars .
--------------------------------------------------


Below code blocks are for French to English

In [19]:
# Rebuilding vocabs in opposite direction (FR=src, EN=tgt)
import io, re, collections, torch, random

SEED = 42
random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

FR_PATH = "french.txt"
EN_PATH = "english.txt"

with io.open(FR_PATH, "r", encoding="utf-8") as f:
    fr_lines = [ln.strip() for ln in f if ln.strip()]
with io.open(EN_PATH, "r", encoding="utf-8") as f:
    en_lines = [ln.strip() for ln in f if ln.strip()]

N = min(len(fr_lines), len(en_lines))
fr_lines, en_lines = fr_lines[:N], en_lines[:N]

if "basic_clean" not in globals():
    def basic_clean(s):
        s = s.lower().strip()
        s = re.sub(r"([.,!?;:()\"“”'’\-])", r" \1 ", s)
        s = re.sub(r"\s+", " ", s)
        return s
if "tokenize" not in globals():
    def tokenize(s): return basic_clean(s).split()

SPECIAL_TOKENS = {"PAD":"<pad>","SOS":"<sos>","EOS":"<eos>","UNK":"<unk>"}
PAD_ID, SOS_ID, EOS_ID, UNK_ID = 0, 1, 2, 3

def build_vocab(lines, max_vocab_size=20000):
    c = collections.Counter()
    for ln in lines: c.update(tokenize(ln))
    itos = [SPECIAL_TOKENS["PAD"], SPECIAL_TOKENS["SOS"], SPECIAL_TOKENS["EOS"], SPECIAL_TOKENS["UNK"]] \
           + [w for w,_ in c.most_common(max_vocab_size-4)]
    stoi = {w:i for i,w in enumerate(itos)}
    return stoi, itos

src_stoi, src_itos = build_vocab(fr_lines, 20000)  # FR source vocab
tgt_stoi, tgt_itos = build_vocab(en_lines, 20000)  # EN target vocab

def encode_src_fr(s):
    return torch.tensor([src_stoi.get(t, UNK_ID) for t in tokenize(s)], dtype=torch.long)

def encode_tgt_en(s):
    ids = [tgt_stoi.get(t, UNK_ID) for t in tokenize(s)]
    return torch.tensor([SOS_ID] + ids + [EOS_ID], dtype=torch.long)


In [20]:
# Dataset/DataLoader
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

class FrEnDataset(Dataset):
    def __init__(self, fr_list, en_list):
        self.fr, self.en = fr_list, en_list
    def __len__(self): return len(self.fr)
    def __getitem__(self, i):
        return encode_src_fr(self.fr[i]), encode_tgt_en(self.en[i])

dataset = FrEnDataset(fr_lines, en_lines)
val_size = int(0.05*len(dataset)); train_size = len(dataset)-val_size
train_ds, val_ds = torch.utils.data.random_split(dataset, [train_size, val_size], generator=torch.Generator().manual_seed(SEED))

import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def generate_square_subsequent_mask(sz):
    return torch.triu(torch.ones(sz, sz, device=device), diagonal=1).bool()

def collate_fn(batch):
    src_seqs, tgt_seqs = zip(*batch)
    src_pad = pad_sequence(src_seqs, batch_first=True, padding_value=PAD_ID)
    tgt_pad = pad_sequence(tgt_seqs, batch_first=True, padding_value=PAD_ID)
    tgt_in, tgt_out = tgt_pad[:, :-1], tgt_pad[:, 1:]
    return {
        "src": src_pad,
        "tgt_in": tgt_in,
        "tgt_out": tgt_out,
        "src_key_padding_mask": (src_pad == PAD_ID),
        "tgt_key_padding_mask": (tgt_in == PAD_ID),
        "tgt_mask": generate_square_subsequent_mask(tgt_in.size(1))
    }

BATCH_SIZE = 128
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True,  collate_fn=collate_fn)
val_loader   = DataLoader(val_ds,   batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)


Train French -> English model (10 epochs)

In [21]:
# FR→EN model + training loop

model_fr2en = TransformerSeq2Seq(
    src_vocab=len(src_itos),
    tgt_vocab=len(tgt_itos),
    d_model=256, nhead=8, num_layers=3, dim_ff=512, dropout=0.1
).to(device)

criterion = nn.CrossEntropyLoss(ignore_index=PAD_ID)
optimizer = torch.optim.Adam(model_fr2en.parameters(), lr=3e-4)

def run_epoch(loader, train=True):
    model_fr2en.train(train)
    total_loss, total_tok = 0.0, 0
    for b in loader:
        src = b["src"].to(device)
        tgt_in = b["tgt_in"].to(device)
        tgt_out = b["tgt_out"].to(device)
        src_kpm = b["src_key_padding_mask"].to(device)
        tgt_kpm = b["tgt_key_padding_mask"].to(device)
        tgt_mask = b["tgt_mask"].to(device)

        if train:
            optimizer.zero_grad()

        logits = model_fr2en(src, tgt_in, src_kpm, tgt_kpm, tgt_mask)
        B, T, V = logits.shape
        loss = criterion(logits.reshape(B*T, V), tgt_out.reshape(B*T))

        if train:
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model_fr2en.parameters(), 1.0)
            optimizer.step()

        ntok = (tgt_out != PAD_ID).sum().item()
        total_loss += loss.item() * ntok
        total_tok  += ntok

    return total_loss / max(1, total_tok)

# train for 10 epochs
for ep in range(1, 10+1):
    tr = run_epoch(train_loader, True)
    va = run_epoch(val_loader,   False)
    print(f"[FR→EN] Epoch {ep:02d} | train xent/token {tr:.4f} | val {va:.4f}")


[FR→EN] Epoch 01 | train xent/token 3.8952 | val 2.9306
[FR→EN] Epoch 02 | train xent/token 2.5262 | val 2.0142
[FR→EN] Epoch 03 | train xent/token 1.7919 | val 1.4849
[FR→EN] Epoch 04 | train xent/token 1.3642 | val 1.2248
[FR→EN] Epoch 05 | train xent/token 1.1105 | val 1.0983
[FR→EN] Epoch 06 | train xent/token 0.9478 | val 0.9972
[FR→EN] Epoch 07 | train xent/token 0.8335 | val 0.9402
[FR→EN] Epoch 08 | train xent/token 0.7453 | val 0.9093
[FR→EN] Epoch 09 | train xent/token 0.6755 | val 0.8753
[FR→EN] Epoch 10 | train xent/token 0.6197 | val 0.8574


Feed the given French line → print English output

In [22]:
# Feed French lines that were generated from English to French model and print English outputs

def ids_to_toks(ids, itos):
    toks = []
    for i in ids:
        if i == SOS_ID:
            continue
        if i == EOS_ID:
            break
        toks.append(itos[i] if i < len(itos) else "<unk>")
    return toks

examples_fr = [
    "j ' adore jouer au football .",
    "j ' aime le cricket .",
    "je viens de mars ."
]

model_fr2en.eval()
for sample_fr in examples_fr:
    src_ids = encode_src_fr(sample_fr).unsqueeze(0).to(device)
    pred_ids = model_fr2en.greedy_decode(src_ids, max_len=60)
    pred_toks = ids_to_toks(pred_ids, tgt_itos)
    print("FR:", sample_fr)
    print("EN:", " ".join(pred_toks))
    print("-" * 50)


FR: j ' adore jouer au football .
EN: i love to play football .
--------------------------------------------------
FR: j ' aime le cricket .
EN: i like cricket .
--------------------------------------------------
FR: je viens de mars .
EN: i ' m from march .
--------------------------------------------------


Compare the outputs of the two models.

When I compare the two models, I see that both are able to translate the sentences with similar meaning in both directions. The English-to-French model gave “j' adore jouer au football .” for “I love to play soccer,” while the French-to-English model returned “i love to play football .” for the same idea. The word “soccer” got mapped to “football,” which is still correct but shows some variation. For the sentence about cricket, both models translated almost perfectly without any difference. For the Mars example, the English-to-French model gave “je viens de mars .” which is correct, but the French-to-English model came back as “i ' m from march .” where “mars” was mistaken for the month “March.” This shows that the models capture the general meaning but can differ on specific word choices or named entities.