In [2]:
!pip install torchtext==0.17.2

Collecting torchtext==0.17.2
  Downloading torchtext-0.17.2-cp310-cp310-manylinux1_x86_64.whl.metadata (7.9 kB)
Collecting torch==2.2.2 (from torchtext==0.17.2)
  Downloading torch-2.2.2-cp310-cp310-manylinux1_x86_64.whl.metadata (26 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch==2.2.2->torchtext==0.17.2)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch==2.2.2->torchtext==0.17.2)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch==2.2.2->torchtext==0.17.2)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch==2.2.2->torchtext==0.17.2)
  Downloading nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch==2.2

In [4]:
!pip install torchdata==0.7.1

Collecting torchdata==0.7.1
  Downloading torchdata-0.7.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading torchdata-0.7.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.7/4.7 MB[0m [31m44.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torchdata
Successfully installed torchdata-0.7.1


In [7]:
#!pip uninstall torchdata -y
# !pip install portalocker>=2.0.0
# !pip install spacy
# !python -m spacy download de_core_news_sm
# !python -m spacy download en_core_web_sm


In [1]:
import torch
import torchtext
print(f"PyTorch version: {torch.__version__}")
print(f"torchtext version: {torchtext.__version__}")


PyTorch version: 2.2.2+cu121
torchtext version: 0.17.2+cpu


In [2]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.datasets import multi30k, Multi30k
from typing import Iterable, List


# We need to modify the URLs for the dataset since the links to the original dataset are broken
# Refer to https://github.com/pytorch/text/issues/1756#issuecomment-1163664163 for more info
multi30k.URL["train"] = "https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/training.tar.gz"
multi30k.URL["valid"] = "https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/validation.tar.gz"

SRC_LANGUAGE = 'de'
TGT_LANGUAGE = 'en'

# Initialize transforms
token_transform_de_en = {}
vocab_transform_de_en = {}

In [3]:
token_transform_de_en[SRC_LANGUAGE] = get_tokenizer('spacy', language='de_core_news_sm')
token_transform_de_en[TGT_LANGUAGE] = get_tokenizer('spacy', language='en_core_web_sm')


# helper function to yield list of tokens
# Cell 2: Token Generator and Special Tokens
def yield_tokens(data_iter: Iterable, language: str) -> List[str]:
    language_index = {SRC_LANGUAGE: 0, TGT_LANGUAGE: 1}
    for data_sample in data_iter:
        yield token_transform_de_en[language](data_sample[language_index[language]])

# Special tokens setup
UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3
special_symbols = ['<unk>', '<pad>', '<bos>', '<eos>']


"""
Purpose: Defines special tokens used in the vocabulary for machine learning tasks with text data.
Special Tokens:
<unk>: "Unknown" token (represents words not in the vocabulary)
<pad>: Padding token (to make sequences the same length)
<bos>: "Beginning of Sequence"
<eos>: "End of Sequence"
"""

for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
    train_iter = Multi30k(split='train', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
    vocab_transform_de_en[ln] = build_vocab_from_iterator(
        yield_tokens(train_iter, ln),
        min_freq=1,
        specials=special_symbols,
        special_first=True
    )
    vocab_transform_de_en[ln].set_default_index(UNK_IDX)

In [4]:
from torch import Tensor
import torch
import torch.nn as nn
from torch.nn import Transformer
import math
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# helper Module that adds positional encoding to the token embedding to introduce a notion of word order.
class PositionalEncoding(nn.Module):
    def __init__(self,
                 emb_size: int,
                 dropout: float,
                 maxlen: int = 5000):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(- torch.arange(0, emb_size, 2)* math.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding: Tensor):
        return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])

# helper Module to convert tensor of input indices into corresponding tensor of token embeddings
class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size

    def forward(self, tokens: Tensor):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)

# Seq2Seq Network
class Seq2SeqTransformer(nn.Module):
    def __init__(self,
                 num_encoder_layers: int,
                 num_decoder_layers: int,
                 emb_size: int,
                 nhead: int,
                 src_vocab_size: int,
                 tgt_vocab_size: int,
                 dim_feedforward: int = 512,
                 dropout: float = 0.1):
        super(Seq2SeqTransformer, self).__init__()
        self.transformer = Transformer(d_model=emb_size,
                                       nhead=nhead,
                                       num_encoder_layers=num_encoder_layers,
                                       num_decoder_layers=num_decoder_layers,
                                       dim_feedforward=dim_feedforward,
                                       dropout=dropout)
        self.generator = nn.Linear(emb_size, tgt_vocab_size)
        self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)
        self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)
        self.positional_encoding = PositionalEncoding(
            emb_size, dropout=dropout)

    def forward(self,
                src: Tensor,
                trg: Tensor,
                src_mask: Tensor,
                tgt_mask: Tensor,
                src_padding_mask: Tensor,
                tgt_padding_mask: Tensor,
                memory_key_padding_mask: Tensor):
        src_emb = self.positional_encoding(self.src_tok_emb(src))
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg))
        outs = self.transformer(src_emb, tgt_emb, src_mask, tgt_mask, None,
                                src_padding_mask, tgt_padding_mask, memory_key_padding_mask)
        return self.generator(outs)

    def encode(self, src: Tensor, src_mask: Tensor):
        return self.transformer.encoder(self.positional_encoding(
                            self.src_tok_emb(src)), src_mask)

    def decode(self, tgt: Tensor, memory: Tensor, tgt_mask: Tensor):
        return self.transformer.decoder(self.positional_encoding(
                          self.tgt_tok_emb(tgt)), memory,
                          tgt_mask)

In [5]:
def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones((sz, sz), device=DEVICE)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask


def create_mask(src, tgt):
    src_seq_len = src.shape[0]
    tgt_seq_len = tgt.shape[0]

    tgt_mask = generate_square_subsequent_mask(tgt_seq_len)
    src_mask = torch.zeros((src_seq_len, src_seq_len),device=DEVICE).type(torch.bool)

    src_padding_mask = (src == PAD_IDX).transpose(0, 1)
    tgt_padding_mask = (tgt == PAD_IDX).transpose(0, 1)
    return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask

In [6]:
torch.manual_seed(0)

SRC_VOCAB_SIZE = len(vocab_transform_de_en[SRC_LANGUAGE])
TGT_VOCAB_SIZE = len(vocab_transform_de_en[TGT_LANGUAGE])
EMB_SIZE = 512
NHEAD = 8
FFN_HID_DIM = 512
BATCH_SIZE = 128
NUM_ENCODER_LAYERS = 3
NUM_DECODER_LAYERS = 3

de_en_transformer = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE,
                                     NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM)

for p in de_en_transformer.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

de_en_transformer = de_en_transformer.to(DEVICE)

loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)

optimizer = torch.optim.Adam(de_en_transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)



In [18]:
from torch.nn.utils.rnn import pad_sequence

# helper function to club together sequential operations
def sequential_transforms(*transforms):
    def func(txt_input):
        for transform in transforms:
            txt_input = transform(txt_input)
        return txt_input
    return func

# function to add BOS/EOS and create tensor for input sequence indices
def tensor_transform(token_ids: List[int]):
    return torch.cat((torch.tensor([BOS_IDX]),
                      torch.tensor(token_ids),
                      torch.tensor([EOS_IDX])))

# ``src`` and ``tgt`` language text transforms to convert raw strings into tensors indices
# 2. Update the text_transform dictionary
text_transform = {}
for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
    text_transform[ln] = sequential_transforms(
        lambda x: token_transform_de_en[ln](x),  # Use token_transform_de_en
        lambda x: vocab_transform_de_en[ln](x),  # Use vocab_transform_de_en
        tensor_transform
    )


# function to collate data samples into batch tensors
def collate_fn(batch):
    src_batch, tgt_batch = [], []
    for src_sample, tgt_sample in batch:
        # Use token_transform_de_en instead of token_transform
        src_tokens = token_transform_de_en[SRC_LANGUAGE](src_sample.rstrip("\n"))
        tgt_tokens = token_transform_de_en[TGT_LANGUAGE](tgt_sample.rstrip("\n"))

        # Use vocab_transform_de_en instead of vocab_transform
        src_indices = vocab_transform_de_en[SRC_LANGUAGE](src_tokens)
        tgt_indices = vocab_transform_de_en[TGT_LANGUAGE](tgt_tokens)

        src_tensor = tensor_transform(src_indices)
        tgt_tensor = tensor_transform(tgt_indices)

        src_batch.append(src_tensor)
        tgt_batch.append(tgt_tensor)

    src_batch = pad_sequence(src_batch, padding_value=PAD_IDX)
    tgt_batch = pad_sequence(tgt_batch, padding_value=PAD_IDX)
    return src_batch, tgt_batch


In [19]:
from torch.utils.data import DataLoader

# 3. Update train_epoch function
def train_epoch(model, optimizer):
    model.train()
    losses = 0
    train_iter = Multi30k(split='train', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
    train_dataloader = DataLoader(train_iter, batch_size=BATCH_SIZE, collate_fn=collate_fn)

    for src, tgt in train_dataloader:
        src = src.to(DEVICE)
        tgt = tgt.to(DEVICE)

        tgt_input = tgt[:-1, :]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

        logits = model(src, tgt_input, src_mask, tgt_mask,
                      src_padding_mask, tgt_padding_mask, src_padding_mask)

        optimizer.zero_grad()

        tgt_out = tgt[1:, :]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        loss.backward()

        optimizer.step()
        losses += loss.item()

    return losses / len(list(train_dataloader))


def evaluate(model):
    model.eval()
    losses = 0

    val_iter = Multi30k(split='valid', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
    val_dataloader = DataLoader(val_iter, batch_size=BATCH_SIZE, collate_fn=collate_fn)

    for src, tgt in val_dataloader:
        src = src.to(DEVICE)
        tgt = tgt.to(DEVICE)

        tgt_input = tgt[:-1, :]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

        logits = model(src, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask)

        tgt_out = tgt[1:, :]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        losses += loss.item()

    return losses / len(list(val_dataloader))

In [20]:
from timeit import default_timer as timer
NUM_EPOCHS = 18

for epoch in range(1, NUM_EPOCHS+1):
    start_time = timer()
    train_loss = train_epoch(de_en_transformer, optimizer)
    end_time = timer()
    val_loss = evaluate(de_en_transformer)
    print((f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Val loss: {val_loss:.3f}, "f"Epoch time = {(end_time - start_time):.3f}s"))


# function to generate output sequence using greedy algorithm
def greedy_decode(model, src, src_mask, max_len, start_symbol):
    src = src.to(DEVICE)
    src_mask = src_mask.to(DEVICE)

    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(DEVICE)
    for i in range(max_len-1):
        memory = memory.to(DEVICE)
        tgt_mask = (generate_square_subsequent_mask(ys.size(0))
                    .type(torch.bool)).to(DEVICE)
        out = model.decode(ys, memory, tgt_mask)
        out = out.transpose(0, 1)
        prob = model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim=1)
        next_word = next_word.item()

        ys = torch.cat([ys,
                        torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
        if next_word == EOS_IDX:
            break
    return ys


# 4. Update translate function to properly handle tokenization
def translate(model: torch.nn.Module, src_sentence: str):
    model.eval()
    # Tokenize the source sentence
    src_tokens = token_transform[SRC_LANGUAGE](src_sentence)
    # Convert tokens to indices
    src_indices = vocab_transform[SRC_LANGUAGE](src_tokens)
    # Add BOS/EOS and create tensor
    src = torch.cat([
        torch.tensor([BOS_IDX]),
        torch.tensor(src_indices),
        torch.tensor([EOS_IDX])
    ]).view(-1, 1)

    num_tokens = src.shape[0]
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)

    tgt_tokens = greedy_decode(
        model, src, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX
    ).flatten()

    return " ".join(vocab_transform[TGT_LANGUAGE].lookup_tokens(
        list(tgt_tokens.cpu().numpy()))).replace("<bos>", "").replace("<eos>", "")



Epoch: 1, Train loss: 5.344, Val loss: 4.107, Epoch time = 45.137s
Epoch: 2, Train loss: 3.760, Val loss: 3.309, Epoch time = 43.890s
Epoch: 3, Train loss: 3.157, Val loss: 2.886, Epoch time = 43.299s
Epoch: 4, Train loss: 2.767, Val loss: 2.639, Epoch time = 44.812s
Epoch: 5, Train loss: 2.477, Val loss: 2.439, Epoch time = 43.737s
Epoch: 6, Train loss: 2.247, Val loss: 2.305, Epoch time = 43.468s
Epoch: 7, Train loss: 2.055, Val loss: 2.208, Epoch time = 42.796s
Epoch: 8, Train loss: 1.893, Val loss: 2.114, Epoch time = 45.572s
Epoch: 9, Train loss: 1.754, Val loss: 2.053, Epoch time = 43.516s
Epoch: 10, Train loss: 1.628, Val loss: 2.007, Epoch time = 42.326s
Epoch: 11, Train loss: 1.519, Val loss: 1.961, Epoch time = 42.712s
Epoch: 12, Train loss: 1.419, Val loss: 1.955, Epoch time = 45.837s
Epoch: 13, Train loss: 1.330, Val loss: 1.969, Epoch time = 44.439s
Epoch: 14, Train loss: 1.245, Val loss: 1.973, Epoch time = 42.321s
Epoch: 15, Train loss: 1.173, Val loss: 1.931, Epoch time

In [21]:
from google.colab import drive
drive.mount('/content/drive')

torch.save({
    'model_state_dict': de_en_transformer.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'vocab_transform': vocab_transform_de_en,
    'token_transform': token_transform_de_en,
    'SRC_LANGUAGE': SRC_LANGUAGE,
    'TGT_LANGUAGE': TGT_LANGUAGE
}, '/content/drive/My Drive/Deep Learning/LAB2/de_en_transformer.pth')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [28]:
# Test function to verify translations
def test_translations():
    # Load German to English model and transforms
    checkpoint_de_en = torch.load('/content/drive/My Drive/Deep Learning/LAB2/de_en_transformer.pth')
    de_en_token_transform = checkpoint_de_en['token_transform']
    de_en_vocab_transform = checkpoint_de_en['vocab_transform']

    de_en_transformer = Seq2SeqTransformer(
        NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE,
        NHEAD,
        len(de_en_vocab_transform['de']),
        len(de_en_vocab_transform['en']),
        FFN_HID_DIM
    ).to(DEVICE)
    de_en_transformer.load_state_dict(checkpoint_de_en['model_state_dict'])
    de_en_transformer.eval()

    # Function to translate German to English
    def translate_de_to_en(sentence):
        model = de_en_transformer
        src_tokens = de_en_token_transform['de'](sentence)
        src_indices = de_en_vocab_transform['de'](src_tokens)
        src = torch.cat([
            torch.tensor([BOS_IDX]),
            torch.tensor(src_indices),
            torch.tensor([EOS_IDX])
        ]).view(-1, 1)

        num_tokens = src.shape[0]
        src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)

        tgt_tokens = greedy_decode(
            model, src, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX
        ).flatten()

        return " ".join(de_en_vocab_transform['en'].lookup_tokens(
            list(tgt_tokens.cpu().numpy()))).replace("<bos>", "").replace("<eos>", "")

    # Test German to English translations
    test_sentences = [
        "Eine Gruppe von Menschen steht vor einem Iglu.",
        "Der Hund läuft im Park.",
        "Das Mädchen liest ein Buch."
    ]

    print("\nTesting German to English translations:")
    print("-" * 50)
    for sentence in test_sentences:
        translation = translate_de_to_en(sentence)
        print(f"German: {sentence}")
        print(f"English: {translation}")
        print("-" * 50)

# Run the tests
test_translations()


Testing German to English translations:
--------------------------------------------------
German: Eine Gruppe von Menschen steht vor einem Iglu.
English:  A group of people standing in front of an igloo 
--------------------------------------------------
German: Der Hund läuft im Park.
English:  The dog is running in the park . 
--------------------------------------------------
German: Das Mädchen liest ein Buch.
English:  The girl reads a book . 
--------------------------------------------------


## English to German Translation

In [29]:
# 1. Change language configuration
import torch
import torchtext
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.datasets import multi30k, Multi30k
from typing import Iterable, List

# We need to modify the URLs for the dataset since the links to the original dataset are broken
# Refer to https://github.com/pytorch/text/issues/1756#issuecomment-1163664163 for more info
multi30k.URL["train"] = "https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/training.tar.gz"
multi30k.URL["valid"] = "https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/validation.tar.gz"

# Set device to CPU for now
DEVICE = torch.device('cpu')

SRC_LANGUAGE = 'en'
TGT_LANGUAGE = 'de'

# Initialize new transforms
token_transform_en_de = {}
vocab_transform_en_de = {}


In [30]:
# 2. Update tokenizers
# Update tokenizers
token_transform_en_de[SRC_LANGUAGE] = get_tokenizer('spacy', language='en_core_web_sm')
token_transform_en_de[TGT_LANGUAGE] = get_tokenizer('spacy', language='de_core_news_sm')

# helper function to yield list of tokens
def yield_tokens(data_iter: Iterable, language: str) -> List[str]:
    language_index = {SRC_LANGUAGE: 0, TGT_LANGUAGE: 1}
    for data_sample in data_iter:
        yield token_transform_en_de[language](data_sample[language_index[language]])


# Define special symbols and indices
UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3
# Make sure the tokens are in order of their indices to properly insert them in vocab
special_symbols = ['<unk>', '<pad>', '<bos>', '<eos>']

"""
Purpose: Defines special tokens used in the vocabulary for machine learning tasks with text data.
Special Tokens:
<unk>: "Unknown" token (represents words not in the vocabulary)
<pad>: Padding token (to make sequences the same length)
<bos>: "Beginning of Sequence"
<eos>: "End of Sequence"
"""
# Rebuild vocabularies
for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
    train_iter = Multi30k(split='train', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
    vocab_transform_en_de[ln] = build_vocab_from_iterator(
        yield_tokens(train_iter, ln),
        min_freq=1,
        specials=special_symbols,
        special_first=True
    )
    vocab_transform_en_de[ln].set_default_index(UNK_IDX)



In [31]:
from torch import Tensor
import torch
import torch.nn as nn
from torch.nn import Transformer
import math
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# helper Module that adds positional encoding to the token embedding to introduce a notion of word order.
class PositionalEncoding(nn.Module):
    def __init__(self,
                 emb_size: int,
                 dropout: float,
                 maxlen: int = 5000):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(- torch.arange(0, emb_size, 2)* math.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding: Tensor):
        return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])

# helper Module to convert tensor of input indices into corresponding tensor of token embeddings
class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size

    def forward(self, tokens: Tensor):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)

# Seq2Seq Network
class Seq2SeqTransformer(nn.Module):
    def __init__(self,
                 num_encoder_layers: int,
                 num_decoder_layers: int,
                 emb_size: int,
                 nhead: int,
                 src_vocab_size: int,
                 tgt_vocab_size: int,
                 dim_feedforward: int = 512,
                 dropout: float = 0.1):
        super(Seq2SeqTransformer, self).__init__()
        self.transformer = Transformer(d_model=emb_size,
                                       nhead=nhead,
                                       num_encoder_layers=num_encoder_layers,
                                       num_decoder_layers=num_decoder_layers,
                                       dim_feedforward=dim_feedforward,
                                       dropout=dropout)
        self.generator = nn.Linear(emb_size, tgt_vocab_size)
        self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)
        self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)
        self.positional_encoding = PositionalEncoding(
            emb_size, dropout=dropout)

    def forward(self,
                src: Tensor,
                trg: Tensor,
                src_mask: Tensor,
                tgt_mask: Tensor,
                src_padding_mask: Tensor,
                tgt_padding_mask: Tensor,
                memory_key_padding_mask: Tensor):
        src_emb = self.positional_encoding(self.src_tok_emb(src))
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg))
        outs = self.transformer(src_emb, tgt_emb, src_mask, tgt_mask, None,
                                src_padding_mask, tgt_padding_mask, memory_key_padding_mask)
        return self.generator(outs)

    def encode(self, src: Tensor, src_mask: Tensor):
        return self.transformer.encoder(self.positional_encoding(
                            self.src_tok_emb(src)), src_mask)

    def decode(self, tgt: Tensor, memory: Tensor, tgt_mask: Tensor):
        return self.transformer.decoder(self.positional_encoding(
                          self.tgt_tok_emb(tgt)), memory,
                          tgt_mask)

In [32]:
def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones((sz, sz), device=DEVICE)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask


def create_mask(src, tgt):
    src_seq_len = src.shape[0]
    tgt_seq_len = tgt.shape[0]

    tgt_mask = generate_square_subsequent_mask(tgt_seq_len)
    src_mask = torch.zeros((src_seq_len, src_seq_len),device=DEVICE).type(torch.bool)

    src_padding_mask = (src == PAD_IDX).transpose(0, 1)
    tgt_padding_mask = (tgt == PAD_IDX).transpose(0, 1)
    return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask

In [33]:
torch.manual_seed(0)

SRC_VOCAB_SIZE = len(vocab_transform_en_de[SRC_LANGUAGE])
TGT_VOCAB_SIZE = len(vocab_transform_en_de[TGT_LANGUAGE])

EMB_SIZE = 512
NHEAD = 8
FFN_HID_DIM = 512
BATCH_SIZE = 128
NUM_ENCODER_LAYERS = 3
NUM_DECODER_LAYERS = 3

en_de_transformer = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE,
                                     NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM)

for p in en_de_transformer.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

en_de_transformer = en_de_transformer.to(DEVICE)

loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)

optimizer = torch.optim.Adam(en_de_transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

In [34]:
from torch.nn.utils.rnn import pad_sequence

# helper function to club together sequential operations
def sequential_transforms(*transforms):
    def func(txt_input):
        for transform in transforms:
            txt_input = transform(txt_input)
        return txt_input
    return func

# function to add BOS/EOS and create tensor for input sequence indices
def tensor_transform(token_ids: List[int]):
    return torch.cat((torch.tensor([BOS_IDX]),
                      torch.tensor(token_ids),
                      torch.tensor([EOS_IDX])))

# ``src`` and ``tgt`` language text transforms to convert raw strings into tensors indices
# 2. Update the text_transform dictionary
text_transform = {}
for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
    text_transform[ln] = sequential_transforms(
        lambda x: token_transform_en_de[ln](x),  # Use token_transform_de_en
        lambda x: vocab_transform_en_de[ln](x),  # Use vocab_transform_de_en
        tensor_transform
    )


# function to collate data samples into batch tensors
def collate_fn(batch):
    src_batch, tgt_batch = [], []
    for src_sample, tgt_sample in batch:
        # Use token_transform_de_en instead of token_transform
        src_tokens = token_transform_en_de[SRC_LANGUAGE](src_sample.rstrip("\n"))
        tgt_tokens = token_transform_en_de[TGT_LANGUAGE](tgt_sample.rstrip("\n"))

        # Use vocab_transform_de_en instead of vocab_transform
        src_indices = vocab_transform_en_de[SRC_LANGUAGE](src_tokens)
        tgt_indices = vocab_transform_en_de[TGT_LANGUAGE](tgt_tokens)

        src_tensor = tensor_transform(src_indices)
        tgt_tensor = tensor_transform(tgt_indices)

        src_batch.append(src_tensor)
        tgt_batch.append(tgt_tensor)

    src_batch = pad_sequence(src_batch, padding_value=PAD_IDX)
    tgt_batch = pad_sequence(tgt_batch, padding_value=PAD_IDX)
    return src_batch, tgt_batch


In [35]:
# 3. Update data loading
from torch.utils.data import DataLoader

def train_epoch(model, optimizer):
    model.train()
    losses = 0
    train_iter = Multi30k(split='train', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
    train_dataloader = DataLoader(train_iter, batch_size=BATCH_SIZE, collate_fn=collate_fn)

    for src, tgt in train_dataloader:
        src = src.to(DEVICE)
        tgt = tgt.to(DEVICE)

        tgt_input = tgt[:-1, :]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

        logits = model(src, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask)

        optimizer.zero_grad()

        tgt_out = tgt[1:, :]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        loss.backward()

        optimizer.step()
        losses += loss.item()

    return losses / len(list(train_dataloader))


def evaluate(model):
    model.eval()
    losses = 0

    val_iter = Multi30k(split='valid', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
    val_dataloader = DataLoader(val_iter, batch_size=BATCH_SIZE, collate_fn=collate_fn)

    for src, tgt in val_dataloader:
        src = src.to(DEVICE)
        tgt = tgt.to(DEVICE)

        tgt_input = tgt[:-1, :]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

        logits = model(src, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask)

        tgt_out = tgt[1:, :]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        losses += loss.item()

    return losses / len(list(val_dataloader))

In [36]:
from timeit import default_timer as timer
NUM_EPOCHS = 18

for epoch in range(1, NUM_EPOCHS+1):
    start_time = timer()
    train_loss = train_epoch(en_de_transformer, optimizer)
    end_time = timer()
    val_loss = evaluate(en_de_transformer)
    print((f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Val loss: {val_loss:.3f}, "f"Epoch time = {(end_time - start_time):.3f}s"))


# function to generate output sequence using greedy algorithm
def greedy_decode(model, src, src_mask, max_len, start_symbol):
    src = src.to(DEVICE)
    src_mask = src_mask.to(DEVICE)

    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(DEVICE)
    for i in range(max_len-1):
        memory = memory.to(DEVICE)
        tgt_mask = (generate_square_subsequent_mask(ys.size(0))
                    .type(torch.bool)).to(DEVICE)
        out = model.decode(ys, memory, tgt_mask)
        out = out.transpose(0, 1)
        prob = model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim=1)
        next_word = next_word.item()

        ys = torch.cat([ys,
                        torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
        if next_word == EOS_IDX:
            break
    return ys


# 4. Update translate function to properly handle tokenization
def translate(model: torch.nn.Module, src_sentence: str):
    model.eval()
    # Tokenize the source sentence
    src_tokens = token_transform[SRC_LANGUAGE](src_sentence)
    # Convert tokens to indices
    src_indices = vocab_transform[SRC_LANGUAGE](src_tokens)
    # Add BOS/EOS and create tensor
    src = torch.cat([
        torch.tensor([BOS_IDX]),
        torch.tensor(src_indices),
        torch.tensor([EOS_IDX])
    ]).view(-1, 1)

    num_tokens = src.shape[0]
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)

    tgt_tokens = greedy_decode(
        model, src, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX
    ).flatten()

    return " ".join(vocab_transform[TGT_LANGUAGE].lookup_tokens(
        list(tgt_tokens.cpu().numpy()))).replace("<bos>", "").replace("<eos>", "")



Epoch: 1, Train loss: 5.903, Val loss: 4.632, Epoch time = 49.157s
Epoch: 2, Train loss: 4.132, Val loss: 3.790, Epoch time = 50.875s
Epoch: 3, Train loss: 3.464, Val loss: 3.328, Epoch time = 51.711s
Epoch: 4, Train loss: 3.026, Val loss: 3.011, Epoch time = 49.667s
Epoch: 5, Train loss: 2.703, Val loss: 2.812, Epoch time = 49.020s
Epoch: 6, Train loss: 2.443, Val loss: 2.648, Epoch time = 50.823s
Epoch: 7, Train loss: 2.230, Val loss: 2.505, Epoch time = 50.288s
Epoch: 8, Train loss: 2.059, Val loss: 2.385, Epoch time = 49.015s
Epoch: 9, Train loss: 1.904, Val loss: 2.293, Epoch time = 52.595s
Epoch: 10, Train loss: 1.767, Val loss: 2.227, Epoch time = 55.751s
Epoch: 11, Train loss: 1.648, Val loss: 2.196, Epoch time = 52.722s
Epoch: 12, Train loss: 1.545, Val loss: 2.179, Epoch time = 50.825s
Epoch: 13, Train loss: 1.452, Val loss: 2.175, Epoch time = 49.305s
Epoch: 14, Train loss: 1.361, Val loss: 2.154, Epoch time = 49.125s
Epoch: 15, Train loss: 1.283, Val loss: 2.076, Epoch time

In [37]:
from google.colab import drive
drive.mount('/content/drive')

# Save to Google Drive
torch.save({
    'model_state_dict': en_de_transformer.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'vocab_transform': vocab_transform_en_de,
    'token_transform': token_transform_en_de,
    'SRC_LANGUAGE': SRC_LANGUAGE,
    'TGT_LANGUAGE': TGT_LANGUAGE
}, '/content/drive/My Drive/Deep Learning/LAB2/en_de_transformer.pth')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [38]:
# Load English to German model and transforms
checkpoint_en_de = torch.load('/content/drive/My Drive/Deep Learning/LAB2/en_de_transformer.pth')
en_de_token_transform = checkpoint_en_de['token_transform']
en_de_vocab_transform = checkpoint_en_de['vocab_transform']
en_de_transformer = Seq2SeqTransformer(
    NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE,
    NHEAD,
    len(en_de_vocab_transform['en']),
    len(en_de_vocab_transform['de']),
    FFN_HID_DIM
).to(DEVICE)
en_de_transformer.load_state_dict(checkpoint_en_de['model_state_dict'])
en_de_transformer.eval()

# Function to translate English to German
def translate_en_to_de(sentence):
    model = en_de_transformer
    src_tokens = en_de_token_transform['en'](sentence)
    src_indices = en_de_vocab_transform['en'](src_tokens)
    src = torch.cat([
        torch.tensor([BOS_IDX]),
        torch.tensor(src_indices),
        torch.tensor([EOS_IDX])
    ]).view(-1, 1)
    num_tokens = src.shape[0]
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
    tgt_tokens = greedy_decode(
        model, src, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX
    ).flatten()
    return " ".join(en_de_vocab_transform['de'].lookup_tokens(
        list(tgt_tokens.cpu().numpy()))).replace("<bos>", "").replace("<eos>", "")

# Test English to German translations
english_sentences = [
    "A group of people stands in front of an igloo.",
    "The dog is running in the park.",
    "The girl is reading a book."
]

print("\nTesting English to German translations:")
print("-" * 50)
for sentence in english_sentences:
    translation = translate_en_to_de(sentence)
    print(f"English: {sentence}")
    print(f"German: {translation}")
    print("-" * 50)




Testing English to German translations:
--------------------------------------------------
English: A group of people stands in front of an igloo.
German:  Eine Gruppe von Personen steht vor einem Labor in einem Labor . 
--------------------------------------------------
English: The dog is running in the park.
German:  Der Hund läuft im Park . 
--------------------------------------------------
English: The girl is reading a book.
German:  Das Mädchen liest ein Buch . 
--------------------------------------------------


In [40]:
# Load both models
# German to English model
checkpoint_de_en = torch.load('/content/drive/My Drive/Deep Learning/LAB2/de_en_transformer.pth')
de_en_token_transform = checkpoint_de_en['token_transform']
de_en_vocab_transform = checkpoint_de_en['vocab_transform']

de_en_transformer = Seq2SeqTransformer(
    NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE,
    NHEAD,
    len(de_en_vocab_transform['de']),
    len(de_en_vocab_transform['en']),
    FFN_HID_DIM
).to(DEVICE)
de_en_transformer.load_state_dict(checkpoint_de_en['model_state_dict'])
de_en_transformer.eval()

def translate_de_to_en(sentence):
    model = de_en_transformer
    src_tokens = de_en_token_transform['de'](sentence)
    src_indices = de_en_vocab_transform['de'](src_tokens)
    src = torch.cat([
        torch.tensor([BOS_IDX]),
        torch.tensor(src_indices),
        torch.tensor([EOS_IDX])
    ]).view(-1, 1)

    num_tokens = src.shape[0]
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)

    tgt_tokens = greedy_decode(
        model, src, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX
    ).flatten()

    return " ".join(de_en_vocab_transform['en'].lookup_tokens(
        list(tgt_tokens.cpu().numpy()))).replace("<bos>", "").replace("<eos>", "")

# Test sentences
test_sentences = [
    "A group of people stands in front of an igloo.",
    "The dog is running in the park.",
    "The girl is reading a book."
]

print("\nTesting Back Translation (English -> German -> English):")
print("-" * 70)

for i, original_en in enumerate(test_sentences, 1):
    print(f"\nTest Case {i}:")
    print(f"Original English: {original_en}")

    # Forward translation (EN -> DE)
    german = translate_en_to_de(original_en)
    print(f"German Translation: {german}")

    # Back translation (DE -> EN)
    back_to_english = translate_de_to_en(german)
    print(f"Back to English: {back_to_english}")

    # Calculate word preservation ratio
    original_words = set(original_en.lower().split())
    back_translated_words = set(back_to_english.lower().split())
    word_preservation = len(original_words.intersection(back_translated_words)) / len(original_words)

    print(f"Word Preservation Ratio: {word_preservation:.2f}")
    print("-" * 70)


Testing Back Translation (English -> German -> English):
----------------------------------------------------------------------

Test Case 1:
Original English: A group of people stands in front of an igloo.
German Translation:  Eine Gruppe von Personen steht vor einem Labor in einem Labor . 
Back to English:  A group of people in front of a lab in a lab . 
Word Preservation Ratio: 0.67
----------------------------------------------------------------------

Test Case 2:
Original English: The dog is running in the park.
German Translation:  Der Hund läuft im Park . 
Back to English:  The dog is running in the park . 
Word Preservation Ratio: 0.83
----------------------------------------------------------------------

Test Case 3:
Original English: The girl is reading a book.
German Translation:  Das Mädchen liest ein Buch . 
Back to English:  The girl is reading a book . 
Word Preservation Ratio: 0.83
----------------------------------------------------------------------


Here's a qualitative analysis of the back-translation results:

1. **Simple Perfect Translations:**
   - Test Case 2 and 3 show nearly perfect back-translation
   - "The dog is running in the park" and "The girl is reading a book"
   - Word preservation ratio: 0.83 (very high)
   - Maintains:
     - Exact meaning
     - Grammatical structure
     - All key information
   - These sentences work well because they:
     - Use common vocabulary
     - Have simple grammatical structures
     - Express concrete actions

2. **Complex Translation with Issues:**
   - Test Case 1 shows some degradation
   - Original: "A group of people stands in front of an igloo"
   - Final: "A group of people in front of a lab in a lab"
   - Word preservation ratio: 0.67 (moderate)
   - Issues identified:
     - "igloo" mistranslated as "Labor" (lab)
     - Unnecessary repetition in German ("in einem Labor in einem Labor")
     - Slight grammatical structure change (loses "stands")
     - Still maintains core subject and spatial relationship

3. **Overall Patterns:**
   - Strengths:
     - Excellent with common vocabulary
     - Maintains basic sentence structure
     - Perfect preservation of simple statements
     - High accuracy with everyday scenarios
   
   - Weaknesses:
     - Struggles with uncommon words (igloo → lab)
     - Can introduce redundancies
     - May lose some verbal elements in complex sentences

4. **Quality Metrics:**
   - Word preservation improves with:
     - Shorter sentences
     - Common vocabulary
     - Simple grammatical structures
   - Perfect preservation (0.83) for everyday scenarios
   - Lower preservation (0.67) for sentences with specialized terms

These results suggest the model is well-suited for common communications but may need improvement for handling specialized vocabulary or more complex sentence structures.
