In [1]:
!pip install torchdata



In [2]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.datasets import multi30k, Multi30k
from typing import Iterable, List

In [3]:
multi30k.URL["train"] = "https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/training.tar.gz"
multi30k.URL["valid"] = "https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/validation.tar.gz"

In [4]:
SRC_LANGUAGE = 'de'
TGT_LANGUAGE = 'en'

In [5]:
token_transform = {}
vocab_transform = {}

In [6]:
!pip install -U spacy



In [7]:
!pip install portalocker



In [8]:
!python -m spacy download en_core_web_sm
!python -m spacy download de_core_news_sm

2023-08-03 07:44:54.056200: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-08-03 07:44:56.553455: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-08-03 07:44:56.553892: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-08-

In [9]:
token_transform[SRC_LANGUAGE] = get_tokenizer('spacy', language='de_core_news_sm')
token_transform[TGT_LANGUAGE] = get_tokenizer('spacy', language='en_core_web_sm')

In [10]:
token_transform

{'de': functools.partial(<function _spacy_tokenize at 0x7a930cc8a710>, spacy=<spacy.lang.de.German object at 0x7a923cd9ff40>),
 'en': functools.partial(<function _spacy_tokenize at 0x7a930cc8a710>, spacy=<spacy.lang.en.English object at 0x7a93079fd660>)}

In [11]:
def yield_tokens(data_iter: Iterable, language: str) -> List[str]:
    language_index = {SRC_LANGUAGE: 0, TGT_LANGUAGE: 1}

    for data_sample in data_iter:
        yield token_transform[language](data_sample[language_index[language]])

In [12]:
UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3
special_symbols = ['<unk>', '<pad>', '<bos>', '<eos>']

for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
    train_iter = Multi30k(split='train', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
    vocab_transform[ln] = build_vocab_from_iterator(yield_tokens(train_iter, ln),
                                                    min_freq=1,
                                                    specials=special_symbols,
                                                    special_first=True)

In [13]:
vocab_transform['en'].get_stoi() # mapping of tokens to indices

{'zips': 10834,
 'zippered': 10833,
 'youngster': 10832,
 'yong': 10831,
 'yielding': 10830,
 'yawns': 10827,
 'yawing': 10826,
 'yak': 10825,
 'yacht': 10824,
 'wuth': 10822,
 'writhing': 10820,
 'wristwatch': 10818,
 'wrecks': 10814,
 'wrecked': 10813,
 'wrappers': 10812,
 'wrangled': 10810,
 'worshiping': 10807,
 'worshipers': 10806,
 'woodsy': 10802,
 'wonders': 10801,
 'wonderment': 10800,
 'wonder': 10798,
 'won': 10797,
 'wmoan': 10794,
 'wizards': 10793,
 'witnessing': 10792,
 'witnessed': 10791,
 'wireless': 10785,
 'windy': 10779,
 'windsurfing': 10777,
 'wildly': 10773,
 'wieners': 10769,
 'whitewater': 10767,
 'whites': 10766,
 'whiteboard': 10765,
 'whistling': 10764,
 'whips': 10762,
 'whine': 10761,
 'whereas': 10759,
 'wheelers': 10758,
 'whatever': 10756,
 'whales': 10755,
 'went': 10753,
 'wells': 10752,
 'welders': 10751,
 'weld': 10750,
 'weiner': 10749,
 'weightlifting': 10748,
 'weighs': 10746,
 'weighed': 10745,
 'week': 10743,
 'wedge': 10742,
 'website': 10740,

In [14]:
vocab_transform['en'].get_itos() # list of tokens

['<unk>',
 '<pad>',
 '<bos>',
 '<eos>',
 'a',
 '.',
 'A',
 'in',
 'the',
 'on',
 'is',
 'and',
 'man',
 'of',
 'with',
 ',',
 'woman',
 'are',
 'to',
 'Two',
 'at',
 'wearing',
 'people',
 'shirt',
 'white',
 'young',
 'black',
 'his',
 'an',
 'while',
 'blue',
 'red',
 'sitting',
 'girl',
 'dog',
 'boy',
 'men',
 'standing',
 'playing',
 'group',
 'street',
 'down',
 'walking',
 '-',
 'front',
 'her',
 'holding',
 'water',
 'by',
 'The',
 'up',
 'green',
 'women',
 'An',
 'one',
 'for',
 'looking',
 'outside',
 'child',
 'Three',
 'as',
 'little',
 'large',
 'through',
 'yellow',
 'brown',
 'two',
 'from',
 'hat',
 'ball',
 'their',
 'into',
 'person',
 'children',
 'next',
 'other',
 'dressed',
 'small',
 'out',
 'over',
 'building',
 'riding',
 'running',
 'People',
 'near',
 'jacket',
 'another',
 'around',
 'some',
 'sidewalk',
 'field',
 'orange',
 'beach',
 'crowd',
 'stands',
 'pink',
 'sits',
 'jumping',
 'behind',
 'table',
 'snow',
 'grass',
 'hair',
 'background',
 'stand',

In [15]:
for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
  vocab_transform[ln].set_default_index(UNK_IDX)

Transformer is a Seq2Seq model introduced in “Attention is all you need” paper for solving machine translation tasks. Below, we will create a Seq2Seq network that uses Transformer. The network consists of three parts. First part is the embedding layer. This layer converts tensor of input indices into corresponding tensor of input embeddings. These embedding are further augmented with positional encodings to provide position information of input tokens to the model. The second part is the actual Transformer model. Finally, the output of the Transformer model is passed through linear layer that gives unnormalized probabilities for each token in the target language.

In [16]:
from torch import Tensor
import torch
import torch.nn as nn
from torch.nn import Transformer
import math

In [17]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [18]:
DEVICE

device(type='cuda')

In [19]:
class PositionalEncoding(nn.Module):
    def __init__(self,
                 emb_size: int,
                 dropout: float,
                 maxlen: int = 5000):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(- torch.arange(0, emb_size, 2)* math.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding: Tensor):
        return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])

In [20]:
class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size

    def forward(self, tokens: Tensor):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)

In [21]:
class Seq2SeqTransformer(nn.Module):
    def __init__(self,
                 num_encoder_layers: int,
                 num_decoder_layers: int,
                 emb_size: int,
                 nhead: int,
                 src_vocab_size: int,
                 tgt_vocab_size: int,
                 dim_feedforward: int = 512,
                 dropout: float = 0.1):
        super(Seq2SeqTransformer, self).__init__()
        self.transformer = Transformer(d_model=emb_size,
                                       nhead=nhead,
                                       num_encoder_layers=num_encoder_layers,
                                       num_decoder_layers=num_decoder_layers,
                                       dim_feedforward=dim_feedforward,
                                       dropout=dropout)
        self.generator = nn.Linear(emb_size, tgt_vocab_size)
        self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)
        self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)
        self.positional_encoding = PositionalEncoding(
            emb_size, dropout=dropout)

    def forward(self,
                src: Tensor,
                trg: Tensor,
                src_mask: Tensor,
                tgt_mask: Tensor,
                src_padding_mask: Tensor,
                tgt_padding_mask: Tensor,
                memory_key_padding_mask: Tensor):
        src_emb = self.positional_encoding(self.src_tok_emb(src))
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg))
        outs = self.transformer(src_emb, tgt_emb, src_mask, tgt_mask, None,
                                src_padding_mask, tgt_padding_mask, memory_key_padding_mask)
        return self.generator(outs)

    def encode(self, src: Tensor, src_mask: Tensor):
        return self.transformer.encoder(self.positional_encoding(
                            self.src_tok_emb(src)), src_mask)

    def decode(self, tgt: Tensor, memory: Tensor, tgt_mask: Tensor):
        return self.transformer.decoder(self.positional_encoding(
                          self.tgt_tok_emb(tgt)), memory,
                          tgt_mask)

In [22]:
def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones((sz, sz), device=DEVICE)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask

In [23]:
def create_mask(src, tgt):
    src_seq_len = src.shape[0]
    tgt_seq_len = tgt.shape[0]

    tgt_mask = generate_square_subsequent_mask(tgt_seq_len)
    src_mask = torch.zeros((src_seq_len, src_seq_len),device=DEVICE).type(torch.bool)

    src_padding_mask = (src == PAD_IDX).transpose(0, 1)
    tgt_padding_mask = (tgt == PAD_IDX).transpose(0, 1)
    return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask

In [24]:
torch.manual_seed(0)

SRC_VOCAB_SIZE = len(vocab_transform[SRC_LANGUAGE])
TGT_VOCAB_SIZE = len(vocab_transform[TGT_LANGUAGE])
EMB_SIZE = 512
NHEAD = 8
FFN_HID_DIM = 512
BATCH_SIZE = 128
NUM_ENCODER_LAYERS = 3
NUM_DECODER_LAYERS = 3

transformer = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE,
                                 NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM)

for p in transformer.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

transformer = transformer.to(DEVICE)

loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)

optimizer = torch.optim.Adam(transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

In [25]:
from torch.nn.utils.rnn import pad_sequence

# helper function to club together sequential operations
def sequential_transforms(*transforms):
    def func(txt_input):
        for transform in transforms:
            txt_input = transform(txt_input)
        return txt_input
    return func

# function to add BOS/EOS and create tensor for input sequence indices
def tensor_transform(token_ids: List[int]):
    return torch.cat((torch.tensor([BOS_IDX]),
                      torch.tensor(token_ids),
                      torch.tensor([EOS_IDX])))

# ``src`` and ``tgt`` language text transforms to convert raw strings into tensors indices
text_transform = {}
for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
    text_transform[ln] = sequential_transforms(token_transform[ln], #Tokenization
                                               vocab_transform[ln], #Numericalization
                                               tensor_transform) # Add BOS/EOS and create tensor


# function to collate data samples into batch tensors
def collate_fn(batch):
    src_batch, tgt_batch = [], []
    for src_sample, tgt_sample in batch:
        src_batch.append(text_transform[SRC_LANGUAGE](src_sample.rstrip("\n")))
        tgt_batch.append(text_transform[TGT_LANGUAGE](tgt_sample.rstrip("\n")))

    src_batch = pad_sequence(src_batch, padding_value=PAD_IDX)
    tgt_batch = pad_sequence(tgt_batch, padding_value=PAD_IDX)
    return src_batch, tgt_batch

In [26]:
from torch.utils.data import DataLoader

def train_epoch(model, optimizer):
    model.train()
    losses = 0
    train_iter = Multi30k(split='train', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
    train_dataloader = DataLoader(train_iter, batch_size=BATCH_SIZE, collate_fn=collate_fn)

    for src, tgt in train_dataloader:
        src = src.to(DEVICE)
        tgt = tgt.to(DEVICE)

        tgt_input = tgt[:-1, :]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

        logits = model(src, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask)

        optimizer.zero_grad()

        tgt_out = tgt[1:, :]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        loss.backward()

        optimizer.step()
        losses += loss.item()

    return losses / len(list(train_dataloader))


def evaluate(model):
    model.eval()
    losses = 0

    val_iter = Multi30k(split='valid', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
    val_dataloader = DataLoader(val_iter, batch_size=BATCH_SIZE, collate_fn=collate_fn)

    for src, tgt in val_dataloader:
        src = src.to(DEVICE)
        tgt = tgt.to(DEVICE)

        tgt_input = tgt[:-1, :]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

        logits = model(src, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask)

        tgt_out = tgt[1:, :]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        losses += loss.item()

    return losses / len(list(val_dataloader))

In [30]:
from timeit import default_timer as timer
import torch

NUM_EPOCHS = 18

for epoch in range(1, NUM_EPOCHS+1):
    start_time = timer()
    train_loss = train_epoch(transformer, optimizer)
    end_time = timer()
    val_loss = evaluate(transformer)
    print(f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Val loss: {val_loss:.3f}, Epoch time = {(end_time - start_time):.3f}s")

filename = f"saved_model.pt"
torch.save(transformer.state_dict(), filename)

Epoch: 1, Train loss: 0.353, Val loss: 2.162, Epoch time = 46.399s
Epoch: 2, Train loss: 0.328, Val loss: 2.185, Epoch time = 45.341s
Epoch: 3, Train loss: 0.304, Val loss: 2.235, Epoch time = 46.431s
Epoch: 4, Train loss: 0.285, Val loss: 2.254, Epoch time = 45.167s
Epoch: 5, Train loss: 0.267, Val loss: 2.268, Epoch time = 46.409s
Epoch: 6, Train loss: 0.249, Val loss: 2.310, Epoch time = 45.485s
Epoch: 7, Train loss: 0.231, Val loss: 2.364, Epoch time = 45.259s
Epoch: 8, Train loss: 0.217, Val loss: 2.387, Epoch time = 46.500s
Epoch: 9, Train loss: 0.204, Val loss: 2.420, Epoch time = 45.088s
Epoch: 10, Train loss: 0.193, Val loss: 2.460, Epoch time = 46.352s
Epoch: 11, Train loss: 0.179, Val loss: 2.474, Epoch time = 45.424s
Epoch: 12, Train loss: 0.167, Val loss: 2.510, Epoch time = 45.361s
Epoch: 13, Train loss: 0.159, Val loss: 2.530, Epoch time = 46.208s
Epoch: 14, Train loss: 0.151, Val loss: 2.536, Epoch time = 45.034s
Epoch: 15, Train loss: 0.142, Val loss: 2.543, Epoch time

In [31]:
def greedy_decode(model, src, src_mask, max_len, start_symbol):
    src = src.to(DEVICE)
    src_mask = src_mask.to(DEVICE)

    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(DEVICE)
    for i in range(max_len-1):
        memory = memory.to(DEVICE)
        tgt_mask = (generate_square_subsequent_mask(ys.size(0))
                    .type(torch.bool)).to(DEVICE)
        out = model.decode(ys, memory, tgt_mask)
        out = out.transpose(0, 1)
        prob = model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim=1)
        next_word = next_word.item()

        ys = torch.cat([ys,
                        torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
        if next_word == EOS_IDX:
            break
    return ys

In [32]:
def translate(model: torch.nn.Module, src_sentence: str):
    model.eval()
    src = text_transform[SRC_LANGUAGE](src_sentence).view(-1, 1)
    num_tokens = src.shape[0]
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
    tgt_tokens = greedy_decode(
        model,  src, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX).flatten()
    return " ".join(vocab_transform[TGT_LANGUAGE].lookup_tokens(list(tgt_tokens.cpu().numpy()))).replace("<bos>", "").replace("<eos>", "")

In [33]:
print(translate(transformer, "Eine Gruppe von Menschen steht vor einem Iglu ."))

 A group of people standing in front of an igloo 


In [34]:
print(translate(transformer, "Die Leute fixieren das Dach eines Hauses."))

 The people are really receives the roof of a house 
