# seq to seq and attention
https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html

In [None]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import re
import random

In [None]:
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

In [None]:
import numpy as np
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
SOS_token = 0
EOS_token = 1

In [None]:
class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0:"SOS", 1:"EOS"}
        self.n_words = 2
        
    def addSentence(self, sentence):
        for word in sentence.split(" "):
            self.addWord(word)
            
    def addWord(self, word):
        if word not in self.word2index:
            # if not SOS or EOS, then the index 
            # will start from 2
            self.word2index[word] = self.n_words
            # The first actual word with counting
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1
            

In [None]:
# Turn a Unicode string to plain ASCII, thanks to
# https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z!?]+", r" ", s)
    return s.strip()

In [None]:
def readLangs(lang1, lang2, reverse=False):
    print("Reading lines...")
    
    lines = open("./data/english-lang-trans/%s-%s.txt" % (lang1, lang2),\
                encoding="utf-8").read().strip().split("\n")
    
    pairs = [[normalizeString(s) for s in l.split("\t")] for l in lines]
    
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    return input_lang, output_lang, pairs 

In [None]:
# trim the data set to only relatively short and simple sentences.

MAX_LENGTH=10

eng_prefixes = (
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s ",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re "
)

def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and \
        len(p[1].split(' ')) < MAX_LENGTH and \
        p[1].startswith(eng_prefixes)


def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

In [None]:
# prepare dataset

def prepareData(lang1, lang2, reverse=False):
    input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse)
    print("Read %s sentence pairs" % len(pairs))
    pairs = filterPairs(pairs)
    print("Trimmed to %s senence pairs" % len(pairs))
    print("Counting words")
    
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
        
    print("Counted words: ")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs

In [None]:
input_lang, output_lang, pairs = prepareData("eng", "fra", True)
print(random.choice(pairs))

## seq2seq working flow
![seq2seq working flow](https://pytorch.org/tutorials/_images/encoder-network.png)

In [None]:
class EncoderRNN(nn.Module):
    def __init__(self,input_size, hidden_size, dropout_p=0.1):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(dropout_p)
        
    
    def forward(self, input):
        embedded = self.dropout(self.embedding(input))
        output, hidden = self.gru(embedded)
        return output, hidden

![seq2seq encoding - decoding](https://pytorch.org/tutorials/_images/decoder-network.png)

**Understanding squeeze(-1) and Detach in Decoder Input Processing (Seq2seq with Teacher Forcing):**

In a seq2seq model with teacher forcing, `squeeze(-1)` and `detach` are used together during decoder input processing to ensure proper shape compatibility and memory optimization. Here's a detailed breakdown:

**1. Context and Teacher Forcing:**

- Seq2seq models translate between sequences (e.g., text-to-text, speech-to-text).
- Teacher forcing is a training technique where the model receives the correct target sequence as input at each decoding step, along with previously generated elements.

**2. Decoder Input Processing:**

   **a. Target Sequence Reshaping (unsqueeze(1))**

     - The target sequence (ground truth) typically has a shape `(batch_size, target_length)`.
     - For teacher forcing, the decoder needs input one element at a time during each decoding step.
     - To achieve this, `unsqueeze(1)` is used during training:

       ```python
       # Example target sequence
       target_sequence = torch.randint(1, 10, (batch_size, target_length))

       # Teacher forcing input for first decoding step
       teacher_forcing_input = target_sequence[:, 0].unsqueeze(1)
       ```

     - Explanation:
       - `target_sequence[:, 0]` extracts the first element (index 0) from each sequence in the batch, resulting in a tensor of shape `(batch_size,)`.
       - `unsqueeze(1)` inserts a new dimension of size 1 at dimension 1 (the column dimension). This creates a tensor with shape `(batch_size, 1)`, aligning with the expected decoder input format at the first step.

   **b. Decoder Output Reshaping and Detachment (squeeze(-1).detach())**

     - After each decoding step, the decoder generates an output.
     - We need to compare the decoder output with the corresponding element in the target sequence for calculating the loss during training.
     - However, the decoder output might have a shape `(batch_size, target_length, hidden_size)`:

       - `batch_size`: Number of samples in the batch.
       - `target_length`: Length of the target sequence.
       - `hidden_size`: Model's internal hidden state dimension (representing extracted features).

     ```python
     # Example decoder output after a decoding step
     decoder_output = model(decoder_input)  # Model processes decoder input

     # Process decoder output for loss calculation
     processed_decoder_output = decoder_output.squeeze(-1).detach()
     ```

     - Explanation:
       - `squeeze(-1)` removes the dimension of size 1 at the last dimension (dimension -1). This ensures the decoder output has the same shape `(batch_size, target_length)` as the target sequence, allowing for element-wise comparison during loss calculation. Essentially, it extracts the meaningful content from the last dimension (`hidden_size`) by combining it with the previous dimensions (`batch_size` and `target_length`).
       - `.detach()` detaches the processed decoder output from the computational graph. Since we're using the ground truth for teacher forcing and not backpropagating through the decoder output in this step, detaching saves memory and avoids unnecessary computations.

**3. Key Points:**

- `unsqueeze(1)` prepares the target sequence for teacher forcing by creating an input with the correct shape for the first decoding step.
- `squeeze(-1).detach()` processes the decoder output by:
   - Removing the unnecessary `hidden_size` dimension for element-wise comparison.
   - Detaching the output from the computational graph for teacher forcing efficiency.

**In essence, these operations ensure that the decoder receives the appropriate teacher forcing input and that the decoder output is properly shaped for loss calculation during training.**

In [None]:
class DecoderRNN(nn.Module):
    def __init__(self,hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)
        
    def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
        batch_size = encoder_outputs.size(0)
        decoder_input = torch.empty(batch_size, 1, dtype=torch.long,\
                                   device=device).fill_(SOS_token)
        decoder_hidden = encoder_hidden
        decoder_outputs = []
        
        for i in range(MAX_LENGTH):
            decoder_output, decoder_hidden = self.forward_step(decoder_input,
                                                               decoder_hidden)
            decoder_outputs.append(decoder_output)
            
            if target_tensor is not None:
                # Teacher forcing, feed the target as the next input
                decoder_input = target_tensor[:, i].unsqueeze(1) # teacher forcing
            else:
                _, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze(-1).detach() # detach from hist as input
                
        decoder_outputs = torch.cat(decoer_outputs, dim=1)
        decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)
        # return `None` for consistency in the training loop
        return decoder_outputs, decoder_hidden, None
    
    def forward_step(self, input, hidden):
        output = self.embedding(input)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.out(output)
        return output, hidden
        

![](https://i.imgur.com/1152PYf.png)  
![](https://pytorch.org/tutorials/_images/attention-decoder-network.png)


In [None]:
class BahdanauAttention(nn.Module):
    def __init__(self, hidden_size):
        super(BahdanauAttention, self).__init__()
        self.Wa = nn.Linear(hidden_size, hidden_size)
        self.Ua = nn.Linear(hidden_size, hidden_size)
        self.Va = nn.Linear(hidden_size, 1)

    def forward(self, query, keys):
        scores = self.Va(torch.tanh(self.Wa(query) + self.Ua(keys)))
        scores = scores.squeeze(2).unsqueeze(1)

        weights = F.softmax(scores, dim=-1)
        context = torch.bmm(weights, keys)

        return context, weights

In [None]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1):
        super(AttnDecoderRNN, self).__init__()
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.attention = BahdanauAttention(hidden_size)
        self.gru = nn.GRU(2 * hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
        batch_size = encoder_outputs.size(0)
        decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(SOS_token)
        decoder_hidden = encoder_hidden
        decoder_outputs = []
        attentions = []

        for i in range(MAX_LENGTH):
            decoder_output, decoder_hidden, attn_weights = self.forward_step(
                decoder_input, decoder_hidden, encoder_outputs
            )
            decoder_outputs.append(decoder_output)
            attentions.append(attn_weights)

            if target_tensor is not None:
                # Teacher forcing: Feed the target as the next input
                decoder_input = target_tensor[:, i].unsqueeze(1) # Teacher forcing
            else:
                # Without teacher forcing: use its own predictions as the next input
                _, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze(-1).detach()  # detach from history as input

        decoder_outputs = torch.cat(decoder_outputs, dim=1)
        decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)
        attentions = torch.cat(attentions, dim=1)

        return decoder_outputs, decoder_hidden, attentions


    def forward_step(self, input, hidden, encoder_outputs):
        embedded =  self.dropout(self.embedding(input))

        query = hidden.permute(1, 0, 2)
        context, attn_weights = self.attention(query, encoder_outputs)
        input_gru = torch.cat((embedded, context), dim=2)

        output, hidden = self.gru(input_gru, hidden)
        output = self.out(output)

        return output, hidden, attn_weights

In [None]:
# preparing taining data

def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(" ")]

def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(1, -1)

def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

def get_dataloader(batch_size):
    input_lang, output_lang, pairs = prepareData("eng", "fra", True)
    
    n = len(pairs)
    input_ids = np.zeros((n, MAX_LENGTH), dtype=np.int32)
    target_ids = np.zeros((n, MAX_LENGTH), dtype=np.int32)
    
    for idx, (inp, tgt) in enumerate(pairs):
        inp_ids = indexesFromSentence(input_lang, inp)
        tgt_ids = indexesFromSentence(output_lang, tgt)
        inp_ids.append(EOS_token)
        tgt_ids.append(EOS_token)
        input_ids[idx, :len(inp_ids)] = inp_ids
        target_ids[idx, :len(tgt_ids)] = tgt_ids
        
        
    train_data = TensorDataset(torch.LongTensor(input_ids).to(device),
                               torch.LongTensor(target_ids).to(device))
    
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, 
                                  batch_size=batch_size)
    return input_lang, output_lang, train_dataloader

In [None]:
def train_epoch(dataloader, encoder, decoder, encoder_optimizer,
          decoder_optimizer, criterion):

    total_loss = 0
    for data in dataloader:
        input_tensor, target_tensor = data

        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()

        encoder_outputs, encoder_hidden = encoder(input_tensor)
        decoder_outputs, _, _ = decoder(encoder_outputs, encoder_hidden, target_tensor)

        loss = criterion(
            decoder_outputs.view(-1, decoder_outputs.size(-1)),
            target_tensor.view(-1)
        )
        loss.backward()

        encoder_optimizer.step()
        decoder_optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)

In [None]:
# This is a helper function to print time elapsed and estimated time remaining given the current time and progress %.

import time
import math

def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [None]:
def train(train_dataloader, encoder, decoder, n_epochs, learning_rate=0.001,
               print_every=100, plot_every=100):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
    criterion = nn.NLLLoss()

    for epoch in range(1, n_epochs + 1):
        loss = train_epoch(train_dataloader, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if epoch % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, epoch / n_epochs),
                                        epoch, epoch / n_epochs * 100, print_loss_avg))

        if epoch % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

> Change the backend to TkAgg, QtAgg, or WXAgg

In [None]:
import matplotlib.pyplot as plt
plt.switch_backend('QtAgg')
import matplotlib.ticker as ticker
import numpy as np

def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [None]:
def evaluate(encoder, decoder, sentence, input_lang, output_lang):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)

        encoder_outputs, encoder_hidden = encoder(input_tensor)
        decoder_outputs, decoder_hidden, decoder_attn = decoder(encoder_outputs, encoder_hidden)

        _, topi = decoder_outputs.topk(1)
        decoded_ids = topi.squeeze()

        decoded_words = []
        for idx in decoded_ids:
            if idx.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            decoded_words.append(output_lang.index2word[idx.item()])
    return decoded_words, decoder_attn

In [None]:
def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(pairs)
        print('>', pair[0])
        print('=', pair[1])
        output_words, _ = evaluate(encoder, decoder, pair[0], input_lang, output_lang)
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

In [None]:
# actual train, evaluation

hidden_size = 128
batch_size = 32

input_lang, output_lang, train_dataloader = get_dataloader(batch_size)

encoder = EncoderRNN(input_lang.n_words, hidden_size).to(device)
decoder = AttnDecoderRNN(hidden_size, output_lang.n_words).to(device)

train(train_dataloader, encoder, decoder, 80, print_every=5, plot_every=5)

In [None]:
encoder.eval()
decoder.eval()
evaluateRandomly(encoder, decoder)

In [None]:
def showAttention(input_sentence, output_words, attentions):
    fig = plt.figure()
    ax = fig.add_subplot(111)
    cax = ax.matshow(attentions.cpu().numpy(), cmap='bone')
    fig.colorbar(cax)

    # Set up axes
    ax.set_xticklabels([''] + input_sentence.split(' ') +
                       ['<EOS>'], rotation=90)
    ax.set_yticklabels([''] + output_words)

    # Show label at every tick
    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

    plt.show()


def evaluateAndShowAttention(input_sentence):
    output_words, attentions = evaluate(encoder, decoder, input_sentence, input_lang, output_lang)
    print('input =', input_sentence)
    print('output =', ' '.join(output_words))
    showAttention(input_sentence, output_words, attentions[0, :len(output_words), :])


evaluateAndShowAttention('il n est pas aussi grand que son pere')

evaluateAndShowAttention('je suis trop fatigue pour conduire')

evaluateAndShowAttention('je suis desole si c est une question idiote')

evaluateAndShowAttention('je suis reellement fiere de vous')

# preprocess custom text dataset using torchtext 
https://pytorch.org/tutorials/beginner/torchtext_custom_dataset_tutorial.html

Read a dataset

Tokenize sentence

Apply transforms to sentence

Perform bucket batching

In [None]:
import spacy
import torchdata.datapipes as dp
import torchtext.transforms as T

from torchtext.vocab import build_vocab_from_iterator
eng = spacy.load("en_core_web_sm")
zh = spacy.load("zh_core_web_sm")
de = spacy.load("de_core_news_sm")
fr = spacy.load("fr_core_news_sm")

In [None]:
FILE_PATH = "data/deu-eng/deu.txt"
data_pipe = dp.iter.IterableWrapper([FILE_PATH])
data_pipe = dp.iter.FileOpener(data_pipe, mode="rb")
data_pipe = data_pipe.parse_csv(skip_lines=0, delimiter="\t", as_tuple=True)

In [None]:
def showSample(data_pipe):
    for sample in data_pipe:
        print(sample)
        break

In [None]:
showSample(data_pipe)

In [None]:
def removeAttribution(row):
    """
    Keeping the first two elements in a tuple
    Keeping the first and second columns
    """
    return row[:2]

In [None]:
data_pipe = data_pipe.map(removeAttribution)

In [None]:
showSample(data_pipe)

In [None]:
# tokenization
def engTokenize(text):
    """
    tokenize an English text and return a list of tokens
    """
    return [token.text for token in eng.tokenizer(text)]

In [None]:
def deTokenize(text):
    """
    Tokenize a German text and return a list of tokens
    """
    return [token.text for token in de.tokenizer(text)]

In [None]:
# test for the tokenization
print(engTokenize("Hello world!!!"))
print(deTokenize("Hallo Welt!!!"))

In [None]:
print(engTokenize("Have a good day!!!"))
print(deTokenize("Haben Sie einen guten Tag!!!"))

In [None]:
# build vocabulary
def getTokens(data_iter, place):
    """
    Function to yield tokens from an iterator. Since, our iterator contains
    tuple of sentences (source and target), `place` parameters defines for which
    index to return the tokens for. `place=0` for source and `place=1` for target
    """
    for english, german in data_iter:
        if place == 0:
            yield engTokenize(english)
        else:
            yield deTokenize(german)

In [None]:
source_vocab = build_vocab_from_iterator(
    getTokens(data_pipe, 0),
    min_freq=2,
    specials=["<pad>", "<sos>", "<eos>", "<unk>"],
    special_first=True
)
source_vocab.set_default_index(source_vocab["<unk>"])

In [None]:
target_vocab = build_vocab_from_iterator(
    getTokens(data_pipe,1),
    min_freq=2,
    specials= ['<pad>', '<sos>', '<eos>', '<unk>'],
    special_first=True
)
target_vocab.set_default_index(target_vocab['<unk>'])

In [None]:
# test for vocab
print(source_vocab.get_itos()[:9])

In [None]:
# Numericalize sentences using vocabulary
# convert our sentences to corresponding indices
def getTransform(vocab):
    """
    Create transforms based on given vocabulary. The returned transform is applied to sequence
    of tokens.
    """
    text_transform = T.Sequential(
        # converts the sentences to indices based on given vocabulary
        T.VocabTransform(vocab=vocab),
        ## Add <sos> at beginning of each sentence. 1 because the index for <sos> in vocabulary is
        # 1 as seen in previous section
        T.AddToken(1, begin=True),
        ## Add <eos> at beginning of each sentence. 2 because the index for <eos> in vocabulary is
        # 2 as seen in previous section
        T.AddToken(2, begin=False)
    )
    return text_transform

In [None]:
temp_list = list(data_pipe)
some_sentence = temp_list[798][0]
print(f"Some sentence = ", end="")
print(some_sentence)
transformed_sentence = getTransform(source_vocab)(engTokenize(some_sentence))
print("Transformed sentence = ", end="")
print(transformed_sentence)
index_to_string = source_vocab.get_itos()
for index in transformed_sentence:
    print(index_to_string[index], end="")

In [None]:
def applyTransform(sequence_pair):
    """
    Apply transforms to sequence of tokens in a sequence pair
    """
    
    return (
        getTransform(source_vocab)(engTokenize(sequence_pair[0])),
        getTransform(target_vocab)(deTokenize(sequence_pair[1]))
    )

In [None]:
data_pipe = data_pipe.map(applyTransform)
temp_list = list(data_pipe)
print(temp_list[0])

In [None]:
index_to_string = source_vocab.get_itos()
for inx_list in temp_list[0]:
#     print(index_to_string[index], end="")
    output = []
    
    for inx in inx_list:
        print(index_to_string[inx], end="")

In [None]:
def sortBucket(bucket):
    """
    Function to sort a given bucket. Here, we want to sort based on the length of
    source and target sequence.
    """
    return sorted(bucket, key=lambda x: (len(x[0]), len(x[1])))

In [None]:
data_pipe = data_pipe.bucketbatch(
    batch_size = 4,
    batch_num = 5,
    bucket_num = 1,
    use_in_batch_shuffle=False,
    sort_key=sortBucket
)

In [None]:
len(list(data_pipe))

In [None]:
print(list(data_pipe)[0])

In [None]:
def separateSourceTarget(sequence_pairs):
    """
    input of form: `[(X_1,y_1), (X_2,y_2), (X_3,y_3), (X_4,y_4)]`
    output of form: `((X_1,X_2,X_3,X_4), (y_1,y_2,y_3,y_4))`
    """
    sources,targets = zip(*sequence_pairs)
    return sources,targets

In [None]:
## Apply the function to each element in the iterator
data_pipe = data_pipe.map(separateSourceTarget)
print(list(data_pipe)[0])

In [None]:
# padding
def applyPadding(pair_of_sequences):
    """
    Convert sequences to tensors and apply paddjing
    """
    return (T.ToTensor(0)(list(pair_of_sequences[0])), T.ToTensor(0)(list(pair_of_sequences[1])))

In [None]:
data_pipe = data_pipe.map(applyPadding)

In [None]:
#  index to string mapping to see how the sequence would look with tokens instead of indices

source_index_to_string = source_vocab.get_itos()
target_index_to_string = target_vocab.get_itos()


def showSomeTransformedSentences(data_pipe):
    """
    Function to show how the sentences look like after applying all transforms.
    Here we try to print actual words instead of corresponding index
    """
    for sources,targets in data_pipe:
        if sources[0][-1] != 0:
            continue # Just to visualize padding of shorter sentences
        for i in range(4):
            source = ""
            for token in sources[i]:
                source += " " + source_index_to_string[token]
            target = ""
            for token in targets[i]:
                target += " " + target_index_to_string[token]
            print(f"Source: {source}")
            print(f"Target: {target}")
        break

In [None]:
showSomeTransformedSentences(data_pipe)

# Text classification with the torchtext library
https://pytorch.org/tutorials/beginner/text_sentiment_ngrams_tutorial.html
`!pip install -U portalocker>=2.0.0``

In [None]:
import torch
import torch.nn as nn
from torchtext.datasets import AG_NEWS

In [None]:
train_iter = iter(AG_NEWS(split="train"))

In [None]:
for i in range(20):
    print(next(train_iter))

In [None]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

In [None]:
tokenizer = get_tokenizer("basic_english")
train_iter = AG_NEWS(split="train")

In [None]:
def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text)

In [None]:
vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

In [None]:
vocab(['here', 'is', 'an', 'example'])

In [None]:
vocab(["hello", "gears", "of", "war"])

In [None]:
text_pipeline = lambda x: vocab(tokenizer(x))
label_pipeline = lambda x: int(x) - 1

In [None]:
text_pipeline('here is the an example')

In [None]:
label_pipeline('10')

In [None]:
from torch.utils.data import DataLoader

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### why offsets

Certainly! Here's a concrete example with more than 5 sequences to illustrate offset computation with `cumsum` and slicing:

**Scenario:**

Imagine a batch containing six text sequences of varying lengths:

- Sequence 1: "This is the first sentence" (length 5)
- Sequence 2: "A shorter sentence" (length 4)
- Sequence 3: "Another example sentence" (length 5)
- Sequence 4: "This is sequence number four" (length 7)
- Sequence 5: "A very brief message" (length 3)
- Sequence 6: "The final sequence of the batch" (length 6)

**`collate_batch` Function:**

1. **Processing Sequences:**

   - Processes each sequence using `text_pipeline` (e.g., tokenization), resulting in processed tensors.

2. **Creating `offsets`:**

   - Keeps track of starting indices by appending the length of each processed sequence:

     ```
     offsets = [0, 5, 9, 14, 21, 24]
     ```

     - `offsets[0]` to `offsets[5]` represent the accumulated lengths of sequences 1 to 6, respectively.

**Offsets Processing:**

1. **Slicing (Optional, depending on implementation):**

   - `offsets[:-1] = [0, 5, 9, 14, 21]`: This excludes the last element (total length) if it's present.

2. **Converting to Tensor:**

   - `torch.tensor(offsets[:-1])`: Converts the sliced `offsets` (or the entire list if not sliced) into a PyTorch tensor.

3. **Cumulative Sum:**

   - ```
     .cumsum(dim=0)
     ```

     :

      Calculates the starting indices for each sequence:

     ```
     offsets_with_indices = torch.tensor([0, 5, 9, 14, 21, 27]).cumsum(dim=0)
     ```

     - `offsets_with_indices` becomes `tensor([0, 5, 9, 14, 21, 27])`.

**Understanding the Results:**

- ```
  offsets_with_indices
  ```

   now holds the absolute starting indices for each sequence within the combined 

  ```
  text_list
  ```

   tensor:

  - Sequence 1: Starts at index `offsets_with_indices[0]` (0).
  - Sequence 2: Starts at index `offsets_with_indices[1]` (5).
  - Sequence 3: Starts at index `offsets_with_indices[2]` (9).
  - And so on...

**Key Points:**

- Slicing with `offsets[:-1]` addresses the potential issue of including the total length in `offsets`.
- The cumulative sum (`cumsum`) provides the correct starting indices for efficient processing of each sequence in the batch.

I hope this example with six sequences clarifies the concept of offsets, slicing, and `cumsum` in the context of handling variable-length text data during batch processing.

<u>The offset is a tensor of delimiters to represent the beginning index of the individual sequence in the text tensor. Label is a tensor saving the labels of individual text entries</u>

In [None]:
def collate_batch(batch):
    label_list, text_list, offsets = [],[],[0]
    
    for _label, _text in batch:
        
        label_list.append(label_pipeline(_label))
        
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)
        
        offsets.append(processed_text.size(0))
        
    label_list = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)
    
    return label_list.to(device), text_list.to(device), offsets.to(device)

In [None]:
# # about cumsum function

# import torch

# # Sample tensor
# tensor = torch.tensor([[1, 2, 3], [4, 5, 6]])

# # Cumulative sum across rows (dim=0)
# cumulative_sum_rows = torch.cumsum(tensor, dim=0)
# print(cumulative_sum_rows)  # Output: tensor([[1, 3, 6], [4, 9, 15]])


In [None]:
train_iter = AG_NEWS(split="train")
dataloader = DataLoader(
    train_iter, batch_size=8,
    shuffle=False, collate_fn=collate_batch
)

The `offsets` parameter plays a crucial role in efficiently handling variable-length text sequences within the `embedding` layer (specifically, the `nn.EmbeddingBag` module). Here's a breakdown of how it works:

**`nn.EmbeddingBag` for Variable-Length Sequences:**

- The standard `nn.Embedding` module assumes all sequences have the same length.
- However, in text classification, sequences often have different lengths.
- `nn.EmbeddingBag` overcomes this limitation by efficiently processing variable-length sequences.

**How Offsets Help:**

1. **Combined Text List (`text`):**

   - Imagine a tensor `text` that combines all processed sequences (e.g., tokenized) from the batch into a single list.
   - This `text` tensor might contain all the tokens from all sequences, one after another.

2. **`offsets`:**

   - This parameter is a tensor (often created from a list) that provides the starting index for each sequence within the combined `text` tensor.

   - For example,

      

     ```
     offsets = [0, 5, 9]
     ```

      might indicate that:

     - Sequence 1 starts at index 0 in `text`.
     - Sequence 2 starts at index 5 in `text`.
     - Sequence 3 starts at index 9 in `text`.

**`embedding(text, offsets)` in Action:**

1. Leveraging Offsets:
   - The `embedding` layer (using `nn.EmbeddingBag`) uses `offsets` to identify the relevant subsequence for each sequence within the combined `text` tensor.
2. Embedding Calculation:
   - For each sequence, it extracts the corresponding subsequence from `text` based on the starting index provided by `offsets`.
   - This subsequence represents the sequence's tokens.
3. Embedding Bag Operation:
   - `nn.EmbeddingBag` then performs embedding on the extracted subsequence, essentially converting each token in the sequence to a dense vector representation.

**Benefits of `offsets`:**

- Enables efficient processing of variable-length sequences in a single batch.
- Reduces memory overhead compared to padding sequences to a fixed length.

**Further Explanation:**

- The `nn.Embedding` layer likely has a pre-trained weight matrix that maps each word in the vocabulary to a dense embedding vector (e.g., size `embed_dim`).
- By looking up the indices of tokens in the subsequence (extracted using `offsets`), the `embedding` layer creates an embedding representation for the entire sequence.

**In essence, `offsets` act as a guide for the `embedding` layer to efficiently extract and process individual sequences within the combined `text` tensor, enabling effective text classification even with variable-length text data.**

In [None]:
from torch import nn

class TextClassificationModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super(TextClassificationModel, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=False)
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()
        
    def init_weights(self):
        initrange=.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()
        
    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        return self.fc(embedded)

## The AG_NEWS dataset has four labels and therefore the number of classes is four.
1 : World   
2 : Sports    
3 : Business    
4 : Sci/Tec   

In [None]:
train_iter = AG_NEWS(split="train")
num_class = len(set([label for (label, text) in train_iter]))
vocab_size = len(vocab)
emsize = 64
model = TextClassificationModel(vocab_size, emsize, num_class).to(device)

In [None]:
import time

In [None]:
def train(dataloader):
    model.train()
    total_acc, total_count = 0,0
    log_interval = 500
    start_time = time.time()
    
    for idx, (label, text, offsets) in enumerate(dataloader):
        optimizer.zero_grad()
        predicted_label = model(text, offsets)
        loss = criterion(predicted_label, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()
        total_acc += (predicted_label.argmax(1) == label).sum().item()
        total_count += label.size(0)
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print(
                "| epoch {:3d} | {:5d} | {:5d} batches"
                "| accuracy {:8.3f}".format(
                    epoch, idx, len(dataloader), total_acc / total_count
                )
            )
            
            total_acc, total_count = 0,0
            start_time = time.time()


In [None]:
def evaluate(dataloader):
    model.eval()
    total_acc, total_count = 0,0
    
    with torch.no_grad():
        for idx, (label, text, offsets) in enumerate(dataloader):
            predicted_label = model(text, offsets)
            loss = criterion(predicted_label, label)
            total_acc += (predicted_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
    return total_acc / total_count

# random_split and to_map_style_dataset`

In [None]:
generator = torch.Generator().manual_seed(40)
a = random_split(range(10), [0.5, 0.3, 0.2], generator=generator)
for group in a:
    for each in group:
        print(each)
    print("---")

In [None]:
generator = torch.Generator().manual_seed(40)
a = random_split(range(10), [3,7], generator=generator)
for group in a:
    for each in group:
        print(each)
    print("---")

In [None]:
from torch.utils.data.dataset import random_split
from torchtext.data.functional import to_map_style_dataset

# Hyperparameters
EPOCHS = 10  # epoch
LR = 5  # learning rate
BATCH_SIZE = 64  # batch size for training

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
total_accu = None
train_iter, test_iter = AG_NEWS()
train_dataset = to_map_style_dataset(train_iter)
test_dataset = to_map_style_dataset(test_iter)
num_train = int(len(train_dataset) * 0.95)
split_train_, split_valid_ = random_split(
    train_datset, [num_train, lean(train_dataset) - num_train]
)

train_dataloader = DataLoader(
    split_train_, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch
)
valid_dataloader = DataLoader(
    split_valid_, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch
)
test_dataloader = DataLoader(
    test_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch
)

for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()
    train(train_dataloader)
    accu_val = evaluate(valid_dataloader)
    if total_accu is not None and total_accu > accu_val:
        scheduler.step()
    else:
        total_accu = accu_val
    print("-" * 59)
    print(
        "| end of epoch {:3d} | time: {:5.2f}s | "
        "valid accuracy {:8.3f} ".format(
            epoch, time.time() - epoch_start_time, accu_val
        )
    )
    print("-" * 59)

In [None]:
print("Checking the results of test dataset.")
accu_test = evaluate(test_dataloader)
print("test accuracy {:8.3f}".format(accu_test))

In [None]:
ag_news_label = {1: "World", 2: "Sports", 3: "Business", 4: "Sci/Tec"}


def predict(text, text_pipeline):
    with torch.no_grad():
        text = torch.tensor(text_pipeline(text))
        output = model(text, torch.tensor([0]))
        return output.argmax(1).item() + 1


ex_text_str = "MEMPHIS, Tenn. – Four days ago, Jon Rahm was \
    enduring the season’s worst weather conditions on Sunday at The \
    Open on his way to a closing 75 at Royal Portrush, which \
    considering the wind and the rain was a respectable showing. \
    Thursday’s first round at the WGC-FedEx St. Jude Invitational \
    was another story. With temperatures in the mid-80s and hardly any \
    wind, the Spaniard was 13 strokes better in a flawless round. \
    Thanks to his best putting performance on the PGA Tour, Rahm \
    finished with an 8-under 62 for a three-stroke lead, which \
    was even more impressive considering he’d never played the \
    front nine at TPC Southwind."

model = model.to("cpu")

print("This is a %s news" % ag_news_label[predict(ex_text_str, text_pipeline)])

# Language Translation with nn.Transformer and torchtext
https://www.statmt.org/wmt16/multimodal-task.html#task1


### trochtext.datasets

Datasets

- [Text Classification](https://pytorch.org/text/stable/datasets.html#text-classification)
  - [AG_NEWS](https://pytorch.org/text/stable/datasets.html#ag-news)
  - [AmazonReviewFull](https://pytorch.org/text/stable/datasets.html#amazonreviewfull)
  - [AmazonReviewPolarity](https://pytorch.org/text/stable/datasets.html#amazonreviewpolarity)
  - [CoLA](https://pytorch.org/text/stable/datasets.html#cola)
  - [DBpedia](https://pytorch.org/text/stable/datasets.html#dbpedia)
  - [IMDb](https://pytorch.org/text/stable/datasets.html#imdb)
  - [MNLI](https://pytorch.org/text/stable/datasets.html#mnli)
  - [MRPC](https://pytorch.org/text/stable/datasets.html#mrpc)
  - [QNLI](https://pytorch.org/text/stable/datasets.html#qnli)
  - [QQP](https://pytorch.org/text/stable/datasets.html#qqp)
  - [RTE](https://pytorch.org/text/stable/datasets.html#rte)
  - [SogouNews](https://pytorch.org/text/stable/datasets.html#sogounews)
  - [SST2](https://pytorch.org/text/stable/datasets.html#sst2)
  - [STSB](https://pytorch.org/text/stable/datasets.html#stsb)
  - [WNLI](https://pytorch.org/text/stable/datasets.html#wnli)
  - [YahooAnswers](https://pytorch.org/text/stable/datasets.html#yahooanswers)
  - [YelpReviewFull](https://pytorch.org/text/stable/datasets.html#yelpreviewfull)
  - [YelpReviewPolarity](https://pytorch.org/text/stable/datasets.html#yelpreviewpolarity)
- [Language Modeling](https://pytorch.org/text/stable/datasets.html#language-modeling)
  - [PennTreebank](https://pytorch.org/text/stable/datasets.html#penntreebank)
  - [WikiText-2](https://pytorch.org/text/stable/datasets.html#wikitext-2)
  - [WikiText103](https://pytorch.org/text/stable/datasets.html#wikitext103)
- [Machine Translation](https://pytorch.org/text/stable/datasets.html#machine-translation)
  - [IWSLT2016](https://pytorch.org/text/stable/datasets.html#iwslt2016)
  - [IWSLT2017](https://pytorch.org/text/stable/datasets.html#iwslt2017)
  - [Multi30k](https://pytorch.org/text/stable/datasets.html#multi30k)
- [Sequence Tagging](https://pytorch.org/text/stable/datasets.html#sequence-tagging)
  - [CoNLL2000Chunking](https://pytorch.org/text/stable/datasets.html#conll2000chunking)
  - [UDPOS](https://pytorch.org/text/stable/datasets.html#udpos)
- [Question Answer](https://pytorch.org/text/stable/datasets.html#question-answer)
  - [SQuAD 1.0](https://pytorch.org/text/stable/datasets.html#squad-1-0)
  - [SQuAD 2.0](https://pytorch.org/text/stable/datasets.html#squad-2-0)
- [Unsupervised Learning](https://pytorch.org/text/stable/datasets.html#unsupervised-learning)
  - [CC100](https://pytorch.org/text/stable/datasets.html#cc100)
  - [EnWik9](https://pytorch.org/text/stable/datasets.html#enwik9)

Python offers a typing module that provides various data type annotations to improve code readability and maintainability.
These annotations don't affect the code's functionality at runtime, but they act as hints for developers and static type checkers.

In [1]:
# typing, Iterable, and List
from typing import Iterable, List

def print_all_items(items: Iterable) -> None:
    """Prints all items in an iterable object."""
    for item in items:
        print(item)

# Example usage
my_list: List[int] = [1, 2, 3]
print_all_items(my_list)  # This works as the list is iterable

1
2
3


In [2]:
# data sourcing and processing


from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.datasets import multi30k, Multi30k
from typing import Iterable, List



In [5]:
multi30k.URL["train"] = "https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/training.tar.gz"
multi30k.URL["valid"] = "https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/validation.tar.gz"

In [6]:
SRC_LANGUAGE = "de"
TGT_LANGUAGE = "en"

In [7]:
token_transform = {}
vocab_transform = {}

```shell
pip install -U torchdata
pip install -U spacy
python -m spacy download en_core_web_sm
python -m spacy download de_core_news_sm
```

In [8]:
# source and target language tokenizer
token_transform[SRC_LANGUAGE] = get_tokenizer("spacy", language="de_core_news_sm")
token_transform[TGT_LANGUAGE] = get_tokenizer('spacy', language='en_core_web_sm')

In [9]:
def yield_tokens(data_iter: Iterable, language: str) -> List[str]:
    language_index = {SRC_LANGUAGE: 0, TGT_LANGUAGE: 1}
    
    for data_sample in data_iter:
        """
        Yield tokens
        """
        yield token_transform[language](data_sample[language_index[language]])

In [10]:
# Special symbols and indices
UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0,1,2,3

# Make sure the tokens are in order of their indices to properly insert them in vocab
special_symbols = ["<unk>", "<pad>", "<bos>", "<eos>"]

In [11]:
# ln - language name


for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
    # training data iterator
    train_iter = Multi30k(split="train", 
                          language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
    # create torchtext vocab object
    """
    Yield indices
    """
    vocab_transform[ln] = build_vocab_from_iterator(
        yield_tokens(train_iter, ln),
        min_freq=1,
        specials=special_symbols,
        special_first=True
    )
    
# Set ``UNK_IDX`` as the default index. This index is returned when 
# the token is not found. If not set, it throws ``RuntimeError`` 
# when the queried token is not found in the Vocabulary.
for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
    vocab_transform[ln].set_default_index(UNK_IDX)

# Seq2seq - transformer

In [12]:
from torch import Tensor
import torch
import torch.nn as nn
from torch.nn import Transformer
import math
DEVICE=torch.device("cuda" if torch.cuda.is_available() else "cpu")

![](./img/pe.png)   


1. **Imagine a numbering system:** Assign a unique number (position index) to each word in the sequence, starting from 0 for the first word.
2. **Create a special table (embedding):** This table has the same size (dimensions) as the word embeddings used in the model. Each row in the table represents the positional encoding for a specific position index.
3. **Encode position using sine and cosine:**  Instead of directly using the position index, the encoding for each position is created using sine and cosine functions. The value of these functions depends on both the position index and the dimension of the word embedding. This creates a smooth and continuous representation of position.
4. **Why sine and cosine?** These functions have periodic properties, meaning their values repeat after a certain interval. This helps the model learn long-range dependencies in the sequence, even if the positions are far apart.
5. **Adding the encoding:** The positional encoding for each word (a row from the table) is added to the corresponding word embedding. This injects information about the word's position into the word representation.

**Think of it like adding directional arrows to each word:**

- The arrow's strength (amplitude) depends on the position index (using sine and cosine).
- The direction (up/down) alternates for even and odd positions.
- By adding these arrows to the word embeddings, the model can learn not only the meaning of each word but also its relative position within the sentence.

**Key Points:**

- Positional encoding helps the model understand the order of words in a sequence.
- It uses sine and cosine functions to create a smooth and continuous representation of position.
- The encoding is added to the word embeddings, enriching them with positional information.  

$$PE_{(pos, 2i)} = sin \bigg(\frac{pos}{10000^{2i/d_{model}}}\bigg)$$     
$$PE_{(pos, 2i+1)} = cos \bigg(\frac{pos}{10000^{2i/d_{model}}}\bigg)$$

In [13]:
# visualization of PE

import math
import numpy as np

MAX_SEQ_LEN = 128 # maximum length of a sentence  - postion
d_model = 512 # word embedding (and positional encoding) dimensions

# pre-allocates vectors with zeros
PE = np.zeros((MAX_SEQ_LEN, d_model))

# for each position, and for each dimension
for pos in range(MAX_SEQ_LEN):
    for i in range(d_model//2):  # d_model // 2 becuase the PE use pairs of (sin,cos)
        theta = pos / (10000 ** ((2*i)/d_model))
        PE[pos, 2*i ] = math.sin(theta)
        PE[pos, 2*i + 1] = math.cos(theta)
        
        
PE[:6, :6]

array([[ 0.        ,  1.        ,  0.        ,  1.        ,  0.        ,
         1.        ],
       [ 0.84147098,  0.54030231,  0.82185619,  0.56969501,  0.8019618 ,
         0.59737533],
       [ 0.90929743, -0.41614684,  0.93641474, -0.35089519,  0.95814438,
        -0.28628544],
       [ 0.14112001, -0.9899925 ,  0.24508542, -0.96950149,  0.34278182,
        -0.93941504],
       [-0.7568025 , -0.65364362, -0.65716686, -0.75374513, -0.54860557,
        -0.83608129],
       [-0.95892427,  0.28366219, -0.99385478,  0.11069182, -0.99822869,
        -0.05949362]])

In [14]:
# Instead of using for-loop, we can take advantage of NumPy’s parallelizable operations 
# (inspired by the PyTorch tutorial https://pytorch.org/tutorials/beginner/transformer_tutorial.html)

pos = np.arange(MAX_SEQ_LEN)[:, np.newaxis]
# np.arange(0, d_model, 2) generates integers from 0 to d_model - 2
div_term = np.exp(np.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))

PE[:, 0::2] = np.sin(pos * div_term)
PE[:, 1::2] = np.cos(pos * div_term)

# How to derive from the for loop to np vectorizatin manner

### 1. Formula-1
## ${div-term} = \frac{pos}{10000 (\frac{2i}{d_{model}})}$

### 2. Formula-2
## ${div-term} = {np.exp}\bigg({np.arange(0, d_{model}, 2)} \cdot \bigg(-\frac{math.log(10000.0)}{d_{model}}\bigg)\bigg)$

### 3. Understanding logarithms and exponention
## $e^{({log_e(a)})} = a$


### 4. rewrite the first formula with e raised to the power of logarithm of 10000
## $e^{({log_e(\frac{pos}{10000 (\frac{2i}{d_{model}})})})} = \frac{pos}{10000 (\frac{2i}{d_{model}})}$


### 5. rewrite the second formula 
## $-\frac{math.log(10000.0)}{d_{model}} \rightarrow  -\frac{math.log(10000.0)}{d_{model}} \cdot {d_{model}} = log(d_{model}) + log(-{math.log(10000.0)})$


### 6. raising `e` to power of both expressions

# $$e^{{\log(10000) \cdot \left( \frac{2i}{d_{model}} \right)}}$$
# $$e^{{\left( d_{model} \cdot \frac{math.log(10000.0)}{d_{model}} \right)} \cdot (-1)}$$


Due to the properties of logarithms and exponentiation:

- The logarithm part in both expressions cancels out (as raising `e` to the power of its own logarithm results in 1).
- The negative sign in the second expression flips the result, effectively matching the behavior of raising 10000 to a negative power in Formula 1.

Therefore, both expressions, when raised to `e` (the base of the natural logarithm), yield the same result. This result is the scaling factor for the specific even dimension (`i`).



# https://ai.stackexchange.com/questions/41670/why-use-exponential-and-log-in-positional-encoding-of-transformer  


# $$PE_{(pos, 2i)} = sin(\frac{pos}{10000^{2i/d_{model}}})$$
# $$PE_{(pos, 2i)} = cos(\frac{pos}{10000^{2i/d_{model}}})$$  


# $\because x=math.exp (log_e(x)) \& log x^a = a log x$
# $\therefore \frac{pos}{10000^{2i/d_{model}}} = exp(log(pos) - \frac{2i}{d_{model}}log(10000))$


>But perhaps you are interested in the question as to why the positional encodings are of this fairly whacky form. My understanding of the main intuitions are that sines and cosines interact really nicely with translation due to their periodicity, and by using exponentially spaced 'frequencies' one can extract signals for interactions at a large number of different 'length scales'; see this excellent blog post and associated links for further explanation. But as I understand it, the main reason this clever idea is so widely used is because it works so well in practice.  https://kazemnejad.com/blog/transformer_architecture_positional_encoding/




# https://www.kaggle.com/code/lianghsunhuang/positional-encoding

In [15]:
class PositionalEncoding(nn.Module):
    def __init__(self,
                 emb_size:int,
                 dropout:float,
                 maxlen: int=5000):
        super(PositionalEncoding, self).__init__()
        # den - denominator
        den = torch.exp(- torch.arange(0, emb_size, 2) * math.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)
        
        self.dropout = nn.Dropout(dropout)
        self.register_buffer("pos_embedding", pos_embedding)
        
    def forward(self, token_embedding: Tensor):
        return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0),:])
    

Multiplying the embedding vectors by math.sqrt(d_model) essentially scales their initial values. This scaling helps address the issues mentioned above: Normalization: By dividing the variance of the initial values by d_model, the gradients tend to have a more manageable magnitude during backpropagation, improving learning efficiency. Activation Functions: Scaling the initial values ensures they are within a range where activation functions can operate effectively, allowing for more nuanced gradients during training. Alternative Initializations:

While math.sqrt(d_model) is a common scaling factor, it's not the only approach. Some researchers use other techniques like uniform initialization within a specific range or initialization based on pre-trained word embeddings from sources like Word2Vec or GloVe.

In [16]:
# converting tensor of input indices into corresponding tensor of token embeddings
class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size
        
    def forward(self, tokens: Tensor):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)


```python
 def forward(self, src: Tensor, tgt: Tensor, src_mask: Optional[Tensor] = None, tgt_mask: Optional[Tensor] = None,
                memory_mask: Optional[Tensor] = None, src_key_padding_mask: Optional[Tensor] = None,
                tgt_key_padding_mask: Optional[Tensor] = None, memory_key_padding_mask: Optional[Tensor] = None,
                src_is_causal: Optional[bool] = None, tgt_is_causal: Optional[bool] = None,
                memory_is_causal: bool = False) -> Tensor:
```

In [17]:
# Model definition
class Seq2SeqTransformer(nn.Module):
    def __init__(
        self, 
        num_encoder_layers: int, 
        num_decoder_layers: int,
        emb_size: int,
        nhead: int, 
        src_vocab_size: int,
        tgt_vocab_size: int,
        dim_feedforward: int=512,
        dropout: float=0.1
    ):
        super(Seq2SeqTransformer, self).__init__()
        self.transformer = Transformer(
            d_model = emb_size,
            nhead = nhead,
            num_encoder_layers=num_encoder_layers,
            num_decoder_layers=num_decoder_layers,
            dim_feedforward=dim_feedforward,
            dropout=dropout
        )
        self.generator = nn.Linear(emb_size, tgt_vocab_size)
        self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)
        self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)
        self.positional_encoding = PositionalEncoding(
            emb_size, dropout=dropout
        )
    
    def forward(
        self,
        src: Tensor,
        trg: Tensor,
        src_mask: Tensor,
        tgt_mask: Tensor,
        src_padding_mask: Tensor,
        tgt_padding_mask: Tensor,
        memory_key_padding_mask: Tensor
    ):
        src_emb = self.positional_encoding(self.src_tok_emb(src))
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg))
        outs = self.transformer(
            src_emb,
            tgt_emb,
            src_mask,
            tgt_mask,
            None,  # memory mask
            src_padding_mask,
            tgt_padding_mask,
            memory_key_padding_mask
        )
        return self.generator(outs)
    
    def encode(self, src: Tensor, src_mask: Tensor):
        return self.transformer.encoder(self.positional_encoding(
            self.src_tok_emb(src)  # token emb + positional emb
        ), src_mask)
    
    def decode(self, tgt: Tensor, memory: Tensor, tgt_mask: Tensor):
        return self.transformer.decoder()

During training, we need a subsequent word mask that will prevent the model from looking into the future words when making predictions  



**Square Subsequent Mask:**

- A square subsequent mask is a binary matrix (often of size `sequence_length x sequence_length`) used to ensure the model only attends to past or current positions (including itself) during self-attention.
- Here's how it works:
  - All elements on the main diagonal (i, i) of the mask are set to 1, allowing the model to attend to the word itself (attend to its own embedding).
  - All elements **above** the main diagonal (i, j where i < j) are set to 0, preventing the model from attending to future positions (words that come later in the sequence).
  - The elements **below** the main diagonal (i, j where i > j) can be set to 1 (depending on the specific implementation). This allows the model to attend to past positions (words that came earlier in the sequence).
  
```
| 1 | 0 | 0 | 0 | 0 | (Word 1 can attend to itself)
| 1 | 1 | 0 | 0 | 0 | (Word 2 can attend to Word 1 and itself)
| 1 | 1 | 1 | 0 | 0 | (Word 3 can attend to Word 1, Word 2, and itself)
| 1 | 1 | 1 | 1 | 0 | (Word 4 can attend to Word 1, Word 2, Word 3, and itself)
| 1 | 1 | 1 | 1 | 1 | (Word 5 can attend to all previous words and itself)
```


In [18]:
def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones((sz, sz), device=DEVICE)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, 
                                                                          float(1.0))

    return mask

In [19]:
def create_mask(src, tgt):
    src_seq_len = src.shape[0]
    tgt_seq_len = tgt.shape[0]
    
    # decoder masking
    tgt_mask = generate_square_subsequent_mask(tgt_seq_len)
    src_mask = torch.zeros((src_seq_len, src_seq_len), device=DEVICE).type(torch.bool)
    
    src_padding_mask = (src == PAD_IDX).transpose(0,1)
    tgt_padding_mask = (tgt == PAD_IDX).transpose(0,1)
    
    return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask   

# subsequent mask test

In [20]:
subsequent_mask = np.triu(np.ones(10), k=1).astype('uint8')
subsequent_mask

array([[0, 1, 1, 1, 1, 1, 1, 1, 1, 1],
       [0, 0, 1, 1, 1, 1, 1, 1, 1, 1],
       [0, 0, 0, 1, 1, 1, 1, 1, 1, 1],
       [0, 0, 0, 0, 1, 1, 1, 1, 1, 1],
       [0, 0, 0, 0, 0, 1, 1, 1, 1, 1],
       [0, 0, 0, 0, 0, 0, 1, 1, 1, 1],
       [0, 0, 0, 0, 0, 0, 0, 1, 1, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=uint8)

In [21]:
torch.from_numpy(subsequent_mask) == 0  # subtle way to yield decoder masking with bool

tensor([[ True, False, False, False, False, False, False, False, False, False],
        [ True,  True, False, False, False, False, False, False, False, False],
        [ True,  True,  True, False, False, False, False, False, False, False],
        [ True,  True,  True,  True, False, False, False, False, False, False],
        [ True,  True,  True,  True,  True, False, False, False, False, False],
        [ True,  True,  True,  True,  True,  True, False, False, False, False],
        [ True,  True,  True,  True,  True,  True,  True, False, False, False],
        [ True,  True,  True,  True,  True,  True,  True,  True, False, False],
        [ True,  True,  True,  True,  True,  True,  True,  True,  True, False],
        [ True,  True,  True,  True,  True,  True,  True,  True,  True,  True]])

In [22]:
mask = (torch.triu(torch.ones((10, 10), device=DEVICE)) == 1).transpose(0, 1)
mask

tensor([[ True, False, False, False, False, False, False, False, False, False],
        [ True,  True, False, False, False, False, False, False, False, False],
        [ True,  True,  True, False, False, False, False, False, False, False],
        [ True,  True,  True,  True, False, False, False, False, False, False],
        [ True,  True,  True,  True,  True, False, False, False, False, False],
        [ True,  True,  True,  True,  True,  True, False, False, False, False],
        [ True,  True,  True,  True,  True,  True,  True, False, False, False],
        [ True,  True,  True,  True,  True,  True,  True,  True, False, False],
        [ True,  True,  True,  True,  True,  True,  True,  True,  True, False],
        [ True,  True,  True,  True,  True,  True,  True,  True,  True,  True]],
       device='cuda:0')

In [23]:
mask1 = mask.float().masked_fill(mask == 0, float('-inf'))
mask1

tensor([[1., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [1., 1., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [1., 1., 1., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [1., 1., 1., 1., -inf, -inf, -inf, -inf, -inf, -inf],
        [1., 1., 1., 1., 1., -inf, -inf, -inf, -inf, -inf],
        [1., 1., 1., 1., 1., 1., -inf, -inf, -inf, -inf],
        [1., 1., 1., 1., 1., 1., 1., -inf, -inf, -inf],
        [1., 1., 1., 1., 1., 1., 1., 1., -inf, -inf],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., -inf],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]], device='cuda:0')

In [24]:
mask2 = mask1.masked_fill(mask == 1, float(0.0))
mask2

tensor([[0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., 0., -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., 0., 0., -inf, -inf],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., -inf],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]], device='cuda:0')

In [25]:
mask = (torch.triu(torch.ones((10, 10), device=DEVICE)) == 1).transpose(0, 1)
mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
mask    
    

tensor([[0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., 0., -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., 0., 0., -inf, -inf],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., -inf],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]], device='cuda:0')

In [26]:
mask = (torch.triu(torch.ones(10,10)) == 1).transpose(0, 1)
mask

tensor([[ True, False, False, False, False, False, False, False, False, False],
        [ True,  True, False, False, False, False, False, False, False, False],
        [ True,  True,  True, False, False, False, False, False, False, False],
        [ True,  True,  True,  True, False, False, False, False, False, False],
        [ True,  True,  True,  True,  True, False, False, False, False, False],
        [ True,  True,  True,  True,  True,  True, False, False, False, False],
        [ True,  True,  True,  True,  True,  True,  True, False, False, False],
        [ True,  True,  True,  True,  True,  True,  True,  True, False, False],
        [ True,  True,  True,  True,  True,  True,  True,  True,  True, False],
        [ True,  True,  True,  True,  True,  True,  True,  True,  True,  True]])

In [27]:
mask = mask.float().masked_fill(mask==0, float("-inf"))
mask

tensor([[1., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [1., 1., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [1., 1., 1., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [1., 1., 1., 1., -inf, -inf, -inf, -inf, -inf, -inf],
        [1., 1., 1., 1., 1., -inf, -inf, -inf, -inf, -inf],
        [1., 1., 1., 1., 1., 1., -inf, -inf, -inf, -inf],
        [1., 1., 1., 1., 1., 1., 1., -inf, -inf, -inf],
        [1., 1., 1., 1., 1., 1., 1., 1., -inf, -inf],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., -inf],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]])

In [28]:
mask = mask.masked_fill(mask == 1, float(0.0))
mask

tensor([[0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., 0., -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., 0., 0., -inf, -inf],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., -inf],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [29]:
torch.manual_seed(0)

SRC_VOCAB_SIZE = len(vocab_transform[SRC_LANGUAGE])
TGT_VOCAB_SIZE = len(vocab_transform[TGT_LANGUAGE])
EMB_SIZE = 512
NHEAD = 8
FFN_HID_DIM = 512
BATCH_SIZE = 128
NUM_ENCODER_LAYERS = 3
NUM_DECODER_LAYERS = 3

transformer = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE,
                                 NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM)

for p in transformer.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

transformer = transformer.to(DEVICE)

loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)

optimizer = torch.optim.Adam(transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)



In [30]:
# collation
from torch.nn.utils.rnn import pad_sequence

# helper function to club together sequential operations
def sequential_transforms(*transforms):
    def func(txt_input):
        for transform in transforms:
            txt_input = transform(txt_input)
        return txt_input
    return func

# function to add BOS/EOS and create tensor for input sequence indices
def tensor_transform(token_ids: List[int]):
    return torch.cat((torch.tensor([BOS_IDX]),
                      torch.tensor(token_ids),
                      torch.tensor([EOS_IDX])))

# ``src`` and ``tgt`` language text transforms to convert raw strings into tensors indices
text_transform = {}
for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
    text_transform[ln] = sequential_transforms(token_transform[ln], #Tokenization
                                               vocab_transform[ln], #Numericalization
                                               tensor_transform) # Add BOS/EOS and create tensor


# function to collate data samples into batch tensors
def collate_fn(batch):
    src_batch, tgt_batch = [], []
    for src_sample, tgt_sample in batch:
        src_batch.append(text_transform[SRC_LANGUAGE](src_sample.rstrip("\n")))
        tgt_batch.append(text_transform[TGT_LANGUAGE](tgt_sample.rstrip("\n")))

    src_batch = pad_sequence(src_batch, padding_value=PAD_IDX)
    tgt_batch = pad_sequence(tgt_batch, padding_value=PAD_IDX)
    return src_batch, tgt_batch

In [31]:
from torch.utils.data import DataLoader

def train_epoch(model, optimizer):
    model.train()
    losses = 0
    train_iter = Multi30k(split='train', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
    train_dataloader = DataLoader(train_iter, batch_size=BATCH_SIZE, collate_fn=collate_fn)

    for src, tgt in train_dataloader:
        src = src.to(DEVICE)
        tgt = tgt.to(DEVICE)

        tgt_input = tgt[:-1, :]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

        logits = model(src, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask)

        optimizer.zero_grad()

        tgt_out = tgt[1:, :]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        loss.backward()

        optimizer.step()
        losses += loss.item()

    return losses / len(list(train_dataloader))


def evaluate(model):
    model.eval()
    losses = 0

    val_iter = Multi30k(split='valid', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
    val_dataloader = DataLoader(val_iter, batch_size=BATCH_SIZE, collate_fn=collate_fn)

    for src, tgt in val_dataloader:
        src = src.to(DEVICE)
        tgt = tgt.to(DEVICE)

        tgt_input = tgt[:-1, :]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

        logits = model(src, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask)

        tgt_out = tgt[1:, :]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        losses += loss.item()

    return losses / len(list(val_dataloader))

In [32]:
from timeit import default_timer as timer
NUM_EPOCHS = 18

for epoch in range(1, NUM_EPOCHS+1):
    start_time = timer()
    train_loss = train_epoch(transformer, optimizer)
    end_time = timer()
    val_loss = evaluate(transformer)
    print((f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Val loss: {val_loss:.3f}, "f"Epoch time = {(end_time - start_time):.3f}s"))


# function to generate output sequence using greedy algorithm
def greedy_decode(model, src, src_mask, max_len, start_symbol):
    src = src.to(DEVICE)
    src_mask = src_mask.to(DEVICE)

    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(DEVICE)
    for i in range(max_len-1):
        memory = memory.to(DEVICE)
        tgt_mask = (generate_square_subsequent_mask(ys.size(0))
                    .type(torch.bool)).to(DEVICE)
        out = model.decode(ys, memory, tgt_mask)
        out = out.transpose(0, 1)
        prob = model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim=1)
        next_word = next_word.item()

        ys = torch.cat([ys,
                        torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
        if next_word == EOS_IDX:
            break
    return ys


# actual function to translate input sentence into target language
def translate(model: torch.nn.Module, src_sentence: str):
    model.eval()
    src = text_transform[SRC_LANGUAGE](src_sentence).view(-1, 1)
    num_tokens = src.shape[0]
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
    tgt_tokens = greedy_decode(
        model,  src, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX).flatten()
    return " ".join(vocab_transform[TGT_LANGUAGE].lookup_tokens(list(tgt_tokens.cpu().numpy()))).replace("<bos>", "").replace("<eos>", "")



KeyboardInterrupt: 

In [None]:
print(translate(transformer, "Eine Gruppe von Menschen steht vor einem Iglu ."))