# seq to seq and attention

In [None]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import re
import random

In [None]:
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

In [None]:
import numpy as np
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
SOS_token = 0
EOS_token = 1

In [None]:
class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0:"SOS", 1:"EOS"}
        self.n_words = 2
        
    def addSentence(self, sentence):
        for word in sentence.split(" "):
            self.addWord(word)
            
    def addWord(self, word):
        if word not in self.word2index:
            # if not SOS or EOS, then the index 
            # will start from 2
            self.word2index[word] = self.n_words
            # The first actual word with counting
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1
            

In [None]:
# Turn a Unicode string to plain ASCII, thanks to
# https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z!?]+", r" ", s)
    return s.strip()

In [None]:
def readLangs(lang1, lang2, reverse=False):
    print("Reading lines...")
    
    lines = open("./data/english-lang-trans/%s-%s.txt" % (lang1, lang2),\
                encoding="utf-8").read().strip().split("\n")
    
    pairs = [[normalizeString(s) for s in l.split("\t")] for l in lines]
    
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    return input_lang, output_lang, pairs 

In [None]:
# trim the data set to only relatively short and simple sentences.

MAX_LENGTH=10

eng_prefixes = (
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s ",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re "
)

def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and \
        len(p[1].split(' ')) < MAX_LENGTH and \
        p[1].startswith(eng_prefixes)


def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

In [None]:
# prepare dataset

def prepareData(lang1, lang2, reverse=False):
    input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse)
    print("Read %s sentence pairs" % len(pairs))
    pairs = filterPairs(pairs)
    print("Trimmed to %s senence pairs" % len(pairs))
    print("Counting words")
    
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
        
    print("Counted words: ")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs

In [None]:
input_lang, output_lang, pairs = prepareData("eng", "fra", True)
print(random.choice(pairs))

## seq2seq working flow
![seq2seq working flow](https://pytorch.org/tutorials/_images/encoder-network.png)

In [None]:
class EncoderRNN(nn.Module):
    def __init__(self,input_size, hidden_size, dropout_p=0.1):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(dropout_p)
        
    
    def forward(self, input):
        embedded = self.dropout(self.embedding(input))
        output, hidden = self.gru(embedded)
        return output, hidden

![seq2seq encoding - decoding](https://pytorch.org/tutorials/_images/decoder-network.png)

**Understanding squeeze(-1) and Detach in Decoder Input Processing (Seq2seq with Teacher Forcing):**

In a seq2seq model with teacher forcing, `squeeze(-1)` and `detach` are used together during decoder input processing to ensure proper shape compatibility and memory optimization. Here's a detailed breakdown:

**1. Context and Teacher Forcing:**

- Seq2seq models translate between sequences (e.g., text-to-text, speech-to-text).
- Teacher forcing is a training technique where the model receives the correct target sequence as input at each decoding step, along with previously generated elements.

**2. Decoder Input Processing:**

   **a. Target Sequence Reshaping (unsqueeze(1))**

     - The target sequence (ground truth) typically has a shape `(batch_size, target_length)`.
     - For teacher forcing, the decoder needs input one element at a time during each decoding step.
     - To achieve this, `unsqueeze(1)` is used during training:

       ```python
       # Example target sequence
       target_sequence = torch.randint(1, 10, (batch_size, target_length))

       # Teacher forcing input for first decoding step
       teacher_forcing_input = target_sequence[:, 0].unsqueeze(1)
       ```

     - Explanation:
       - `target_sequence[:, 0]` extracts the first element (index 0) from each sequence in the batch, resulting in a tensor of shape `(batch_size,)`.
       - `unsqueeze(1)` inserts a new dimension of size 1 at dimension 1 (the column dimension). This creates a tensor with shape `(batch_size, 1)`, aligning with the expected decoder input format at the first step.

   **b. Decoder Output Reshaping and Detachment (squeeze(-1).detach())**

     - After each decoding step, the decoder generates an output.
     - We need to compare the decoder output with the corresponding element in the target sequence for calculating the loss during training.
     - However, the decoder output might have a shape `(batch_size, target_length, hidden_size)`:

       - `batch_size`: Number of samples in the batch.
       - `target_length`: Length of the target sequence.
       - `hidden_size`: Model's internal hidden state dimension (representing extracted features).

     ```python
     # Example decoder output after a decoding step
     decoder_output = model(decoder_input)  # Model processes decoder input

     # Process decoder output for loss calculation
     processed_decoder_output = decoder_output.squeeze(-1).detach()
     ```

     - Explanation:
       - `squeeze(-1)` removes the dimension of size 1 at the last dimension (dimension -1). This ensures the decoder output has the same shape `(batch_size, target_length)` as the target sequence, allowing for element-wise comparison during loss calculation. Essentially, it extracts the meaningful content from the last dimension (`hidden_size`) by combining it with the previous dimensions (`batch_size` and `target_length`).
       - `.detach()` detaches the processed decoder output from the computational graph. Since we're using the ground truth for teacher forcing and not backpropagating through the decoder output in this step, detaching saves memory and avoids unnecessary computations.

**3. Key Points:**

- `unsqueeze(1)` prepares the target sequence for teacher forcing by creating an input with the correct shape for the first decoding step.
- `squeeze(-1).detach()` processes the decoder output by:
   - Removing the unnecessary `hidden_size` dimension for element-wise comparison.
   - Detaching the output from the computational graph for teacher forcing efficiency.

**In essence, these operations ensure that the decoder receives the appropriate teacher forcing input and that the decoder output is properly shaped for loss calculation during training.**

In [None]:
class DecoderRNN(nn.Module):
    def __init__(self,hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)
        
    def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
        batch_size = encoder_outputs.size(0)
        decoder_input = torch.empty(batch_size, 1, dtype=torch.long,\
                                   device=device).fill_(SOS_token)
        decoder_hidden = encoder_hidden
        decoder_outputs = []
        
        for i in range(MAX_LENGTH):
            decoder_output, decoder_hidden = self.forward_step(decoder_input,
                                                               decoder_hidden)
            decoder_outputs.append(decoder_output)
            
            if target_tensor is not None:
                # Teacher forcing, feed the target as the next input
                decoder_input = target_tensor[:, i].unsqueeze(1) # teacher forcing
            else:
                _, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze(-1).detach() # detach from hist as input
                
        decoder_outputs = torch.cat(decoer_outputs, dim=1)
        decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)
        # return `None` for consistency in the training loop
        return decoder_outputs, decoder_hidden, None
    
    def forward_step(self, input, hidden):
        output = self.embedding(input)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.out(output)
        return output, hidden
        

![](https://i.imgur.com/1152PYf.png)  
![](https://pytorch.org/tutorials/_images/attention-decoder-network.png)


In [None]:
class BahdanauAttention(nn.Module):
    def __init__(self, hidden_size):
        super(BahdanauAttention, self).__init__()
        self.Wa = nn.Linear(hidden_size, hidden_size)
        self.Ua = nn.Linear(hidden_size, hidden_size)
        self.Va = nn.Linear(hidden_size, 1)

    def forward(self, query, keys):
        scores = self.Va(torch.tanh(self.Wa(query) + self.Ua(keys)))
        scores = scores.squeeze(2).unsqueeze(1)

        weights = F.softmax(scores, dim=-1)
        context = torch.bmm(weights, keys)

        return context, weights

In [None]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1):
        super(AttnDecoderRNN, self).__init__()
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.attention = BahdanauAttention(hidden_size)
        self.gru = nn.GRU(2 * hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
        batch_size = encoder_outputs.size(0)
        decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(SOS_token)
        decoder_hidden = encoder_hidden
        decoder_outputs = []
        attentions = []

        for i in range(MAX_LENGTH):
            decoder_output, decoder_hidden, attn_weights = self.forward_step(
                decoder_input, decoder_hidden, encoder_outputs
            )
            decoder_outputs.append(decoder_output)
            attentions.append(attn_weights)

            if target_tensor is not None:
                # Teacher forcing: Feed the target as the next input
                decoder_input = target_tensor[:, i].unsqueeze(1) # Teacher forcing
            else:
                # Without teacher forcing: use its own predictions as the next input
                _, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze(-1).detach()  # detach from history as input

        decoder_outputs = torch.cat(decoder_outputs, dim=1)
        decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)
        attentions = torch.cat(attentions, dim=1)

        return decoder_outputs, decoder_hidden, attentions


    def forward_step(self, input, hidden, encoder_outputs):
        embedded =  self.dropout(self.embedding(input))

        query = hidden.permute(1, 0, 2)
        context, attn_weights = self.attention(query, encoder_outputs)
        input_gru = torch.cat((embedded, context), dim=2)

        output, hidden = self.gru(input_gru, hidden)
        output = self.out(output)

        return output, hidden, attn_weights

In [None]:
# preparing taining data

def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(" ")]

def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(1, -1)

def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

def get_dataloader(batch_size):
    input_lang, output_lang, pairs = prepareData("eng", "fra", True)
    
    n = len(pairs)
    input_ids = np.zeros((n, MAX_LENGTH), dtype=np.int32)
    target_ids = np.zeros((n, MAX_LENGTH), dtype=np.int32)
    
    for idx, (inp, tgt) in enumerate(pairs):
        inp_ids = indexesFromSentence(input_lang, inp)
        tgt_ids = indexesFromSentence(output_lang, tgt)
        inp_ids.append(EOS_token)
        tgt_ids.append(EOS_token)
        input_ids[idx, :len(inp_ids)] = inp_ids
        target_ids[idx, :len(tgt_ids)] = tgt_ids
        
        
    train_data = TensorDataset(torch.LongTensor(input_ids).to(device),
                               torch.LongTensor(target_ids).to(device))
    
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, 
                                  batch_size=batch_size)
    return input_lang, output_lang, train_dataloader

In [None]:
def train_epoch(dataloader, encoder, decoder, encoder_optimizer,
          decoder_optimizer, criterion):

    total_loss = 0
    for data in dataloader:
        input_tensor, target_tensor = data

        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()

        encoder_outputs, encoder_hidden = encoder(input_tensor)
        decoder_outputs, _, _ = decoder(encoder_outputs, encoder_hidden, target_tensor)

        loss = criterion(
            decoder_outputs.view(-1, decoder_outputs.size(-1)),
            target_tensor.view(-1)
        )
        loss.backward()

        encoder_optimizer.step()
        decoder_optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)

In [None]:
# This is a helper function to print time elapsed and estimated time remaining given the current time and progress %.

import time
import math

def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [None]:
def train(train_dataloader, encoder, decoder, n_epochs, learning_rate=0.001,
               print_every=100, plot_every=100):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
    criterion = nn.NLLLoss()

    for epoch in range(1, n_epochs + 1):
        loss = train_epoch(train_dataloader, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if epoch % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, epoch / n_epochs),
                                        epoch, epoch / n_epochs * 100, print_loss_avg))

        if epoch % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

> Change the backend to TkAgg, QtAgg, or WXAgg

In [None]:
import matplotlib.pyplot as plt
plt.switch_backend('QtAgg')
import matplotlib.ticker as ticker
import numpy as np

def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [None]:
def evaluate(encoder, decoder, sentence, input_lang, output_lang):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)

        encoder_outputs, encoder_hidden = encoder(input_tensor)
        decoder_outputs, decoder_hidden, decoder_attn = decoder(encoder_outputs, encoder_hidden)

        _, topi = decoder_outputs.topk(1)
        decoded_ids = topi.squeeze()

        decoded_words = []
        for idx in decoded_ids:
            if idx.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            decoded_words.append(output_lang.index2word[idx.item()])
    return decoded_words, decoder_attn

In [None]:
def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(pairs)
        print('>', pair[0])
        print('=', pair[1])
        output_words, _ = evaluate(encoder, decoder, pair[0], input_lang, output_lang)
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

In [None]:
# actual train, evaluation

hidden_size = 128
batch_size = 32

input_lang, output_lang, train_dataloader = get_dataloader(batch_size)

encoder = EncoderRNN(input_lang.n_words, hidden_size).to(device)
decoder = AttnDecoderRNN(hidden_size, output_lang.n_words).to(device)

train(train_dataloader, encoder, decoder, 80, print_every=5, plot_every=5)

In [None]:
encoder.eval()
decoder.eval()
evaluateRandomly(encoder, decoder)

In [None]:
def showAttention(input_sentence, output_words, attentions):
    fig = plt.figure()
    ax = fig.add_subplot(111)
    cax = ax.matshow(attentions.cpu().numpy(), cmap='bone')
    fig.colorbar(cax)

    # Set up axes
    ax.set_xticklabels([''] + input_sentence.split(' ') +
                       ['<EOS>'], rotation=90)
    ax.set_yticklabels([''] + output_words)

    # Show label at every tick
    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

    plt.show()


def evaluateAndShowAttention(input_sentence):
    output_words, attentions = evaluate(encoder, decoder, input_sentence, input_lang, output_lang)
    print('input =', input_sentence)
    print('output =', ' '.join(output_words))
    showAttention(input_sentence, output_words, attentions[0, :len(output_words), :])


evaluateAndShowAttention('il n est pas aussi grand que son pere')

evaluateAndShowAttention('je suis trop fatigue pour conduire')

evaluateAndShowAttention('je suis desole si c est une question idiote')

evaluateAndShowAttention('je suis reellement fiere de vous')

# preprocess custom text dataset using torchtext 

Read a dataset

Tokenize sentence

Apply transforms to sentence

Perform bucket batching

In [14]:
import spacy
import torchdata.datapipes as dp
import torchtext.transforms as T

from torchtext.vocab import build_vocab_from_iterator
eng = spacy.load("en_core_web_sm")
zh = spacy.load("zh_core_web_sm")
de = spacy.load("de_core_news_sm")
fr = spacy.load("fr_core_news_sm")

In [15]:
FILE_PATH = "data/deu-eng/deu.txt"
data_pipe = dp.iter.IterableWrapper([FILE_PATH])
data_pipe = dp.iter.FileOpener(data_pipe, mode="rb")
data_pipe = data_pipe.parse_csv(skip_lines=0, delimiter="\t", as_tuple=True)

In [16]:
def showSample(data_pipe):
    for sample in data_pipe:
        print(sample)
        break

In [17]:
showSample(data_pipe)

('Go.', 'Geh.', 'CC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #8597805 (Roujin)')


In [18]:
def removeAttribution(row):
    """
    Keeping the first two elements in a tuple
    Keeping the first and second columns
    """
    return row[:2]

In [20]:
data_pipe = data_pipe.map(removeAttribution)

In [21]:
showSample(data_pipe)

('Go.', 'Geh.')


In [23]:
# tokenization
def engTokenize(text):
    """
    tokenize an English text and return a list of tokens
    """
    return [token.text for token in eng.tokenizer(text)]

In [24]:
def deTokenize(text):
    """
    Tokenize a German text and return a list of tokens
    """
    return [token.text for token in de.tokenizer(text)]

In [25]:
# test for the tokenization
print(engTokenize("Hello world!!!"))
print(deTokenize("Hallo Welt!!!"))

['Hello', 'world', '!', '!', '!']
['Hallo', 'Welt', '!', '!', '!']


In [26]:
print(engTokenize("Have a good day!!!"))
print(deTokenize("Haben Sie einen guten Tag!!!"))

['Have', 'a', 'good', 'day', '!', '!', '!']
['Haben', 'Sie', 'einen', 'guten', 'Tag', '!', '!', '!']


In [27]:
# build vocabulary
def getTokens(data_iter, place):
    """
    Function to yield tokens from an iterator. Since, our iterator contains
    tuple of sentences (source and target), `place` parameters defines for which
    index to return the tokens for. `place=0` for source and `place=1` for target
    """
    for english, german in data_iter:
        if place == 0:
            yield engTokenize(english)
        else:
            yield deTokenize(german)

In [28]:
source_vocab = build_vocab_from_iterator(
    getTokens(data_pipe, 0),
    min_freq=2,
    specials=["<pad>", "<sos>", "<eos>", "<unk>"],
    special_first=True
)
source_vocab.set_default_index(source_vocab["<unk>"])

In [30]:
target_vocab = build_vocab_from_iterator(
    getTokens(data_pipe,1),
    min_freq=2,
    specials= ['<pad>', '<sos>', '<eos>', '<unk>'],
    special_first=True
)
target_vocab.set_default_index(target_vocab['<unk>'])

In [33]:
# test for vocab
print(source_vocab.get_itos()[:9])

['<pad>', '<sos>', '<eos>', '<unk>', '.', 'I', 'Tom', 'to', 'you']


In [34]:
# Numericalize sentences using vocabulary
# convert our sentences to corresponding indices
def getTransform(vocab):
    """
    Create transforms based on given vocabulary. The returned transform is applied to sequence
    of tokens.
    """
    text_transform = T.Sequential(
        # converts the sentences to indices based on given vocabulary
        T.VocabTransform(vocab=vocab),
        ## Add <sos> at beginning of each sentence. 1 because the index for <sos> in vocabulary is
        # 1 as seen in previous section
        T.AddToken(1, begin=True),
        ## Add <eos> at beginning of each sentence. 2 because the index for <eos> in vocabulary is
        # 2 as seen in previous section
        T.AddToken(2, begin=False)
    )
    return text_transform

In [47]:
temp_list = list(data_pipe)
some_sentence = temp_list[798][0]
print(f"Some sentence = ", end="")
print(some_sentence)
transformed_sentence = getTransform(source_vocab)(engTokenize(some_sentence))
print("Transformed sentence = ", end="")
print(transformed_sentence)
index_to_string = source_vocab.get_itos()
for index in transformed_sentence:
    print(index_to_string[index], end="")

Some sentence = I changed.
Transformed sentence = [1, 5, 510, 4, 2]
<sos>Ichanged.<eos>

In [48]:
def applyTransform(sequence_pair):
    """
    Apply transforms to sequence of tokens in a sequence pair
    """
    
    return (
        getTransform(source_vocab)(engTokenize(sequence_pair[0])),
        getTransform(target_vocab)(deTokenize(sequence_pair[1]))
    )

In [49]:
data_pipe = data_pipe.map(applyTransform)
temp_list = list(data_pipe)
print(temp_list[0])

([1, 617, 4, 2], [1, 743, 4, 2])


In [57]:
index_to_string = source_vocab.get_itos()
for inx_list in temp_list[0]:
#     print(index_to_string[index], end="")
    output = []
    
    for inx in inx_list:
        print(index_to_string[inx], end="")

<sos>Go.<eos><sos>present.<eos>

In [54]:
def sortBucket(bucket):
    """
    Function to sort a given bucket. Here, we want to sort based on the length of
    source and target sequence.
    """
    return sorted(bucket, key=lambda x: (len(x[0]), len(x[1])))

In [58]:
data_pipe = data_pipe.bucketbatch(
    batch_size = 4,
    batch_num = 5,
    bucket_num = 1,
    use_in_batch_shuffle=False,
    sort_key=sortBucket
)

In [60]:
len(list(data_pipe))

69473

In [59]:
print(list(data_pipe)[0])

[([1, 5393, 21, 4, 2], [1, 7035, 31, 24, 2]), ([1, 5393, 21, 4, 2], [1, 16189, 31, 24, 2]), ([1, 5, 440, 4, 2], [1, 7, 673, 4, 2]), ([1, 5, 440, 4, 2], [1, 7, 3256, 4, 2])]


In [62]:
def separateSourceTarget(sequence_pairs):
    """
    input of form: `[(X_1,y_1), (X_2,y_2), (X_3,y_3), (X_4,y_4)]`
    output of form: `((X_1,X_2,X_3,X_4), (y_1,y_2,y_3,y_4))`
    """
    sources,targets = zip(*sequence_pairs)
    return sources,targets

In [63]:
## Apply the function to each element in the iterator
data_pipe = data_pipe.map(separateSourceTarget)
print(list(data_pipe)[0])

(([1, 1972, 69, 4, 2], [1, 617, 99, 4, 2], [1, 1044, 21, 4, 2], [1, 6923, 23, 195, 2]), ([1, 2200, 28, 24, 2], [1, 743, 106, 24, 2], [1, 375, 12, 31, 24, 2], [1, 7, 1041, 381, 24, 2]))


In [64]:
# padding
def applyPadding(pair_of_sequences):
    """
    Convert sequences to tensors and apply paddjing
    """
    return (T.ToTensor(0)(list(pair_of_sequences[0])), T.ToTensor(0)(list(pair_of_sequences[1])))

In [65]:
data_pipe = data_pipe.map(applyPadding)

In [83]:
#  index to string mapping to see how the sequence would look with tokens instead of indices

source_index_to_string = source_vocab.get_itos()
target_index_to_string = target_vocab.get_itos()


def showSomeTransformedSentences(data_pipe):
    """
    Function to show how the sentences look like after applying all transforms.
    Here we try to print actual words instead of corresponding index
    """
    for sources,targets in data_pipe:
        if sources[0][-1] != 0:
            continue # Just to visualize padding of shorter sentences
        for i in range(4):
            source = ""
            for token in sources[i]:
                source += " " + source_index_to_string[token]
            target = ""
            for token in targets[i]:
                target += " " + target_index_to_string[token]
            print(f"Source: {source}")
            print(f"Target: {target}")
        break

In [84]:
showSomeTransformedSentences(data_pipe)

Source:  <sos> I spit . <eos> <pad>
Target:  <sos> Ich habe gespuckt . <eos>
Source:  <sos> I wept . <eos> <pad>
Target:  <sos> Ich habe geweint . <eos>
Source:  <sos> I 'm 19 . <eos>
Target:  <sos> Ich bin 19. <eos> <pad>
Source:  <sos> I 'm OK . <eos>
Target:  <sos> Mir geht's gut . <eos>


# Text classification with the torchtext library
`!pip install -U portalocker>=2.0.0``

In [106]:
import torch
from torchtext.datasets import AG_NEWS

In [107]:
train_iter = iter(AG_NEWS(split="train"))

In [109]:
for i in range(20):
    print(next(train_iter))

(3, "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.")
(3, 'Carlyle Looks Toward Commercial Aerospace (Reuters) Reuters - Private investment firm Carlyle Group,\\which has a reputation for making well-timed and occasionally\\controversial plays in the defense industry, has quietly placed\\its bets on another part of the market.')
(3, "Oil and Economy Cloud Stocks' Outlook (Reuters) Reuters - Soaring crude prices plus worries\\about the economy and the outlook for earnings are expected to\\hang over the stock market next week during the depth of the\\summer doldrums.")
(3, 'Iraq Halts Oil Exports from Main Southern Pipeline (Reuters) Reuters - Authorities have halted oil export\\flows from the main pipeline in southern Iraq after\\intelligence showed a rebel militia could strike\\infrastructure, an oil official said on Saturday.')
(3, 'Oil prices soar to all-time record, posing new menace t

In [111]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

In [126]:
tokenizer = get_tokenizer("basic_english")
train_iter = AG_NEWS(split="train")

In [115]:
def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text)

In [116]:
vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

In [117]:
vocab(['here', 'is', 'an', 'example'])

[475, 21, 30, 5297]

In [123]:
vocab(["hello", "gears", "of", "war"])

[12544, 9199, 6, 269]

In [127]:
text_pipeline = lambda x: vocab(tokenizer(x))
label_pipeline = lambda x: int(x) - 1

In [128]:
text_pipeline('here is the an example')

[475, 21, 2, 30, 5297]

In [129]:
label_pipeline('10')

9

In [131]:
from torch.utils.data import DataLoader

In [132]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
def collate_batch(batch):
    label_list, text_list, offsets = [],[],[0]
    for _label, _text in batch:
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)