# Full Transformer Architecture

In [1]:
import torch.nn as nn
import torch
import random
import os
import pandas as pd
from itertools import chain
import math

#### Read Data

In [2]:
# train_df = pd.read_csv("../../Datasets/Jumble_Unjumble/Train_400.tsv",sep="\t")
# test_df = pd.read_csv("../../Datasets/Jumble_Unjumble/Test_100.tsv",sep="\t")
# print(train_df.shape, test_df.shape)
# train_df.head()

In [3]:
jumbled_df = pd.read_csv("../../Datasets/Jumble_Unjumble/processed_jumbled.txt",sep="\t",header=None)
unjumbled_df = pd.read_csv("../../Datasets/Jumble_Unjumble/processed_unjumbled.txt",sep="\t",header=None)
jumbled_df.columns = ["jumbled_sentences"]
unjumbled_df.columns = ["unjumbled_sentences"]
df = pd.concat([jumbled_df,unjumbled_df],axis=1)
train_df = df.sample(frac=0.8, random_state=42) 
test_df = df.drop(train_df.index)
print(train_df.shape, test_df.shape)
train_df.head()

(32368, 2) (8092, 2)


Unnamed: 0,jumbled_sentences,unjumbled_sentences
32760,tools and a man gardening inside two holding a...,a man and two women are inside a greenhouse ho...
31413,meandering at people of the walkway stand . up...,people stand at the bottom of a meandering wal...
4325,standing a rock . on man view the shorts a out...,a man in shorts is standing on a rock looking ...
28232,to a a in on shirt little red a holds pole nea...,a little girl in a red shirt holds on to a pol...
28438,children two in <unk> play the melting .,two children play in the melting <unk> .


#### Preprocess Train and Test Data
1. Lowercasing, removing stopwords. <br>
2. Stemming, Lemmetization. <br>
3. Tokenization. <br>
4. Here, we are just doing tokenization by splitting on space.

In [4]:
class Preprocessor:
    def __init__(self):
        self.tokenize_on = " "
    
    def tokenize(self,text_string):
        '''
        text_string = "This is one sentence."
        returns token_list = ["This","is","one","sentence."]
        '''
        token_list = text_string.split(self.tokenize_on)
        return token_list

In [5]:
preprocessor = Preprocessor()
train_df["jumbled_sentences"] = train_df["jumbled_sentences"].apply(lambda x: preprocessor.tokenize(x))
train_df["unjumbled_sentences"] = train_df["unjumbled_sentences"].apply(lambda x: preprocessor.tokenize(x))
test_df["jumbled_sentences"] = test_df["jumbled_sentences"].apply(lambda x: preprocessor.tokenize(x))
test_df["unjumbled_sentences"] = test_df["unjumbled_sentences"].apply(lambda x: preprocessor.tokenize(x))
print(train_df.shape, test_df.shape)
train_df.head()

(32368, 2) (8092, 2)


Unnamed: 0,jumbled_sentences,unjumbled_sentences
32760,"[tools, and, a, man, gardening, inside, two, h...","[a, man, and, two, women, are, inside, a, gree..."
31413,"[meandering, at, people, of, the, walkway, sta...","[people, stand, at, the, bottom, of, a, meande..."
4325,"[standing, a, rock, ., on, man, view, the, sho...","[a, man, in, shorts, is, standing, on, a, rock..."
28232,"[to, a, a, in, on, shirt, little, red, a, hold...","[a, little, girl, in, a, red, shirt, holds, on..."
28438,"[children, two, in, <unk>, play, the, melting,...","[two, children, play, in, the, melting, <unk>,..."


#### Create X_Encoder, X_Decoder and Y
1. X denotes Input, Y denotes Output. <br>
2. X_encoder is the matrix of tokens in jumbled_sentences, each sentence suffixed by "eos" token. <br>
3. X_decoder is the matrix of tokens in unjumbled_sentences, each sentence prefixed by "sos" token. X_decoder is required because we want to do <b>Teacher Forcing</b>, which means we want to provide the correct current token to decoder to predict next token, instead of relying only on its own prediction. <br>
4. Y is the matrix of unjumbled_sentences, each sentence suffixed by "eos" token. <br>

5. Do this for both train and test data.

In [6]:
def get_Xe_Xd_Y(dataframe, sos_token, eos_token):
    jumbled_sentences = dataframe["jumbled_sentences"].tolist()
    unjumbled_sentences = dataframe["unjumbled_sentences"].tolist()
    X_encoder_tokens = [el + [eos_token] for el in jumbled_sentences]
    X_decoder_tokens = [[sos_token] + el for el in unjumbled_sentences]
    Y_tokens = [el + [eos_token] for el in unjumbled_sentences]
    return X_encoder_tokens, X_decoder_tokens, Y_tokens

unknown_token = "<unk>"
pad_token = "<pad>"
sos_token = "<sos>"
eos_token = "<eos>"
X_encoder_tokens_tr, X_decoder_tokens_tr, Y_tokens_tr = get_Xe_Xd_Y(train_df, sos_token, eos_token)
X_encoder_tokens_test, X_decoder_tokens_test, Y_tokens_test = get_Xe_Xd_Y(test_df, sos_token, eos_token)
print("X Encoder train length:",len(X_encoder_tokens_tr))
print("X Encoder test length:",len(X_encoder_tokens_test))
print("Sample X_encoder_train:",X_encoder_tokens_tr[0])
print("Sample X_decoder_train:",X_decoder_tokens_tr[0])
print("Sample Y_train:",Y_tokens_tr[0])

X Encoder train length: 32368
X Encoder test length: 8092
Sample X_encoder_train: ['tools', 'and', 'a', 'man', 'gardening', 'inside', 'two', 'holding', 'are', '.', 'women', 'a', 'greenhouse', '', '<eos>']
Sample X_decoder_train: ['<sos>', 'a', 'man', 'and', 'two', 'women', 'are', 'inside', 'a', 'greenhouse', 'holding', 'gardening', 'tools', '.', '']
Sample Y_train: ['a', 'man', 'and', 'two', 'women', 'are', 'inside', 'a', 'greenhouse', 'holding', 'gardening', 'tools', '.', '', '<eos>']


#### Build Vocab
1. Generally, Vocab is created from both Encoder and Decoder Tokens, consider Senetence Translation for ex, where encoder and decoder tokens can be in different languages. <br>
2. We can create Vocab separately for Encder and Decoder tokens, or can create shared vocab. Shared Vocab is preferable though. <br>
3. Also, Vocab is generated from only Training Data. <br>
4. In this case, we are using only Encoder tokens to create Vocab because Decoder Tokens are the same. Also, we are using both Train and Test Dataset to create Vocab, as our datasize is small.

In [7]:
class VocabBuilder:
    def __init__(self,token_corpus,unknown_token=None,pad_token=None,sos_token=None,eos_token=None):
        '''
        token_corpus = ['tools', 'and', 'a', 'man', 'gardening', 'inside', 'two', 'holding', 'are', '.']
        '''
        self.token_corpus = token_corpus
        self.unknown_token = unknown_token or "<unk>"
        self.pad_token = pad_token or "<pad>"
        self.sos_token = sos_token or "<sos>"
        self.eos_token = eos_token or "<eos>"
        self.word_to_index, self.index_to_word = self.get_vocabs()
                        
    def get_vocabs(self):
        word_to_index = {}
        index_count = 0
        all_unique_words = set(self.token_corpus).difference(set(
            [self.unknown_token, self.pad_token, self.sos_token, self.eos_token]
        ))
        word_to_index[self.unknown_token] = 0
        word_to_index[self.pad_token] = 1
        word_to_index[self.sos_token] = 2
        word_to_index[self.eos_token] = 3
        
        for index, word in enumerate(all_unique_words):
            word_to_index[word] = index + 4
        if self.pad_token not in word_to_index: word_to_index[self.pad_token] = index + 1
        if self.sos_token not in word_to_index: word_to_index[self.sos_token] = index + 2
        if self.eos_token not in word_to_index: word_to_index[self.eos_token] = index + 3
        if self.unknown_token not in word_to_index: word_to_index[self.unknown_token] = index + 4
        index_to_word = {v:k for k,v in word_to_index.items()}
        return word_to_index, index_to_word

In [8]:
token_corpus_1 = list(chain.from_iterable(X_encoder_tokens_tr)) # flattens a 2D list ot 1D
token_corpus_2 = list(chain.from_iterable(X_encoder_tokens_test))  # flattens a 2D list ot 1D
token_corpus = token_corpus_1 + token_corpus_2
print(token_corpus[:20])

['tools', 'and', 'a', 'man', 'gardening', 'inside', 'two', 'holding', 'are', '.', 'women', 'a', 'greenhouse', '', '<eos>', 'meandering', 'at', 'people', 'of', 'the']


In [9]:
vocab_builder = VocabBuilder(token_corpus,unknown_token,pad_token,sos_token,eos_token)
print("WordToIndex Dict length:",len(vocab_builder.word_to_index))
print("IndexToWord Dict length:",len(vocab_builder.index_to_word))

WordToIndex Dict length: 5242
IndexToWord Dict length: 5242


#### Map X_encoder, X_decoder and Y using Vocab 

In [10]:
class Token_Index_Mapper:
    def __init__(self,token_to_index,index_to_token, unknown_token):
        self.token_to_index = token_to_index
        self.index_to_token = index_to_token
        self.unknown_token = unknown_token
    
    def get_encoding(self,sentence):
        '''
        sentence must be a list of tokens.
        Ex: ["Climate","change","is","a","pressing","global","issue"]
        '''
        encoded_sentence = []
        for token in sentence:
            if token in self.token_to_index: encoded_sentence.append(self.token_to_index[token])
            else: encoded_sentence.append(self.token_to_index[self.unknown_token])
        return encoded_sentence
    
    def get_decoding(self,encoded_sentence):
        '''
        encoded_sentence must be a list of vocab indices.
        Ex: encoded_sentence = [24,21,4,1,..] 
        '''
        sentence = [self.index_to_token[index] for index in encoded_sentence]
        return " ".join(sentence)

In [11]:
def map_tokens_to_indices(token_index_mapper, max_sequence_length, token_matrix):
    index_matrix = []
    for el in token_matrix:
        el = el[:max_sequence_length] # truncate sentence to max_seq_length
        if len(el) < max_sequence_length:
            pad_tokens_to_append = max_sequence_length - len(el)
            el = el + [pad_token]*pad_tokens_to_append
        index_matrix.append(token_index_mapper.get_encoding(el))
    return index_matrix

In [12]:
max_sequence_length = 25
token_index_mapper = Token_Index_Mapper(vocab_builder.word_to_index, vocab_builder.index_to_word, unknown_token)
X_encoder_indices_tr = map_tokens_to_indices(token_index_mapper, max_sequence_length, X_encoder_tokens_tr)
X_decoder_indices_tr = map_tokens_to_indices(token_index_mapper, max_sequence_length, X_decoder_tokens_tr)
Y_indices_tr = map_tokens_to_indices(token_index_mapper, max_sequence_length, Y_tokens_tr)

X_encoder_indices_test = map_tokens_to_indices(token_index_mapper, max_sequence_length, X_encoder_tokens_test)
X_decoder_indices_test = map_tokens_to_indices(token_index_mapper, max_sequence_length, X_decoder_tokens_test)
Y_indices_test = map_tokens_to_indices(token_index_mapper, max_sequence_length, Y_tokens_test)
print("X Encoder train length:",len(X_encoder_indices_tr))
print("X Encoder test length:",len(X_encoder_indices_test))
print("Sample X_encoder_train:",X_encoder_indices_tr[0])
print("Sample X_decoder_train:",X_decoder_indices_tr[0])
print("Sample Y_train:",Y_indices_tr[0])

X Encoder train length: 32368
X Encoder test length: 8092
Sample X_encoder_train: [986, 4094, 4391, 3020, 4636, 5047, 2764, 2011, 5124, 4821, 2055, 4391, 4671, 4, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Sample X_decoder_train: [2, 4391, 3020, 4094, 2764, 2055, 5124, 5047, 4391, 4671, 2011, 4636, 986, 4821, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Sample Y_train: [4391, 3020, 4094, 2764, 2055, 5124, 5047, 4391, 4671, 2011, 4636, 986, 4821, 4, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


### Define Model 
1. Encoder
2. Decoder

In [13]:
class EmbeddingClass(nn.Module):
    def __init__(self, vocab_size, embedding_dim, max_token_length):
        super(EmbeddingClass, self).__init__()
        self.max_token_length = max_token_length
        self.tok_embedding = nn.Embedding(vocab_size, embedding_dim)
        self.pos_embedding = nn.Embedding(max_token_length, embedding_dim)
    
    def forward(self,token_ids, debug = False):
        # create position ids
        batch_size = token_ids.shape[0]
        position_id = range(0,self.max_token_length)
        position_ids = torch.tensor([position_id]*batch_size)
        
        token_embeddings = self.tok_embedding(token_ids)
        position_embeddings = self.pos_embedding(position_ids)
        
        embbeded_x = token_embeddings + position_embeddings
        
        if debug:
            print("----------Embedding Class------------")
            print("token_ids shape:",token_ids.shape)
            print("batch_size:", batch_size)
            print("position_ids shape:", position_ids.shape)
            print("token_embeddings shape:", token_embeddings.shape)
            print("position_embeddings shape:", position_embeddings.shape)
            print("embbeded_x shape:", embbeded_x.shape)
        
        return embbeded_x

In [14]:
class SelfAttention(nn.Module):
    def __init__(self, embedding_dim, head_dim):
        super(SelfAttention, self).__init__()
        self.head_dim = head_dim
        self.W_K = nn.Linear(embedding_dim, head_dim)
        self.W_Q = nn.Linear(embedding_dim, head_dim)
        self.W_V = nn.Linear(embedding_dim, head_dim)
    
    def forward(self, embedded_x1, embedded_x2, mask_matrix = None, debug = False):
        # Batch_Size = B, Seq_Length = L, Embedding_Dim = D, Head_Dim = H
        # embedded_x shape = (B X L X D)
        q = self.W_Q(embedded_x1) # q shape = (B X L X H)
        k = self.W_K(embedded_x2) # k shape = (B X L X H)
        v = self.W_V(embedded_x2) # v shape = (B X L X H)
        
        scores = torch.bmm(q, k.transpose(1,2))/math.sqrt(self.head_dim)
        # scores shape = (B X L X L)
        # torch.bmm() does the batch matrix multiplication:
        # input_1 = (b X n X m), input_2 = (b X m X p)
        # op = torch.zeros(b X n X p)
        # for each matrix (n X m) in input_1 and (m X p) in input_2, 
        # torch.bmm() will output an (n X p) matrix. 
        # This will be done 'b' times (i.e for all b matrices)
        
        if mask_matrix is not None and mask_matrix.shape[0] > 0:
            scores = scores.masked_fill(mask_matrix==0, float("-inf"))
        weights = torch.softmax(scores, dim=2) # (B X L X L)
        output = torch.bmm(weights, v) # (B X L X H)
        
        if debug:
            print("----------Self-Attention------------")
            print("q shape:", q.shape)
            print("k shape:", k.shape)
            print("v shape:", v.shape)
            if mask_matrix is not None and mask_matrix.shape[0] > 0: print("mask_matrix shape:", mask_matrix.shape)
            print("scores shape:", scores.shape)
            print("weights shape:", weights.shape)
            print("output shape:", output.shape)
        
        return output

In [15]:
class MultiHeadAttention(nn.Module):
    def __init__(self, embedding_dim, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.head_dim = embedding_dim // num_heads
        self.sa_list = nn.ModuleList([
           SelfAttention(embedding_dim, self.head_dim) 
            for _ in range(self.num_heads)
        ])
        self.linear = nn.Linear(embedding_dim, embedding_dim)
    
    def forward(self, embedded_x1, embedded_x2, mask_matrix = None, debug = False):
        multiple_self_attn_op = [
            self_attn_head(embedded_x1, embedded_x2, mask_matrix, debug) 
            for self_attn_head in self.sa_list
        ]
        concatenated_op = torch.cat(multiple_self_attn_op,axis=2)
        linear_op = self.linear(concatenated_op)

        if debug:
            print("----------MHA------------")
            print("embedded_x1 shape:", embedded_x1.shape)
            print("embedded_x2 shape:", embedded_x2.shape)
            print("multiple_self_attn_op len:", len(multiple_self_attn_op))
            print("concatenated_op shape:", concatenated_op.shape)
            print("linear_op shape:", linear_op.shape)
        return linear_op

In [16]:
class FeedForward(nn.Module):
    def __init__(self,embedding_dim):
        super(FeedForward, self).__init__()
        self.linear_1 = nn.Linear(embedding_dim, 4*embedding_dim)
        self.linear_2 = nn.Linear(4*embedding_dim, embedding_dim)
        self.dropout = nn.Dropout(0.3)
        self.gelu = nn.GELU()
        
    def forward(self,mha_op, debug=False):
        linear_1_op = self.linear_1(mha_op)
        linear_1_op = self.gelu(linear_1_op)
        linear_2_op = self.linear_2(linear_1_op)
        linear_2_op = self.dropout(linear_2_op)
        if debug:
            print("----------Feed Forward------------")
            print("mha_op shape:", mha_op.shape)
            print("linear_1_op shape:", linear_1_op.shape)
            print("linear_2_op shape:", linear_2_op.shape)
        return linear_2_op

In [17]:
class EncoderBlock(nn.Module):
    def __init__(self, embedding_dim, num_heads):
        super(EncoderBlock, self).__init__()
        self.mha = MultiHeadAttention(embedding_dim, num_heads)
        self.ff_layer = FeedForward(embedding_dim)
        self.layer_norm = nn.LayerNorm(embedding_dim)
    
    def forward(self, embedded_x, mask_matrix, debug=False):
        mha_op = self.mha(embedded_x, embedded_x, mask_matrix, debug)
        layer_norm_op_1 = self.layer_norm(mha_op + embedded_x)
        ff_op = self.ff_layer(layer_norm_op_1, debug)
        layer_norm_op_2 = self.layer_norm(ff_op + layer_norm_op_1)
        
        if debug:
            print("----------EncoderBlock------------")
            print("embedded_x shape:", embedded_x.shape)
            print("mha_op shape:", mha_op.shape)
            print("layer_norm_op_1 shape:", layer_norm_op_1.shape)
            print("ff_op shape:", ff_op.shape)
            print("layer_norm_op_2 shape:", layer_norm_op_2.shape)
        return layer_norm_op_2

In [18]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, max_token_length, num_heads):
        super(Encoder, self).__init__()
        self.embedding_obj = EmbeddingClass(
            vocab_size, embedding_dim, max_token_length
        )
        self.encoder_blocks = nn.ModuleList([
            EncoderBlock(embedding_dim, num_heads) for _ in range(6)
        ])
    
    def forward(self, token_ids, debug=False):
        embedded_x = self.embedding_obj(token_ids, debug)
        for i,block in enumerate(self.encoder_blocks):
            if i == 0: block_op  = block(embedded_x, None, debug)
            else:
                block_op  = block(block_op, None, debug)
        if debug:
            print("----------Encoder------------")
            print("token_ids shape:", token_ids.shape)
            print("block_op shape:", block_op.shape)
        return block_op

In [19]:
class DecoderBlock(nn.Module):
    def __init__(self, embedding_dim, num_heads):
        super(DecoderBlock, self).__init__()
        self.mha = MultiHeadAttention(embedding_dim, num_heads)
        self.ff_layer = FeedForward(embedding_dim)
        self.layer_norm = nn.LayerNorm(embedding_dim)
    
    def forward(self, embedded_x, encoder_op, mask_matrix, debug=False):
        masked_mha_op = self.mha(embedded_x, embedded_x, mask_matrix, debug)
        layer_norm_op_1 = self.layer_norm(masked_mha_op + embedded_x)
        
        mha_cross_attn_op = self.mha(layer_norm_op_1, encoder_op, None, debug)
        layer_norm_op_2 = self.layer_norm(mha_cross_attn_op + layer_norm_op_1)
        
        ff_op = self.ff_layer(layer_norm_op_2, debug)
        layer_norm_op_3 = self.layer_norm(ff_op + layer_norm_op_2)
        
        if debug:
            print("----------DecoderBlock------------")
            print("embedded_x shape:", embedded_x.shape)
            print("encoder_op shape:", encoder_op.shape)
            print("masked_mha_op shape:", masked_mha_op.shape)
            print("layer_norm_op_1 shape:", layer_norm_op_1.shape)
            print("mha_cross_attn_op shape:", mha_cross_attn_op.shape)
            print("layer_norm_op_2 shape:", layer_norm_op_2.shape)
            print("ff_op shape:", ff_op.shape)
            print("layer_norm_op_3 shape:", layer_norm_op_3.shape)
        return layer_norm_op_3

In [20]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, max_token_length, num_heads):
        super(Decoder, self).__init__()
        self.embedding_obj = EmbeddingClass(
            vocab_size, embedding_dim, max_token_length
        )
        self.encoder_blocks = nn.ModuleList([
            DecoderBlock(embedding_dim, num_heads) for _ in range(6)
        ])
        self.mask_matrix = torch.tril(
            torch.ones(max_token_length, max_token_length)
        )
        self.linear = nn.Linear(embedding_dim, vocab_size)
        
    def forward(self, token_ids, encoder_op, debug=False):
        embedded_x = self.embedding_obj(token_ids, debug)
        for i,block in enumerate(self.encoder_blocks):
            if i == 0: block_op  = block(
                embedded_x, encoder_op, self.mask_matrix, debug
            )
            else:
                block_op  = block(
                    block_op, encoder_op, self.mask_matrix, debug
                )
        reshaped_output = block_op.reshape(
            -1,block_op.shape[2]
        )
        linear_op = self.linear(reshaped_output)
        if debug:
            print("----------Decoder------------")
            print("token_ids shape:", token_ids.shape)
            print("block_op shape:", block_op.shape)
            print("reshaped_output shape:", reshaped_output.shape)
            print("linear_op shape:", linear_op.shape)
        return linear_op

In [21]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super(Seq2Seq, self).__init__() 
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
    
    def forward(self, encoder_input, decoder_input, debug=False): 
        encoder_outputs = self.encoder(encoder_input, debug) 
        decoder_outputs = self.decoder(decoder_input, encoder_outputs, debug)
        
        if debug:
            print("----------Seq2Seq------------")
            print("encoder_outputs shape:", encoder_outputs.shape)
            print("decoder_outputs shape:", decoder_outputs.shape)
            
        return decoder_outputs

In [40]:
def predict_on_1_input(model, Xe_b, token_index_mapper, device, max_sequence_length):
    model.eval()
    sos_token = [token_index_mapper.token_to_index["<sos>"]] # 1st token as decoder input is <sos>.
    unjumbled_sentence = []
    with torch.no_grad():
        encoder_outputs = model.encoder(Xe_b)
        for i in range(max_sequence_length):
            if i == 0:
                decoder_input = sos_token*max_sequence_length
                decoder_op = model.decoder(torch.tensor([decoder_input]).to(device), encoder_outputs)
            else: 
                # After 1st time step, input to decoder is the predicted token of previous time step.
                # and hidden state input to decoder is the hidden state output of decoder of previous time step.
                
                # To get the predicted token of previous time step:
                # first, do the softmax on decoder_op of previous time step
                softmax_op = torch.softmax(decoder_op,axis=1) # decoder_op is (1 X Vocab_Size),
#                 print("Softmax op shape:",softmax_op.shape)
                # next, take the token with max probability
                # (softmax_op is also [1 X Vocab_Size], as we have taken softmax along axis=1, which
                # has simply converted the logits to probabilities.)
                # torch.argmax() returns a tensor([]). The list will contain as many elements as 0th dimension of softmax_op.
                # because we are taking argmax along axis = 1.
                # In this case, softmax_op has only 1 token in 0th dimension, so the list has only 1 element.
                # torch.tensor([]).tolist() gives out the []
                predicted_token = torch.argmax(softmax_op,axis=1).tolist()
                decoder_input[i] = predicted_token[i-1]
                decoder_op = model.decoder(
                    torch.tensor([decoder_input]).to(device), encoder_outputs
                )
                unjumbled_sentence.append(token_index_mapper.index_to_token[predicted_token[i-1]])
                if predicted_token[i-1] == token_index_mapper.token_to_index["<eos>"]: break
        print("_______________________________________")
        print(unjumbled_sentence)
        return unjumbled_sentence

### Sample Training And Prediction

In [23]:
INPUT_DIM = len(token_index_mapper.token_to_index) # Size of source vocabulary 
OUTPUT_DIM = len(token_index_mapper.token_to_index) # Size of target vocabulary 
ENC_EMB_DIM = 512 
DEC_EMB_DIM = 512 
NUM_HEADS = 8
device = "cpu"
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, max_sequence_length, NUM_HEADS) 
dec = Decoder(INPUT_DIM, DEC_EMB_DIM, max_sequence_length, NUM_HEADS) 
model = Seq2Seq(enc, dec, device).to(device)
optimizer = torch.optim.Adam(model.parameters()) 
criterion = nn.CrossEntropyLoss(ignore_index=token_index_mapper.token_to_index["<pad>"])

In [26]:
data_index = 6
batch_size = 5

model.train()
optimizer.zero_grad()
Xe_b = torch.tensor(X_encoder_indices_tr[data_index:data_index+batch_size]).to(device)
Xd_b = torch.tensor(X_decoder_indices_tr[data_index:data_index+batch_size]).to(device)
Y_b = torch.tensor(Y_indices_tr[data_index:data_index+batch_size]).to(device)
print(Xe_b.shape,Xd_b.shape,Y_b.shape)
output = model(Xe_b, Xd_b, debug=True)
loss = criterion(output, Y_b.view(-1))
loss.backward()
optimizer.step()

torch.Size([5, 25]) torch.Size([5, 25]) torch.Size([5, 25])
----------Embedding Class------------
token_ids shape: torch.Size([5, 25])
batch_size: 5
position_ids shape: torch.Size([5, 25])
token_embeddings shape: torch.Size([5, 25, 512])
position_embeddings shape: torch.Size([5, 25, 512])
embbeded_x shape: torch.Size([5, 25, 512])
----------Self-Attention------------
q shape: torch.Size([5, 25, 64])
k shape: torch.Size([5, 25, 64])
v shape: torch.Size([5, 25, 64])
scores shape: torch.Size([5, 25, 25])
weights shape: torch.Size([5, 25, 25])
output shape: torch.Size([5, 25, 64])
----------Self-Attention------------
q shape: torch.Size([5, 25, 64])
k shape: torch.Size([5, 25, 64])
v shape: torch.Size([5, 25, 64])
scores shape: torch.Size([5, 25, 25])
weights shape: torch.Size([5, 25, 25])
output shape: torch.Size([5, 25, 64])
----------Self-Attention------------
q shape: torch.Size([5, 25, 64])
k shape: torch.Size([5, 25, 64])
v shape: torch.Size([5, 25, 64])
scores shape: torch.Size([5, 

In [62]:
# Randomly select one sentence from Test Data to Predict.
data_index = random.randint(0,100)
# Since its only 1 sentence, we need to convert into a 2-D list before sending it to torch.tensor()
Xe_b = torch.tensor([X_encoder_indices_test[data_index]]).to(device)
print("Test Jumbled sentence:",X_encoder_tokens_test[data_index])
print("Test Unjumbled sentence:", Y_tokens_test[data_index]) 
unjumbled_sentence = predict_on_1_input(model, Xe_b, token_index_mapper, device, max_sequence_length)

Test Jumbled sentence: ['dog', 'out', 'the', 'in', 'is', 'brown', 'the', '.', 'snow', '', '<eos>']
Test Unjumbled sentence: ['the', 'brown', 'dog', 'is', 'out', 'in', 'the', 'snow', '.', '', '<eos>']
_______________________________________
['the', 'brown', 'dog', 'is', 'out', 'in', 'the', 'snow', '.', '', '<eos>']


## For Actual Training

In [35]:
device = "cpu" #torch.device("cuda:0")
batch_size = 50
INPUT_DIM = len(token_index_mapper.token_to_index) # Size of source vocabulary 
OUTPUT_DIM = len(token_index_mapper.token_to_index) # Size of target vocabulary 
ENC_EMB_DIM = 512 
DEC_EMB_DIM = 512 
NUM_HEADS = 8
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, max_sequence_length, NUM_HEADS) 
dec = Decoder(INPUT_DIM, DEC_EMB_DIM, max_sequence_length, NUM_HEADS) 
model = Seq2Seq(enc, dec, device).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=3e-5) 
criterion = nn.CrossEntropyLoss(ignore_index=token_index_mapper.token_to_index["<pad>"])
epochs = 2

In [51]:
optimizer = torch.optim.Adam(model.parameters(), lr=3e-4) 

In [56]:
for i in range(epochs):
    model.train()
    epoch_loss = 0
    for j in range(0,len(X_encoder_indices_tr),batch_size):
        optimizer.zero_grad()
        Xe_b = torch.tensor(X_encoder_indices_tr[j:j+batch_size]).to(device)
        Xd_b = torch.tensor(X_decoder_indices_tr[j:j+batch_size]).to(device)
        Y_b = torch.tensor(Y_indices_tr[j:j+batch_size]).to(device)
        op = model(Xe_b,Xd_b)
        loss = criterion(op,Y_b.reshape(-1))
        loss.backward()
        optimizer.step()
        batch_loss = loss.item()
        epoch_loss += batch_loss
        if j%500 == 0: print("Epoch:",i,"Batch:",j,"Loss:",batch_loss)
    print("______________________________________")
    print("Epoch Loss:",epoch_loss)
    
    # Randomly select one sentence from Test Data to Predict.
    data_index = random.randint(0,100)
    # Since its only 1 sentence, we need to convert into a 2-D list before sending it to torch.tensor()
    Xe_b = torch.tensor([X_encoder_indices_test[data_index]]).to(device)
    print("Test Jumbled sentence:",X_encoder_tokens_test[data_index])
    print("Test Unjumbled sentence:", Y_tokens_test[data_index]) 
    unjumbled_sentence = predict_on_1_input(model, Xe_b, token_index_mapper, device, max_sequence_length)
    print("_______________________________________")

Epoch: 0 Batch: 0 Loss: 0.6898346543312073
Epoch: 0 Batch: 500 Loss: 0.7354006171226501
Epoch: 0 Batch: 1000 Loss: 0.700539231300354
Epoch: 0 Batch: 1500 Loss: 0.6414951086044312
Epoch: 0 Batch: 2000 Loss: 0.7601133584976196
Epoch: 0 Batch: 2500 Loss: 0.6712945103645325
Epoch: 0 Batch: 3000 Loss: 0.6413605809211731
Epoch: 0 Batch: 3500 Loss: 0.6541795134544373
Epoch: 0 Batch: 4000 Loss: 0.6091183423995972
Epoch: 0 Batch: 4500 Loss: 0.6493122577667236
Epoch: 0 Batch: 5000 Loss: 0.4903520345687866
Epoch: 0 Batch: 5500 Loss: 0.594270646572113
Epoch: 0 Batch: 6000 Loss: 0.7431533932685852
Epoch: 0 Batch: 6500 Loss: 0.6674348711967468
Epoch: 0 Batch: 7000 Loss: 0.7684808373451233
Epoch: 0 Batch: 7500 Loss: 0.4872722327709198
Epoch: 0 Batch: 8000 Loss: 0.5335842967033386
Epoch: 0 Batch: 8500 Loss: 0.6324428915977478
Epoch: 0 Batch: 9000 Loss: 0.6997290253639221
Epoch: 0 Batch: 9500 Loss: 0.6009303331375122
Epoch: 0 Batch: 10000 Loss: 0.9210977554321289
Epoch: 0 Batch: 10500 Loss: 0.705772995