In [1]:
import torch.nn as nn
import torch
import random
import os
import pandas as pd
from itertools import chain

<h6> In this notebook, we will try to unjumble a sentence using Encoder-Decoder Architecture built using <br><br>
 Recurrent Networks like GRU, LSTM and Bi-directional LSTMs.</h6>
<h6> The Data is located here: ../../Datasets/Jumble_Unjumble/ </h6>

#### Read Data

In [2]:
# train_df = pd.read_csv("../../Datasets/Jumble_Unjumble/Train_400.tsv",sep="\t")
# test_df = pd.read_csv("../../Datasets/Jumble_Unjumble/Test_100.tsv",sep="\t")
# print(train_df.shape, test_df.shape)
# train_df.head()

In [3]:
jumbled_df = pd.read_csv("../../Datasets/Jumble_Unjumble/processed_jumbled.txt",sep="\t",header=None)
unjumbled_df = pd.read_csv("../../Datasets/Jumble_Unjumble/processed_unjumbled.txt",sep="\t",header=None)
jumbled_df.columns = ["jumbled_sentences"]
unjumbled_df.columns = ["unjumbled_sentences"]
df = pd.concat([jumbled_df,unjumbled_df],axis=1)
train_df = df.sample(frac=0.8, random_state=42) 
test_df = df.drop(train_df.index)
print(train_df.shape, test_df.shape)
train_df.head()

(32368, 2) (8092, 2)


Unnamed: 0,jumbled_sentences,unjumbled_sentences
32760,tools and a man gardening inside two holding a...,a man and two women are inside a greenhouse ho...
31413,meandering at people of the walkway stand . up...,people stand at the bottom of a meandering wal...
4325,standing a rock . on man view the shorts a out...,a man in shorts is standing on a rock looking ...
28232,to a a in on shirt little red a holds pole nea...,a little girl in a red shirt holds on to a pol...
28438,children two in <unk> play the melting .,two children play in the melting <unk> .


#### Preprocess Train and Test Data
1. Lowercasing, removing stopwords. <br>
2. Stemming, Lemmetization. <br>
3. Tokenization. <br>
4. Here, we are just doing tokenization by splitting on space.

In [4]:
class Preprocessor:
    def __init__(self):
        self.tokenize_on = " "
    
    def tokenize(self,text_string):
        '''
        text_string = "This is one sentence."
        returns token_list = ["This","is","one","sentence."]
        '''
        token_list = text_string.split(self.tokenize_on)
        return token_list

In [5]:
preprocessor = Preprocessor()
train_df["jumbled_sentences"] = train_df["jumbled_sentences"].apply(lambda x: preprocessor.tokenize(x))
train_df["unjumbled_sentences"] = train_df["unjumbled_sentences"].apply(lambda x: preprocessor.tokenize(x))
test_df["jumbled_sentences"] = test_df["jumbled_sentences"].apply(lambda x: preprocessor.tokenize(x))
test_df["unjumbled_sentences"] = test_df["unjumbled_sentences"].apply(lambda x: preprocessor.tokenize(x))
print(train_df.shape, test_df.shape)
train_df.head()

(32368, 2) (8092, 2)


Unnamed: 0,jumbled_sentences,unjumbled_sentences
32760,"[tools, and, a, man, gardening, inside, two, h...","[a, man, and, two, women, are, inside, a, gree..."
31413,"[meandering, at, people, of, the, walkway, sta...","[people, stand, at, the, bottom, of, a, meande..."
4325,"[standing, a, rock, ., on, man, view, the, sho...","[a, man, in, shorts, is, standing, on, a, rock..."
28232,"[to, a, a, in, on, shirt, little, red, a, hold...","[a, little, girl, in, a, red, shirt, holds, on..."
28438,"[children, two, in, <unk>, play, the, melting,...","[two, children, play, in, the, melting, <unk>,..."


#### Create X_Encoder, X_Decoder and Y
1. X denotes Input, Y denotes Output. <br>
2. X_encoder is the matrix of tokens in jumbled_sentences, each sentence suffixed by "eos" token. <br>
3. X_decoder is the matrix of tokens in unjumbled_sentences, each sentence prefixed by "sos" token. X_decoder is required because we want to do <b>Teacher Forcing</b>, which means we want to provide the correct current token to decoder to predict next token, instead of relying only on its own prediction. <br>
4. Y is the matrix of unjumbled_sentences, each sentence suffixed by "eos" token. <br>

5. Do this for both train and test data.

In [6]:
def get_Xe_Xd_Y(dataframe, sos_token, eos_token):
    jumbled_sentences = dataframe["jumbled_sentences"].tolist()
    unjumbled_sentences = dataframe["unjumbled_sentences"].tolist()
    X_encoder_tokens = [el + [eos_token] for el in jumbled_sentences]
    X_decoder_tokens = [[sos_token] + el for el in unjumbled_sentences]
    Y_tokens = [el + [eos_token] for el in unjumbled_sentences]
    return X_encoder_tokens, X_decoder_tokens, Y_tokens

unknown_token = "<unk>"
pad_token = "<pad>"
sos_token = "<sos>"
eos_token = "<eos>"
X_encoder_tokens_tr, X_decoder_tokens_tr, Y_tokens_tr = get_Xe_Xd_Y(train_df, sos_token, eos_token)
X_encoder_tokens_test, X_decoder_tokens_test, Y_tokens_test = get_Xe_Xd_Y(test_df, sos_token, eos_token)
print("X Encoder train length:",len(X_encoder_tokens_tr))
print("X Encoder test length:",len(X_encoder_tokens_test))
print("Sample X_encoder_train:",X_encoder_tokens_tr[0])
print("Sample X_decoder_train:",X_decoder_tokens_tr[0])
print("Sample Y_train:",Y_tokens_tr[0])

X Encoder train length: 32368
X Encoder test length: 8092
Sample X_encoder_train: ['tools', 'and', 'a', 'man', 'gardening', 'inside', 'two', 'holding', 'are', '.', 'women', 'a', 'greenhouse', '', '<eos>']
Sample X_decoder_train: ['<sos>', 'a', 'man', 'and', 'two', 'women', 'are', 'inside', 'a', 'greenhouse', 'holding', 'gardening', 'tools', '.', '']
Sample Y_train: ['a', 'man', 'and', 'two', 'women', 'are', 'inside', 'a', 'greenhouse', 'holding', 'gardening', 'tools', '.', '', '<eos>']


#### Build Vocab
1. Generally, Vocab is created from both Encoder and Decoder Tokens, consider Senetence Translation for ex, where encoder and decoder tokens can be in different languages. <br>
2. We can create Vocab separately for Encder and Decoder tokens, or can create shared vocab. Shared Vocab is preferable though. <br>
3. Also, Vocab is generated from only Training Data. <br>
4. In this case, we are using only Encoder tokens to create Vocab because Decoder Tokens are the same. Also, we are using both Train and Test Dataset to create Vocab, as our datasize is small.

In [7]:
class VocabBuilder:
    def __init__(self,token_corpus,unknown_token=None,pad_token=None,sos_token=None,eos_token=None):
        '''
        token_corpus = ['tools', 'and', 'a', 'man', 'gardening', 'inside', 'two', 'holding', 'are', '.']
        '''
        self.token_corpus = token_corpus
        self.unknown_token = unknown_token or "<unk>"
        self.pad_token = pad_token or "<pad>"
        self.sos_token = sos_token or "<sos>"
        self.eos_token = eos_token or "<eos>"
        self.word_to_index, self.index_to_word = self.get_vocabs()
                        
    def get_vocabs(self):
        word_to_index = {}
        index_count = 0
        all_unique_words = set(self.token_corpus).difference(set(
            [self.unknown_token, self.pad_token, self.sos_token, self.eos_token]
        ))
        word_to_index[self.unknown_token] = 0
        word_to_index[self.pad_token] = 1
        word_to_index[self.sos_token] = 2
        word_to_index[self.eos_token] = 3
        
        for index, word in enumerate(all_unique_words):
            word_to_index[word] = index + 4
        if self.pad_token not in word_to_index: word_to_index[self.pad_token] = index + 1
        if self.sos_token not in word_to_index: word_to_index[self.sos_token] = index + 2
        if self.eos_token not in word_to_index: word_to_index[self.eos_token] = index + 3
        if self.unknown_token not in word_to_index: word_to_index[self.unknown_token] = index + 4
        index_to_word = {v:k for k,v in word_to_index.items()}
        return word_to_index, index_to_word

In [8]:
token_corpus_1 = list(chain.from_iterable(X_encoder_tokens_tr)) # flattens a 2D list ot 1D
token_corpus_2 = list(chain.from_iterable(X_encoder_tokens_test))  # flattens a 2D list ot 1D
token_corpus = token_corpus_1 + token_corpus_2
print(token_corpus[:20])

['tools', 'and', 'a', 'man', 'gardening', 'inside', 'two', 'holding', 'are', '.', 'women', 'a', 'greenhouse', '', '<eos>', 'meandering', 'at', 'people', 'of', 'the']


In [9]:
vocab_builder = VocabBuilder(token_corpus,unknown_token,pad_token,sos_token,eos_token)
print("WordToIndex Dict length:",len(vocab_builder.word_to_index))
print("IndexToWord Dict length:",len(vocab_builder.index_to_word))

WordToIndex Dict length: 5242
IndexToWord Dict length: 5242


#### Map X_encoder, X_decoder and Y using Vocab 

In [10]:
class Token_Index_Mapper:
    def __init__(self,token_to_index,index_to_token, unknown_token):
        self.token_to_index = token_to_index
        self.index_to_token = index_to_token
        self.unknown_token = unknown_token
    
    def get_encoding(self,sentence):
        '''
        sentence must be a list of tokens.
        Ex: ["Climate","change","is","a","pressing","global","issue"]
        '''
        encoded_sentence = []
        for token in sentence:
            if token in self.token_to_index: encoded_sentence.append(self.token_to_index[token])
            else: encoded_sentence.append(self.token_to_index[self.unknown_token])
        return encoded_sentence
    
    def get_decoding(self,encoded_sentence):
        '''
        encoded_sentence must be a list of vocab indices.
        Ex: encoded_sentence = [24,21,4,1,..] 
        '''
        sentence = [self.index_to_token[index] for index in encoded_sentence]
        return " ".join(sentence)

In [11]:
def map_tokens_to_indices(token_index_mapper, max_sequence_length, token_matrix):
    index_matrix = []
    for el in token_matrix:
        el = el[:max_sequence_length] # truncate sentence to max_seq_length
        if len(el) < max_sequence_length:
            pad_tokens_to_append = max_sequence_length - len(el)
            el = el + [pad_token]*pad_tokens_to_append
        index_matrix.append(token_index_mapper.get_encoding(el))
    return index_matrix

In [12]:
max_sequence_length = 25
token_index_mapper = Token_Index_Mapper(vocab_builder.word_to_index, vocab_builder.index_to_word, unknown_token)
X_encoder_indices_tr = map_tokens_to_indices(token_index_mapper, max_sequence_length, X_encoder_tokens_tr)
X_decoder_indices_tr = map_tokens_to_indices(token_index_mapper, max_sequence_length, X_decoder_tokens_tr)
Y_indices_tr = map_tokens_to_indices(token_index_mapper, max_sequence_length, Y_tokens_tr)

X_encoder_indices_test = map_tokens_to_indices(token_index_mapper, max_sequence_length, X_encoder_tokens_test)
X_decoder_indices_test = map_tokens_to_indices(token_index_mapper, max_sequence_length, X_decoder_tokens_test)
Y_indices_test = map_tokens_to_indices(token_index_mapper, max_sequence_length, Y_tokens_test)
print("X Encoder train length:",len(X_encoder_indices_tr))
print("X Encoder test length:",len(X_encoder_indices_test))
print("Sample X_encoder_train:",X_encoder_indices_tr[0])
print("Sample X_decoder_train:",X_decoder_indices_tr[0])
print("Sample Y_train:",Y_indices_tr[0])

X Encoder train length: 32368
X Encoder test length: 8092
Sample X_encoder_train: [3749, 4085, 3919, 4113, 957, 3966, 3698, 5212, 3551, 3762, 2538, 3919, 151, 4, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Sample X_decoder_train: [2, 3919, 4113, 4085, 3698, 2538, 3551, 3966, 3919, 151, 5212, 957, 3749, 3762, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Sample Y_train: [3919, 4113, 4085, 3698, 2538, 3551, 3966, 3919, 151, 5212, 957, 3749, 3762, 4, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


### Define Model - (UniDirectional + BatchFirst + SingleLayer + StateLess)  GRU, TeacherForcing Decoder

In [13]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, dropout):
        super(Encoder, self).__init__()
        self.hid_dim = hid_dim
        
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.gru = nn.GRU(emb_dim, hid_dim, batch_first=True)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, debug=False):
        # X shape: (Batch_Size X Seq_Length)
        
        embedded = self.embedding(x)   
        # Embedded shape: (Batch_Size X Seq_Length X Embedding_Dim)
        
        dropped_out = self.dropout(embedded) 
        # DroppedOut shape: (Batch_Size X Seq_Length X Embedding_Dim)
        
        output, hidden = self.gru(dropped_out) 
        # Output shape: (Batch_Size X Seq_Length X D*Hidden_Dim), where D == 2 if Bi-directional, else 1. 
        # It contains h_t for all tokens, hence the Seq_Length in shape.
        # If more GRU layers are present, it contains h_t for all tokens but only from last layer.
        
        # Hidden shape: (D*NumLayers X Batch_Size X Hidden_Dim)
        # It contains the h_t for only last token of each sequence in the batch.
        # In case of multiple GRU layers:,
        # It can be thought of as a stack of (Batch_Size X Hidden_Dim) matrix, where each GRU Layer
        # is contributing 2 matrices (one for each direction in case of Bi-directional), 
        # such that the last layer's matrix is at the top of the stack and can be accessed using h_t[-1].
        
        hidden_unsqueezed = hidden[-1].unsqueeze(dim = 0)
        # This operation has no meaning for the UniDirectional + SingleLayer case, but helpful for other cases.
        # hidden[-1] shape = Batch_Size X Hidden_Dim, we are only taking the last layer's matrix.
        # hidden[-1].unsqueeze(dim = 0) shape: We are converting the 2-D matrix back to 3-D matrix,
        # with the first dimension (or the 0th dimension) == 1.
        
        if debug: 
            print("-----------Encoder----------:")
            print("Input Data shape:",x.shape)
            print("After Embedding Layer:",embedded.shape)
            print("After Dropout Layer:",dropped_out.shape)
            print("Outputs and hidden shape from GRU:",output.shape, hidden.shape)
            print("Unsqueezed hidden shape:", hidden_unsqueezed.shape)
            
        # Returning both the output and hidden state, but Decoder will only need hidden state.    
        return output, hidden_unsqueezed

class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim):
        super(Decoder, self).__init__()
        self.output_dim = output_dim
        self.hid_dim = hid_dim
        
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.gru = nn.GRU(emb_dim, hid_dim,  batch_first=True) # Decoders are almost always unidirectional.
        self.fc_out = nn.Linear(hid_dim, output_dim)
        
    def forward(self, x, encoder_hidden, debug=False):
        # X shape: (Batch_Size X Seq_Length)
        # X (decoder) is only required when we do Teacher Forcing. In this case we are doing 100% teacher forcing.
        
        embedded = self.embedding(x)
        # Embedded shape: (Batch_Size X Seq_Length X Embedding_Dim)
        
        output, hidden = self.gru(embedded, encoder_hidden)
        # Output shape: (Batch_Size X Seq_Length X D*Hidden_Dim), where D == 2 if Bi-directional, else 1.
        # Bi-directional Decoder does not make sense, except for very rare cases.
        # Hidden shape: (D*NumLayers X Batch_Size X Hidden_Dim)
        # It contains the h_t for only last token of each sequence in the batch.
        
        reshaped_output = output.reshape(-1,output.shape[2])
        # Reshaped_Output shape = (Batch_Size*Seq_Length X Hidden_Dim) [Basically a 2-D Matrix]
        # Since we need to calculate loss on each token of the batch, the output has to be 
        # reshaped into Batch_Size*Seq_Length X Hidden_Dim.
        # -1 in the reshape function tells Pytorch to figure out the size of Tensor by itself.
        # output.shape[2] is the Hidden_Dim (obvious from Output Shape: Batch_Size X Seq_Length X D*Hidden_Dim)
        # So, the reshape operation figures out the size of 0th dimension, 
        # given that size of 1st dimension ==  Hidden_Dim. 
        
        prediction = self.fc_out(reshaped_output)
        # prediction shape: (Batch_Size*Seq_Length X Vocab_Size)
        # For each input token, we get the vector of vocab size consisting of logits.
        # We take the softmax of each vector to convert vector of logits to vector of probabilities.
        # Then we take argmax of each vector to get the predicted token.
        
        if debug: 
            print("-----------Decoder----------:")
            print("Input Data shape, X:",x.shape, ", Encoder hidden state:", encoder_hidden.shape)
            print("After Embedding Layer:",embedded.shape)
            print("Outputs and hidden shape from GRU:",output.shape, hidden.shape)
            print("Reshaped Output:", reshaped_output.shape)
            print("After FC layer:", prediction.shape)
        return prediction, hidden
    

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super(Seq2Seq, self).__init__() 
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
    
    def forward(self, encoder_input, decoder_input, debug=False): 
        encoder_outputs, encoder_hidden = self.encoder(encoder_input, debug) 
        outputs, _ = self.decoder(decoder_input, encoder_hidden, debug)
        return outputs

In [14]:
def predict_on_1_input(model, Xe_b, token_index_mapper, device, max_sequence_length):
    model.eval()
    sos_token = torch.tensor([[token_index_mapper.token_to_index["<sos>"]]]).to(device) # 1st token as decoder input is <sos>.
    unjumbled_sentence = []
    with torch.no_grad():
        encoder_op, encoder_hidden = model.encoder(Xe_b)
        for i in range(max_sequence_length):
            if i == 0: 
                # At 1st time step, input to decoder is the index of <sos> token.
                # And the hidden state input to decoder is Encoder's hidden state.
                decoder_op, decoder_hidden = model.decoder(sos_token,encoder_hidden)
            else: 
                # After 1st time step, input to decoder is the predicted token of previous time step.
                # and hidden state input to decoder is the hidden state output of decoder of previous time step.
                
                # To get the predicted token of previous time step:
                # first, do the softmax on decoder_op of previous time step
                softmax_op = torch.softmax(decoder_op,axis=1) # decoder_op is (1 X Vocab_Size),

                # next, take the token with max probability
                # (softmax_op is also [1 X Vocab_Size], as we have taken softmax along axis=1, which
                # has simply converted the logits to probabilities.)
                # torch.argmax() returns a tensor([]). The list will contain as many elements as 0th dimension of softmax_op.
                # because we are taking argamx along axis = 1.
                # In this case, softmax_op has only 1 token in 0th dimension, so the list has only 1 element.
                # torch.tensor([]).tolist() gives out the []
                predicted_token = torch.argmax(softmax_op,axis=1).tolist() 
                decoder_op, decoder_hidden = model.decoder(torch.tensor([predicted_token]).to(device),decoder_hidden)
                
                unjumbled_sentence.append(token_index_mapper.index_to_token[predicted_token[0]])
                if predicted_token[0] == token_index_mapper.token_to_index["<eos>"]: break
        print("_______________________________________")
        print(unjumbled_sentence)

### Sample Training And Prediction

In [15]:
INPUT_DIM = len(token_index_mapper.token_to_index) # Size of source vocabulary 
OUTPUT_DIM = len(token_index_mapper.token_to_index) # Size of target vocabulary 
ENC_EMB_DIM = 256 
DEC_EMB_DIM = 256 
HID_DIM = 512 
ENC_DROPOUT = 0.5 
device = "cpu"
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, ENC_DROPOUT) 
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM) 
model = Seq2Seq(enc, dec, device).to(device)
optimizer = torch.optim.Adam(model.parameters()) 
criterion = nn.CrossEntropyLoss(ignore_index=token_index_mapper.token_to_index["<pad>"])

In [16]:
data_index = 6
batch_size = 5

model.train()
optimizer.zero_grad()
Xe_b = torch.tensor(X_encoder_indices_tr[data_index:data_index+batch_size]).to(device)
Xd_b = torch.tensor(X_decoder_indices_tr[data_index:data_index+batch_size]).to(device)
Y_b = torch.tensor(Y_indices_tr[data_index:data_index+batch_size]).to(device)
print(Xe_b.shape,Xd_b.shape,Y_b.shape)
output = model(Xe_b, Xd_b, debug=True)
loss = criterion(output, Y_b.view(-1))
loss.backward()
optimizer.step()

torch.Size([5, 25]) torch.Size([5, 25]) torch.Size([5, 25])
-----------Encoder----------:
Input Data shape: torch.Size([5, 25])
After Embedding Layer: torch.Size([5, 25, 256])
After Dropout Layer: torch.Size([5, 25, 256])
Outputs and hidden shape from GRU: torch.Size([5, 25, 512]) torch.Size([1, 5, 512])
Unsqueezed hidden shape: torch.Size([1, 5, 512])
-----------Decoder----------:
Input Data shape, X: torch.Size([5, 25]) , Encoder hidden state: torch.Size([1, 5, 512])
After Embedding Layer: torch.Size([5, 25, 256])
Outputs and hidden shape from GRU: torch.Size([5, 25, 512]) torch.Size([1, 5, 512])
Reshaped Output: torch.Size([125, 512])
After FC layer: torch.Size([125, 5242])


In [17]:
# Randomly select one sentence from Test Data to Predict.
data_index = random.randint(0,100)
# Since its only 1 sentence, we need to convert into a 2-D list before sending it to torch.tensor()
Xe_b = torch.tensor([X_encoder_indices_test[data_index]]).to(device)
print("Test Jumbled sentence:",X_encoder_tokens_test[data_index])
print("Test Unjumbled sentence:", Y_tokens_test[data_index]) 
predict_on_1_input(model, Xe_b, token_index_mapper, device, max_sequence_length)

Test Jumbled sentence: ['toddler', 'to', 'a', 'him', 'air', 'wetsuit', 'is', 'ready', 'in', 'catch', '.', 'throwing', 'and', 'the', 'up', 'is', 'a', 'in', 'man', 'a', '', '<eos>']
Test Unjumbled sentence: ['a', 'man', 'in', 'a', 'wetsuit', 'is', 'throwing', 'a', 'toddler', 'up', 'in', 'the', 'air', 'and', 'is', 'ready', 'to', 'catch', 'him', '.', '', '<eos>']
_______________________________________
['a', 'a', 'dog', 'dog', 'pedestrians', 'kiddie', 'tiles', 'homebase', 'costumed', 'brook', 'kayaker', 'crawling', 'hose', 'amused', 'doing', 'tuxedo', 'carnival', 'performs', 'collar', 'circular', 'weather', 'riverbank', 'brush', 'power']


## For Actual Training

In [18]:
device = "cpu" #torch.device("cuda:0")
batch_size = 50
INPUT_DIM = len(token_index_mapper.token_to_index) # Size of source vocabulary 
OUTPUT_DIM = len(token_index_mapper.token_to_index) # Size of target vocabulary 
ENC_EMB_DIM = 128 
DEC_EMB_DIM = 128 
HID_DIM = 500 
ENC_DROPOUT = 0.5 
DEC_DROPOUT = 0.5
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, ENC_DROPOUT) 
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM) 
model = Seq2Seq(enc, dec, device).to(device)
optimizer = torch.optim.Adam(model.parameters()) 
criterion = nn.CrossEntropyLoss(ignore_index=token_index_mapper.token_to_index["<pad>"])
epochs = 20

In [None]:
for i in range(epochs):
#     init_ht_for_encoder = model_encoder.init_hidden().to(device)
    model.train()
    epoch_loss = 0
    for j in range(0,len(X_encoder_indices_tr),batch_size):
        optimizer.zero_grad()
        Xe_b = torch.tensor(X_encoder_indices_tr[j:j+batch_size]).to(device)
        Xd_b = torch.tensor(X_decoder_indices_tr[j:j+batch_size]).to(device)
        Y_b = torch.tensor(Y_indices_tr[j:j+batch_size]).to(device)
        op = model(Xe_b,Xd_b)
        loss = criterion(op,Y_b.reshape(-1))
        loss.backward()
        optimizer.step()
        batch_loss = loss.item()
        epoch_loss += batch_loss
        if j%1000 == 0: print("Epoch:",i,"Batch:",j,"Loss:",batch_loss)
    print("______________________________________")
    print("Epoch Loss:",epoch_loss)
    
    # Randomly select one sentence from Test Data to Predict.
    data_index = random.randint(0,100)
    # Since its only 1 sentence, we need to convert into a 2-D list before sending it to torch.tensor()
    Xe_b = torch.tensor([X_encoder_indices_test[data_index]]).to(device)
    print("Test Jumbled sentence:",X_encoder_tokens_test[data_index])
    print("Test Unjumbled sentence:", Y_tokens_test[data_index]) 
    predict_on_1_input(model, Xe_b, token_index_mapper, device, max_sequence_length)
    print("_______________________________________")

Epoch: 0 Batch: 0 Loss: 8.573908805847168
Epoch: 0 Batch: 1000 Loss: 4.552748680114746
Epoch: 0 Batch: 2000 Loss: 4.255152225494385
Epoch: 0 Batch: 3000 Loss: 4.000977993011475
Epoch: 0 Batch: 4000 Loss: 3.680941581726074
Epoch: 0 Batch: 5000 Loss: 3.532689094543457
