# Full Transformer Architecture

In [24]:
import torch.nn as nn
from torch.utils.data import DataLoader, random_split
import torch
import random
import os
import pandas as pd
from itertools import chain
import math
import time
from sklearn.model_selection import train_test_split
from create_seq2seq_data import Sequence2SequenceData

## Read Data

In [2]:
jumbled_file = "../../Datasets/Jumble_Unjumble/OriginalJumbled.txt"
unjumbled_file = "../../Datasets/Jumble_Unjumble/OriginalUnJumbled.txt"
full_data_object = Sequence2SequenceData(
    source_data_file=jumbled_file,
    target_data_file=unjumbled_file,
    extra_token_config={"unknown":"<unk>","pad":"<pad>","eos":"<eos>","sos":"<sos>"},
    max_sequence_length=25
)

### Printing full dataset, Vocab size

In [3]:
full_data_object.full_df.head()

Unnamed: 0,Source,Target
0,up entry climbing A in dress an a in way. of a...,A child in a pink dress is climbing up a set o...
1,into going building. wooden girl A a,A girl going into a wooden building.
2,climbing wooden a little girl into playhouse. A,A little girl climbing into a wooden playhouse.
3,to her playhouse. girl the A little stairs cli...,A little girl climbing the stairs to her playh...
4,wooden dress in cabin. into pink little girl A...,A little girl in a pink dress going into a woo...


In [4]:
print("Vocab size:",len(full_data_object.token_to_index))

Vocab size: 9018


### Printing raw tokens

In [5]:
print("X_Encoder token length:",len(full_data_object.X_encoder_tokens))
print("Sample X_Encoder tokens:",full_data_object.X_encoder_tokens[0])
print("Sample X_Decoder tokens:",full_data_object.X_decoder_tokens[0])
print("Sample Y tokens:",full_data_object.Y_tokens[0])

X_Encoder token length: 40460
Sample X_Encoder tokens: ['up', 'entry', 'climbing', 'a', 'in', 'dress', 'an', 'a', 'in', 'way', '.', 'of', 'a', 'child', 'is', 'stairs', 'pink', 'set', '<eos>']
Sample X_Decoder tokens: ['<sos>', 'a', 'child', 'in', 'a', 'pink', 'dress', 'is', 'climbing', 'up', 'a', 'set', 'of', 'stairs', 'in', 'an', 'entry', 'way', '.']
Sample Y tokens: ['a', 'child', 'in', 'a', 'pink', 'dress', 'is', 'climbing', 'up', 'a', 'set', 'of', 'stairs', 'in', 'an', 'entry', 'way', '.', '<eos>']


### Printing token indices

In [6]:
print("X_Encoderindices length:",len(full_data_object.X_encoder_indices))
print("Sample X_Encoder indices length:",len(full_data_object.X_encoder_indices[0]))
print("Sample X_Encoder indices:",full_data_object.X_encoder_indices[0])
print("Sample X_Decoder indices:",full_data_object.X_decoder_indices[0])
print("Sample Y indices:",full_data_object.Y_indices[0])

X_Encoderindices length: 40460
Sample X_Encoder indices length: 25
Sample X_Encoder indices: [2175, 2287, 4175, 7903, 8081, 7063, 7323, 7903, 8081, 8430, 170, 5235, 7903, 5835, 2741, 6804, 1890, 4182, 2, 1, 1, 1, 1, 1, 1]
Sample X_Decoder indices: [3, 7903, 5835, 8081, 7903, 1890, 7063, 2741, 4175, 2175, 7903, 4182, 5235, 6804, 8081, 7323, 2287, 8430, 170, 1, 1, 1, 1, 1, 1]
Sample Y indices: [7903, 5835, 8081, 7903, 1890, 7063, 2741, 4175, 2175, 7903, 4182, 5235, 6804, 8081, 7323, 2287, 8430, 170, 2, 1, 1, 1, 1, 1, 1]


## Define Model 
1. Encoder
2. Decoder

In [7]:
class EmbeddingClass(nn.Module):
    def __init__(self, vocab_size, embedding_dim, max_token_length):
        super(EmbeddingClass, self).__init__()
        self.max_token_length = max_token_length
        self.tok_embedding = nn.Embedding(vocab_size, embedding_dim)
        self.pos_embedding = nn.Embedding(max_token_length, embedding_dim)
    
    def forward(self,token_ids, debug = False):
        # create position ids
        batch_size = token_ids.shape[0]
        position_id = range(0,self.max_token_length)
        position_ids = torch.tensor([position_id]*batch_size)
        
        token_embeddings = self.tok_embedding(token_ids)
        position_embeddings = self.pos_embedding(position_ids)
        
        embbeded_x = token_embeddings + position_embeddings
        
        if debug:
            print("----------Embedding Class------------")
            print("token_ids shape:",token_ids.shape)
            print("batch_size:", batch_size)
            print("position_ids shape:", position_ids.shape)
            print("token_embeddings shape:", token_embeddings.shape)
            print("position_embeddings shape:", position_embeddings.shape)
            print("embbeded_x shape:", embbeded_x.shape)
        
        return embbeded_x

In [8]:
class SelfAttention(nn.Module):
    def __init__(self, embedding_dim, head_dim):
        super(SelfAttention, self).__init__()
        self.head_dim = head_dim
        self.W_K = nn.Linear(embedding_dim, head_dim)
        self.W_Q = nn.Linear(embedding_dim, head_dim)
        self.W_V = nn.Linear(embedding_dim, head_dim)
    
    def forward(self, embedded_x1, embedded_x2, mask_matrix = None, debug = False):
        # Batch_Size = B, Seq_Length = L, Embedding_Dim = D, Head_Dim = H
        # embedded_x shape = (B X L X D)
        q = self.W_Q(embedded_x1) # q shape = (B X L X H)
        k = self.W_K(embedded_x2) # k shape = (B X L X H)
        v = self.W_V(embedded_x2) # v shape = (B X L X H)
        
        scores = torch.bmm(q, k.transpose(1,2))/math.sqrt(self.head_dim)
        # scores shape = (B X L X L)
        # torch.bmm() does the batch matrix multiplication:
        # input_1 = (b X n X m), input_2 = (b X m X p)
        # op = torch.zeros(b X n X p)
        # for each matrix (n X m) in input_1 and (m X p) in input_2, 
        # torch.bmm() will output an (n X p) matrix. 
        # This will be done 'b' times (i.e for all b matrices)
        
        if mask_matrix is not None and mask_matrix.shape[0] > 0:
            scores = scores.masked_fill(mask_matrix==0, float("-inf"))
        weights = torch.softmax(scores, dim=2) # (B X L X L)
        output = torch.bmm(weights, v) # (B X L X H)
        
        if debug:
            print("----------Self-Attention------------")
            print("q shape:", q.shape)
            print("k shape:", k.shape)
            print("v shape:", v.shape)
            if mask_matrix is not None and mask_matrix.shape[0] > 0: print("mask_matrix shape:", mask_matrix.shape)
            print("scores shape:", scores.shape)
            print("weights shape:", weights.shape)
            print("output shape:", output.shape)
        
        return output

In [9]:
class MultiHeadAttention(nn.Module):
    def __init__(self, embedding_dim, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.head_dim = embedding_dim // num_heads
        self.sa_list = nn.ModuleList([
           SelfAttention(embedding_dim, self.head_dim) 
            for _ in range(self.num_heads)
        ])
        self.linear = nn.Linear(embedding_dim, embedding_dim)
    
    def forward(self, embedded_x1, embedded_x2, mask_matrix = None, debug = False):
        multiple_self_attn_op = [
            self_attn_head(embedded_x1, embedded_x2, mask_matrix, debug) 
            for self_attn_head in self.sa_list
        ]
        concatenated_op = torch.cat(multiple_self_attn_op,axis=2)
        linear_op = self.linear(concatenated_op)

        if debug:
            print("----------MHA------------")
            print("embedded_x1 shape:", embedded_x1.shape)
            print("embedded_x2 shape:", embedded_x2.shape)
            print("multiple_self_attn_op len:", len(multiple_self_attn_op))
            print("concatenated_op shape:", concatenated_op.shape)
            print("linear_op shape:", linear_op.shape)
        return linear_op

In [10]:
class FeedForward(nn.Module):
    def __init__(self,embedding_dim):
        super(FeedForward, self).__init__()
        self.linear_1 = nn.Linear(embedding_dim, 4*embedding_dim)
        self.linear_2 = nn.Linear(4*embedding_dim, embedding_dim)
        self.dropout = nn.Dropout(0.3)
        self.gelu = nn.GELU()
        
    def forward(self,mha_op, debug=False):
        linear_1_op = self.linear_1(mha_op)
        linear_1_op = self.gelu(linear_1_op)
        linear_2_op = self.linear_2(linear_1_op)
        linear_2_op = self.dropout(linear_2_op)
        if debug:
            print("----------Feed Forward------------")
            print("mha_op shape:", mha_op.shape)
            print("linear_1_op shape:", linear_1_op.shape)
            print("linear_2_op shape:", linear_2_op.shape)
        return linear_2_op

In [11]:
class EncoderBlock(nn.Module):
    def __init__(self, embedding_dim, num_heads):
        super(EncoderBlock, self).__init__()
        self.mha = MultiHeadAttention(embedding_dim, num_heads)
        self.ff_layer = FeedForward(embedding_dim)
        self.layer_norm = nn.LayerNorm(embedding_dim)
    
    def forward(self, embedded_x, mask_matrix, debug=False):
        mha_op = self.mha(embedded_x, embedded_x, mask_matrix, debug)
        layer_norm_op_1 = self.layer_norm(mha_op + embedded_x)
        ff_op = self.ff_layer(layer_norm_op_1, debug)
        layer_norm_op_2 = self.layer_norm(ff_op + layer_norm_op_1)
        
        if debug:
            print("----------EncoderBlock------------")
            print("embedded_x shape:", embedded_x.shape)
            print("mha_op shape:", mha_op.shape)
            print("layer_norm_op_1 shape:", layer_norm_op_1.shape)
            print("ff_op shape:", ff_op.shape)
            print("layer_norm_op_2 shape:", layer_norm_op_2.shape)
        return layer_norm_op_2

In [12]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, max_token_length, num_heads, num_layers):
        super(Encoder, self).__init__()
        self.embedding_obj = EmbeddingClass(
            vocab_size, embedding_dim, max_token_length
        )
        self.encoder_blocks = nn.ModuleList([
            EncoderBlock(embedding_dim, num_heads) for _ in range(num_layers)
        ])
    
    def forward(self, token_ids, debug=False):
        embedded_x = self.embedding_obj(token_ids, debug)
        for i,block in enumerate(self.encoder_blocks):
            if i == 0: block_op  = block(embedded_x, None, debug)
            else:
                block_op  = block(block_op, None, debug)
        if debug:
            print("----------Encoder------------")
            print("token_ids shape:", token_ids.shape)
            print("block_op shape:", block_op.shape)
        return block_op

In [13]:
class DecoderBlock(nn.Module):
    def __init__(self, embedding_dim, num_heads):
        super(DecoderBlock, self).__init__()
        self.mha = MultiHeadAttention(embedding_dim, num_heads)
        self.ff_layer = FeedForward(embedding_dim)
        self.layer_norm = nn.LayerNorm(embedding_dim)
    
    def forward(self, embedded_x, encoder_op, mask_matrix, debug=False):
        masked_mha_op = self.mha(embedded_x, embedded_x, mask_matrix, debug)
        layer_norm_op_1 = self.layer_norm(masked_mha_op + embedded_x)
        
        mha_cross_attn_op = self.mha(layer_norm_op_1, encoder_op, None, debug)
        layer_norm_op_2 = self.layer_norm(mha_cross_attn_op + layer_norm_op_1)
        
        ff_op = self.ff_layer(layer_norm_op_2, debug)
        layer_norm_op_3 = self.layer_norm(ff_op + layer_norm_op_2)
        
        if debug:
            print("----------DecoderBlock------------")
            print("embedded_x shape:", embedded_x.shape)
            print("encoder_op shape:", encoder_op.shape)
            print("masked_mha_op shape:", masked_mha_op.shape)
            print("layer_norm_op_1 shape:", layer_norm_op_1.shape)
            print("mha_cross_attn_op shape:", mha_cross_attn_op.shape)
            print("layer_norm_op_2 shape:", layer_norm_op_2.shape)
            print("ff_op shape:", ff_op.shape)
            print("layer_norm_op_3 shape:", layer_norm_op_3.shape)
        return layer_norm_op_3

In [14]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, max_token_length, num_heads, num_layers):
        super(Decoder, self).__init__()
        self.embedding_obj = EmbeddingClass(
            vocab_size, embedding_dim, max_token_length
        )
        self.encoder_blocks = nn.ModuleList([
            DecoderBlock(embedding_dim, num_heads) for _ in range(num_layers)
        ])
        self.mask_matrix = torch.tril(
            torch.ones(max_token_length, max_token_length)
        )
        self.linear = nn.Linear(embedding_dim, vocab_size)
        
    def forward(self, token_ids, encoder_op, debug=False):
        embedded_x = self.embedding_obj(token_ids, debug)
        for i,block in enumerate(self.encoder_blocks):
            if i == 0: block_op  = block(
                embedded_x, encoder_op, self.mask_matrix, debug
            )
            else:
                block_op  = block(
                    block_op, encoder_op, self.mask_matrix, debug
                )
        reshaped_output = block_op.reshape(
            -1,block_op.shape[2]
        )
        linear_op = self.linear(reshaped_output)
        if debug:
            print("----------Decoder------------")
            print("token_ids shape:", token_ids.shape)
            print("block_op shape:", block_op.shape)
            print("reshaped_output shape:", reshaped_output.shape)
            print("linear_op shape:", linear_op.shape)
        return linear_op

In [15]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, full_data_object, device):
        super(Seq2Seq, self).__init__() 
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        self.full_data_object = full_data_object
    
    def forward(self, encoder_input, decoder_input, teacher_forcing_ratio, debug=False): 
        # encoder_input shape: Batch_Size X Seq_Len
        # decoder_input shape: Batch_Size X Seq_Len
        
        encoder_outputs = self.encoder(encoder_input, debug)
        
        for i in range(self.full_data_object.max_sequence_length):
            if i == 0:
                # At 1st Time step, we need to pass <sos> token 
                # which is already provided in decoder_input.
                decoder_outputs = self.decoder(decoder_input, encoder_outputs, debug)
                # decoder_outputs shape: (Batch_Size*Seq_Len X Vocab_Size)
            else:
                softmax_op = torch.softmax(decoder_outputs,axis=1) # softmax_op is (Batch_Size*Seq_Len X Vocab_Size)
                pred_tokens = torch.argmax(softmax_op,axis=1) # pred_tokens is torch.tensor([]) of len = (Batch_Size*Seq_Len)
                reshaped_pred_tokens = pred_tokens.reshape(
                    -1,self.full_data_object.max_sequence_length
                ) # reshape to (Batch_Size X Seq_Len)
                
                teacher_force = torch.rand(1).item() < teacher_forcing_ratio
                if teacher_force:
                    decoder_outputs = self.decoder(decoder_input, encoder_outputs, debug=False)
                else:
                    # Replace i-th token in decoder_input with (i-1)th predicted_token
                    decoder_input[:,i] = reshaped_pred_tokens[:,i-1]
                    decoder_outputs = self.decoder(decoder_input, encoder_outputs, debug=False)
        if debug:
            print("----------Seq2Seq------------")
            print("encoder_outputs shape:", encoder_outputs.shape)
            print("decoder_outputs shape:", decoder_outputs.shape)
            
        return decoder_outputs

In [16]:
def predict_on_1_input(model, Xe_b, full_data_object, device):
    model.eval()
    sos_token = [full_data_object.token_to_index["<sos>"]]
    unjumbled_sentence = []
    with torch.no_grad():
        encoder_outputs = model.encoder(Xe_b)
        # At each time step, unlike RNN models, we can provide
        # full sequence to decoder, leveraging the masking functionality in the
        # self-attention of decoder.
        for i in range(full_data_object.max_sequence_length):
            if i == 0:
                # At 1st time step, we only have <sos> token,
                # so we provide one full sequence of <sos> tokens ([<sos>,<sos>,...,<sos>]) to decoder
                decoder_input = sos_token*full_data_object.max_sequence_length 
                decoder_op = model.decoder(torch.tensor([decoder_input]).to(device), encoder_outputs)
            else: 
                # After 1st time step, input to decoder is the predicted token of previous time step.
                
                # To get the predicted token of previous time step:
                # first, do the softmax on decoder_op of previous time step
                softmax_op = torch.softmax(decoder_op,axis=1) # decoder_op is (Seq_Len X Vocab_Size),

                # next, take the token with max probability
                predicted_token = torch.argmax(softmax_op,axis=1).tolist()
                # torch.argmax() returns a tensor([]), tolist() gives out the [].
                # The list will contain as many elements as 0th dimension of softmax_op.
                # as we are taking argmax along axis = 1.
                # In this case, softmax_op has (Seq_Len) tokens in 0th dimension, so the list has (Seq_Len) elements.
                
                decoder_input[i] = predicted_token[i-1]
                # At i-th time step, replace the <sos> token with (i-1)-th step's prediction
                decoder_op = model.decoder(
                    torch.tensor([decoder_input]).to(device), encoder_outputs
                )
                unjumbled_sentence.append(full_data_object.index_to_token[predicted_token[i-1]])
                if (
                    predicted_token[i-1] == full_data_object.token_to_index["<eos>"]
                    or
                    predicted_token[i-1] == full_data_object.token_to_index["."]
                ): break
        print("_______________________________________")
        print(unjumbled_sentence)
        return unjumbled_sentence

### Split the dataset into train and test sets

In [17]:
Xe_tokens_tr, Xe_tokens_test, Xd_tokens_tr, Xd_tokens_test, Y_tokens_tr, Y_tokens_test,\
Xe_indices_tr, Xe_indices_test, Xd_indices_tr, Xd_indices_test, Y_indices_tr, Y_indices_test= train_test_split(
    full_data_object.X_encoder_tokens, full_data_object.X_decoder_tokens, full_data_object.Y_tokens,
    full_data_object.X_encoder_indices, full_data_object.X_decoder_indices, full_data_object.Y_indices,
    test_size = 0.1
)
print(
    len(Xe_tokens_tr), len(Xe_tokens_test), 
    len(Xd_tokens_tr), len(Xd_tokens_test), 
    len(Y_tokens_tr), len(Y_tokens_test)
)

36414 4046 36414 4046 36414 4046


### Sample Training And Prediction

In [34]:
device = "cpu"
INPUT_DIM = len(full_data_object.token_to_index) # Size of source vocabulary 
OUTPUT_DIM = len(full_data_object.token_to_index) # Size of target vocabulary 
ENC_EMB_DIM = 512 
DEC_EMB_DIM = 512 
NUM_ENC_LAYER = 6
NUM_DEC_LAYER = 6
NUM_HEADS = 8
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, full_data_object.max_sequence_length, NUM_HEADS, NUM_ENC_LAYER) 
dec = Decoder(INPUT_DIM, DEC_EMB_DIM, full_data_object.max_sequence_length, NUM_HEADS, NUM_DEC_LAYER) 
model = Seq2Seq(enc, dec, full_data_object, device).to(device)
optimizer = torch.optim.Adam(model.parameters()) 
criterion = nn.CrossEntropyLoss(ignore_index=full_data_object.token_to_index["<pad>"])

In [39]:
data_index = 6
batch_size = 50

start_time = time.time()
model.train()
optimizer.zero_grad()
Xe_b = torch.tensor(Xe_indices_tr[data_index:data_index+batch_size]).to(device)
Xd_b = torch.tensor(Xd_indices_tr[data_index:data_index+batch_size]).to(device)
Y_b = torch.tensor(Y_indices_tr[data_index:data_index+batch_size]).to(device)
print(Xe_b.shape,Xd_b.shape,Y_b.shape)
output = model(Xe_b, Xd_b, 0.9, debug=True)
loss = criterion(output, Y_b.view(-1))
loss.backward()
optimizer.step()
end_time = time.time()
print("Time taken in seconds:",end_time-start_time)

torch.Size([50, 25]) torch.Size([50, 25]) torch.Size([50, 25])
----------Embedding Class------------
token_ids shape: torch.Size([50, 25])
batch_size: 50
position_ids shape: torch.Size([50, 25])
token_embeddings shape: torch.Size([50, 25, 512])
position_embeddings shape: torch.Size([50, 25, 512])
embbeded_x shape: torch.Size([50, 25, 512])
----------Self-Attention------------
q shape: torch.Size([50, 25, 64])
k shape: torch.Size([50, 25, 64])
v shape: torch.Size([50, 25, 64])
scores shape: torch.Size([50, 25, 25])
weights shape: torch.Size([50, 25, 25])
output shape: torch.Size([50, 25, 64])
----------Self-Attention------------
q shape: torch.Size([50, 25, 64])
k shape: torch.Size([50, 25, 64])
v shape: torch.Size([50, 25, 64])
scores shape: torch.Size([50, 25, 25])
weights shape: torch.Size([50, 25, 25])
output shape: torch.Size([50, 25, 64])
----------Self-Attention------------
q shape: torch.Size([50, 25, 64])
k shape: torch.Size([50, 25, 64])
v shape: torch.Size([50, 25, 64])
score

----------Feed Forward------------
mha_op shape: torch.Size([50, 25, 512])
linear_1_op shape: torch.Size([50, 25, 2048])
linear_2_op shape: torch.Size([50, 25, 512])
----------DecoderBlock------------
embedded_x shape: torch.Size([50, 25, 512])
encoder_op shape: torch.Size([50, 25, 512])
masked_mha_op shape: torch.Size([50, 25, 512])
layer_norm_op_1 shape: torch.Size([50, 25, 512])
mha_cross_attn_op shape: torch.Size([50, 25, 512])
layer_norm_op_2 shape: torch.Size([50, 25, 512])
ff_op shape: torch.Size([50, 25, 512])
layer_norm_op_3 shape: torch.Size([50, 25, 512])
----------Self-Attention------------
q shape: torch.Size([50, 25, 64])
k shape: torch.Size([50, 25, 64])
v shape: torch.Size([50, 25, 64])
mask_matrix shape: torch.Size([25, 25])
scores shape: torch.Size([50, 25, 25])
weights shape: torch.Size([50, 25, 25])
output shape: torch.Size([50, 25, 64])
----------Self-Attention------------
q shape: torch.Size([50, 25, 64])
k shape: torch.Size([50, 25, 64])
v shape: torch.Size([50, 

----------Self-Attention------------
q shape: torch.Size([50, 25, 64])
k shape: torch.Size([50, 25, 64])
v shape: torch.Size([50, 25, 64])
scores shape: torch.Size([50, 25, 25])
weights shape: torch.Size([50, 25, 25])
output shape: torch.Size([50, 25, 64])
----------Self-Attention------------
q shape: torch.Size([50, 25, 64])
k shape: torch.Size([50, 25, 64])
v shape: torch.Size([50, 25, 64])
scores shape: torch.Size([50, 25, 25])
weights shape: torch.Size([50, 25, 25])
output shape: torch.Size([50, 25, 64])
----------Self-Attention------------
q shape: torch.Size([50, 25, 64])
k shape: torch.Size([50, 25, 64])
v shape: torch.Size([50, 25, 64])
scores shape: torch.Size([50, 25, 25])
weights shape: torch.Size([50, 25, 25])
output shape: torch.Size([50, 25, 64])
----------Self-Attention------------
q shape: torch.Size([50, 25, 64])
k shape: torch.Size([50, 25, 64])
v shape: torch.Size([50, 25, 64])
scores shape: torch.Size([50, 25, 25])
weights shape: torch.Size([50, 25, 25])
output shap

In [40]:
# Randomly select one sentence from Test Data to Predict.
data_index = random.randint(0,100)
# Since its only 1 sentence, we need to convert into a 2-D list before sending it to torch.tensor()
Xe_b = torch.tensor([Xe_indices_test[data_index]]).to(device)
print("Test Jumbled sentence:",Xe_tokens_test[data_index])
print("Test Unjumbled sentence:", Y_tokens_test[data_index]) 
unjumbled_sentence = predict_on_1_input(model, Xe_b, full_data_object, device)

Test Jumbled sentence: ['fenced', 'with', 'area', 'animals', '.', 'farm', 'playing', 'two', 'grassy', 'children', 'in', 'a', 'in', '<eos>']
Test Unjumbled sentence: ['two', 'children', 'playing', 'in', 'a', 'fenced', 'in', 'grassy', 'area', 'with', 'farm', 'animals', '.', '<eos>']
_______________________________________
['a', 'man', 'and', 'woman', 'points', ',', 'to', 'put', 'put', 'put', 'put', 'put', 'her', 'friend', 'interviewed', 'to', 'put', 'her', 'hand', 'put', 'put', 'put', 'put', 'put']


## For Actual Training

In [41]:
device = "cpu" #torch.device("cuda:0")
batch_size = 50
INPUT_DIM = len(full_data_object.token_to_index) # Size of source vocabulary 
OUTPUT_DIM = len(full_data_object.token_to_index) # Size of target vocabulary 
ENC_EMB_DIM = 512 
DEC_EMB_DIM = 512
NUM_ENC_LAYER = 6
NUM_DEC_LAYER = 6
NUM_HEADS = 8
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, full_data_object.max_sequence_length, NUM_HEADS, NUM_ENC_LAYER) 
dec = Decoder(INPUT_DIM, DEC_EMB_DIM, full_data_object.max_sequence_length, NUM_HEADS, NUM_DEC_LAYER) 
model = Seq2Seq(enc, dec, full_data_object,device).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)
criterion = nn.CrossEntropyLoss(ignore_index=full_data_object.token_to_index["<pad>"])
epochs = 5
steps_per_epoch = len(Xe_indices_tr)//batch_size + 1
print("Steps Per Epoch:",steps_per_epoch)
scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, epochs=epochs, steps_per_epoch=steps_per_epoch, max_lr=1e-3)

Steps Per Epoch: 729


In [None]:
teacher_forcing = 1.0
for i in range(epochs):
    teacher_forcing /= (i+1)
    print("Teacher forcing ratio for epoch {0} is {1}".format(i,teacher_forcing))
    model.train()
    epoch_loss = 0
    for j in range(0,len(Xe_indices_tr),batch_size):
        optimizer.zero_grad()
        Xe_b = torch.tensor(Xe_indices_tr[j:j+batch_size]).to(device)
        Xd_b = torch.tensor(Xe_indices_tr[j:j+batch_size]).to(device)
        Y_b = torch.tensor(Y_indices_tr[j:j+batch_size]).to(device)
        op = model(Xe_b,Xd_b, teacher_forcing)
        loss = criterion(op,Y_b.reshape(-1))
        loss.backward()
        optimizer.step()
        batch_loss = loss.item()
        epoch_loss += batch_loss
        if j%500 == 0: print("Epoch:",i,"Batch:",j,"Loss:",batch_loss)
        scheduler.step() # OneCycleLR has to take a step after every batch.
    print("______________________________________")
    print("Epoch Loss:",epoch_loss)
    
    # Randomly select one sentence from Test Data to Predict.
    data_index = random.randint(0,100)
    # Since its only 1 sentence, we need to convert into a 2-D list before sending it to torch.tensor()
    Xe_b = torch.tensor([Xe_indices_test[data_index]]).to(device)
    print("Test Jumbled sentence:",Xe_tokens_test[data_index])
    print("Test Unjumbled sentence:", Y_tokens_test[data_index]) 
    unjumbled_sentence = predict_on_1_input(model, Xe_b, full_data_object, device)
    print("_______________________________________")

Teacher forcing ratio for epoch 0 is 1.0
Epoch: 0 Batch: 0 Loss: 9.263079643249512
Epoch: 0 Batch: 500 Loss: 6.893364906311035
Epoch: 0 Batch: 1000 Loss: 6.652351379394531
Epoch: 0 Batch: 1500 Loss: 6.297564506530762
Epoch: 0 Batch: 2000 Loss: 6.205170154571533
Epoch: 0 Batch: 2500 Loss: 5.979619979858398
Epoch: 0 Batch: 3000 Loss: 5.5181403160095215
Epoch: 0 Batch: 3500 Loss: 5.453036785125732
Epoch: 0 Batch: 4000 Loss: 5.256730079650879
Epoch: 0 Batch: 4500 Loss: 5.124663829803467
Epoch: 0 Batch: 5000 Loss: 5.095658779144287
Epoch: 0 Batch: 5500 Loss: 5.089066505432129
Epoch: 0 Batch: 6000 Loss: 4.757066249847412
Epoch: 0 Batch: 6500 Loss: 4.592711448669434
Epoch: 0 Batch: 7000 Loss: 4.648641109466553
Epoch: 0 Batch: 7500 Loss: 4.4117302894592285
Epoch: 0 Batch: 8000 Loss: 4.340845108032227
Epoch: 0 Batch: 8500 Loss: 4.301884651184082
Epoch: 0 Batch: 9000 Loss: 4.1593780517578125
Epoch: 0 Batch: 9500 Loss: 4.153980731964111
Epoch: 0 Batch: 10000 Loss: 3.9925413131713867
Epoch: 0 Batc