In [1]:
import torch.nn as nn
import torch
import random
import os
from sklearn.model_selection import train_test_split
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

<h6> In this notebook, we will try to unjumble a sentence using Encoder-Decoder + Attention Architecture built using <br><br>
 Recurrent Networks like GRU, LSTM and Bi-directional LSTMs.</h6>
<h6> The Data is located here: ../../Datasets/Jumble_Unjumble/ </h6>

#### Read Data

In [2]:
train_df = pd.read_csv("../../Datasets/Jumble_Unjumble/Train_400.tsv",sep="\t")
test_df = pd.read_csv("../../Datasets/Jumble_Unjumble/Test_100.tsv",sep="\t")
print(train_df.shape, test_df.shape)
train_df.head()

(400, 2) (130, 2)


Unnamed: 0,jumbled_sentences,unjumbled_sentences
0,Climate change a issue requiring and global im...,Climate change is a pressing global issue requ...
1,The rise. temperatures is gases causing to gre...,The increase in greenhouse gases is causing gl...
2,significantly the to levels. contributes dioxi...,Deforestation contributes significantly to the...
3,vital carbon for are Renewable energy emission...,Renewable energy sources are vital for reducin...
4,of power energy. Solar renewable wind and sour...,"Solar and wind power are clean, renewable sour..."


#### Build Vocab using jumbled_sentences of Train + Test dataset. Ideally only Train dataset should be used.

In [3]:
class VocabBuilder:
    def __init__(self,text_corpus,unknown_token=None,pad_token=None,sos_token=None,eos_token=None):
        '''
        text_corpus = "This is first sentence. This is second sentence. This is another sentence"
        '''
        self.text_corpus = text_corpus
        self.unknown_token = unknown_token or "<unk>"
        self.pad_token = pad_token or "<pad>"
        self.sos_token = sos_token or "<sos>"
        self.eos_token = eos_token or "<eos>"
        self.word_to_index, self.index_to_word = self.get_vocabs()
                        
    def get_vocabs(self):
        word_to_index = {}
        index_count = 0
        all_unique_words = set(self.text_corpus.split(" "))
        for index, word in enumerate(all_unique_words):
            word_to_index[word] = index
        word_to_index[self.pad_token] = index + 1
        word_to_index[self.unknown_token] = index + 2
        word_to_index[self.sos_token] = index + 3
        word_to_index[self.eos_token] = index + 4
        index_to_word = {v:k for k,v in word_to_index.items()}
        return word_to_index, index_to_word

In [4]:
text_corpus_1 = " ".join(train_df["jumbled_sentences"].tolist())
text_corpus_2 = " ".join(test_df["jumbled_sentences"].tolist())
text_corpus = text_corpus_1 + " " + text_corpus_2
print(text_corpus[:200])

Climate change a issue requiring and global immediate is action. sustained pressing The rise. temperatures is gases causing to greenhouse increase global in significantly the to levels. contributes di


In [5]:
unknown_token = "<unk>"
pad_token = "<pad>"
sos_token = "<sos>"
eos_token = "<eos>"
vocab_builder = VocabBuilder(text_corpus,unknown_token,pad_token,sos_token,eos_token)
print("WordToIndex Dict length:",len(vocab_builder.word_to_index))
print("IndexToWord Dict length:",len(vocab_builder.index_to_word))

WordToIndex Dict length: 1521
IndexToWord Dict length: 1521


#### Create X_Encoder, X_Decoder and Y
X_encoder is the matrix of words in jumbled_sentences, each sentence suffixed by "eos" token <br>
X_decoder is the matrix of unjumbled_sentences, each sentence prefixed by "sos" token <br>
Y is the matrix of unjumbled_sentences, each sentence suffixed by "eos" token <br>

Do this for both train and test data

In [6]:
def get_Xe_Xd_Y(dataframe, sos_token, eos_token):
    jumbled_sentences = dataframe["jumbled_sentences"].tolist()
    unjumbled_sentences = dataframe["unjumbled_sentences"].tolist()
    X_encoder_words = [el.split(" ") + [eos_token] for el in jumbled_sentences]
    X_decoder_words = [[sos_token] + el.split(" ") for el in unjumbled_sentences]
    Y_words = [el.split(" ") + [eos_token] for el in unjumbled_sentences]
    return X_encoder_words, X_decoder_words, Y_words

X_encoder_words_tr, X_decoder_words_tr, Y_words_tr = get_Xe_Xd_Y(train_df, sos_token, eos_token)
X_encoder_words_test, X_decoder_words_test, Y_words_test = get_Xe_Xd_Y(test_df, sos_token, eos_token)
print("X Encoder train length:",len(X_encoder_words_tr))
print("X Encoder test length:",len(X_encoder_words_test))
print("Sample X_encoder_train:",X_encoder_words_tr[0])
print("Sample X_decoder_train:",X_decoder_words_tr[0])
print("Sample Y_train:",Y_words_tr[0])

X Encoder train length: 400
X Encoder test length: 130
Sample X_encoder_train: ['Climate', 'change', 'a', 'issue', 'requiring', 'and', 'global', 'immediate', 'is', 'action.', 'sustained', 'pressing', '<eos>']
Sample X_decoder_train: ['<sos>', 'Climate', 'change', 'is', 'a', 'pressing', 'global', 'issue', 'requiring', 'immediate', 'and', 'sustained', 'action.']
Sample Y_train: ['Climate', 'change', 'is', 'a', 'pressing', 'global', 'issue', 'requiring', 'immediate', 'and', 'sustained', 'action.', '<eos>']


#### Map X_encoder, X_decoder and Y using Vocab

In [7]:
class Word_Index_Mapper:
    def __init__(self,word_to_index,index_to_word, unknown_token):
        self.word_to_index = word_to_index
        self.index_to_word = index_to_word
        self.unknown_token = unknown_token
    
    def get_encoding(self,sentence):
        '''
        sentence must be a list of words.
        Ex: ["Climate","change","is","a","pressing","global","issue"]
        '''
        encoded_sentence = []
        for word in sentence:
            if word in self.word_to_index: encoded_sentence.append(self.word_to_index[word])
            else: encoded_sentence.append(self.word_to_index[self.unknown_token])
        return encoded_sentence
    
    def get_decoding(self,encoded_sentence):
        '''
        encoded_sentence must be a list of vocab indices.
        Ex: encoded_sentence = [24,21,4,1,..] 
        '''
        sentence = [self.index_to_word[index] for index in encoded_sentence]
        return " ".join(sentence)

In [8]:
def map_words_to_indices(word_index_mapper, max_sequence_length, word_matrix):
    index_matrix = []
    for el in word_matrix:
        el = el[:max_sequence_length]
        if len(el) < max_sequence_length:
            pad_tokens_to_append = max_sequence_length - len(el)
            el = el + [pad_token]*pad_tokens_to_append
        index_matrix.append(word_index_mapper.get_encoding(el))
    return index_matrix

In [9]:
max_sequence_length = 25
word_index_mapper = Word_Index_Mapper(vocab_builder.word_to_index, vocab_builder.index_to_word, unknown_token)
X_encoder_indices_tr = map_words_to_indices(word_index_mapper, max_sequence_length, X_encoder_words_tr)
X_decoder_indices_tr = map_words_to_indices(word_index_mapper, max_sequence_length, X_decoder_words_tr)
Y_indices_tr = map_words_to_indices(word_index_mapper, max_sequence_length, Y_words_tr)
X_encoder_indices_test = map_words_to_indices(word_index_mapper, max_sequence_length, X_encoder_words_test)
X_decoder_indices_test = map_words_to_indices(word_index_mapper, max_sequence_length, X_decoder_words_test)
Y_indices_test = map_words_to_indices(word_index_mapper, max_sequence_length, Y_words_test)
print("X Encoder train length:",len(X_encoder_indices_tr))
print("X Encoder test length:",len(X_encoder_indices_test))
print("Sample X_encoder_train:",X_encoder_indices_tr[0])
print("Sample X_decoder_train:",X_decoder_indices_tr[0])
print("Sample Y_train:",Y_indices_tr[0])

X Encoder train length: 400
X Encoder test length: 130
Sample X_encoder_train: [490, 1269, 668, 400, 338, 952, 164, 794, 781, 838, 179, 43, 1520, 1517, 1517, 1517, 1517, 1517, 1517, 1517, 1517, 1517, 1517, 1517, 1517]
Sample X_decoder_train: [1519, 490, 1269, 781, 668, 43, 164, 400, 338, 794, 952, 179, 838, 1517, 1517, 1517, 1517, 1517, 1517, 1517, 1517, 1517, 1517, 1517, 1517]
Sample Y_train: [490, 1269, 781, 668, 43, 164, 400, 338, 794, 952, 179, 838, 1520, 1517, 1517, 1517, 1517, 1517, 1517, 1517, 1517, 1517, 1517, 1517, 1517]


#### Unknown token statistics - must be 0, as both Train and Test dataset has been used for vocab creation

In [10]:
X_test_temp = []
unknown_token_counts = 0
for el in X_decoder_indices_test:
    temp_list = word_index_mapper.get_decoding(el)
    unknown_token_counts += temp_list.count(unknown_token)
    X_test_temp.append(temp_list)
print(unknown_token_counts)
print(X_test_temp[4])

0
<sos> The youth are increasingly active in climate advocacy. <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>


### Define Model

In [11]:
class UnjumbleEncoderModel(nn.Module):
    def __init__(self,vocab_size,embedding_dim,num_lstm_layers,hidden_size,make_bidirectional,debug):
        super().__init__()
        self.debug = debug
        self.bidirectional = make_bidirectional
        self.num_lstm_layers = num_lstm_layers
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(vocab_size,embedding_dim)
        self.relu = nn.ReLU()
        self.gru = nn.GRU(input_size=embedding_dim,hidden_size=hidden_size,dropout=0.5,
                            num_layers=num_lstm_layers,bidirectional=make_bidirectional,batch_first=True)
        
    def forward(self,x,h):
        if self.debug: 
            print("_______________________________")
            print("\t\tEncoder\t\t")
            print("_______________________________")
        if self.debug: print("Before starting: x Shape:",x.shape,"Prev State Shape",h.shape)
        
        x = self.embedding(x)
        x = self.relu(x)
        if self.debug: print("Embedding, x Shape:",x.shape)
        
        op,ht = self.gru(x,h)
        if self.debug: print("GRU, op Shape:",op.shape,"ht shape",ht.shape)
        
        if self.bidirectional: 
            ht_for_decoder = torch.cat((ht[-1],ht[-2]),axis=1)
            ht_for_decoder = ht_for_decoder.unsqueeze(0)
        else: ht_for_decoder = ht[-1].unsqueeze(0)
        if self.debug: print("ht for decoder shape",ht_for_decoder.shape)
            
        return op,ht,ht_for_decoder
    
    def init_hidden(self):
        first_param = self.num_lstm_layers
        if self.bidirectional: first_param *= 2
        return torch.zeros(first_param, 1, self.hidden_size)

class UnjumbleBahadnauAttention(nn.Module):
    def __init__(self,attention_neurons,debug):
        super().__init__()
        self.debug = debug
        self.rnn = nn.RNNCell(input_size=attention_neurons,hidden_size=attention_neurons,bias=False)
        self.linear = nn.Linear(in_features = attention_neurons, out_features = 1)
        self.softmax = nn.Softmax(dim=0)
    
    def forward(self,op_from_enoder,st_minus_one_from_decoder):
        
        # Reshape the op_from_enoder from (batch_size,seq_length,lstm_neurons) to (batch_size*seq_length,lstm_neurons)
        # And reshape st_minus_one_from_decoder from (1,batch_size,lstm_neurons) to (batch_size,lstm_neurons)
        # And repeat st_minus_one_from_decoder to seq_length times to get (batch_size*seq_length,lstm_neurons)
        if self.debug: 
            print("_______________________________")
            print("\t\tAttention\t\t")
            print("_______________________________")
        seq_length = op_from_enoder.shape[1]
        op_from_enoder = op_from_enoder.reshape(-1,op_from_enoder.shape[2])
        st_minus_one_from_decoder = st_minus_one_from_decoder[-1]
        st_minus_one_from_decoder = st_minus_one_from_decoder.repeat(seq_length,1)
        if self.debug: print("Shape of op_from_encoder:",op_from_enoder.shape,
                             "Shape of st_minus_one_from_decoder:",st_minus_one_from_decoder.shape)
            
        rnn_op = self.rnn(op_from_enoder,st_minus_one_from_decoder)
        if self.debug: print("RNN Cell Op:",rnn_op.shape)
            
        linear_op = self.linear(rnn_op)
        if self.debug: print("Linear Op:",linear_op.shape)
            
        softmax_op = self.softmax(linear_op)
        if self.debug: print("Softmax Op:",softmax_op.shape)
            
        ct = torch.sum(torch.mul(op_from_enoder,softmax_op),dim=0).unsqueeze(0)
        if self.debug: print("Weighted Averaged h vectors:",ct.shape)
        
        return ct,softmax_op
        

class UnjumbleDecoderModel(nn.Module):
    def __init__(self,model_attention,vocab_size,embedding_dim,num_lstm_layers,
                 hidden_size,make_bidirectional,debug):
        super().__init__()
        self.debug = debug
        self.model_attention = model_attention
        self.bidirectional = make_bidirectional
        self.num_lstm_layers = num_lstm_layers
        self.embedding = nn.Embedding(vocab_size,embedding_dim)
        self.relu = nn.ReLU()
        self.gru_input_size = embedding_dim  + hidden_size
        self.gru = nn.GRU(input_size=self.gru_input_size,hidden_size=hidden_size,
                            num_layers=num_lstm_layers,bidirectional=make_bidirectional,batch_first=True)
        self.in_features = hidden_size*2 if make_bidirectional else hidden_size
        self.linear = nn.Linear(in_features=self.in_features, out_features=vocab_size)
        self.log_softmax = nn.LogSoftmax(dim=1)
    
    def forward(self,x,s0_from_encoder,op_from_encoder_for_attn):
        if self.debug: 
            print("_______________________________")
            print("\t\tDecoder\t\t")
            print("_______________________________")
        if self.debug: print("Before starting: x Shape:",x.shape," s0_from_encoder Shape:",s0_from_encoder.shape)
        
        x = self.embedding(x)
        x = self.relu(x)
        if self.debug: print("Embedding, x Shape:",x.shape)
        
        
        seq_length = x.shape[1]
        if self.debug: print("Sequence Length:",seq_length)
        
        all_timestep_op = []
        for i in range(seq_length):
            if i == 0: 
                ct,softmax_op = self.model_attention(op_from_encoder_for_attn,s0_from_encoder)
                concatenated_x = torch.cat((x[0][i].unsqueeze(0),ct),axis=1)
                if self.debug: print("concatenated_x shape:",concatenated_x.shape)
                gru_op,ht = self.gru(concatenated_x.unsqueeze(0),s0_from_encoder)
                
            else: 
                ct,softmax_op = self.model_attention(op_from_encoder_for_attn,ht)
                concatenated_x = torch.cat((x[0][i].unsqueeze(0),ct),axis=1)
                if self.debug: print("concatenated_x shape:",concatenated_x.shape)
                gru_op,ht = self.gru(concatenated_x.unsqueeze(0),ht)
            
            all_timestep_op.append(gru_op)
            if self.debug:
                print("---------------------------------")
                print("GRU_OP:",gru_op.shape,"Ht:",ht.shape)
                print("---------------------------------")
        
        gru_final_op = torch.cat(all_timestep_op,axis=1)
        if self.debug: print("GRU, Final Shape:",gru_final_op.shape,"ht shape",ht.shape)
            
        # Resizing caption for Linear Layer
        gru_final_op = gru_final_op.reshape(-1,gru_final_op.shape[2])
        if self.debug: print("Reshaping gru_final_op Shape:",gru_final_op.shape)
        
        linear_op = self.linear(gru_final_op)
        if self.debug: print("Linear linear_op Shape:",linear_op.shape)
        
        op = self.log_softmax(linear_op)
        if self.debug: print("log_softmax op Shape:",op.shape)
            
        if self.debug:print("_______________________________\n\n")
            
        return op,ht,softmax_op


In [12]:
device = "cpu" #torch.device("cuda:0")
hidden_size_encoder = 400
hidden_size_decoder = hidden_size_encoder
model_encoder = UnjumbleEncoderModel(
    vocab_size=len(word_index_mapper.word_to_index),embedding_dim=300,num_lstm_layers=2,
    hidden_size=hidden_size_encoder,make_bidirectional=True,debug=True
).to(device)
if model_encoder.bidirectional: hidden_size_decoder = 2*hidden_size_encoder
model_attention = UnjumbleBahadnauAttention(hidden_size_decoder,debug=True).to(device)
model_decoder = UnjumbleDecoderModel(
    model_attention = model_attention,
    vocab_size=len(word_index_mapper.word_to_index),embedding_dim=300,num_lstm_layers=1,
    hidden_size=hidden_size_decoder,make_bidirectional=False,debug=True
).to(device)
loss_fn = nn.NLLLoss()
optimizer_encoder = torch.optim.Adam(model_encoder.parameters(),lr=0.003)
optimizer_decoder = torch.optim.Adam(model_decoder.parameters(),lr=0.003)

In [13]:
data_index = 6

init_ht_for_encoder = model_encoder.init_hidden().to(device)
model_encoder.train()
model_decoder.train()

optimizer_encoder.zero_grad()
optimizer_decoder.zero_grad()
Xe_b = torch.tensor([X_encoder_indices_tr[data_index]]).to(device)
Xd_b = torch.tensor([X_decoder_indices_tr[data_index]]).to(device)
Y_b = torch.tensor([Y_indices_tr[data_index]]).to(device)
print(Xe_b.shape,Xd_b.shape,Y_b.shape)

op_from_encoder,ht,ht_for_decoder = model_encoder(Xe_b,init_ht_for_encoder)
op,_,_ = model_decoder(Xd_b,ht_for_decoder,op_from_encoder)
ht = ht.detach()
loss = loss_fn(op,Y_b.reshape(-1))
loss.backward()
optimizer_encoder.step()
optimizer_decoder.step()

print("---------------------------------------------")

optimizer_encoder.zero_grad()
optimizer_decoder.zero_grad()
Xe_b = torch.tensor([X_encoder_indices_tr[data_index+1]]).to(device)
Xd_b = torch.tensor([X_decoder_indices_tr[data_index+1]]).to(device)
Y_b = torch.tensor([Y_indices_tr[data_index+1]]).to(device)
print(Xe_b.shape,Xd_b.shape,Y_b.shape)
op_from_encoder,ht,ht_for_decoder = model_encoder(Xe_b,ht)
op,_,_ = model_decoder(Xd_b,ht_for_decoder,op_from_encoder)
ht = ht.detach()
loss = loss_fn(op,Y_b.reshape(-1))
loss.backward()
optimizer_encoder.step()
optimizer_decoder.step()

torch.Size([1, 25]) torch.Size([1, 25]) torch.Size([1, 25])
_______________________________
		Encoder		
_______________________________
Before starting: x Shape: torch.Size([1, 25]) Prev State Shape torch.Size([4, 1, 400])
Embedding, x Shape: torch.Size([1, 25, 300])
GRU, op Shape: torch.Size([1, 25, 800]) ht shape torch.Size([4, 1, 400])
ht for decoder shape torch.Size([1, 1, 800])
_______________________________
		Decoder		
_______________________________
Before starting: x Shape: torch.Size([1, 25])  s0_from_encoder Shape: torch.Size([1, 1, 800])
Embedding, x Shape: torch.Size([1, 25, 300])
Sequence Length: 25
_______________________________
		Attention		
_______________________________
Shape of op_from_encoder: torch.Size([25, 800]) Shape of st_minus_one_from_decoder: torch.Size([25, 800])
RNN Cell Op: torch.Size([25, 800])
Linear Op: torch.Size([25, 1])
Softmax Op: torch.Size([25, 1])
Weighted Averaged h vectors: torch.Size([1, 800])
concatenated_x shape: torch.Size([1, 1100])
---

---------------------------------------------
torch.Size([1, 25]) torch.Size([1, 25]) torch.Size([1, 25])
_______________________________
		Encoder		
_______________________________
Before starting: x Shape: torch.Size([1, 25]) Prev State Shape torch.Size([4, 1, 400])
Embedding, x Shape: torch.Size([1, 25, 300])
GRU, op Shape: torch.Size([1, 25, 800]) ht shape torch.Size([4, 1, 400])
ht for decoder shape torch.Size([1, 1, 800])
_______________________________
		Decoder		
_______________________________
Before starting: x Shape: torch.Size([1, 25])  s0_from_encoder Shape: torch.Size([1, 1, 800])
Embedding, x Shape: torch.Size([1, 25, 300])
Sequence Length: 25
_______________________________
		Attention		
_______________________________
Shape of op_from_encoder: torch.Size([25, 800]) Shape of st_minus_one_from_decoder: torch.Size([25, 800])
RNN Cell Op: torch.Size([25, 800])
Linear Op: torch.Size([25, 1])
Softmax Op: torch.Size([25, 1])
Weighted Averaged h vectors: torch.Size([1, 800])
c

In [24]:
def predict(model_encoder,model_decoder,
            X_encoder_indices_test,X_decoder_indices_test, X_encoder_words_test, X_decoder_words_test, 
            word_index_mapper, device):
    data_index = random.randint(0,100)
    Xe_b = torch.tensor([X_encoder_indices_test[data_index]]).to(device)
    print(X_encoder_words_test[data_index],Xe_b)
    print(X_decoder_words_test[data_index])
    
    model_encoder.eval()
    model_decoder.eval()
    with torch.no_grad():
        softmax_ops = []
        init_ht_for_encoder = model_encoder.init_hidden().to(device)
        op_from_encoder,ht,ht_for_decoder = model_encoder(Xe_b,init_ht_for_encoder)
        sos_word = torch.tensor([[word_index_mapper.word_to_index["<sos>"]]]).to(device)
        op,ht,softmax_op = model_decoder(sos_word,ht_for_decoder,op_from_encoder)
        softmax_ops.append([round(float(el),3) for el in softmax_op.cpu()])
        unjumbled_sentence = []
        for i in range(25):
            predicted_word = torch.argmax(op,axis=1).tolist()
#             print("Predicted .....................",predicted_word)
            unjumbled_sentence.append(word_index_mapper.index_to_word[predicted_word[0]])
            if predicted_word[0] == word_index_mapper.word_to_index["<eos>"]: break
            op,ht,softmax_op = model_decoder(torch.tensor([predicted_word]).to(device),ht,op_from_encoder)
            softmax_ops.append([round(float(el),3) for el in softmax_op.cpu()])
        print("_______________________________________")
        print(unjumbled_sentence)
        print("attention weights")
#         df = pd.DataFrame(softmax_ops)
#         if df.shape[0] == len(unjumbled_sentence):
#             jumbled_words = X_encoder_words_test[data_index]
#             df.columns = jumbled_words
#             df.index = unjumbled_sentence
#             plt.figure(figsize=(16,4))
#             sns.heatmap(df, annot=True)
#             plt.show()
        print("_______________________________________")

In [25]:
predict(model_encoder,model_decoder,X_encoder_indices_test,X_decoder_indices_test, X_encoder_words_test, X_decoder_words_test, 
            word_index_mapper, device)

['dating', 'connects', 'people', 'for', 'relationships.', 'Online', 'looking', '<eos>'] tensor([[ 800,  348,  686,  339, 1044, 1435,  438, 1520, 1517, 1517, 1517, 1517,
         1517, 1517, 1517, 1517, 1517, 1517, 1517, 1517, 1517, 1517, 1517, 1517,
         1517]])
['<sos>', 'Online', 'dating', 'connects', 'people', 'looking', 'for', 'relationships.']
_______________________________________
['Individuals', 'can', 'help', 'by', 'reducing', 'their', 'carbon', 'footprints.', '<eos>']
attention weights
_______________________________________


## For Actual Training

In [45]:
device = "cpu" # torch.device("cuda:0")
hidden_size_encoder = 400
hidden_size_decoder = hidden_size_encoder
model_encoder = UnjumbleEncoderModel(
    vocab_size=len(word_index_mapper.word_to_index),embedding_dim=300,num_lstm_layers=2,
    hidden_size=hidden_size_encoder,make_bidirectional=True,debug=False
).to(device)
if model_encoder.bidirectional: hidden_size_decoder = 2*hidden_size_encoder
model_attention = UnjumbleBahadnauAttention(hidden_size_decoder,debug=False).to(device)
model_decoder = UnjumbleDecoderModel(
    model_attention = model_attention,
    vocab_size=len(word_index_mapper.word_to_index),embedding_dim=300,num_lstm_layers=1,
    hidden_size=hidden_size_decoder,make_bidirectional=False,debug=False
).to(device)
loss_fn = nn.NLLLoss()
optimizer_encoder = torch.optim.Adam(model_encoder.parameters(),lr=0.0003)
optimizer_decoder = torch.optim.Adam(model_decoder.parameters(),lr=0.0003)

In [46]:
epochs = 5
# init_ht_for_encoder = model_encoder.init_hidden().to(device)
for i in range(epochs):
    model_encoder.train()
    model_decoder.train()
    epoch_loss = 0
    for j in range(0,len(X_encoder_indices_tr)):
        optimizer_encoder.zero_grad()
        optimizer_decoder.zero_grad()
        Xe_b = torch.tensor([X_encoder_indices_tr[j]]).to(device)
        Xd_b = torch.tensor([X_decoder_indices_tr[j]]).to(device)
        Y_b = torch.tensor([Y_indices_tr[j]]).to(device)
        init_ht_for_encoder = model_encoder.init_hidden().to(device)
        op_from_encoder,ht,ht_for_decoder = model_encoder(Xe_b,init_ht_for_encoder)
        op,_,_ = model_decoder(Xd_b,ht_for_decoder,op_from_encoder)
        ht = ht.detach()
#         init_ht_for_encoder = ht
        loss = loss_fn(op,Y_b.reshape(-1))
        loss.backward()
        optimizer_encoder.step()
        optimizer_decoder.step()
        batch_loss = loss.item()
        epoch_loss += batch_loss
        if j%20 == 0: print("Epoch:",i,"Batch:",j,"Loss:",batch_loss)
#     init_ht_for_encoder = ht
    print("______________________________________")
    print("Epoch Loss:",epoch_loss)
    predict(model_encoder,model_decoder,X_encoder_indices_test,X_decoder_indices_test, X_encoder_words_test, X_decoder_words_test, 
            word_index_mapper, device)
    print("_______________________________________")

Epoch: 0 Batch: 0 Loss: 7.331637382507324
Epoch: 0 Batch: 20 Loss: 3.2774887084960938
Epoch: 0 Batch: 40 Loss: 2.9809961318969727
Epoch: 0 Batch: 60 Loss: 2.643310308456421
Epoch: 0 Batch: 80 Loss: 3.0020956993103027
Epoch: 0 Batch: 100 Loss: 2.5291919708251953
Epoch: 0 Batch: 120 Loss: 2.2877607345581055
Epoch: 0 Batch: 140 Loss: 2.2846059799194336
Epoch: 0 Batch: 160 Loss: 2.7650551795959473
Epoch: 0 Batch: 180 Loss: 1.7087280750274658
Epoch: 0 Batch: 200 Loss: 2.5276002883911133
Epoch: 0 Batch: 220 Loss: 2.230926752090454
Epoch: 0 Batch: 240 Loss: 2.3627145290374756
Epoch: 0 Batch: 260 Loss: 1.9801985025405884
Epoch: 0 Batch: 280 Loss: 2.3858814239501953
Epoch: 0 Batch: 300 Loss: 2.4429879188537598
Epoch: 0 Batch: 320 Loss: 2.580000638961792
Epoch: 0 Batch: 340 Loss: 2.4296698570251465
Epoch: 0 Batch: 360 Loss: 2.132462501525879
Epoch: 0 Batch: 380 Loss: 1.4336600303649902
______________________________________
Epoch Loss: 886.1695392131805
['and', 'diagnosis', 'Telemedicine', 'remo

In [None]:
def predict_on_whole_val(model_encoder,model_decoder,encoder_encode_decode,decoder_itow,decoder_wtoi):
    model_encoder.eval()
    model_decoder.eval()
    accuracy_tuple_list = []  #[(jumbed_sent,unjumbled_sent,predicted_sent,hard,soft,word_count),...,]
    with torch.no_grad():
        for data_index in range(len(Xval_e)):
            if data_index % 50 == 0: print(data_index,end = ' ')
            Xe_b = torch.tensor([encoder_encode_decode.get_encoding(Xval_e[data_index])]).to(device)

            init_ht_for_encoder = model_encoder.init_hidden().to(device)
            op_from_encoder,ht,ht_for_decoder = model_encoder(Xe_b,init_ht_for_encoder)
            sos_word = torch.tensor([[decoder_wtoi["<sos>"]]]).to(device)
            op,ht,softmax_op = model_decoder(sos_word,ht_for_decoder,op_from_encoder)
            unjumbled_sentence = []
            for i in range(25):
                predicted_word = torch.argmax(op,axis=1).tolist()
                unjumbled_sentence.append(decoder_itow[predicted_word[0]])
                if predicted_word[0] == decoder_wtoi["<eos>"]: break
                op,ht,softmax_op = model_decoder(torch.tensor([predicted_word]).to(device),ht,op_from_encoder)
                
            hard_accuracy = 1 if " ".join(unjumbled_sentence) == Yval[data_index] else 0
            word_count = len(set(Yval[data_index].split()))
            soft_accuracy = len(set(unjumbled_sentence).intersection(set(Yval[data_index].split())))/word_count
            accuracy_tuple_list.append(
                (Xval_e[data_index],Yval[data_index]," ".join(unjumbled_sentence),hard_accuracy,soft_accuracy,word_count)
            )
    return accuracy_tuple_list

In [None]:
accuracy_tuple_list = predict_on_whole_val(model_encoder,model_decoder,encoder_encode_decode,decoder_itow,decoder_wtoi)
df = pd.DataFrame(accuracy_tuple_list)
df.columns = ["jumbled_sent","unjumbled_sent","prediction","hard_accuracy","soft_accuracy","word_count"]
print(df.shape,df['hard_accuracy'].sum())
df.head(10)

In [None]:
a = sum((df['soft_accuracy']*df['word_count']).tolist())
b = df['word_count'].sum()
a,b,a/b

In [None]:
df[df['soft_accuracy']==1].shape, df[df['soft_accuracy']==1].shape[0]/df.shape[0]

In [None]:
def predict_on_test(model_encoder_new,model_decoder_new,encoder_encode_decode,decoder_itow,decoder_wtoi):
    model_encoder_new.eval()
    model_decoder_new.eval()
    accuracy_tuple_list = []  #[(jumbed_sent,unjumbled_sent,predicted_sent,hard,soft,word_count),...,]
    with torch.no_grad():
        for data_index in range(len(Xtest_e)):
            if data_index % 50 == 0: print(data_index,end = ' ')
            Xe_b = torch.tensor([encoder_encode_decode.get_encoding(Xtest_e[data_index])]).to(device)
            init_ht_for_encoder = model_encoder_new.init_hidden().to(device)
            op_from_encoder,ht,ht_for_decoder = model_encoder_new(Xe_b,init_ht_for_encoder)
            sos_word = torch.tensor([[decoder_wtoi["<sos>"]]]).to(device)
            op,ht,softmax_op = model_decoder_new(sos_word,ht_for_decoder,op_from_encoder)
            unjumbled_sentence = []
            for i in range(25):
                predicted_word = torch.argmax(op,axis=1).tolist()
                unjumbled_sentence.append(decoder_itow[predicted_word[0]])
                if predicted_word[0] == decoder_wtoi["<eos>"]: break
                op,ht,softmax_op = model_decoder_new(torch.tensor([predicted_word]).to(device),ht,op_from_encoder)
            hard_accuracy = 1 if " ".join(unjumbled_sentence) == Ytest[data_index] else 0
            word_count = len(set(Ytest[data_index].split()))
            soft_accuracy = len(set(unjumbled_sentence).intersection(set(Ytest[data_index].split())))/word_count
            accuracy_tuple_list.append(
                (Xtest_e[data_index],Ytest[data_index]," ".join(unjumbled_sentence),hard_accuracy,soft_accuracy,word_count)
            )
    return accuracy_tuple_list

In [None]:
Xtest_e = [
    "is eating . Nitish apple <eos>",
    "is city my favorite New York. <eos>",
    "a a and dog are man woods walking through the . <eos>"
]
Ytest = [
    "Nitish is eating apple . <eos>",
    "New York is my faorite city. <eos>",
    "a man and a dog are walking through the woods . <eos>"
]

In [None]:
accuracy_tuple_list_test = predict_on_test(model_encoder,model_decoder,encoder_encode_decode,decoder_itow,decoder_wtoi)
df_test = pd.DataFrame(accuracy_tuple_list_test)
df_test.columns = ["jumbled_sent","unjumbled_sent","prediction","hard_accuracy","soft_accuracy","word_count"]
df_test.head(10)

## Save the models

#### Attention Model was not required to be saved explicitly as it is a part of the decoder model only.
#### Additionally the word_to_index and index_to_word dictionary and get_encoding funtion will be required for inference

In [None]:
torch.save(model_encoder.state_dict(), "SavedModels/Jumble/encoder_model.pt")
torch.save(model_attention.state_dict(), "SavedModels/Jumble/attention_model.pt")
torch.save(model_decoder.state_dict(), "SavedModels/Jumble/decoder_model.pt")

# torch.save(model_encoder,"SavedModels/Jumble/encoder_model.pt")
# torch.save(model_attention,"SavedModels/Jumble/attention_model.pt")
# torch.save(model_decoder,"SavedModels/Jumble/decoder_model.pt")

## Load Saved Models

In [None]:
device = torch.device("cuda:0")
hidden_size_encoder = 400
encoder_encode_decode = EncodeDecode(encoder_wtoi,encoder_itow,pad_token,unknown_token)
decoder_encode_decode = EncodeDecode(decoder_wtoi,decoder_itow,pad_token,unknown_token)
loaded_encoder_model = UnjumbleEncoderModel(
    vocab_size=len(encoder_wtoi),embedding_dim=300,num_lstm_layers=2,
    hidden_size=hidden_size_encoder,make_bidirectional=True,debug=False
).to(device)
loaded_encoder_model.load_state_dict(torch.load("SavedModels/Jumble/encoder_model.pt"))
loaded_encoder_model.eval()

if loaded_encoder_model.bidirectional: hidden_size_decoder = 2*hidden_size_encoder
loaded_attention_model = UnjumbleBahadnauAttention(hidden_size_decoder,debug=False).to(device)
loaded_attention_model.load_state_dict(torch.load("SavedModels/Jumble/attention_model.pt"))
loaded_attention_model.eval()

loaded_decoder_model = UnjumbleDecoderModel(model_attention = loaded_attention_model,
    vocab_size=len(encoder_wtoi),embedding_dim=300,num_lstm_layers=1,
    hidden_size=hidden_size_decoder,make_bidirectional=False,debug=False
).to(device)
loaded_decoder_model.load_state_dict(torch.load("SavedModels/Jumble/decoder_model.pt"))
loaded_decoder_model.eval()


# loaded_encoder_model = torch.load("SavedModels/Jumble/encoder_model.pt")
# loaded_decoder_model = torch.load("SavedModels/Jumble/decoder_model.pt")
# model_attention = torch.load("SavedModels/Jumble/attention_model.pt")

In [None]:
accuracy_tuple_list_test = predict_on_test(loaded_encoder_model,loaded_decoder_model,encoder_encode_decode,decoder_itow,decoder_wtoi)
df_test = pd.DataFrame(accuracy_tuple_list_test)
df_test.columns = ["jumbled_sent","unjumbled_sent","prediction","hard_accuracy","soft_accuracy","word_count"]
df_test.head(10)