In [3]:
from __future__ import unicode_literals, print_function, division
from io import open
import sys
import unicodedata
import re
import random
from typing import Optional

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torchmetrics.classification import Accuracy
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

import time
import math
from tqdm import tqdm

import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Create Vocab class for input and output language


In [4]:
SOS_token = 0
EOS_token = 1

class Lang:
    """
    Lang(self,name:str)
    Attributes:
        word2index {dict}: Dictionary for mapping word to vocab indexes
        index2word {dict}: Dictionary for mapping vocab index to words
        word2count {count}: Dictionary for mapping word to its frequency of appearance in dataset
    Methods:
        addSentence:
            args:
                sentence {str}
            Adds words from sentence to vocab 
        addWord
    """
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [5]:
# Turn a Unicode string to plain ASCII, thanks to
# https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z!?]+", r" ", s)
    return s.strip()

## Read languages 

In [6]:
def readLangs(lang1, lang2, reverse=False):
    print("Reading lines...")

    # Read the file and split into lines
    lines = open('data/%s-%s.txt' % (lang1, lang2), encoding='utf-8').\
        read().strip().split('\n')

    # Split every line into pairs and normalize
    pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]

    # Reverse pairs, make Language instances for vocabulary utilizations
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)

    return input_lang, output_lang, pairs

## Filter data


In [7]:
MAX_LENGTH = 30

eng_prefixes = (
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s ",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re "
)

def filterPair(p,reverse):
    if reverse is not True:
        return len(p[0].split(' ')) < MAX_LENGTH and \
            len(p[1].split(' ')) < MAX_LENGTH and \
            p[0].startswith(eng_prefixes)
    else:
        return len(p[0].split(' ')) < MAX_LENGTH and \
            len(p[1].split(' ')) < MAX_LENGTH and \
            p[1].startswith(eng_prefixes)

def filterPairs(pairs,reverse):
    return [pair for pair in pairs if filterPair(pair,reverse)]

In [8]:
import numpy as np
np.array([1,2,3]).tolist()

[1, 2, 3]

## Read and preprocess all text from txt -> Create pairs from entire data and add words from data to vocab. 

In [9]:




def prepareData(lang1, lang2, reverse=False,test_size=None):
    input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse)
    print("Read %s sentence pairs" % len(pairs))
    pairs = filterPairs(pairs,reverse)
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs

input_lang, output_lang, pairs = prepareData('eng', 'fra', True,1000)
print(random.choice(pairs))


Reading lines...
Read 135842 sentence pairs
Counting words...
Counted words:
fra 5173
eng 3388
['je suis completement confuse', 'i am totally confused']


## Tokenize data, create tensors of the data and create dataset and data loader


In [10]:
MAX_LENGTH=30
########################################################################################################
  
def indexesFromSentence(lang,sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]

def tensorFromSentence(lang,sentence):
    indexes=indexesFromSentence(lang,sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long,device=device).view(1,-1)

def tensorFromPair(pair):
    input_tensor=tensorFromSentence(input_lang,pair[0])
    output_tensor=tensorFromSentence(output_lang,pair[1])
    return(input_tensor,output_tensor)


####################################################################################################################################################


def get_dataloader(lang1,lang2,batch_size,test_ratio=0.1):
    input_lang,output_lang,pairs=prepareData(lang1,lang2,True)
    N=len(pairs)
    input_ids=torch.zeros(size=(N,MAX_LENGTH),dtype=torch.long)
    output_ids=torch.zeros(size=(N,MAX_LENGTH),dtype=torch.long)

    for idx,(inp,trg) in enumerate(pairs):
        try:
            input_tensor,output_tensor=tensorFromPair((inp,trg))
        except KeyError:
            print(f"error at {idx}th pair")
            print(inp,trg)
            continue
        input_ids[idx,:input_tensor.shape[1]]=input_tensor
        output_ids[idx,:output_tensor.shape[1]]=output_tensor
    
    test_size=int(N*test_ratio)
        
    test_idx=np.random.randint(low=0,high=N,size=test_size)
    train_idx=np.setdiff1d(np.arange(N),test_idx)
    
    train_inp,train_outp=input_ids[train_idx],output_ids[train_idx]
    test_inp,test_outp=input_ids[test_idx],output_ids[test_idx]
    
    train_data=TensorDataset(torch.LongTensor(train_inp).to(device),
                             torch.LongTensor(train_outp).to(device)
                             )
    test_data=TensorDataset(torch.LongTensor(test_inp).to(device),
                             torch.LongTensor(test_outp).to(device)
                             )
    
    train_sampler=RandomSampler(train_data)
    train_dataloader=DataLoader(train_data,sampler=train_sampler,batch_size=batch_size)

    test_sampler=RandomSampler(test_data)
    test_dataloader=DataLoader(test_data,sampler=test_sampler,batch_size=batch_size)
    
    return input_lang,output_lang,train_dataloader,test_dataloader



## MODEL BUILDING 

### Vanilla

In [11]:
class SelfAttention(nn.Module):
    def __init__(self,input_dim,heads):
        super().__init__()
        head_dim=input_dim//heads
        self.heads=heads
        self.head_dim=head_dim
        assert input_dim%heads==0

        self.query=nn.Linear(head_dim,head_dim)
        self.key=nn.Linear(head_dim,head_dim)
        self.value=nn.Linear(head_dim,head_dim)
        self.fc_out=nn.Linear(input_dim,input_dim)
    
    def forward(self,queries,keys,values):
        N=queries.shape[0]
        ql,kl,vl=queries.shape[1],keys.shape[1],values.shape[1]   #Q,K,V
        queries=torch.transpose(queries.reshape(N,ql,self.heads,self.head_dim),1,2) #NHQD
        keys=torch.transpose(keys.reshape(N,kl,self.heads,self.head_dim),1,2)   #NHKD
        values=torch.transpose(keys.reshape(N,vl,self.heads,self.head_dim),1,2)   #NHVD   
        queries_=self.query(queries)
        keys_=self.key(keys)
        values_=self.value(values)

        attention=torch.bmm(queries_,torch.transpose(keys_-1,-2))           #NHQD . NHDK -> NHQK
        
        attention_norm=torch.softmax(attention/(self.head_dim**0.5),axis=3)
        attention_out=torch.transpose(torch.bmm(attention_norm,values_),1,2)    #NHQK . NHVD -> NHQD (V=K)
        out=self.fc_out(attention_out.reshape(N,ql,self.heads*self.head_dim))
        return out 
        

# class TransformerBlock(nn.Module):
#     def __init__(self,input_dim,heads,expansion,dropout):
#         self.expansion=input_dim*expansion
#         self.attention=SelfAttention(input_dim,heads)
#         self.dropout=nn.Dropout(dropout)
#         self.norm1=nn.LayerNorm(input_dim)
#         self.norm2=nn.LayerNorm(input_dim)
#         self.expansion_layer=nn.Sequential(
#             nn.Linear(input_dim,self.expansion),
#             nn.ReLU(),
#             nn.Linear(expansion,input_dim)
#         )   

#     def forward(self,query,key,value,mask):
#         attention=self.attention(query,key,value)



class EncoderRNN(nn.Module):
    def __init__(self,input_size,hidden_size,dropout_p=0.1,vocab_size=None):
        super(EncoderRNN,self).__init__()
        self.hidden_size=hidden_size
        self.embedding=nn.Embedding(vocab_size,hidden_size)
        self.gru=nn.GRU(hidden_size,hidden_size,batch_first=True)
        self.dropout=nn.Dropout(dropout_p)
        
    def forward(self,input):
        embedded=self.dropout(self.embedding(input))
        out,hidden_state=self.gru(embedded)
        return out,hidden_state


class DecoderRNN(nn.Module):
    def __init__(self,output_size,hidden_size,vocab_size=None):
        super(DecoderRNN,self).__init__()
        self.embedding=nn.Embedding(vocab_size,hidden_size)
        self.gru=nn.GRU(hidden_size,hidden_size,batch_first=True)
        self.out=nn.Linear(hidden_size,output_size)
    
    def forward_step(self,input,hidden):
        embedded=self.embedding(input)
        output,hidden=self.gru(embedded,hidden)
        output=self.out(output)
        return output,hidden 

    def forward(self,encoder_outputs,encoder_hidden,target_tensor=None):
        N=encoder_outputs.size(0)
        decoder_input=torch.empty(N,1,dtype=torch.long,device=device).fill_(SOS_token)
        decoder_hidden=encoder_hidden
        decoder_outputs=[]
        for i in range(MAX_LENGTH):
            gru_out,decoder_hidden=self.forward_step(decoder_input,decoder_hidden)
            decoder_outputs.append(gru_out)
            if target_tensor is not None:
                decoder_input=target_tensor[:,i].unsqueeze(1)

            else:
                _,topi=gru_out.topk(1)
                decoder_input=topi.squeeze(-1).detach()

        
        decoder_outputs=torch.cat(decoder_outputs,dim=1)
        decoder_outputs=F.log_softmax(decoder_outputs,dim=-1)
        return decoder_outputs,decoder_hidden,None
    

    

        
                



        
        


### Bahdanau Attention Implementation

In [14]:
class EncoderBiRNN(nn.Module):
    def __init__(self,vocab_size,num_layers,input_size,hidden_size):
        super(EncoderBiRNN,self).__init__()
        self.embedding=nn.Embedding(vocab_size,input_size)
        self.BiRNN=nn.GRU(input_size,hidden_size, num_layers=num_layers,bidirectional=True,batch_first=True)
        
    def forward(self,x,hidden=None):
        N,max_L=x.shape
        hiddens=[]
        outputs=[]
        for i in range(max_L):
            embedding=self.embedding(x[:,i].unsqueeze(1))
            output,hidden=self.BiRNN(embedding,hidden)
            outputs.append(output)
            hiddens.append(hidden.unsqueeze(0))
        outputs=torch.cat(outputs,dim=1)            # outputs [max_L,N,1,enc_hidden_size]
        hiddens=torch.cat(hiddens,dim=0)            # hiddens [max_L,enc_nlayers*directions,N,enc_hidden_size]

        return outputs,hiddens
    
    #encoding_dict={directions=enc_directions,
        # enc_hidden_size=enc_hidden_size,
        # dec_hidden_size=enc_hidden_size,
        # enc_num_layers=enc_num_layers,
        # dec_num_layers=dec_num_layers}

        


class BahdanauAttention(nn.Module):
    def __init__(self,encoding_dict):
        super(BahdanauAttention,self).__init__()
        self.encoding_dict=encoding_dict
        self.Wa=nn.Linear(self.encoding_dict['dec_hidden_size']*self.encoding_dict['dec_num_layers'],self.encoding_dict['dec_hidden_size'])
        self.Ua=nn.Linear(self.encoding_dict["enc_hidden_size"],self.encoding_dict['dec_hidden_size'])
        self.Va=nn.Linear(self.encoding_dict["dec_hidden_size"],1)

    def forward(self,query,keys): 
        #query [dec_num_layers,N,dec_hidden]; keys  [max_L,(enc_nlayers*directions),N,enc_hidden_size]  
        
        #  [dec_num_layers,N,dec_hidden]-> [N,dec_num_layers,dec_hidden]-> [N,1,dec_num_layers*dec_hidden]
        query=query.transpose(0,1)
        query=query.reshape(query.size(0),1,query.size(1)*query.size(2))    
        #  [max_L,(enc_nlayers*directions),N,enc_hidden_size] -> [max_L*(enc_nlayers*directions),N,enc_hidden_size] -> [N,max_L*(enc_nlayers*directions),enc_hidden_size]
        keys=keys.reshape(-1,keys.size(2),keys.size(3)).transpose(0,1)
        
        #  query: [N, 1, dec_num_layers*dec_hidden] @ [dec_hidden*dec_num_layers, dec_hidden] -> [N, 1, dec_hidden]
        #  keys:  [N, max_L*(enc_nlayers*directions), enc_hidden_size] @ [enc_hidden_size, dec_hidden] -> [N, max_L*(enc_nlayers*directions), dec_hidden]
        
        #  score(addition): [N, max_L*(enc_nlayers*directions), dec_hidden] + [N, 1, dec_hidden](broadcasted) -> [N, max_L*(enc_nlayers*directions), dec_hidden]
        #  score(reduce_sum ): [N, max_L*(enc_nlayers*directions), dec_hidden] @ [dec_hidden, 1] -> [N, max_L*(enc_nlayers*directions), 1]                
        scores=self.Va(torch.tanh(self.Wa(query)+self.Ua(keys)))
        #  score: [N, max_L*(enc_nlayers*directions), 1] -> [N, max_L*(enc_nlayers*directions)] -> [N, 1, max_L*(enc_nlayers*directions)]  
        scores=scores.squeeze().unsqueeze(1)
        
        #softmax(weights)
        weights=F.softmax(scores,dim=-1)
        #  [N,1,max_L*(enc_nlayers*directions)] @ [N,max_L*(enc_nlayers*directions),enc_hidden_size] -> [N,1,enc_hidden_size]
        context=torch.bmm(weights,keys) 

        return context,weights

class BahdanauDecoder(nn.Module):
    def __init__(self,vocab_size,hidden_size,num_layers,encoding_dict):
        super(BahdanauDecoder,self).__init__()
        self.encoding_dict=encoding_dict
        self.encoding_dict['dec_hidden_size']=hidden_size                
        self.encoding_dict['dec_num_layers']=num_layers
        self.embedding=nn.Embedding(vocab_size,hidden_size)                            # Embedding Layer: [ vocab_size, enc_hidden_dims ]
        self.attention=BahdanauAttention(self.encoding_dict)                           # Inputs: [t-1_dec_hidden_state,all_enc_hidden_states] | Outputs: [context_vector,attention_weights]
        self.gru=nn.GRU(
            input_size=self.encoding_dict['enc_hidden_size']+self.encoding_dict['dec_hidden_size'],       #Inputs: [context+input_concat,decoder_hidden_state]
            hidden_size=self.encoding_dict['dec_hidden_size'],                                       #Outputs: [lstm_final_layer_activations,dec_hidden_state]
            num_layers=self.encoding_dict['dec_num_layers'],
            batch_first=True
            )
        self.fcout=nn.Linear(self.encoding_dict['dec_hidden_size'],vocab_size)

    def forward(self,encoder_hiddens,target_tensor=None):
        MAX_LENGTH,N=encoder_hiddens.size(0),encoder_hiddens.size(2)
        decoder_input=torch.empty((N,1),dtype=torch.long).fill_(SOS_token).to(device)                   #create empty input
        decoder_outputs=[]  #output token cache
        attention_weights=[]  #atttention weights cache
        decoder_hidden=torch.zeros((self.encoding_dict['dec_num_layers'],N,self.encoding_dict['dec_hidden_size'])).to(device)
        for i in range(MAX_LENGTH):
            decoder_output,decoder_hidden,attn_weight=self.forward_step(decoder_input,encoder_hiddens,decoder_hidden)
            if target_tensor!=None:
                decoder_input=target_tensor[:,i].unsqueeze(1)               #Teacher Forcing with groundtruth label inputs
            else:
                _,topi=decoder_output.topk(1)                               #Predicted output to input
                decoder_input=topi.squeeze(-1).detach()
            
            
            
            decoder_outputs.append(decoder_output)
            attention_weights.append(attn_weight)
        decoder_outputs=F.log_softmax(torch.cat(decoder_outputs,dim=1),dim=-1)
        attention_weights=torch.cat(attention_weights,dim=1)
        return decoder_outputs,attention_weights
    
    def forward_step(self,input,encoder_states,decoder_hidden):
        embedded=self.embedding(input)          #[batch_size,1] -> [batch_size,1,decoder_hidden]
        context,weights=self.attention(query=decoder_hidden,keys=encoder_states) #[[dec_num_layers,N,dec_hidden];[max_L,(enc_nlayers*directions),N,enc_hidden_size]] ->[N,1,enc_hidden_size];[N,1,max_L]    
        nn_inp=torch.cat([context,embedded],dim=-1)                         # [N,1,enc_hidden_size+dec_hidden]
        decoder_output,decoder_hidden=self.gru(nn_inp,decoder_hidden)       # [[N,1,enc_hidden_size+dec_hidden], [dec_num_layers,N,dec_hidden]] -> [N,1,dec_hidden_state],[dec_num_layers,N,dec_hidden]                                              
        decoder_output=self.fcout(decoder_output)                           # [N,1,dec_hidden_state] -> [N,1,output_vocab_size]
        return decoder_output,decoder_hidden,weights                

### Luong Attention Implementation

In [15]:

class EncoderLSTM(nn.Module):
    def __init__(self,vocab_size,input_size,hidden_size,num_layers):
        super(EncoderLSTM,self).__init__()
        self.embedding=nn.Embedding(vocab_size,input_size)
        self.lstm=nn.LSTM(input_size=input_size,hidden_size=hidden_size,num_layers=num_layers,batch_first=True)
    def forward(self,x,state_cell=None):
        hiddens=[]
        N,max_L=x.shape
        for i in range(max_L):
            embedded=self.embedding(x[:,i].unsqueeze(1))
            if not state_cell:
                _,state_cell=self.lstm(embedded)
            else:
                _,state_cell=self.lstm(embedded,state_cell)
            hidden=state_cell[0]
            hiddens.append(hidden.unsqueeze(2))
        hiddens=torch.cat(hiddens,dim=2)                        # we use only the last hidden layers of both encoder and decoder hidden_states
        return None,hiddens[-1]                  # encoder_hidden_state (last layer) [N, MAX_L, encoder_hidden_dims]

# encoding_dict={
#     'enc_hidden_size':enc_hidden_size,
#     'enc_num_layers': enc_num_layers,
#     'dec_hidden_size':dec_hidden_size,
#     "dec_num_layers":dec_num_layers
#     
#     
# }

class dotAttention(nn.Module):
    def __init__(self,_):                   #added _ for consistency with rest of the multiplicative techniques
        super(dotAttention,self).__init__()
    def forward(self,dec_hidden_state,enc_hidden_states):                               
        dot=torch.bmm(enc_hidden_states,dec_hidden_state.transpose(-1,-2)).transpose(-1,-2)
        # dot=torch.einsum("N1H,NLH->N1L",[dec_hidden_state,enc_hidden_states])
        attn_w=F.softmax(dot,dim=-1)        # NIL @
        context=torch.bmm(attn_w,enc_hidden_states)
        return context,attn_w                       #context: N,L,eH
        
class generalDotAttention(nn.Module):
    def __init__(self,encoding_dict):
        super(generalDotAttention,self).__init__()
        self.W_a=nn.Linear(encoding_dict['enc_hidden_size'],encoding_dict['dec_hidden_size'])
    def forward(self,dec_hidden_state,enc_hidden_states):
        enc_hidden_alligned=self.W_a(enc_hidden_states)
        dec_hidden_state=dec_hidden_state.transpose(-1,-2)                    ## N,1,H -> N,H,1
        dot=torch.bmm(enc_hidden_alligned,dec_hidden_state).transpose(-1,-2)  ## N,L,H @ N,H,1 -> N,L,1 -> N,1,L
        attn_w=F.softmax(dot,dim=-1)        # N,1,L
        context=torch.bmm(attn_w,enc_hidden_states)  ## N,1,L @ N,L,eH
        return context,attn_w                               #context: N,1,eH
        
class concatAttention(nn.Module):
    def __init__(self,encoding_dict):
        super(concatAttention,self).__init__()
        self.W_a=nn.Linear(encoding_dict['enc_hidden_size']+encoding_dict['dec_hidden_size'],encoding_dict['dec_hidden_size'])
        self.v_a=nn.Linear(encoding_dict['dec_hidden_size'],1)
    def forward(self,dec_hidden_state,enc_hidden_states):
        print(dec_hidden_state.shape,enc_hidden_states.shape)
        concat_=torch.concat((dec_hidden_state.expand(dec_hidden_state.size(0),enc_hidden_states.size(1),dec_hidden_state.size(2)),enc_hidden_states),dim=-1)      #N,1,dH -> N,L,dH c N,L,eH -> N,L,dH+eH  (dH==H)
        merged=torch.tanh(self.W_a(concat_))                #N,L,dH+eH -> N,L,H        
        attn_w=F.softmax(self.v_a(merged).transpose(-1,-2),dim=-1)  #N,L,H -> softmax(N,1,L)
        context=torch.bmm(attn_w,enc_hidden_states)     ## N,1,L @ N,L,eH
        return context,attn_w                     #context: N,1,eH ; attn_w:N,1,L


class LuongAttention(nn.Module):
    def __init__(self,encoding_dict:dict,max_L_d:Optional[tuple]=None,variant:str='general',type_:Optional[str]='global'):
        super(LuongAttention,self).__init__()
        self.attn_={'concat':concatAttention,'general':generalDotAttention,'dot':dotAttention}
        self.attention=self.attn_[variant](encoding_dict)
        self.type_=type_
        if max_L_d:
            max_L,d=max_L_d
            if d>max_L:
                raise Exception("Sorry, but local span cannot be greater than the max sequence length")
            self.seq_len=max_L
            self.d=d
            self.W_p=nn.Linear(encoding_dict['dec_hidden_size'],1)
            self.V_p=nn.Linear(encoding_dict['max_L'],1)


    def local_p(self,decoder_hidden_state):
        dec_sum=torch.tanh(self.W_p(decoder_hidden_state)).squeeze().unsqueeze(0)
        L_ratio=torch.sigmoid(self.V_p(dec_sum))
        return self.seq_len*L_ratio.squeeze()

    def forward(self,dec_hidden_state,enc_hidden_states):
        if self.type_=='local':
            p=self.local_p(dec_hidden_state)
            enc_hidden_states=enc_hidden_states[:,p-self.d:p+self.d+1,:]                    #[N,max_L,hidden_dims] -> [N,p-d:p+d,hidden_dims]
        context,attn_weights=self.attention(dec_hidden_state,enc_hidden_states)
        return context,attn_weights
    

class LuongDecoder(nn.Module):
    def __init__(self,vocab_size,hidden_size,num_layers,attn_variant,attn_type,encoding_dict):
        super(LuongDecoder,self).__init__()
        encoding_dict['dec_hidden_size']=hidden_size
        encoding_dict['dec_num_layers']=num_layers
        self.embedding=nn.Embedding(vocab_size,hidden_size)
        self.lstm=nn.LSTM(input_size=hidden_size,hidden_size=hidden_size,num_layers=num_layers,batch_first=True)
        self.attention=LuongAttention(encoding_dict,variant=attn_variant,type_=attn_type)
        self.W_c=nn.Linear(encoding_dict["enc_hidden_size"]+encoding_dict["dec_hidden_size"],hidden_size)
        self.fcout=nn.Linear(hidden_size,vocab_size)
        
    def forward(self,enc_hidden_states,target_tensor=None):
        N,max_L=enc_hidden_states.shape[:2]
        decoder_input=torch.empty((N,1),dtype=torch.long,device=device).fill_(SOS_token)
        dec_hidden_cell=None
        dec_outputs,attn_weights=[],[]
        for i in range(max_L):
            decoder_output,dec_hidden_cell,attn_w=self.forward_step(decoder_input,enc_hidden_states,dec_hidden_cell)
            if target_tensor==None:
                _,topi=decoder_output.topk(1)                               # N,1,vocab_size -> N,1,1                      
                decoder_input=topi.squeeze(-1).detach()                     # N,1,1 -> N,1 
            else:
                decoder_input=target_tensor[:,i].unsqueeze(1)               # N -> N,1
            dec_outputs.append(decoder_output)
            attn_weights.append(attn_w)
        dec_outputs=F.log_softmax(torch.cat(dec_outputs,dim=1),dim=-1)
        attn_weights=torch.cat(attn_weights,dim=1)
        return dec_outputs,attn_weights            


    def forward_step(self,input,enc_hidden_states,dec_hidden_cell=None):
        embedded=self.embedding(input)
        if not dec_hidden_cell:
            _,dec_hidden_cell=self.lstm(embedded)
        else:
            _,dec_hidden_cell=self.lstm(embedded,dec_hidden_cell)
        dec_hidden_state=dec_hidden_cell[0][-1].unsqueeze(1)      #(hidden,cell) -> hidden: Nlayers,N,dH -> N,dH ->  N,1,dH 
        context,attn_w=self.attention(dec_hidden_state,enc_hidden_states)
        context_cat=torch.cat([context,embedded],dim=-1)            # N,1,dH ; N,1,eH -> N,1,dH+eH
        decoder_output=torch.tanh(self.W_c(context_cat))            # N,1,dH+eH -> N,1,dH
        # decoder_output,decoder_hidden_cell=self.lstm(decoder_output,dec_hidden_cell)
        decoder_output=self.fcout(decoder_output)                   # N,1,dH -> N,1,vocab_size
        return decoder_output,dec_hidden_cell,attn_w

        

            

        

## Training and Plotting Utils

In [None]:
def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points,range(len(points)))
    plt.show()


def eval_train(encoder,decoder,data_loader,accuracy):
    encoder.eval(),decoder.eval()
    accuracy_=0
    n_eval=0
    with torch.no_grad():
        for idx,(input_,targets) in enumerate(data_loader):
            _,encoder_hidden=encoder(input_)
            preds,attn_weights=decoder(encoder_hidden)
            for pred,target in zip(preds,targets):
                accuracy_+=accuracy(pred,target)
                n_eval+=1
    encoder.train(),decoder.train()
    return accuracy_.item()/n_eval


def train_bahdanau_luong_epoch(encoder,decoder,dataloader,criterion,encoder_optimizer,decoder_optimizer):
    total_loss=0
    for data in tqdm(dataloader):
        x,y=data
        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()

        _,encoder_hidden=encoder(x)
        decoder_outputs,attn_weights=decoder(encoder_hidden,target_tensor=y)
        loss=criterion(decoder_outputs.reshape(-1,decoder_outputs.size(-1)),y.view(-1))
        loss.backward()
        encoder_optimizer.step()
        decoder_optimizer.step()

        total_loss+=loss.item()
    return total_loss / len(dataloader)

def train_bahdanau_luong(epochs,encoder,decoder,train_dataloader,test_dataloader,lr,print_every,plot_every):
    plot_losses=[]
    print_losses=[]
    plot_loss_total=0
    print_loss_total=0
    decoder_optimizer=torch.optim.Adam(params=decoder.parameters(),lr=lr)
    encoder_optimizer=torch.optim.Adam(params=encoder.parameters(),lr=lr)
    criterion=nn.NLLLoss()
    accuracy=Accuracy(task="multiclass",num_classes=decoder.fcout.out_features,top_k=2).to(device)
    # pbar=tqdm(total=epochs,position=0,desc='Training Progress')
    # loss_log=tqdm(position=1,total=0,bar_foramt='{desc}')
    for i in range(1,epochs+1):
        avg_loss=train_bahdanau_luong_epoch(encoder,decoder,train_dataloader,criterion,encoder_optimizer,decoder_optimizer)
        val_acc=eval_train(encoder,decoder,test_dataloader,accuracy)
        plot_loss_total+=avg_loss
        print_loss_total+=avg_loss
        if i%print_every==0:
            print_losses.append(print_loss_total/print_every)
            print(f"Epoch {i} / {epochs} :  Loss {print_loss_total/print_every}   |   Validation Accuracy {val_acc}")
            print_loss_total=0
        # loss_log.set_description_str(f"Epoch {i} / {epochs} :  Loss {print_loss_total/print_every}")
        # pbar.update(1)
        if i%plot_every==0:
            plot_losses.append(plot_loss_total/plot_every)
            plot_loss_total=0
    showPlot(plot_losses)
    

## Testing 

In [77]:
enc_hidden_size=512
enc_num_layers=2
encoding_dict={'enc_hidden_size':enc_hidden_size,"enc_num_layers":enc_num_layers}
input_lang,output_lang,train_data,test_data=get_dataloader('eng','fra',25)
enlstm=EncoderLSTM(vocab_size=input_lang.n_words,input_size=256,hidden_size=enc_hidden_size,num_layers=enc_num_layers).to(device)
dclstm=LuongDecoder(vocab_size=output_lang.n_words,hidden_size=512,num_layers=1,attn_variant='general',attn_type='global',encoding_dict=encoding_dict).to(device)

Reading lines...
Read 135842 sentence pairs
Counting words...
Counted words:
fra 5173
eng 3388


In [82]:
from tqdm import tqdm
train_bahdanau_luong(epochs=10,encoder=enbahdanau,decoder=decbahdanau,train_dataloader=train_data,test_dataloader=test_data,lr=0.002,print_every=1,plot_every=1)

100%|██████████| 498/498 [00:38<00:00, 12.92it/s]


Epoch 1 / 100 :  Loss 0.6604812208667816
Epoch 1 / 100 :  Val Accuracy 0.8900459605008135


100%|██████████| 498/498 [00:38<00:00, 12.87it/s]


Epoch 2 / 100 :  Loss 0.3170104822719911
Epoch 2 / 100 :  Val Accuracy 0.9072993747308097


100%|██████████| 498/498 [00:38<00:00, 12.86it/s]


Epoch 3 / 100 :  Loss 0.17152406062826095
Epoch 3 / 100 :  Val Accuracy 0.9281772303909839


  5%|▌         | 26/498 [00:02<00:37, 12.70it/s]


KeyboardInterrupt: 

In [81]:
enc_hidden_size=512
enc_num_layers=2
device=torch.device('cuda')
encoding_dict={'enc_hidden_size':enc_hidden_size,"enc_num_layers":enc_num_layers,'directions':2}
enbahdanau=EncoderBiRNN(vocab_size=input_lang.n_words,num_layers=1,input_size=256,hidden_size=512).to(device)
decbahdanau=BahdanauDecoder(vocab_size=output_lang.n_words,hidden_size=512,num_layers=1,encoding_dict=encoding_dict).to(device)



In [108]:
prepareData('eng','fra',reverse=True)[2][4]

Reading lines...
Read 135842 sentence pairs
Trimmed to 13056 sentence pairs
Counting words...
Counted words:
fra 5161
eng 3378


['je suis en forme', 'i m fit']

In [133]:
# Target are to be padded
T = 50      # Input sequence length
C = 20      # Number of classes (including blank)
N = 16      # Batch size
S = 30      # Target sequence length of longest target in batch (padding length)
S_min = 10  # Minimum target length, for demonstration purposes
# Initialize random batch of input vectors, for *size = (T,N,C)
input = torch.randn(T, N, C).log_softmax(-2).detach().requires_grad_()
# Initialize random batch of targets (0 = blank, 1:C = classes)
target = torch.randint(low=1, high=C, size=(N, S), dtype=torch.long)
input_lengths = torch.full(size=(N,), fill_value=T, dtype=torch.long)
target_lengths = torch.randint(low=S_min, high=S, size=(N,), dtype=torch.long)
ctc_loss = nn.CTCLoss()
loss = ctc_loss(input, target, input_lengths, target_lengths)
loss.backward()
# Target are to be un-padded
T = 50      # Input sequence length
C = 20      # Number of classes (including blank)
N = 16      # Batch size
# Initialize random batch of input vectors, for *size = (T,N,C)
input = torch.randn(T, N, C).log_softmax(-1).detach().requires_grad_()
input_lengths = torch.full(size=(N,), fill_value=T, dtype=torch.long)
# Initialize random batch of targets (0 = blank, 1:C = classes)
target_lengths = torch.randint(low=1, high=T, size=(N,), dtype=torch.long)
target = torch.randint(low=1, high=C, size=(sum(target_lengths),), dtype=torch.long)
ctc_loss = nn.CTCLoss()
loss = ctc_loss(input, target, input_lengths, target_lengths)
loss.backward()
# Target are to be un-padded and unbatched (effectively N=1)
T = 50      # Input sequence length
C = 20      # Number of classes (including blank)
# Initialize random batch of input vectors, for *size = (T,C)
input = torch.randn(T, C).log_softmax(-2).detach().requires_grad_()
input_lengths = torch.tensor(T, dtype=torch.long)
# Initialize random batch of targets (0 = blank, 1:C = classes)
target_lengths = torch.randint(low=1, high=T, size=(), dtype=torch.long)
target = torch.randint(low=1, high=C, size=(target_lengths,), dtype=torch.long)
ctc_loss = nn.CTCLoss()
loss = ctc_loss(input, target, input_lengths, target_lengths)
loss.backward()

In [135]:
torch.randint(low=1, high=T, size=(N,), dtype=torch.long)


tensor([ 6, 37, 34, 48, 16, 40, 21, 37,  2,  1, 29, 31, 40, 47, 27, 38])