In [None]:
import re
import torch
import string
import random
import unicodedata
import pandas as pd
from io import open
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
SOS_token = 0
EOS_token = 1

In [None]:
class Lang:
    def __init__(self,name):
        self.name = name
        self.W2I = {'SOS':SOS_token,'EOS':EOS_token}
        self.I2W = {SOS_token:'SOS',EOS_token:'EOS'}
        self.W2C = {}
        self.n_words = 2
    def addSentence(self,s):
        for word in s.split(' '):
            self.addWord(word)
    def addWord(self,w):
        if w not in self.W2I:
            self.W2I[w] = self.n_words
            self.W2C[w] = 1
            self.I2W[self.n_words] = w
            self.n_words+=1
        else:
            self.W2C[w]+=1
    def printAllWords(self):
        words = list(self.W2I.keys())
        for word in words:
            print(word)

In [None]:
# L = Lang('Eng')
# L.addWord('NLP')
# L.addSentence('How are you today')
# L.printAllWords()

SOS
EOS
NLP
How
are
you
today


In [None]:
def u2a(s):return ''.join(c for c in unicodedata.normalize('NFD',s)if unicodedata.category(c) != 'Mn')

def normalizeString(s):
    s = u2a(s.lower().strip())
    s = re.sub(r'([.!?])',r'\1',s)
    s = re.sub(r'[^a-zA-Z.!?]+',r' ',s)
    return s

def prepareData(I,O,P):
    MAX_LENGTH = 0
    for pair in P:
        I.addSentence(pair[0])
        O.addSentence(pair[1])
        MAX_LENGTH = max(MAX_LENGTH,len(pair[0].split()),len(pair[1].split()))
    return I,O,MAX_LENGTH

def readLangs():
    lines = open('/content/drive/MyDrive/ENG_to_FRENCH.txt',encoding='utf-8').read().strip().split('\n')
    pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]
    input_lang = Lang('eng')
    output_lang = Lang('fra')
    return input_lang, output_lang,pairs

I,O,P = readLangs()
input_lang,output_lang,MAX_LENGTH = prepareData(I,O,P)

In [None]:
print("The Maximum Length of the Sequence: ", MAX_LENGTH)
print("The Vocabulary Size of English Language: ",input_lang.n_words)
print("The Vocabulary Size of French Language: ",output_lang.n_words)
# output_lang.printAllWords()

The Maximum Length of the Sequence:  59
The Vocabulary Size of English Language:  20753
The Vocabulary Size of French Language:  29481


In [None]:
class EncoderRNN(nn.Module):
    def __init__(self,vocabSize,hidden_size):
        super(EncoderRNN,self).__init__()
        self.hidden_size = hidden_size
        self.E = nn.Embedding(vocabSize,hidden_size)
        self.gru = nn.GRU(hidden_size,hidden_size,
                          batch_first=True,bidirectional=True)
    def forward(self,input,hidden):
        emb = self.E(input).view(1,1,-1)
        output,hidden = self.gru(emb,hidden)
        return output,hidden
    def initHidden(self):
        return torch.zeros(2,1,self.hidden_size,device=device)

In [None]:
class DecoderRNN(nn.Module):
    def __init__(self,hidden_size,vocabSize,max_length = MAX_LENGTH):
        super(DecoderRNN,self).__init__()
        self.hidden_size = hidden_size
        self.output_size = vocabSize
        self.max_length = max_length
        self.E = nn.Embedding(self.output_size,self.hidden_size)
        self.attn = nn.Linear(self.hidden_size*2,self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size*3,self.hidden_size)
        self.gru = nn.GRU(self.hidden_size,self.hidden_size)
        self.out = nn.Linear(self.hidden_size,self.output_size)
    
    def forward(self,input,hidden,encoder_outputs):
        emb = self.E(input).view(1,1,-1)
        attn_w = F.softmax(self.attn(torch.cat((emb[0],hidden[0]),1)),dim=1)
        attn_A = torch.bmm(attn_w.unsqueeze(0),
                          encoder_outputs.unsqueeze(0))
        
        output = torch.cat((emb[0],attn_A[0]),1)
        output = self.attn_combine(output).unsqueeze(0)
        output = F.relu(output)
        output,hidden = self.gru(output,hidden)
        output = F.log_softmax(self.out(output[0]),dim=1)
        return output,hidden,attn_w
    def initHidden(self):
        return torch.zeros(1,1,self.hidden_size,device=device)

In [None]:
def indexesFromSentence(lang,s):
    return[lang.W2I[w] for w in s.split()]

def tensorFromSentence(lang,s):
    idx = indexesFromSentence(lang,s)
    idx.append(EOS_token)
    return torch.tensor(idx,dtype=torch.long,device=device).view(-1,1)

def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang,pair[0])
    output_tensor = tensorFromSentence(output_lang,pair[1])
    return (input_tensor,output_tensor)

In [None]:
def train(input_tensor,target_tensor,encoder,decoder,
         encoder_optimizer,decoder_optimizer,loss_fn,
         max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    input_length = input_tensor.size(0)
    target_length= target_tensor.size(0)
    encoder_outputs = torch.zeros(max_length,2*encoder.hidden_size,device=device)
    loss = 0
    for ei in range(input_length):
        encoder_output,encoder_hidden = encoder(input_tensor[ei],encoder_hidden)
        out_reshaped = encoder_output.view(1,1,2,encoder.hidden_size)
        out_fwd = out_reshaped[:,:,0,:]
        out_bck = out_reshaped[:,:,1,:]
        encoder_outputs[ei] = torch.cat((out_fwd[0,0],out_bck[0,0]),0)
    decoder_input = torch.tensor([[SOS_token]],device=device)
    h_reshaped = encoder_hidden.view(1,2,1,encoder.hidden_size)
    decoder_hidden = h_reshaped[:,0,:,:]
    
    for di in range(target_length):
        decoder_output,decoder_hidden,decoder_attention = decoder(
            decoder_input,decoder_hidden,encoder_outputs)
        topv,topi = decoder_output.topk(1)
        decoder_input = topi.squeeze().detach()
        loss+=loss_fn(decoder_output,target_tensor[di])
        if decoder_input.item() == EOS_token:
            break
    loss.backward()
    encoder_optimizer.step()
    decoder_optimizer.step()
    return loss.item()/target_length

In [None]:
def trainIters(encoder,decoder,n_iters,lr=0.001):
    totalLoss = 0
    encoder_optimizer = optim.SGD(encoder.parameters(),lr=lr)
    decoder_optimizer = optim.SGD(decoder.parameters(),lr=lr)
    training_pairs = [tensorsFromPair(random.choice(P))for i in range(n_iters)]
    loss_fn = nn.NLLLoss()
    for iter in range(n_iters):
        training_pair = training_pairs[iter]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]
        loss = train(input_tensor,target_tensor,encoder,decoder,encoder_optimizer,decoder_optimizer,loss_fn)
        totalLoss+=loss
        print("The Total Loss of Iteration Number",iter+1, "is :",totalLoss/(iter+1))

In [None]:
hidden_size = 128
encoder = EncoderRNN(input_lang.n_words,hidden_size).to(device)
decoder = DecoderRNN(hidden_size,output_lang.n_words).to(device)
trainIters(encoder,decoder,50000)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
The Total Loss of Iteration Number 45001 is : 3.965365466202088
The Total Loss of Iteration Number 45002 is : 3.965336482670357
The Total Loss of Iteration Number 45003 is : 3.9653796201634948
The Total Loss of Iteration Number 45004 is : 3.9653464126596836
The Total Loss of Iteration Number 45005 is : 3.9653280872672325
The Total Loss of Iteration Number 45006 is : 3.9653246247859224
The Total Loss of Iteration Number 45007 is : 3.965296539092481
The Total Loss of Iteration Number 45008 is : 3.965323448961112
The Total Loss of Iteration Number 45009 is : 3.9653699314252795
The Total Loss of Iteration Number 45010 is : 3.9653186774854454
The Total Loss of Iteration Number 45011 is : 3.965302994586932
The Total Loss of Iteration Number 45012 is : 3.965275793997312
The Total Loss of Iteration Number 45013 is : 3.965233570979726
The Total Loss of Iteration Number 45014 is : 3.965225745353433
The Total Loss of Iteration Numbe