#Chapter 26

Translation

The final chapter

In [1]:
from io import open
import unicodedata
import string
import re
import random
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


SOS_token = 0
EOS_token = 1


class Lang:
    def __init__(self,name):
        self.name = name
        self.W2I = {'SOS':SOS_token,'EOS':EOS_token}
        self.I2W = {SOS_token:'SOS',EOS_token:'EOS'}
        self.W2C = {}
        self.n_words = 2
    def addSentence(self,s):
        for word in s.split(' '):
            self.addWord(word)
    def addWord(self,w):
        if w not in self.W2I:
            self.W2I[w] = self.n_words
            self.W2C[w] = 1
            self.I2W[self.n_words] = w
            self.n_words+=1
        else:
            self.W2C[w]+=1
    def printAllWords(self):
        words = list(self.W2I.keys())
        for word in words:
            print(word)

L = Lang('Eng')
L.addWord('NLP')
L.addSentence('How are you today')
L.printAllWords()

SOS
EOS
NLP
How
are
you
today


In [3]:
def unicode2ascii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD',s)
        if unicodedata.category(c) != 'Mn'
    )

def normalizeString(s):
    s = unicode2ascii(s.lower().strip())
    s = re.sub(r'([.!?])',r'\1',s)
    s = re.sub(r'[^a-zA-Z.!?]+',r' ',s)
    return s

print(normalizeString('asfdieojo98793259'))

#read file
def readLangs():
    lines = open('../data/eng-fra.txt',encoding='utf-8').read().strip().split('\n')
    pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]
    input_lang = Lang('eng')
    output_lang = Lang('fra')
    return input_lang, output_lang,pairs

#read in file and build pairs
I,O,P = readLangs()


#video26_5

def prepareData(I,O,P):
    MAX_LENGTH = 0
    for pair in P:
        I.addSentence(pair[0])
        O.addSentence(pair[1])
        MAX_LENGTH = max(MAX_LENGTH,len(pair[0].split()),len(pair[1].split()))
    return I,O,MAX_LENGTH

input_lang,output_lang,MAX_LENGTH = prepareData(I,O,P)

print(MAX_LENGTH,input_lang.n_words,output_lang.n_words)

#output_lang.printAllWords()
print(random.choice(P))
pairs = P



class EncoderRNN(nn.Module):
    def __init__(self,vocabSize,hidden_size):
        super(EncoderRNN,self).__init__()
        self.hidden_size = hidden_size
        self.E = nn.Embedding(vocabSize,hidden_size)
        self.gru = nn.GRU(hidden_size,hidden_size,
                          batch_first=True,bidirectional=True)
    def forward(self,input,hidden):
        emb = self.E(input).view(1,1,-1)
        output,hidden = self.gru(emb,hidden)
        return output,hidden
    def initHidden(self):
        return torch.zeros(2,1,self.hidden_size,device=device)

class DecoderRNN(nn.Module):
    def __init__(self,hidden_size,vocabSize,max_length = MAX_LENGTH):
        super(DecoderRNN,self).__init__()
        self.hidden_size = hidden_size
        self.output_size = vocabSize
        self.max_length = max_length
        self.E = nn.Embedding(self.output_size,self.hidden_size)
        self.attn = nn.Linear(self.hidden_size*2,self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size*3,self.hidden_size)
        self.gru = nn.GRU(self.hidden_size,self.hidden_size)
        self.out = nn.Linear(self.hidden_size,self.output_size)
    
    def forward(self,input,hidden,encoder_outputs):
        emb = self.E(input).view(1,1,-1)
        attn_w = F.softmax(self.attn(torch.cat((emb[0],hidden[0]),1)),dim=1)
        attn_A = torch.bmm(attn_w.unsqueeze(0),
                          encoder_outputs.unsqueeze(0))
        
        output = torch.cat((emb[0],attn_A[0]),1)
        output = self.attn_combine(output).unsqueeze(0)
        output = F.relu(output)
        output,hidden = self.gru(output,hidden)
        output = F.log_softmax(self.out(output[0]),dim=1)
        return output,hidden,attn_w
    def initHidden(self):
        return torch.zeros(1,1,self.hidden_size,device=device)

def indexesFromSentence(lang,s):
    return[lang.W2I[w] for w in s.split()]

def tensorFromSentence(lang,s):
    idx = indexesFromSentence(lang,s)
    idx.append(EOS_token)
    return torch.tensor(idx,dtype=torch.long,device=device).view(-1,1)

def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang,pair[0])
    output_tensor = tensorFromSentence(output_lang,pair[1])
    return (input_tensor,output_tensor)

def train(input_tensor,target_tensor,encoder,decoder,
         encoder_optimizer,decoder_optimizer,loss_fn,
         max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    input_length = input_tensor.size(0)
    target_length= target_tensor.size(0)
    encoder_outputs = torch.zeros(max_length,2*encoder.hidden_size,device=device)
    loss = 0
    for ei in range(input_length):
        encoder_output,encoder_hidden = encoder(
            input_tensor[ei],encoder_hidden)
        out_reshaped = encoder_output.view(1,1,2,encoder.hidden_size)
        out_fwd = out_reshaped[:,:,0,:]
        out_bck = out_reshaped[:,:,1,:]
        encoder_outputs[ei] = torch.cat((out_fwd[0,0],out_bck[0,0]),0)
    decoder_input = torch.tensor([[SOS_token]],device=device)
    h_reshaped = encoder_hidden.view(1,2,1,encoder.hidden_size)
    decoder_hidden = h_reshaped[:,0,:,:]
    
    for di in range(target_length):
        decoder_output,decoder_hidden,decoder_attention = decoder(
            decoder_input,decoder_hidden,encoder_outputs
        )
        topv,topi = decoder_output.topk(1)
        decoder_input = topi.squeeze().detach()
        loss+=loss_fn(decoder_output,target_tensor[di])
        if decoder_input.item() == EOS_token:
            break
    loss.backward()
    encoder_optimizer.step()
    decoder_optimizer.step()
    return loss.item()/target_length

def trainIters(encoder,decoder,n_iters,lr=0.001):
    totalLoss = 0
    encoder_optimizer = optim.SGD(encoder.parameters(),lr=lr)
    decoder_optimizer = optim.SGD(decoder.parameters(),lr=lr)
    training_pairs = [tensorsFromPair(random.choice(pairs))
                     for i in range(n_iters)]
    loss_fn = nn.NLLLoss()
    for iter in range(n_iters):
        training_pair = training_pairs[iter]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]
        loss = train(input_tensor,target_tensor,encoder,decoder,
                        encoder_optimizer,decoder_optimizer,loss_fn
                    )
        totalLoss+=loss
        print(totalLoss/(iter+1))
    
hidden_size = 128
encoder = EncoderRNN(input_lang.n_words,hidden_size).to(device)
decoder = DecoderRNN(hidden_size,output_lang.n_words).to(device)
trainIters(encoder,decoder,100)

def evaluate(encoder,decoder,s,max_length=MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang,s)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()
        encoder_outputs = torch.zeros(max_length,2*encoder.hidden_size,device=device)
        
        for ei in range(input_length):
            encoder_output,encoder_hidden = encoder(
                input_tensor[ei],encoder_hidden)
            out_reshaped = encoder_output.view(1,1,2,encoder.hidden_size)
            out_fwd = out_reshaped[:,:,0,:]
            out_bck = out_reshaped[:,:,1,:]
            encoder_outputs[ei] = torch.cat((out_fwd[0,0],out_bck[0,0]),0)
        decoder_input = torch.tensor([[SOS_token]],device=device)
        h_reshaped = encoder_hidden.view(1,2,1,encoder.hidden_size)
        decoder_hidden = h_reshaped[:,0,:,:]
        
        decoded_words = []
        decoder_att = torch.zeros(max_length,max_length)
        
        for di in range(max_length):
            decoder_output,decoder_hidden,decoder_attention = decoder(
                decoder_input,decoder_hidden,encoder_outputs
            )
            decoder_att[di] = decoder_attention.data
            topv,topi = decoder_output.topk(1)
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang.I2W[topi.item()])
            decoder_input = topi.squeeze().detach()
        return decoded_words

asfdieojo 
59 20753 29481
['the young guy wants to drink.', 'le jeune homme veut boire.']


    Found GPU%d %s which is of cuda capability %d.%d.
    PyTorch no longer supports this GPU because it is too old.
    The minimum cuda capability supported by this library is %d.%d.
    


10.267714182535807
10.299850781758625
10.299460559421115
10.313449827829997
10.316380938575382
10.310496006314716
10.310403727836349
10.308142578034174
10.303793020097036
10.30086672101702
10.298527592910832
10.298486091977074
10.29776371547154
10.293174478959063
10.29390272231329
10.289335240636552
10.285893972915092
10.287917304669737
10.285961242905234
10.287569214957099
10.28461181744426
10.28210162373332
10.28239305863469
10.28013473465329
10.280115512666246
10.2805825455867
10.281644534111582
10.28085302874074
10.279765979931792
10.281262133058416
10.2803134407438
10.278222553975997
10.275736368591248
10.27371424604642
10.272897722279255
10.27180066120936
10.271480692242372
10.270016587882376
10.270371762762396
10.17915883143399
10.180819356008932
10.183444075786575
10.185545972755417
10.188252231978721
10.191026136304432
10.19301643104728
10.19477826526894
10.195190627136846
10.170095403592459
10.124903592613459
10.126893239421705
10.080827144043493
10.083099855785132
10.0860744

In [4]:
print(evaluate(encoder,decoder,pairs[0][0]),pairs[0][1])

['meritez', 'surfez', 'comprendrais.', 'ramenerai', 'singapour.', '<EOS>'] va !
