<a href="https://colab.research.google.com/github/peuape/machine_translation/blob/main/eng_fra.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install bleu

Collecting bleu
  Downloading bleu-0.3.tar.gz (5.2 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting efficiency (from bleu)
  Downloading efficiency-2.0-py3-none-any.whl.metadata (2.5 kB)
Downloading efficiency-2.0-py3-none-any.whl (32 kB)
Building wheels for collected packages: bleu
  Building wheel for bleu (setup.py) ... [?25l[?25hdone
  Created wheel for bleu: filename=bleu-0.3-py3-none-any.whl size=5780 sha256=eefd0240a9298dfc6ba4067613c14dc34dc538b5b59d482dbe6142de8e1b5994
  Stored in directory: /root/.cache/pip/wheels/4e/9f/09/3d45ccd4ce42bc796c1f0b960037e30f40b953458d3868b6f3
Successfully built bleu
Installing collected packages: efficiency, bleu
Successfully installed bleu-0.3 efficiency-2.0


In [2]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import numpy as np

from torch.utils.data import TensorDataset, DataLoader, RandomSampler

In [3]:
from tqdm import tqdm

In [4]:
import matplotlib.pyplot as plt

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [6]:
"""
1. Create a class Lang for preprocessing
attributes:
    str name: language name ("eng" or "fra")
    dict word2index:
    dict word2count:
    dict index2word: Already has "SOS" and "EOS" as tokens
    int n_words

methods:
    addWord(word):
        updates the attributes of Lang.


    addSentence(sentence):
        params: str sentence: input/output sentence. Assume theyve already been normalised
        Registers new words in the given sentence into Lang with addWord.
"""
SOS_token = 0
EOS_token = 1
class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0:"SOS", 1:"EOS"}
        self.n_words = 2
    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1
    def addSentence(self, sentence):
        for word in sentence.split(" "):
            self.addWord(word)

In [7]:
"""
2. Create a function to normalise training data by
 -1 turning them into ascii to remove accents.
 -2 normalising them

unicodeToAscii:
    params: str s: unicode string
    return: str : ascii string

normalizeString:
    params: str s: unicode string
    return normalised string
    Converts unicode into ascii, lowercases, trims, converts .!? into " \1" and removes non_letter characters.
    Note that as a result of replacing special characters with a space, there might be sentences ending with a whitespace, which is undesirable.
"""

def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )


def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r'([.!?])', r' \1', s)
    s = re.sub(r'[^a-zA-Z]', r" ", s)
    return s.strip()


In [8]:
"""
3. Create a function to read data into Lang(create vocabularies)

readLang:
    params:
        str lang1, lang2: language names
        bool reverse = False
    return:
        Lang imput_lang, output_lang
        list[list[string]] pairs

    -1 Read the file line by line.
    -2 Create pairs
    -3 Store them in Lang

"""

def readLang(lang1, lang2, reverse=False):
    print("Reading lines...")

    with open(f"{lang1}-{lang2}.txt") as f:
        lines = f.read().strip().split("\n")
        pairs = [[normalizeString(s) for s in line.split("\t")] for line in lines]




    if reverse:
        pairs = [list(reversed(p)) for p in pairs]

        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)
    return input_lang, output_lang, pairs


In [9]:
MAX_LENGTH = 10
eng_prefixes = (
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s ",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re "
)

def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and \
        len(p[1].split(' ')) < MAX_LENGTH and \
        p[0].startswith(eng_prefixes)


def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

In [10]:
"""
4 Preprocess data.
    -1 Read data into Lang using readLangs
    -2 Filter out long sentences with filterPairs
    -3 Fill the vocabulary in the Lang class

prepareData:
    params:
        str lang1, lang2
        bool reverse=False
    return
        Lang input_lang, output_lang
        list[list[string]] pairs
"""

def prepareData(lang1, lang2, reverse=False):
    print("Reading data...")
    input_lang, output_lang, pairs = readLang("eng", "fra")
    print(f"Read {len(pairs)} sentences. Filtering pairs...")
    pairs = filterPairs(pairs)
    print(f"Trimmed to {len(pairs)} pairs. Creating vocabularies...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])

    print("Created vocabularies.")
    print(f"{lang1}: {input_lang.n_words} words")
    print(f"{lang2}: {output_lang.n_words} words")
    return input_lang, output_lang, pairs

input_lang, output_lang, pairs = prepareData("eng", "fra")
pairs[:10], input_lang.word2index["go"], output_lang.word2index["va"]

Reading data...
Reading lines...
Read 135842 sentences. Filtering pairs...
Trimmed to 11358 pairs. Creating vocabularies...
Created vocabularies.
eng: 2981 words
fra: 4576 words


([['i m ok', 'je vais bien'],
  ['i m ok', 'ca va'],
  ['i m fat', 'je suis gras'],
  ['i m fat', 'je suis gros'],
  ['i m fit', 'je suis en forme'],
  ['i m hit', 'je suis touche'],
  ['i m hit', 'je suis touchee'],
  ['i m ill', 'je suis malade'],
  ['i m sad', 'je suis triste'],
  ['i m shy', 'je suis timide']],
 648,
 6)

In [None]:
"""
5 Create pytorch classes
This nml model is a seq2seq model utilising GRU and attention.
The rough architecture is as follows.

EncoderRNN:
params:
    int input_size:dict size
    int hidden_size: size of each embedding vector
    int dropout_p = 0.1

attributes:
    hidden_size, embedding, gru, dropout

method:
    forward:
        params;

"""
#bahdanau encoder
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, dropout_p=0.1):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, input):
        output = self.dropout(self.embedding(input))
        output, hidden = self.gru(output)
        return output, hidden




In [None]:
class BahdanauAttention(nn.Module):
    def __init__(self, hidden_size):
        super(BahdanauAttention, self).__init__()
        self.Wa = nn.Linear(hidden_size, hidden_size)
        self.Ua = nn.Linear(hidden_size, hidden_size)
        self.Va = nn.Linear(hidden_size, 1)
    def forward(self, query, keys):
        score = self.Va(torch.tanh(self.Wa(query)+self.Ua(keys)))
        score = score.squeeze(2).unsqueeze(1)
        weights = F.softmax(score, dim=-1)
        context = torch.bmm(weights, keys)
        return context, weights

class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1):
        super(AttnDecoderRNN, self).__init__()
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.dropout = nn.Dropout(p= dropout_p)
        self.attention = BahdanauAttention(hidden_size)
        self.gru = nn.GRU(2*hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)

    def forward(self, encoder_output, encoder_hidden, target_tensor=None):
        batch_size = encoder_output.size(0)
        decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(SOS_token)
        decoder_hidden = encoder_hidden
        output_list = []
        attention_list = []
        for i in  range(MAX_LENGTH):
            decoder_output, decoder_hidden, attn_weights = self.forward_step(decoder_input, decoder_hidden,encoder_output)
            output_list.append(decoder_output)
            attention_list.append(attn_weights)

            if target_tensor==None:
                _, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze(-1).detach()
            else:
                decoder_input = target_tensor[:,i].unsqueeze(1)

        decoder_output = torch.cat(output_list, dim=1)
        decoder_output = F.log_softmax(decoder_output, dim=-1)
        attention = torch.cat(attention_list, dim=1)
        return decoder_output, decoder_hidden, attention


    def forward_step(self, input, hidden, encoder_output):
        query = hidden.permute(1,0,2)
        context, weights = self.attention(query, encoder_output)
        embedded = self.dropout(self.embedding(input))
        input_gru = torch.cat((embedded, context), dim=2)
        decoder_output, decoder_hidden = self.gru(input_gru, hidden)
        decoder_output = self.out(decoder_output)
        return decoder_output, decoder_hidden, weights


In [69]:
#luong attention
class EncoderLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, dropout_p=0.1):
        super(EncoderLSTM, self).__init__()
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.dropout = nn.Dropout(dropout_p)
        self.lstm = nn.LSTM(hidden_size, hidden_size, batch_first=True, bidirectional=True)
    def forward(self, input):
        embedded = self.dropout(self.embedding(input))
        hidden, (last_hidden, last_cell) = self.lstm(embedded)
        return hidden, (last_hidden, last_cell)

class DecoderLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, dropout_p=0.1):
        super(DecoderLSTM, self).__init__()
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.dropout = nn.Dropout(dropout_p)
        self.lstmcell = nn.LSTMCell(hidden_size, hidden_size)
        self.Vd = nn.Linear(3*hidden_size, hidden_size)
        self.output = nn.Linear(hidden_size, output_size)
        self.He = nn.Linear(2*hidden_size, hidden_size)
        self.Ce = nn.Linear(2*hidden_size, hidden_size)
        self.Ha = nn.Linear(2*hidden_size, hidden_size)

    def forward(self, encoder_outputs, encoder_hidden, encoder_cell, target_tensor=None):
        batch_size = encoder_outputs.size(0)
        decoder_input = torch.empty(batch_size, dtype=torch.long, device=device).fill_(SOS_token)
        encoder_outputs_projected = self.Ha(encoder_outputs)
        encoder_outputs_permuted = torch.permute(encoder_outputs, (0,2,1))
        encoder_hidden = torch.cat((encoder_hidden[0], encoder_hidden[1]), dim=-1)
        encoder_cell = torch.cat((encoder_cell[0], encoder_cell[1]), dim=-1)
        decoder_hidden = self.He(encoder_hidden)
        decoder_cell = self.Ce(encoder_cell)
        output_list = []

        for i in range(MAX_LENGTH):
            decoder_output, decoder_hidden, decoder_cell = self.step(decoder_input, decoder_hidden, decoder_cell, encoder_outputs_permuted, encoder_outputs_projected)
            output_list.append(decoder_output.unsqueeze(1))
            if target_tensor == None:
                _, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze(-1).detach()

            else:
                decoder_input = target_tensor[:,i]


        decoder_output = torch.cat(output_list, dim=1)
        decoder_output = F.log_softmax(decoder_output, dim=-1)

        return decoder_output, decoder_hidden, decoder_cell




    def step(self, input, decoder_hidden, decoder_cell, encoder_outputs_permuted, encoder_outputs_projected):
        embedded = self.dropout(self.embedding(input))
        (decoder_hidden, decoder_cell) = self.lstmcell(embedded, (decoder_hidden, decoder_cell))
        attention_scores = torch.bmm(encoder_outputs_projected, decoder_hidden.unsqueeze(-1))
        attention_scores = F.softmax(attention_scores, dim=1)
        attention = torch.bmm(encoder_outputs_permuted, attention_scores)
        decoder_output = torch.cat((torch.squeeze(attention, dim=-1), decoder_hidden), dim=1)
        decoder_output = self.Vd(decoder_output)
        decoder_output = self.dropout(F.tanh(decoder_output))
        decoder_output = self.output(decoder_output)
        return decoder_output, decoder_hidden, decoder_cell



In [33]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(" ")]

def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(1, -1)

def tensorFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

def get_dataloader(batch_size):
    input_lang, output_lang, pairs = prepareData('eng', 'fra', True)

    n = len(pairs)
    input_ids = np.zeros((n, MAX_LENGTH), dtype=np.int32)
    target_ids = np.zeros((n, MAX_LENGTH), dtype=np.int32)

    for idx, (inp, tgt) in enumerate(pairs):
        inp_ids = indexesFromSentence(input_lang, inp)
        tgt_ids = indexesFromSentence(output_lang, tgt)
        inp_ids.append(EOS_token)
        tgt_ids.append(EOS_token)
        input_ids[idx, :len(inp_ids)] = inp_ids
        target_ids[idx, :len(tgt_ids)] = tgt_ids

    train_data = TensorDataset(torch.LongTensor(input_ids).to(device),
                               torch.LongTensor(target_ids).to(device))

    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
    return input_lang, output_lang, train_dataloader

In [71]:
def train_epoch(dataloader, encoder, decoder, encoder_optimizer,
                 decoder_optimizer, criterion,attention="luong"):
    total_loss = 0
    for data in tqdm(dataloader):
        input_tensor, target_tensor = data
        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()
        if attention=="bahdanau":
            encoder_outputs, encoder_hidden = encoder(input_tensor)
            decoder_outputs, _, _ = decoder(encoder_outputs, encoder_hidden, target_tensor)
        elif attention=="luong":
            encoder_outputs, (encoder_hidden, encoder_cell) = encoder(input_tensor)
            decoder_outputs, _, _ = decoder(encoder_outputs, encoder_hidden, encoder_cell, target_tensor)

        loss = criterion(
            decoder_outputs.view(-1, decoder_outputs.size(-1)),
            target_tensor.view(-1)
        )
        loss.backward()
        encoder_optimizer.step()
        decoder_optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)


In [35]:
import time
import math

def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))


In [36]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np

def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [37]:
def train(train_dataloader, encoder, decoder, n_epochs, learning_rate=1e-3,
          print_every=100, plot_every=100):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)

    criterion = nn.NLLLoss()

    for epoch in range(1, n_epochs+1):
        loss = train_epoch(train_dataloader, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if epoch % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, epoch / n_epochs),
                                        epoch, epoch / n_epochs * 100, print_loss_avg))

        if epoch % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

In [38]:
def evaluate(encoder, decoder, sentence, input_lang, output_lang):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)

        encoder_outputs, encoder_hidden = encoder(input_tensor)
        decoder_outputs, decoder_hidden, decoder_attn = decoder(encoder_outputs, encoder_hidden)

        _, topi = decoder_outputs.topk(1)
        decoded_ids = topi.squeeze()

        decoded_words = []
        for idx in decoded_ids:
            if idx.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            decoded_words.append(output_lang.index2word[idx.item()])
    return decoded_words, decoder_attn

In [39]:
hidden_size = 128
batch_size=32

input_lang, output_lang, train_dataloader = get_dataloader(batch_size)

Reading data...
Reading lines...
Read 135842 sentences. Filtering pairs...
Trimmed to 11358 pairs. Creating vocabularies...
Created vocabularies.
eng: 2981 words
fra: 4576 words


In [72]:
encoder = EncoderLSTM(input_lang.n_words, hidden_size).to(device)
decoder = DecoderLSTM(output_lang.n_words, hidden_size, output_lang.n_words).to(device)

train(train_dataloader, encoder, decoder, 80, print_every=5, plot_every=5)

100%|██████████| 355/355 [00:06<00:00, 58.41it/s]
100%|██████████| 355/355 [00:07<00:00, 48.17it/s]
100%|██████████| 355/355 [00:06<00:00, 57.03it/s]
100%|██████████| 355/355 [00:06<00:00, 53.65it/s]
100%|██████████| 355/355 [00:06<00:00, 56.49it/s]


0m 32s (- 8m 8s) (5 6%) 1.9903


100%|██████████| 355/355 [00:06<00:00, 54.96it/s]
100%|██████████| 355/355 [00:06<00:00, 54.34it/s]
100%|██████████| 355/355 [00:06<00:00, 56.81it/s]
100%|██████████| 355/355 [00:06<00:00, 52.13it/s]
100%|██████████| 355/355 [00:06<00:00, 58.33it/s]


1m 4s (- 7m 33s) (10 12%) 1.0614


100%|██████████| 355/355 [00:06<00:00, 53.11it/s]
100%|██████████| 355/355 [00:06<00:00, 58.75it/s]
100%|██████████| 355/355 [00:06<00:00, 52.76it/s]
100%|██████████| 355/355 [00:06<00:00, 58.75it/s]
100%|██████████| 355/355 [00:06<00:00, 52.91it/s]


1m 36s (- 7m 0s) (15 18%) 0.6494


100%|██████████| 355/355 [00:06<00:00, 58.60it/s]
100%|██████████| 355/355 [00:06<00:00, 52.75it/s]
100%|██████████| 355/355 [00:06<00:00, 58.12it/s]
100%|██████████| 355/355 [00:06<00:00, 52.92it/s]
100%|██████████| 355/355 [00:06<00:00, 58.90it/s]


2m 8s (- 6m 25s) (20 25%) 0.4258


100%|██████████| 355/355 [00:06<00:00, 53.19it/s]
100%|██████████| 355/355 [00:06<00:00, 58.79it/s]
100%|██████████| 355/355 [00:06<00:00, 52.60it/s]
100%|██████████| 355/355 [00:06<00:00, 57.91it/s]
100%|██████████| 355/355 [00:06<00:00, 53.60it/s]


2m 40s (- 5m 53s) (25 31%) 0.3071


100%|██████████| 355/355 [00:06<00:00, 56.10it/s]
100%|██████████| 355/355 [00:06<00:00, 54.36it/s]
100%|██████████| 355/355 [00:06<00:00, 52.53it/s]
100%|██████████| 355/355 [00:06<00:00, 57.59it/s]
100%|██████████| 355/355 [00:06<00:00, 52.57it/s]


3m 13s (- 5m 22s) (30 37%) 0.2443


100%|██████████| 355/355 [00:06<00:00, 57.80it/s]
100%|██████████| 355/355 [00:06<00:00, 52.08it/s]
100%|██████████| 355/355 [00:06<00:00, 58.54it/s]
100%|██████████| 355/355 [00:06<00:00, 52.43it/s]
100%|██████████| 355/355 [00:06<00:00, 58.80it/s]


3m 45s (- 4m 49s) (35 43%) 0.2058


100%|██████████| 355/355 [00:06<00:00, 52.96it/s]
100%|██████████| 355/355 [00:06<00:00, 58.50it/s]
100%|██████████| 355/355 [00:06<00:00, 52.67it/s]
100%|██████████| 355/355 [00:06<00:00, 58.88it/s]
100%|██████████| 355/355 [00:06<00:00, 53.11it/s]


4m 17s (- 4m 17s) (40 50%) 0.1845


100%|██████████| 355/355 [00:06<00:00, 58.75it/s]
100%|██████████| 355/355 [00:06<00:00, 52.35it/s]
100%|██████████| 355/355 [00:06<00:00, 57.23it/s]
100%|██████████| 355/355 [00:06<00:00, 52.99it/s]
100%|██████████| 355/355 [00:06<00:00, 56.41it/s]


4m 49s (- 3m 45s) (45 56%) 0.1724


100%|██████████| 355/355 [00:06<00:00, 54.80it/s]
100%|██████████| 355/355 [00:06<00:00, 52.99it/s]
100%|██████████| 355/355 [00:06<00:00, 57.61it/s]
100%|██████████| 355/355 [00:07<00:00, 49.22it/s]
100%|██████████| 355/355 [00:06<00:00, 55.84it/s]


5m 22s (- 3m 13s) (50 62%) 0.1627


100%|██████████| 355/355 [00:07<00:00, 50.29it/s]
100%|██████████| 355/355 [00:07<00:00, 49.96it/s]
100%|██████████| 355/355 [00:06<00:00, 52.25it/s]
100%|██████████| 355/355 [00:07<00:00, 50.09it/s]
100%|██████████| 355/355 [00:07<00:00, 50.14it/s]


5m 57s (- 2m 42s) (55 68%) 0.1566


100%|██████████| 355/355 [00:07<00:00, 48.96it/s]
100%|██████████| 355/355 [00:06<00:00, 53.66it/s]
100%|██████████| 355/355 [00:07<00:00, 49.88it/s]
100%|██████████| 355/355 [00:06<00:00, 56.88it/s]
100%|██████████| 355/355 [00:06<00:00, 51.58it/s]


6m 31s (- 2m 10s) (60 75%) 0.1506


100%|██████████| 355/355 [00:06<00:00, 57.97it/s]
100%|██████████| 355/355 [00:07<00:00, 49.99it/s]
100%|██████████| 355/355 [00:06<00:00, 57.85it/s]
100%|██████████| 355/355 [00:06<00:00, 51.77it/s]
100%|██████████| 355/355 [00:06<00:00, 55.01it/s]


7m 4s (- 1m 37s) (65 81%) 0.1480


100%|██████████| 355/355 [00:06<00:00, 54.28it/s]
100%|██████████| 355/355 [00:07<00:00, 50.43it/s]
100%|██████████| 355/355 [00:06<00:00, 52.46it/s]
100%|██████████| 355/355 [00:06<00:00, 53.39it/s]
100%|██████████| 355/355 [00:06<00:00, 58.58it/s]


7m 37s (- 1m 5s) (70 87%) 0.1441


100%|██████████| 355/355 [00:06<00:00, 53.08it/s]
100%|██████████| 355/355 [00:06<00:00, 58.70it/s]
100%|██████████| 355/355 [00:06<00:00, 52.79it/s]
100%|██████████| 355/355 [00:06<00:00, 58.46it/s]
100%|██████████| 355/355 [00:06<00:00, 52.49it/s]


8m 9s (- 0m 32s) (75 93%) 0.1419


100%|██████████| 355/355 [00:06<00:00, 56.51it/s]
100%|██████████| 355/355 [00:07<00:00, 49.30it/s]
100%|██████████| 355/355 [00:06<00:00, 50.73it/s]
100%|██████████| 355/355 [00:06<00:00, 50.95it/s]
100%|██████████| 355/355 [00:07<00:00, 50.55it/s]


8m 44s (- 0m 0s) (80 100%) 0.1407


In [81]:
def evaluate(encoder, decoder, sentence, input_lang, output_lang, attention="luong"):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)
        if attention=="luong":
            encoder_outputs, (encoder_hidden, encoder_cell) = encoder(input_tensor)
            decoder_outputs, decoder_hidden, decoder_cell = decoder(encoder_outputs, encoder_hidden, encoder_cell)
        else:
            encoder_outputs, encoder_hidden = encoder(input_tensor)
            decoder_outputs, decoder_hidden, decoder_attn = decoder(encoder_outputs, encoder_hidden)

        _, topi = decoder_outputs.topk(1)
        decoded_ids = topi.squeeze()

        decoded_words = []
        for idx in decoded_ids:
            if idx.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            decoded_words.append(output_lang.index2word[idx.item()])
    return decoded_words

In [83]:
def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(pairs)
        print('>', pair[0])
        print('=', pair[1])
        output_words = evaluate(encoder, decoder, pair[0], input_lang, output_lang)
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')


In [88]:
#bahdanau
encoder.eval()
decoder.eval()
evaluateRandomly(encoder, decoder)

> we re punctual
= nous sommes ponctuels
< nous sommes confrontes a un probleme <EOS>

> we re going to search the whole ship
= nous allons fouiller la totalite du bateau
< nous allons fouiller la totalite du bateau <EOS>

> we re almost ready
= nous sommes presque pretes
< nous sommes presque prets  un film <EOS>

> he is amusing himself by playing video games
= il s amuse en jouant aux jeux videos
< il s amuse en jouant aux jeux videos <EOS>

> we re all very good players
= nous sommes tous de tres bons joueurs
< nous sommes toutes de tres bonnes joueuses <EOS>

> you re preaching to the choir
= vous prechez des convaincues
< tu preches une convaincue de l ecole <EOS>

> they re not coming
= elles ne viennent pas
< ils ne vont pas nous chercher <EOS>

> you re too slow
= vous etes trop lentes
< tu es trop lente pour moi <EOS>

> he s still alive
= il est toujours en vie
< il est encore au lit <EOS>

> we re the last
= nous sommes les derniers
< nous sommes les derniers le dernier esp

In [None]:
#bahdanau
prediction_list = [evaluate(encoder,decoder,sentence, input_lang, output_lang)[:-1] for sentence in [pair[0] for pair in pairs]]
target_list = [pair[1].split(" ") for pair in pairs]

In [None]:
#bahdanau
prediction_list = [" ".join(prediction_list[i]) for i in range(len(prediction_list))]
target_list = [" ".join(target_list[i]) for i in range(len(target_list))]

In [None]:
from bleu import list_bleu

In [None]:
print("Bleu score of Bandanau encoder-decoder:", list_bleu([target_list], prediction_list))

[Info] Starting to run this command now: perl /tmp/tmp_bleu/multi-bleu-detok.perl /tmp/tmp_bleu/ref_dtk0.txt < /tmp/tmp_bleu/hyp_dtk0.txt 
Bleu score of Bandanau encoder-decoder: 73.26


In [84]:
#luong
encoder.eval()
decoder.eval()
evaluateRandomly(encoder, decoder)

> i m not in good shape now
= je ne suis pas en forme maintenant
< je ne suis pas en forme maintenant <EOS>

> she s not my type
= elle n est pas mon genre
< elle n est pas du tout genre a mon genre

> i am moving next month
= je demenage le mois prochain
< je demenage le mois prochain <EOS>

> i m so sorry that i lied to you
= je suis si desole de t avoir menti
< je suis si desole de t avoir menti <EOS>

> you are as white as a sheet
= vous etes blanc comme un drap
< tu es blanc comme un cachet d aspirine <EOS>

> you re extroverted
= vous etes extravertie
< tu es un critique de bureau <EOS>

> you re out of booze
= c est maree basse
< tu es hors de son affaire <EOS>

> i m very tired
= je suis fourbu
< je suis tres fatigue par tres fatigue <EOS>

> i m not easily impressed
= je ne suis pas facilement impressionne
< je ne suis pas facilement impressionne a ce sujet <EOS>

> we re not friends anymore
= nous ne sommes plus amies
< nous ne sommes plus amies de tous <EOS>



In [89]:
#luong attention
prediction_list = [evaluate(encoder,decoder,sentence, input_lang, output_lang)[:-1] for sentence in [pair[0] for pair in pairs]]
target_list = [pair[1].split(" ") for pair in pairs]
prediction_list = [" ".join(prediction_list[i]) for i in range(len(prediction_list))]
target_list = [" ".join(target_list[i]) for i in range(len(target_list))]

In [92]:
print("Bleu score of Luong encoder-decoder:", list_bleu([target_list], prediction_list))

[Info] Starting to run this command now: perl /tmp/tmp_bleu/multi-bleu-detok.perl /tmp/tmp_bleu/ref_dtk0.txt < /tmp/tmp_bleu/hyp_dtk0.txt 
Bleu score of Luong encoder-decoder: 40.83


In [91]:
#luong
prediction_list[:10], target_list[:10]

(['je crains d avoir a tom',
  'je crains d avoir a tom',
  'je suis content que tu aies aime la tete',
  'je suis content que tu aies aime la tete',
  'je suis en route pour forme',
  'je suis petite par l anglais pour moi',
  'je suis petite par l anglais pour moi',
  'je suis sure d etre un bon malade',
  'je suis triste d etre triste a aies',
  'je suis timide d etre difficile'],
 ['je vais bien',
  'ca va',
  'je suis gras',
  'je suis gros',
  'je suis en forme',
  'je suis touche',
  'je suis touchee',
  'je suis malade',
  'je suis triste',
  'je suis timide'])