# Assignment 7

Train a Transformer model for Machine Translation from Russian to English.  
Dataset: http://data.statmt.org/wmt18/translation-task/training-parallel-nc-v13.tgz   
Make all source and target text to lower case.  
Use following tokenization for english:  
```
import sentencepiece as spm

...
spm.SentencePieceTrainer.Train('--input=data/text.en --model_prefix=bpe_en --vocab_size=32000 --character_coverage=0.98 --model_type=bpe')

tok_en = spm.SentencePieceProcessor()
tok_en.load('bpe_en.model')

TGT = data.Field(
    fix_length=50,
    init_token='<s>',
    eos_token='</s>',
    lower=True,
    tokenize = lambda x: tok_en.encode_as_pieces(x),
    batch_first=True,
)

...
TGT.build_vocab(..., min_freq=5)
...

```
Score: corpus-bleu `nltk.translate.bleu_score.corpus_bleu`  
Use last 1000 sentences for model evalutation (test dataset).  
Use your target sequence tokenization for BLEU score.  
Use max_len=50 for sequence prediction.  


Hint: You may consider much smaller model, than shown in the example.  

Baselines:  
[4 point] BLEU = 0.05  
[6 point] BLEU = 0.10  
[9 point] BLEU = 0.15  

[1 point] Share weights between target embeddings and output dense layer. Notice, they have the same shape.


Readings:
1. BLUE score how to https://machinelearningmastery.com/calculate-bleu-score-for-text-python/
1. Transformer code and comments http://nlp.seas.harvard.edu/2018/04/03/attention.html

In [2]:
!pip install sentencepiece

Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/74/f4/2d5214cbf13d06e7cb2c20d84115ca25b53ea76fa1f0ade0e3c9749de214/sentencepiece-0.1.85-cp36-cp36m-manylinux1_x86_64.whl (1.0MB)
[K     |████████████████████████████████| 1.0MB 2.8MB/s 
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.85


In [3]:
import re
import math
import copy

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook
from torchtext import datasets, data
import sentencepiece as spm


BATCH_SIZE = 256
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DEVICE

device(type='cuda')

# Data Preprocessing

In [4]:
!wget http://data.statmt.org/wmt18/translation-task/training-parallel-nc-v13.tgz
!tar -xf /content/training-parallel-nc-v13.tgz

--2020-02-23 18:27:52--  http://data.statmt.org/wmt18/translation-task/training-parallel-nc-v13.tgz
Resolving data.statmt.org (data.statmt.org)... 129.215.197.184
Connecting to data.statmt.org (data.statmt.org)|129.215.197.184|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 113157482 (108M) [application/x-gzip]
Saving to: ‘training-parallel-nc-v13.tgz’


2020-02-23 18:28:05 (9.14 MB/s) - ‘training-parallel-nc-v13.tgz’ saved [113157482/113157482]



In [5]:
def preprocess_data(text):
    ''' making text lowercased and cleaning it a bit'''
    text = text.lower()
    text = re.sub(r'&[a-z]{0,7};', ' ', text)
    text = re.sub('\\xa0', ' ', text)
    text = re.sub(r'\\u0027', "'", text)
    return text

# tokenize english 
with open('/content/training-parallel-nc-v13/news-commentary-v13.ru-en.en') as f:
    with open('/content/text.en', 'w') as out:
            out.write(preprocess_data(f.read()))
        
spm.SentencePieceTrainer.Train('--input=/content/text.en --model_prefix=bpe_en --vocab_size=32000 --character_coverage=0.98 --model_type=bpe')

# tokenize russian
with open('/content/training-parallel-nc-v13/news-commentary-v13.ru-en.ru') as f:
    with open('/content/text.ru', 'w') as out:
            out.write(preprocess_data(f.read()))
        
spm.SentencePieceTrainer.Train('--input=/content/text.ru --model_prefix=bpe_ru --vocab_size=32000 --character_coverage=0.98 --model_type=bpe')

True

In [0]:
tok_ru = spm.SentencePieceProcessor()
tok_ru.load('bpe_ru.model')

tok_en = spm.SentencePieceProcessor()
tok_en.load('bpe_en.model')

SRC = data.Field(
    fix_length=50,
    init_token='<s>',
    eos_token='</s>',
    lower=True,
    tokenize = lambda x: tok_ru.encode_as_pieces(x),
    batch_first=True,
)

TGT = data.Field(
    fix_length=50,
    init_token='<s>',
    eos_token='</s>',
    lower=True,
    tokenize = lambda x: tok_en.encode_as_pieces(x),
    batch_first=True,
)

fields = (('src', SRC), ('tgt', TGT))

In [7]:
with open('/content/text.ru') as f:
    src_snt = list(map(str.strip, f.readlines()))
    
with open('/content/text.en') as f:
    tgt_snt = list(map(str.strip, f.readlines()))
    
examples = [data.Example.fromlist(x, fields) for x in tqdm_notebook(zip(src_snt, tgt_snt),total=len(tgt_snt))]
test = data.Dataset(examples[-1000:], fields)
train, valid = data.Dataset(examples[:-1000], fields).split(0.9)

HBox(children=(IntProgress(value=0, max=235159), HTML(value='')))




In [8]:
print('src: ' + " ".join(train.examples[100].src))
print('tgt: ' + " ".join(train.examples[100].tgt))

src: ▁дело ▁в ▁том , ▁что ▁израиль ▁всегда ▁может ▁рассчитывать ▁на ▁поддержку ▁сша , ▁особенно ▁на ▁таких ▁общественных ▁мероприятиях , ▁как ▁выступлениях ▁израильского ▁лидера ▁в ▁конгрессе , ▁и ▁нынешние ▁события ▁только ▁подтвердили ▁предположение ▁многих ▁людей ▁во ▁всем ▁мире ▁о ▁том , ▁что ▁израиль ▁и ▁сша ▁близки ▁как ▁си ам ские ▁близне цы .
tgt: ▁the ▁fact ▁that ▁israel ▁could ▁always ▁count ▁on ▁us ▁backing , ▁especially ▁on ▁such ▁public ▁occasions ▁as ▁congressional ▁speeches ▁by ▁an ▁israeli ▁leader , ▁only ▁confirmed ▁the ▁assumption ▁of ▁many ▁people ▁around ▁the ▁world ▁that ▁israel ▁and ▁the ▁us ▁are ▁ j oined ▁like ▁siam ese ▁twins .


In [9]:
len(train), len(valid), len(test)

(210743, 23416, 1000)

In [0]:
TGT.build_vocab(train, min_freq=5)
SRC.build_vocab(train, min_freq=5)

## Batch

In [0]:
def subsequent_mask(size):
    "Mask out subsequent positions."
    attn_shape = (1, size, size)
    subsequent_mask = np.triu(np.ones(attn_shape), k=1).astype('uint8')
    return torch.from_numpy(subsequent_mask) == 0

In [0]:
class Batch:
    "Object for holding a batch of data with mask during training."
    def __init__(self, src, trg=None, pad=0):
        self.src = src
        self.src_mask = (src != pad).unsqueeze(-2)
        if trg is not None:
            self.trg = trg[:, :-1]
            self.trg_y = trg[:, 1:]
            self.trg_mask = self.make_std_mask(self.trg, pad)
            self.ntokens = (self.trg_y != pad).data.sum()
    
    @staticmethod
    def make_std_mask(tgt, pad):
        "Create a mask to hide padding and future words."
        tgt_mask = (tgt != pad).unsqueeze(-2)
        tgt_mask = tgt_mask & subsequent_mask(tgt.size(-1)).type_as(tgt_mask.data).detach()
        return tgt_mask

# Model

In [0]:
def make_model(src_vocab, tgt_vocab, N=6, 
               d_model=512, d_ff=2048, h=8, dropout=0.1):
    "Helper: Construct a model from hyperparameters."
    c = copy.deepcopy
    attn = MultiHeadedAttention(h, d_model)
    ff = PositionwiseFeedForward(d_model, d_ff, dropout)
    position = PositionalEncoding(d_model, dropout)
    model = EncoderDecoder(
        Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N),
        Decoder(DecoderLayer(d_model, c(attn), c(attn), 
                             c(ff), dropout), N),
        nn.Sequential(Embeddings(d_model, src_vocab), c(position)),
        nn.Sequential(Embeddings(d_model, tgt_vocab), c(position)),
        nn.Linear(d_model, tgt_vocab))
    
    # This was important from their code. 
    # Initialize parameters with Glorot / fan_avg.
    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)
    return model

## From transformer.py

In [0]:
class EncoderDecoder(nn.Module):
    """
    A standard Encoder-Decoder architecture. Base for this and many 
    other models.
    """
    def __init__(self, encoder, decoder, src_embed, tgt_embed, generator):
        super(EncoderDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed
        self.tgt_embed = tgt_embed
        self.generator = generator # Linear layer
        
    def forward(self, batch):
        src, trg = batch.src, batch.trg
        src_mask, trg_mask = batch.src_mask, batch.trg_mask
        "Take in and process masked src and target sequences."
        #return self.decode(self.encode(src, src_mask), src_mask,
        #                    tgt, tgt_mask)
        return self.decode(trg, trg_mask, self.encode(src, src_mask), src_mask)
    
    def encode(self, src, src_mask):
        return self.encoder(self.src_embed(src), src_mask)
    
    def decode(self, tgt, tgt_mask, memory, src_mask):
    #def decode(self, memory, src_mask, tgt, tgt_mask):
        #return self.decoder(self.tgt_embed(tgt), memory, src_mask, tgt_mask)
        x = self.decoder(self.tgt_embed(tgt), memory, src_mask, tgt_mask)
        x = self.generator(x)
        return x

def clones(module, N):
    "Produce N identical layers."
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

In [0]:
class Encoder(nn.Module):
    "Core encoder is a stack of N layers"
    def __init__(self, layer, N):
        super(Encoder, self).__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)
        
    def forward(self, x, mask):
        "Pass the input (and mask) through each layer in turn."
        for layer in self.layers:
            x = layer(x, mask)
        return self.norm(x)

In [0]:
class LayerNorm(nn.Module):
    "Construct a layernorm module (See citation for details)."
    def __init__(self, features, eps=1e-6):
        super(LayerNorm, self).__init__()
        self.a_2 = nn.Parameter(torch.ones(features))
        self.b_2 = nn.Parameter(torch.zeros(features))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        return self.a_2 * (x - mean) / (std + self.eps) + self.b_2

class SublayerConnection(nn.Module):
    """
    A residual connection followed by a layer norm.
    Note for code simplicity the norm is first as opposed to last.
    """
    def __init__(self, size, dropout):
        super(SublayerConnection, self).__init__()
        self.norm = LayerNorm(size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, sublayer):
        "Apply residual connection to any sublayer with the same size."
        return x + self.dropout(sublayer(self.norm(x)))

class EncoderLayer(nn.Module):
    "Encoder is made up of self-attn and feed forward (defined below)"
    def __init__(self, size, self_attn, feed_forward, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = self_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(SublayerConnection(size, dropout), 2)
        self.size = size

    def forward(self, x, mask):
        "Follow Figure 1 (left) for connections."
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, mask))
        return self.sublayer[1](x, self.feed_forward)

In [0]:
class Decoder(nn.Module):
    "Generic N layer decoder with masking."
    def __init__(self, layer, N):
        super(Decoder, self).__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)
        
    def forward(self, x, memory, src_mask, tgt_mask):
        for layer in self.layers:
            x = layer(x, memory, src_mask, tgt_mask)
        return self.norm(x)

class DecoderLayer(nn.Module):
    "Decoder is made of self-attn, src-attn, and feed forward (defined below)"
    def __init__(self, size, self_attn, src_attn, feed_forward, dropout):
        super(DecoderLayer, self).__init__()
        self.size = size
        self.self_attn = self_attn
        self.src_attn = src_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(SublayerConnection(size, dropout), 3)
 
    def forward(self, x, memory, src_mask, tgt_mask):
        "Follow Figure 1 (right) for connections."
        m = memory
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, tgt_mask))
        x = self.sublayer[1](x, lambda x: self.src_attn(x, m, m, src_mask))
        return self.sublayer[2](x, self.feed_forward)

In [0]:
def attention(query, key, value, mask=None, dropout=None):
    "Compute 'Scaled Dot Product Attention'"
    d_k = query.size(-1)
    scores = torch.matmul(query, key.transpose(-2, -1)) \
             / math.sqrt(d_k)
    if mask is not None:
        scores = scores.masked_fill(mask == 0, -1e9)
    p_attn = F.softmax(scores, dim = -1)
    if dropout is not None:
        p_attn = dropout(p_attn)
    return torch.matmul(p_attn, value), p_attn

In [0]:
class MultiHeadedAttention(nn.Module):
    def __init__(self, h, d_model, dropout=0.1):
        "Take in model size and number of heads."
        super(MultiHeadedAttention, self).__init__()
        assert d_model % h == 0
        # We assume d_v always equals d_k
        self.d_k = d_model // h
        self.h = h
        self.linears = clones(nn.Linear(d_model, d_model), 4)
        self.attn = None
        self.dropout = nn.Dropout(p=dropout)
        
    def forward(self, query, key, value, mask=None):
        "Implements Figure 2"
        if mask is not None:
            # Same mask applied to all h heads.
            mask = mask.unsqueeze(1)
        nbatches = query.size(0)
        
        # 1) Do all the linear projections in batch from d_model => h x d_k 
        query, key, value = \
            [l(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)
             for l, x in zip(self.linears, (query, key, value))]
        
        # 2) Apply attention on all the projected vectors in batch. 
        x, self.attn = attention(query, key, value, mask=mask, 
                                 dropout=self.dropout)
        
        # 3) "Concat" using a view and apply a final linear. 
        x = x.transpose(1, 2).contiguous() \
             .view(nbatches, -1, self.h * self.d_k)
        return self.linears[-1](x)

In [0]:
class PositionwiseFeedForward(nn.Module):
    "Implements FFN equation."
    def __init__(self, d_model, d_ff, dropout=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.w_1 = nn.Linear(d_model, d_ff)
        self.w_2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        return self.w_2(self.dropout(F.relu(self.w_1(x))))

In [0]:
class Embeddings(nn.Module):
    def __init__(self, d_model, vocab):
        super(Embeddings, self).__init__()
        self.lut = nn.Embedding(vocab, d_model)
        self.d_model = d_model

    def forward(self, x):
        return self.lut(x) * np.sqrt(self.d_model)

In [0]:
class PositionalEncoding(nn.Module):
    "Implement the PE function."
    def __init__(self, d_model, dropout, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        
        # Compute the positional encodings once in log space.
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) *
                             -(math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
        
    def forward(self, x):
        x = x + self.pe[:, :x.size(1)].clone().detach()
        return self.dropout(x)

## End

In [0]:
class BucketIteratorWrapper(DataLoader):
    __initialized = False

    def __init__(self, iterator: data.Iterator):
#         super(BucketIteratorWrapper,self).__init__()
        self.batch_size = iterator.batch_size
        self.num_workers = 1
        self.collate_fn = None
        self.pin_memory = False
        self.drop_last = False
        self.timeout = 0
        self.worker_init_fn = None
        self.sampler = iterator
        self.batch_sampler = iterator
        self.__initialized = True

    def __iter__(self):
        return map(
            lambda batch: Batch(batch.src, batch.tgt, pad=TGT.vocab.stoi['<pad>']),
            self.batch_sampler.__iter__()
        )

    def __len__(self):
        return len(self.batch_sampler)
    
class MyCriterion(nn.Module):
    def __init__(self, pad_idx):
        super(MyCriterion, self).__init__()
        self.pad_idx = pad_idx
        self.criterion = nn.CrossEntropyLoss(reduction='sum', ignore_index=pad_idx)
        
    def forward(self, x, target):
        x = x.contiguous().permute(0,2,1)
        ntokens = (target != self.pad_idx).data.sum()
        
        return self.criterion(x, target) / ntokens

## Optimizer

In [0]:
class NoamOpt:
    "Optim wrapper that implements rate."
    def __init__(self, model_size, factor, warmup, optimizer):
        self.optimizer = optimizer
        self._step = 0
        self.warmup = warmup
        self.factor = factor
        self.model_size = model_size
        self._rate = 0
        
    def step(self):
        "Update parameters and rate"
        self._step += 1
        rate = self.rate()
        for p in self.optimizer.param_groups:
            p['lr'] = rate
        self._rate = rate
        self.optimizer.step()
        
    def rate(self, step = None):
        "Implement `lrate` above"
        if step is None:
            step = self._step
        return self.factor * \
            (self.model_size ** (-0.5) *
            min(step ** (-0.5), step * self.warmup ** (-1.5)))
        
def get_std_opt(model):
    return NoamOpt(model.src_embed[0].d_model, 2, 4000,
            torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9))

## End

In [0]:
torch.cuda.empty_cache()

batch_size = BATCH_SIZE
num_epochs = 10

train_iter, valid_iter, test_iter = data.BucketIterator.splits((train, valid, test), 
                                              batch_sizes=(batch_size, batch_size, batch_size), 
                                  sort_key=lambda x: len(x.src),
                                  shuffle=True,
                                  device=DEVICE,
                                  sort_within_batch=False)
                                  
train_iter = BucketIteratorWrapper(train_iter)
valid_iter = BucketIteratorWrapper(valid_iter)
test_iter = BucketIteratorWrapper(test_iter)

model = make_model(len(SRC.vocab), len(TGT.vocab), N=1)
model = model.to(DEVICE)
criterion = MyCriterion(pad_idx=SRC.vocab.stoi['<pad>'])
criterion.cuda()

optimizer = NoamOpt(model.src_embed[0].d_model, 1, 4000, torch.optim.Adam(model.parameters(),  lr=0, betas=(0.9, 0.98), eps=1e-9))
# share weights
#https://github.com/pytorch/examples/blob/master/word_language_model/model.py#L28
model.tgt_embed[0].lut.weight = model.generator.weight

In [36]:
def train_epoch(data_iter, model, criterion):
    total_loss = 0
    data_iter = tqdm_notebook(data_iter, total=len(train_iter))
    counter = 0
    for batch in data_iter:
        
        out = model.forward(batch)
        loss = criterion(out, batch.trg_y)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)

        optimizer.step()
        #optimizer.zero_grad()
        
        total_loss += loss
        data_iter.set_postfix(loss = loss.item())
        counter +=1
        
    total_loss /= counter
    return total_loss

def valid_epoch(data_iter, model, criterion):
    total_loss = 0
    data_iter = tqdm_notebook(data_iter, total=len(valid_iter))
    counter = 0
    for batch in data_iter:
        
        out = model.forward(batch)
        loss = criterion(out, batch.trg_y).item()
        
        total_loss += loss
        data_iter.set_postfix(loss = loss)
        counter +=1
        
    total_loss /= counter
    return total_loss


for epoch in range(num_epochs):
    model.train()
    loss = train_epoch(train_iter, model, criterion).item()
    print('train', loss)
    
    model.eval()
    with torch.no_grad():
        loss = valid_epoch(valid_iter, model, criterion)
        #scheduler.step(loss)
        print('valid', loss)

# Если лосс стартует от 2х, то это я запустила дообучение модели.

HBox(children=(IntProgress(value=0, max=824), HTML(value='')))

train 2.0379111766815186


HBox(children=(IntProgress(value=0, max=92), HTML(value='')))

valid 2.4104169684907664


HBox(children=(IntProgress(value=0, max=824), HTML(value='')))

train 1.9522501230239868


HBox(children=(IntProgress(value=0, max=92), HTML(value='')))

valid 2.4084081079648887


HBox(children=(IntProgress(value=0, max=824), HTML(value='')))

KeyboardInterrupt: ignored

In [0]:
def beam_search(model, src, src_mask, max_len=25, k=5):
    """
    src : tensor size (1,50)
    src_mask : tensor with True if not pad token
    """    

    tgt = torch.tensor(TGT.vocab.stoi['<s>']).unsqueeze(0).unsqueeze(0).to(DEVICE) # 1,1
    beam = [(tgt, 0)]

    src_encoded = model.encode(src,src_mask) # 64x50x512 batch_size, max_len, model_dim

    for i in range(max_len):
        candidates = []
        candidates_proba = []

        for snt, snt_proba in beam:
            if snt[0][-1] == TGT.vocab.stoi['</s>']:
                candidates.append(snt)
                candidates_proba.append(snt_proba)
            else:
                tgt_mask = subsequent_mask(snt.size(-1)).type_as(src.data).detach().to(DEVICE)
                proba = model.decode(snt, tgt_mask, src_encoded, src_mask)[0][i] # torch.Size([64, 49, 28285])[0][i] 
                best_k = torch.argsort(-proba)[:k].tolist()
                proba = proba.tolist()
                
                for tok in best_k:
                    to_concatenate = torch.tensor(tok).unsqueeze(0).unsqueeze(0).to(DEVICE)
                    candidates.append(torch.cat([snt, to_concatenate],dim=1))
                    candidates_proba.append(snt_proba + np.log(proba[tok]))

        best_candidates = np.argsort(-np.array(candidates_proba))[:k]
        beam = [(candidates[j], candidates_proba[j]) for j in best_candidates]
    return beam

In [66]:
model.eval()
with torch.no_grad():
    for i, batch in enumerate(valid_iter):
        src = batch.src[:1]
        src_key_padding_mask = src != SRC.vocab.stoi["<pad>"]
        beam = beam_search(model, src, src_key_padding_mask)
        
        seq = []
        for i in range(1, src.size(1)):
            sym = SRC.vocab.itos[src[0, i]]
            if sym == "</s>": break
            seq.append(sym)
        seq = tok_ru.decode_pieces(seq)
        print("\nSource:", seq)
        
        print("Translation:")
        for pred, pred_proba in beam:                
            seq = []
            for i in range(1, pred.size(1)):
                sym = TGT.vocab.itos[pred[0, i]]
                if sym == "</s>": break
                seq.append(sym)
            seq = tok_en.decode_pieces(seq)
            print(f"pred {pred_proba:.2f}:", seq)
                
        seq = []
        for i in range(1, batch.trg.size(1)):
            sym = TGT.vocab.itos[batch.trg[0, i]]
            if sym == "</s>": break
            seq.append(sym)
        seq = tok_en.decode_pieces(seq)
        print("Target:", seq)
        break


Source: сосуществование
Translation:
pred 23.62: co-exide coexistence
pred 23.45: co-exide coexist
pred 23.43: co-exide coezy
pred 23.41: co-exide coexy
pred 23.33: co-exide coexists
Target: coexistence


In [0]:
from nltk.translate.bleu_score import corpus_bleu
from nltk import translate

In [0]:
hypotheses = [] # real
references = [] # what we predicted
h_2 = []
r_2 = []

model.eval()
with torch.no_grad():
    for batch in test_iter:        
        for ind, element in enumerate(batch.src):
            src = batch.src[ind:ind+1] # 1,50
            src_mask = src != SRC.vocab.stoi["<pad>"]
            trg = batch.trg[ind:ind+1][0] # 1,50
            beam = beam_search(model, src, src_mask)
            for pred, pred_proba in beam[:1]:  # we take with the highest prob              
                to_h = []
                to_r = []
                tokens = pred[0]

                for i in range(1, len(tokens)):
                    decoded  = TGT.vocab.itos[tokens[i]]
                    if decoded == '</s>': break
                    to_h.append(decoded)
                for i in range(1, len(trg)):
                    decoded = TGT.vocab.itos[trg[i]]
                    if decoded == '</s>': break
                    to_r.append(decoded)
                
                hypotheses.append(to_h)
                references.append([to_r])
                h_2.append(tok_en.decode_pieces(to_h).split())
                r_2.append([tok_en.decode_pieces(to_r).split()])

In [68]:
# Оценка просто по токенам (как в sentencepiece)
corpus_bleu(references, hypotheses, 
            smoothing_function=translate.bleu_score.SmoothingFunction().method3,
            auto_reweigh=True
           )

0.21028987415192593

In [69]:
# Оценка по декодированным токенам и затем сплитанутым
corpus_bleu(r_2, h_2, 
            smoothing_function=translate.bleu_score.SmoothingFunction().method3,
            auto_reweigh=True
           )

0.15520692253194338

In [71]:
hypotheses_argmax = [] # real
references_argmax = [] # what we predicted

model.eval()
with torch.no_grad():
    for batch in test_iter:        
        pred = model(batch)
        scores = torch.softmax(model(batch), dim=-1)
        sents = torch.argmax(scores, dim=-1)
        hypotheses_argmax.extend([[TGT.vocab.itos[idx] for idx in sent] for sent in sents])
        references_argmax.extend([[[TGT.vocab.itos[idx] for idx in sent]] for sent in batch.trg_y])

# cleaning from <pad> and </s> tokens
new_h = []
new_r = []

for h in hypotheses_argmax:
    while '<pad>' in h:
        h.remove('<pad>')
    while '</s>' in h:
        h.remove('</s>')
    new_h.append(h)

for r in references_argmax:
    for r2 in r:
        while '<pad>' in r2:
            r2.remove('<pad>')
        while '</s>' in r2:
            r2.remove('</s>')
        new_r.append([r2])
print(len(new_h), len(new_r))
print(new_h[0])
print(new_r[0])

1000 1000
['▁china', '’', 's', '▁contradictions']
[['▁china', '’', 's', '▁contradictions']]


In [72]:
# Просто аргмакс оценка
corpus_bleu(new_r, new_h, 
            smoothing_function=translate.bleu_score.SmoothingFunction().method3,
            auto_reweigh=True
           )

0.17001525043592144