In [None]:
!pip install -U torch==1.7.1 torchtext==0.8.1

# Reload environment
exit()

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

from torchtext import data
from torchtext.data import Field, BucketIterator

import spacy
import numpy as np

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

import random
import math
import time
import pandas as pd
import numpy as np

SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
# torch.cuda.manual_seed(SEED)
# torch.backends.cudnn.deterministic = True

<torch._C.Generator at 0x2afdf7f3fad0>

In [2]:
sequences = []
labels = []

with open('Aug-Train.txt') as f:
    lines = [line.rstrip() for line in f]
    for i in range(len(lines)):
        if i % 4 == 2:
            sequences.append(lines[i])
        if i % 4 == 3:
            labels.append(lines[i])

In [3]:
seq_new = []
lab_new = []
for i in range(len(sequences)):
    if len(sequences[i]) <= 350:
        seq_new.append(sequences[i])
        lab_new.append(labels[i])

In [4]:
len(seq_new)

25799

In [5]:
for i in range(len(lab_new)):
    lab_new[i] = lab_new[i].replace("0", "U").replace("1", "B")

In [6]:
training_data = pd.DataFrame()
training_data['seq'] = seq_new
training_data['br'] = lab_new
training_data.to_csv('raw_train.csv', index=False)


In [7]:
sequences_test = []
labels_test = []

with open('Val.txt') as f:
    lines = [line.rstrip() for line in f]
    for i in range(len(lines)):
        if i % 4 == 2:
            sequences_test.append(lines[i])
        if i % 4 == 3:
            labels_test.append(lines[i])


In [8]:
seq_test = []
lab_test = []
for i in range(len(sequences_test)):
    if len(sequences_test[i]) <= 350:
        seq_test.append(sequences_test[i])
        lab_test.append(labels_test[i])

In [9]:
len(seq_test)

550

In [10]:
for i in range(len(lab_test)):
    lab_test[i] = lab_test[i].replace("0", "U").replace("1", "B")
    
testing_data = pd.DataFrame()
testing_data['seq'] = seq_test
testing_data['br'] = lab_test
testing_data.to_csv('dataset_test.csv', index=False)

In [11]:
def tokenize_input(text):
    chars = sorted(list(set(text)))
    vocab_size = len(chars)
    stoi = { ch:i for i,ch in enumerate(chars) }
    encode = [stoi[c] for c in text] # encoder: take a amino acid string, output a list of integers
    return encode

def tokenize_output(text):
    chars = sorted(list(set(text)))
    vocab_size = len(chars)
    stoi = { ch:i for i,ch in enumerate(chars) }
    encode = [stoi[c] for c in text] # encoder: take a amino acid string, output a list of integers
    return encode

In [12]:
SRC = Field(tokenize = tokenize_input, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            batch_first = True)

TRG = Field(tokenize = tokenize_output, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            batch_first = True)



In [13]:
VAL_RATIO = 0.1

def prepare_csv(seed=999):
    df_train = pd.read_csv("raw_train.csv")
    idx = np.arange(df_train.shape[0])
    np.random.seed(seed)
    np.random.shuffle(idx)
    val_size = int(len(idx) * VAL_RATIO)
    df_train.iloc[idx[val_size:], :].to_csv(
        "dataset_train.csv", index=False)
    df_train.iloc[idx[:val_size], :].to_csv(
        "dataset_val.csv", index=False)
prepare_csv()

In [14]:
fields = {'seq':('i', SRC), 
          'br':('o', TRG)}
train_data, valid_data, test_data = data.TabularDataset.splits(
                                        path = r'',
                                        train = 'dataset_train.csv',
                                        validation = 'dataset_val.csv',
                                        test = 'dataset_test.csv',
                                        format = 'csv',
                                        fields = fields, 
                                        skip_header = False)



In [15]:
print(f"Number of training examples: {len(train_data.examples)}")
print(f"Number of validation examples: {len(valid_data.examples)}")
print(f"Number of test examples: {len(test_data.examples)}")

Number of training examples: 23220
Number of validation examples: 2579
Number of test examples: 550


In [16]:
SRC.build_vocab(train_data, min_freq = 1)
TRG.build_vocab(train_data, min_freq = 1)
print(f"Unique tokens in input vocabulary: {len(SRC.vocab)}")
print(f"Unique tokens in output vocabulary: {len(TRG.vocab)}")

Unique tokens in input vocabulary: 24
Unique tokens in output vocabulary: 6


In [17]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('cuda' if torch.cuda.is_available() else 'cpu')
print(torch.cuda.device_count() if torch.cuda.device_count() else 0)
BATCH_SIZE = 4

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data,test_data),
     batch_size = BATCH_SIZE,
     sort_key = lambda x: len(x.i),
    sort_within_batch=True,
     device = device)

cuda
4




In [18]:
class Encoder(nn.Module):
    def __init__(self, 
                 input_dim, 
                 hid_dim, 
                 n_layers, 
                 n_heads, 
                 pf_dim,
                 dropout, 
                 device,
                 max_length = 400):
        super().__init__()

        self.device = device
        
        self.tok_embedding = nn.Embedding(input_dim, hid_dim)
        self.pos_embedding = nn.Embedding(max_length, hid_dim)
        
        self.layers = nn.ModuleList([EncoderLayer(hid_dim, 
                                                  n_heads, 
                                                  pf_dim,
                                                  dropout, 
                                                  device) 
                                     for _ in range(n_layers)])
        
        self.dropout = nn.Dropout(dropout)
        
        self.scale = torch.sqrt(torch.FloatTensor([hid_dim])).to(device)
        
    def forward(self, src, src_mask):
        
        #src = [batch size, src len]
        #src_mask = [batch size, src len]
        
        batch_size = src.shape[0]
        src_len = src.shape[1]
        
        pos = torch.arange(0, src_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)
        
        #pos = [batch size, src len]
        
        src = self.dropout((self.tok_embedding(src) * self.scale) + self.pos_embedding(pos))
        
        #src = [batch size, src len, hid dim]
        
        for layer in self.layers:
            src = layer(src, src_mask)
            
        #src = [batch size, src len, hid dim]
            
        return src


class EncoderLayer(nn.Module):
    def __init__(self,
                 hid_dim, 
                 n_heads, 
                 pf_dim,  
                 dropout, 
                 device):
        super().__init__()
        
        self.self_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.ff_layer_norm = nn.LayerNorm(hid_dim)
        self.self_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
        self.positionwise_feedforward = PositionwiseFeedforwardLayer(hid_dim, 
                                                                     pf_dim, 
                                                                     dropout)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src, src_mask):
        
        #src = [batch size, src len, hid dim]
        #src_mask = [batch size, src len]
                
        #self attention
        _src, _ = self.self_attention(src, src, src, src_mask) # multi-head( query, key, value, mask = None)
        
        #dropout, residual connection and layer norm
        src = self.self_attn_layer_norm(src + self.dropout(_src))
        
        #src = [batch size, src len, hid dim]
        
        #positionwise feedforward
        _src = self.positionwise_feedforward(src)
        
        #dropout, residual and layer norm
        src = self.ff_layer_norm(src + self.dropout(_src))
        
        #src = [batch size, src len, hid dim]
        
        return src

class MultiHeadAttentionLayer(nn.Module):
    def __init__(self, hid_dim, n_heads, dropout, device):
        super().__init__()
        
        assert hid_dim % n_heads == 0
        
        self.hid_dim = hid_dim   # in paper, 512
        self.n_heads = n_heads   # in paper, 8
        self.head_dim = hid_dim // n_heads  # in paper, 512 // 8 = 64
         
        self.fc_q = nn.Linear(hid_dim, hid_dim)
        self.fc_k = nn.Linear(hid_dim, hid_dim)
        self.fc_v = nn.Linear(hid_dim, hid_dim)
        
        self.fc_o = nn.Linear(hid_dim, hid_dim)
        
        self.dropout = nn.Dropout(dropout)
        
        self.scale = torch.sqrt(torch.FloatTensor([self.head_dim])).to(device)  # sqrt(64)
        
    def forward(self, query, key, value, mask = None):
        
        batch_size = query.shape[0]
        
        #query = [batch size, query len, hid dim]
        #key = [batch size, key len, hid dim]
        #value = [batch size, value len, hid dim]
                
        Q = self.fc_q(query)
        K = self.fc_k(key)
        V = self.fc_v(value)
        
        #Q = [batch size, query len, hid dim]
        #K = [batch size, key len, hid dim]
        #V = [batch size, value len, hid dim]
                
        Q = Q.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        K = K.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        V = V.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        
        #Q = [batch size, n heads, query len, head dim]
        #K = [batch size, n heads, key len, head dim]
        #V = [batch size, n heads, value len, head dim]
                
        energy = torch.matmul(Q, K.permute(0, 1, 3, 2)) / self.scale
        
        #energy = [batch size, n heads, query len, key len]
        
        if mask is not None:
            energy = energy.masked_fill(mask == 0, -1e10)
        
        attention = torch.softmax(energy, dim = -1)
                
        #attention = [batch size, n heads, query len, key len]
                
        x = torch.matmul(self.dropout(attention), V)  #x = [batch size, n heads, query len, head dim]
        
        x = x.permute(0, 2, 1, 3).contiguous() 
        
        
        
        x = x.view(batch_size, -1, self.hid_dim) #x = [batch size, query len, n heads, head dim]
        
        x = self.fc_o(x) #x = [batch size, query len, hid dim]
                
        return x, attention
class PositionwiseFeedforwardLayer(nn.Module):
    def __init__(self, hid_dim, pf_dim, dropout):
        super().__init__()
        
        self.fc_1 = nn.Linear(hid_dim, pf_dim)
        self.fc_2 = nn.Linear(pf_dim, hid_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        
        #x = [batch size, seq len, hid dim]
        
        x = self.dropout(torch.relu(self.fc_1(x)))
        
        #x = [batch size, seq len, pf dim]
        
        x = self.fc_2(x)
        
        #x = [batch size, seq len, hid dim]
        
        return x
class Decoder(nn.Module):
    def __init__(self, 
                 output_dim, 
                 hid_dim, 
                 n_layers, 
                 n_heads, 
                 pf_dim, 
                 dropout, 
                 device,
                 max_length = 400):
        super().__init__()
        
        self.device = device
        
        self.tok_embedding = nn.Embedding(output_dim, hid_dim)
        self.pos_embedding = nn.Embedding(max_length, hid_dim)
        
        self.layers = nn.ModuleList([DecoderLayer(hid_dim, 
                                                  n_heads, 
                                                  pf_dim, 
                                                  dropout, 
                                                  device)
                                     for _ in range(n_layers)])
        
        self.fc_out = nn.Linear(hid_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
        self.scale = torch.sqrt(torch.FloatTensor([hid_dim])).to(device)

        
    def forward(self, trg, enc_src, trg_mask, src_mask):
        
        #trg = [batch size, trg len]
        #enc_src = [batch size, src len, hid dim]
        #trg_mask = [batch size, trg len]
        #src_mask = [batch size, src len]
                
        batch_size = trg.shape[0]
        trg_len = trg.shape[1]
        
        pos = torch.arange(0, trg_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)
                            
        #pos = [batch size, trg len]
            
        trg = self.dropout((self.tok_embedding(trg) * self.scale) + self.pos_embedding(pos))
                
        #trg = [batch size, trg len, hid dim]
        
        for layer in self.layers:
            trg, attention = layer(trg, enc_src, trg_mask, src_mask)
        
        #trg = [batch size, trg len, hid dim]
        #attention = [batch size, n heads, trg len, src len]
        
        output = self.fc_out(trg)
        
        #output = [batch size, trg len, output dim]
            
        return output, attention

class DecoderLayer(nn.Module):
    def __init__(self, 
                 hid_dim, 
                 n_heads, 
                 pf_dim, 
                 dropout, 
                 device):
        super().__init__()
        
        self.self_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.enc_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.ff_layer_norm = nn.LayerNorm(hid_dim)
        self.self_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
        self.encoder_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
        self.positionwise_feedforward = PositionwiseFeedforwardLayer(hid_dim, 
                                                                     pf_dim, 
                                                                     dropout)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, trg, enc_src, trg_mask, src_mask):
        
        #trg = [batch size, trg len, hid dim]
        #enc_src = [batch size, src len, hid dim]
        #trg_mask = [batch size, trg len]
        #src_mask = [batch size, src len]
        
        #self attention
        _trg, _ = self.self_attention(trg, trg, trg, trg_mask)
        
        #dropout, residual connection and layer norm
        trg = self.self_attn_layer_norm(trg + self.dropout(_trg))
            
        #trg = [batch size, trg len, hid dim]
            
        #encoder attention
        _trg, attention = self.encoder_attention(trg, enc_src, enc_src, src_mask)
        
        #dropout, residual connection and layer norm
        trg = self.enc_attn_layer_norm(trg + self.dropout(_trg))
                    
        #trg = [batch size, trg len, hid dim]
        
        #positionwise feedforward
        _trg = self.positionwise_feedforward(trg)
        
        #dropout, residual and layer norm
        trg = self.ff_layer_norm(trg + self.dropout(_trg))
        
        #trg = [batch size, trg len, hid dim]
        #attention = [batch size, n heads, trg len, src len]
        
        return trg, attention

In [19]:
class Seq2Seq(nn.Module):
    def __init__(self, 
                 encoder, 
                 decoder, 
                 src_pad_idx, 
                 trg_pad_idx, 
                 device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx
        self.device = device
        
    def make_src_mask(self, src):
        
        #src = [batch size, src len]
        
        src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)

        #src_mask = [batch size, 1, 1, src len]

        return src_mask
    
    def make_trg_mask(self, trg):
        
        #trg = [batch size, trg len]
        
        trg_pad_mask = (trg != self.trg_pad_idx).unsqueeze(1).unsqueeze(2)
        
        #trg_pad_mask = [batch size, 1, 1, trg len]
        
        trg_len = trg.shape[1]
        
        trg_sub_mask = torch.tril(torch.ones((trg_len, trg_len), device = self.device)).bool()
        
        #trg_sub_mask = [trg len, trg len]
            
        trg_mask = trg_pad_mask & trg_sub_mask
        
        #trg_mask = [batch size, 1, trg len, trg len]
        
        return trg_mask

    def forward(self, src, trg):
        
        #src = [batch size, src len]
        #trg = [batch size, trg len]
                
        src_mask = self.make_src_mask(src)
        trg_mask = self.make_trg_mask(trg)
        
        #src_mask = [batch size, 1, 1, src len]
        #trg_mask = [batch size, 1, trg len, trg len]
        
        enc_src = self.encoder(src, src_mask)
        
        #enc_src = [batch size, src len, hid dim]
                
        output, attention = self.decoder(trg, enc_src, trg_mask, src_mask)
        
        #output = [batch size, trg len, output dim]
        #attention = [batch size, n heads, trg len, src len]
        
        return output, attention

In [69]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
HID_DIM = 256
ENC_LAYERS = 4
DEC_LAYERS = 4
ENC_HEADS = 8
DEC_HEADS = 8
ENC_PF_DIM = 512
DEC_PF_DIM = 512
ENC_DROPOUT = 0.2
DEC_DROPOUT = 0.2

enc = Encoder(INPUT_DIM, 
              HID_DIM, 
              ENC_LAYERS, 
              ENC_HEADS, 
              ENC_PF_DIM, ENC_DROPOUT, device)

dec = Decoder(OUTPUT_DIM, 
              HID_DIM, 
              DEC_LAYERS, 
              DEC_HEADS, 
              DEC_PF_DIM, 
              DEC_DROPOUT, 
              device)

In [70]:
SRC_PAD_IDX = SRC.vocab.stoi[SRC.pad_token]
TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]

model = Seq2Seq(enc, dec, SRC_PAD_IDX, TRG_PAD_IDX, device).to(device)

In [71]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 5,485,574 trainable parameters


In [72]:
def initialize_weights(m):
    if hasattr(m, 'weight') and m.weight.dim() > 1:
        nn.init.xavier_uniform_(m.weight.data)

In [73]:
model.apply(initialize_weights);

In [74]:
LEARNING_RATE = 0.005

optimizer = torch.optim.Adam(model.parameters(), lr = LEARNING_RATE)

In [75]:
criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

In [76]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        
        src = batch.i
        trg = batch.o
        
        optimizer.zero_grad()
        
        output, _ = model(src, trg[:,:-1])
                
        #output = [batch size, trg len - 1, output dim]
        #trg = [batch size, trg len]
            
        output_dim = output.shape[-1]
            
        output = output.contiguous().view(-1, output_dim)
        trg = trg[:,1:].contiguous().view(-1)
                
        #output = [batch size * trg len - 1, output dim]
        #trg = [batch size * trg len - 1]
            
        loss = criterion(output, trg)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [77]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            src = batch.i
            trg = batch.o

            output, _ = model(src, trg[:,:-1])
            
            #output = [batch size, trg len - 1, output dim]
            #trg = [batch size, trg len]
            
            output_dim = output.shape[-1]
            
            output = output.contiguous().view(-1, output_dim)
            trg = trg[:,1:].contiguous().view(-1)
            
            #output = [batch size * trg len - 1, output dim]
            #trg = [batch size * trg len - 1]
            
            loss = criterion(output, trg)

            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [78]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [79]:
N_EPOCHS = 10
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut6-aug-model2.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

Epoch: 01 | Time: 6m 16s
	Train Loss: 0.239 | Train PPL:   1.270
	 Val. Loss: 0.248 |  Val. PPL:   1.281
Epoch: 02 | Time: 6m 17s
	Train Loss: 0.236 | Train PPL:   1.266
	 Val. Loss: 0.237 |  Val. PPL:   1.267
Epoch: 03 | Time: 7m 8s
	Train Loss: 0.235 | Train PPL:   1.265
	 Val. Loss: 0.235 |  Val. PPL:   1.265
Epoch: 04 | Time: 7m 42s
	Train Loss: 0.234 | Train PPL:   1.264
	 Val. Loss: 0.235 |  Val. PPL:   1.265
Epoch: 05 | Time: 6m 21s
	Train Loss: 0.235 | Train PPL:   1.264
	 Val. Loss: 0.235 |  Val. PPL:   1.266
Epoch: 06 | Time: 6m 19s
	Train Loss: 0.234 | Train PPL:   1.264
	 Val. Loss: 0.236 |  Val. PPL:   1.266
Epoch: 07 | Time: 7m 5s
	Train Loss: 0.235 | Train PPL:   1.264
	 Val. Loss: 0.236 |  Val. PPL:   1.266
Epoch: 08 | Time: 7m 17s
	Train Loss: 0.235 | Train PPL:   1.264
	 Val. Loss: 0.236 |  Val. PPL:   1.266
Epoch: 09 | Time: 6m 26s
	Train Loss: 0.235 | Train PPL:   1.264
	 Val. Loss: 0.235 |  Val. PPL:   1.265
Epoch: 10 | Time: 6m 44s
	Train Loss: 0.235 | Train PPL: 

In [80]:
model.load_state_dict(torch.load('tut6-aug-model2.pt'))

test_loss = evaluate(model, test_iterator, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')

| Test Loss: 0.204 | Test PPL:   1.226 |


In [86]:
def translate_sentence(sentence, src_field, trg_field, model, device, max_len):
    
    model.eval()


    tokens = [token for token in sentence]

    tokens = [src_field.init_token] + tokens + [src_field.eos_token]
        
    src_indexes = [src_field.vocab.stoi[token] for token in tokens]

    src_tensor = torch.LongTensor(src_indexes).unsqueeze(0).to(device)
    
    src_mask = model.make_src_mask(src_tensor)
    
    with torch.no_grad():
        enc_src = model.encoder(src_tensor, src_mask)

    trg_indexes = [trg_field.vocab.stoi[trg_field.init_token]]

    for i in range(max_len):

        trg_tensor = torch.LongTensor(trg_indexes).unsqueeze(0).to(device)

        trg_mask = model.make_trg_mask(trg_tensor)
        
        with torch.no_grad():
            output, attention = model.decoder(trg_tensor, enc_src, trg_mask, src_mask)
        
        pred_token = output.argmax(2)[:,-1].item()
        
        trg_indexes.append(pred_token)

        if pred_token == trg_field.vocab.stoi[trg_field.eos_token]:
            break
    
    trg_tokens = [trg_field.vocab.itos[i] for i in trg_indexes]
    
    return trg_tokens[1:], attention

In [87]:
def display_attention(sentence, translation, attention, n_heads = 8, n_rows = 4, n_cols = 2):
    
    assert n_rows * n_cols == n_heads
    
    fig = plt.figure(figsize=(15,25))
    
    for i in range(n_heads):
        
        ax = fig.add_subplot(n_rows, n_cols, i+1)
        
        _attention = attention.squeeze(0)[i].cpu().detach().numpy()

        cax = ax.matshow(_attention, cmap='bone')

        ax.tick_params(labelsize=12)
        ax.set_xticklabels(['']+['<sos>']+[t.lower() for t in sentence]+['<eos>'], 
                           rotation=45)
        ax.set_yticklabels(['']+translation)

        ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
        ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

    plt.show()
    plt.close()

In [88]:
example_idx = 8

src = vars(train_data.examples[example_idx])['i']
trg = vars(train_data.examples[example_idx])['o']

print(f'src = {src}')
print(f'trg = {trg}')
print(len(trg))

src = [5, 12, 9, 5, 15, 12, 3, 4, 0, 0, 13, 0, 13, 0, 9, 0, 0, 13, 0, 0, 0, 0, 0, 6, 0, 0, 13, 0, 6, 14, 3, 14, 11, 3, 4, 12, 3, 2, 12, 3, 4, 3, 0, 17, 17, 14, 13, 0, 3, 9, 0, 7, 3, 14, 1, 7, 4, 12, 3, 14, 7, 19, 13, 5, 15, 15, 5, 15, 19, 4, 17, 8, 2, 12, 13, 5, 14, 7, 7, 0, 17, 4, 8, 12, 8, 11, 3, 3, 12, 19, 5, 6, 9, 11, 12, 8, 18, 16, 8, 18, 9, 13, 8, 4, 5, 14, 2, 1, 9, 17, 9, 11, 13, 5, 19, 9, 15, 3, 0, 5, 0, 15, 9, 17, 2, 13, 8, 9, 3, 9, 11, 7, 17, 12, 14, 16, 8, 17, 17, 19, 9, 0, 15, 3, 16, 4, 11, 19, 15, 0, 7, 2, 14, 17, 8, 15, 14, 5, 9, 12, 12, 8, 17, 5, 15, 4, 13, 9, 4, 17, 3, 5, 19, 8, 2, 0, 2, 19, 18, 9, 14, 14, 4, 3, 0, 3, 12, 9, 12, 3, 11, 16, 11, 14, 13, 9, 9, 9, 13, 4, 3, 14, 9, 17, 17, 9, 2, 19, 7, 7, 14, 11, 16, 2, 14, 5, 11, 2, 11, 18, 9, 7, 8, 19, 2, 1, 12, 17, 7, 8, 17, 0, 0, 7, 2, 11, 5, 9, 0, 4, 12, 9, 8, 6, 12, 2, 15, 18, 14, 0, 19, 12, 4, 19, 18, 0, 18, 9, 12, 13, 0, 8, 17, 12, 4, 15, 13, 3, 7, 8, 2, 9, 7, 9, 12, 8, 7, 15, 2, 12, 11, 4, 17, 8, 2, 9, 3, 3, 2, 9, 1

In [89]:
translation, attention = translate_sentence(src, SRC, TRG, model, device, len(src))

In [90]:
example_idx = 10

src = vars(test_data.examples[example_idx])['i']
trg = vars(test_data.examples[example_idx])['o']

print(f'src = {src}')
print(f'trg = {trg}')

src = [9, 16, 2, 11, 12, 5, 10, 16, 9, 12, 9, 4, 4, 1, 3, 0, 10, 10, 11, 10, 0, 12, 12, 3, 6, 1, 7, 9, 16, 10, 7, 5, 4, 3, 1, 13, 12, 12, 8, 12, 2, 6, 8, 14, 12, 0, 7, 13, 8, 1, 14, 16, 8, 13, 8, 9, 1, 10, 12, 0, 11, 4, 11, 10, 4, 0, 17, 8, 13, 18, 13, 7, 7, 3, 6, 15, 11, 1, 10, 16, 12, 10, 4, 16, 16, 3, 17, 10, 12, 18, 2, 1, 0, 8, 10, 13, 0, 17, 12, 16, 18, 4, 16, 11, 11, 2, 6, 6, 16, 4, 6, 6, 4, 16, 2, 15, 13, 17, 4, 13, 16, 9, 4, 7, 15, 13, 6, 8, 1, 0, 8, 0, 15, 8, 14, 3, 10, 18, 11, 13, 13, 0, 2, 18, 3, 14, 4, 2, 8, 2, 15, 3, 8, 8, 9, 0, 13, 1, 2, 12, 1, 1, 11, 8, 10, 8, 7, 4, 14, 3, 0, 4, 0, 9, 4, 18, 4, 12, 3, 9, 11, 14, 14, 18, 7, 12, 18, 0, 16, 1, 3, 14, 4, 1, 4, 5, 6, 10, 8, 17, 1, 11, 16, 1, 0, 6, 4, 14, 16, 0, 10, 18, 3, 7, 0, 5, 4, 17, 16, 7, 4, 1, 12, 16, 0, 16, 9, 0, 10, 4, 12, 0, 11, 4, 8, 11, 10, 4, 3, 7, 15, 7, 18, 14, 6, 14, 12, 8, 0, 0, 0, 4, 8, 15, 11, 12, 12, 11, 8, 4, 10, 5, 12, 12, 0, 14, 8, 8, 13, 8, 1, 16, 4, 15, 4, 18, 12, 18, 17, 18, 4, 8, 11, 10, 3, 18, 15, 

In [91]:
translation, attention = translate_sentence(src, SRC, TRG, model, device, len(src))

print(f'predicted trg = {translation}')

predicted trg = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [67]:
def translate_sentence_vectorized(src_tensor, src_field, trg_field, model, device, max_len=50):
    assert isinstance(src_tensor, torch.Tensor)

    model.eval()
    src_mask = model.make_src_mask(src_tensor)

    with torch.no_grad():
        enc_src = model.encoder(src_tensor, src_mask)
    # enc_src = [batch_sz, src_len, hid_dim]

    trg_indexes = [[trg_field.vocab.stoi[trg_field.init_token]] for _ in range(len(src_tensor))]
    # Even though some examples might have been completed by producing a <eos> token
    # we still need to feed them through the model because other are not yet finished
    # and all examples act as a batch. Once every single sentence prediction encounters
    # <eos> token, then we can stop predicting.
    translations_done = [0] * len(src_tensor)
    for i in range(max_len):
        trg_tensor = torch.LongTensor(trg_indexes).to(device)
        trg_mask = model.make_trg_mask(trg_tensor)
        with torch.no_grad():
            output, attention = model.decoder(trg_tensor, enc_src, trg_mask, src_mask)
        pred_tokens = output.argmax(2)[:,-1]
        for i, pred_token_i in enumerate(pred_tokens):
            trg_indexes[i].append(pred_token_i)
            if pred_token_i == trg_field.vocab.stoi[trg_field.eos_token]:
                translations_done[i] = 1
        if all(translations_done):
            break

    # Iterate through each predicted example one by one;
    # Cut-off the portion including the after the <eos> token
    pred_sentences = []
    for trg_sentence in trg_indexes:
        pred_sentence = []
        for i in range(1, len(trg_sentence)):
            if trg_sentence[i] == trg_field.vocab.stoi[trg_field.eos_token]:
                break
            pred_sentence.append(trg_field.vocab.itos[trg_sentence[i]])
        pred_sentences.append(pred_sentence)

    return pred_sentences, attention

In [92]:
from torchtext.data.metrics import bleu_score

def calculate_bleu_alt(iterator, src_field, trg_field, model, device):
    trgs = []
    pred_trgs = []
    with torch.no_grad():
        for batch in iterator:
            src = batch.i
            trg = batch.o
            _trgs = []
            for sentence in trg:
                tmp = []
                # Start from the first token which skips the <start> token
                print(sentence)
                for i in sentence[1:]:
                    # Targets are padded. So stop appending as soon as a padding or eos token is encountered
                    if i == trg_field.vocab.stoi[trg_field.eos_token] or i == trg_field.vocab.stoi[trg_field.pad_token]:
                        break
                    tmp.append(trg_field.vocab.itos[i])
                _trgs.append([tmp])
            trgs += _trgs
            pred_trg, _ = translate_sentence_vectorized(src, src_field, trg_field, model, device)
            pred_trgs += pred_trg
    return pred_trgs, trgs, bleu_score(pred_trgs, trgs)

In [93]:
bleu_score = calculate_bleu_alt(test_data, SRC, TRG, model, device)

print(f'BLEU score = {bleu_score*100:.2f}')

1


TypeError: 'int' object is not subscriptable