# Контекст

- модель кодировщик-декодировщик (последовательность-в-последовательность)
- длина входа и выхода может отличаться
- мы представим только данные с таким началом (для сокращения времени обучения):
    ("i", "am"), ("i", "'m"), 
    ("he", "is"), ("he", "'s"),
    ("she", "is"), ("she", "'s"),
    ("you", "are"), ("you", "'re"),
    ("we", "are"), ("we", "'re"),
    ("they", "are"), ("they", "'re")

---

Данные представлены в формате:


 ['go.', 'марш!'],
 ['go.', 'иди.'],
 ['go.', 'идите.'],
 ['hi.', 'здравствуйте.'],
 ['hi.', 'привет!'],
 ['hi.', 'хай.'],
 ['hi.', 'здрасте.'],
 ['hi.', 'здоро́во!'],
 ['hi.', 'приветик!'],
 ['run!', 'беги!'],
 ['run!', 'бегите!'],
 ['run.', 'беги!']




# Импорты

In [1]:
import os
from argparse import Namespace
from collections import Counter
import json
import re
import string
import warnings

from nltk.tokenize import word_tokenize

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm_notebook


warnings.filterwarnings("ignore")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')



# Чтение и подготовка данных

In [2]:
with open('rus.txt') as file:
    lines = file.readlines()

lines = [line.replace('\n', '').lower().split('\t')[:-1] for line in lines]

In [3]:
lines[0:10]

[['go.', 'марш!'],
 ['go.', 'иди.'],
 ['go.', 'идите.'],
 ['hi.', 'здравствуйте.'],
 ['hi.', 'привет!'],
 ['hi.', 'хай.'],
 ['hi.', 'здрасте.'],
 ['hi.', 'здоро́во!'],
 ['hi.', 'приветик!'],
 ['run!', 'беги!']]

In [4]:
data = []
for english_sentence, russian_sentence in lines:
    data.append({'english_tokens': word_tokenize(english_sentence, language = 'english'),
                 'russian_tokens': word_tokenize(russian_sentence, language = 'russian')})
data[:10]

[{'english_tokens': ['go', '.'], 'russian_tokens': ['марш', '!']},
 {'english_tokens': ['go', '.'], 'russian_tokens': ['иди', '.']},
 {'english_tokens': ['go', '.'], 'russian_tokens': ['идите', '.']},
 {'english_tokens': ['hi', '.'], 'russian_tokens': ['здравствуйте', '.']},
 {'english_tokens': ['hi', '.'], 'russian_tokens': ['привет', '!']},
 {'english_tokens': ['hi', '.'], 'russian_tokens': ['хай', '.']},
 {'english_tokens': ['hi', '.'], 'russian_tokens': ['здрасте', '.']},
 {'english_tokens': ['hi', '.'], 'russian_tokens': ['здоро́во', '!']},
 {'english_tokens': ['hi', '.'], 'russian_tokens': ['приветик', '!']},
 {'english_tokens': ['run', '!'], 'russian_tokens': ['беги', '!']}]

In [5]:
filter_phrases = (
    ("i", "am"), ("i", "'m"), 
    ("he", "is"), ("he", "'s"),
    ("she", "is"), ("she", "'s"),
    ("you", "are"), ("you", "'re"),
    ("we", "are"), ("we", "'re"),
    ("they", "are"), ("they", "'re")
)

In [6]:
data_easy = []
for double_token in data:
    if tuple(double_token['english_tokens'][:2]) in filter_phrases:
        data_easy.append(double_token)
        

print(len(data_easy))
data_easy[:-10]

33941


[{'english_tokens': ['i', "'m", '19', '.'],
  'russian_tokens': ['мне', 'девятнадцать', 'лет', '.']},
 {'english_tokens': ['i', "'m", 'ok', '.'],
  'russian_tokens': ['со', 'мной', 'всё', 'в', 'порядке', '.']},
 {'english_tokens': ['i', "'m", 'ok', '.'],
  'russian_tokens': ['у', 'меня', 'всё', 'хорошо', '.']},
 {'english_tokens': ['i', "'m", 'ok', '.'],
  'russian_tokens': ['я', 'в', 'порядке', '.']},
 {'english_tokens': ['i', "'m", 'ok', '.'],
  'russian_tokens': ['у', 'меня', 'всё', 'в', 'порядке', '.']},
 {'english_tokens': ['i', "'m", 'up', '.'],
  'russian_tokens': ['я', 'встал', '.']},
 {'english_tokens': ['i', "'m", 'up', '.'],
  'russian_tokens': ['я', 'встала', '.']},
 {'english_tokens': ['i', "'m", 'tom', '.'],
  'russian_tokens': ['я', 'том', '.']},
 {'english_tokens': ['i', "'m", 'bad', '.'],
  'russian_tokens': ['я', 'плохой', '.']},
 {'english_tokens': ['i', "'m", 'bad', '.'],
  'russian_tokens': ['я', 'плохая', '.']},
 {'english_tokens': ['i', "'m", 'fat', '.'],
  'russ

In [7]:
for i in range(len(data_easy)):
    data_easy[i] = {'english_tokens': " ".join(data_easy[i]['english_tokens']),
                    'russian_tokens': " ".join(data_easy[i]['russian_tokens'])}

print(len(data_easy))
data_easy

33941


[{'english_tokens': "i 'm 19 .", 'russian_tokens': 'мне девятнадцать лет .'},
 {'english_tokens': "i 'm ok .", 'russian_tokens': 'со мной всё в порядке .'},
 {'english_tokens': "i 'm ok .", 'russian_tokens': 'у меня всё хорошо .'},
 {'english_tokens': "i 'm ok .", 'russian_tokens': 'я в порядке .'},
 {'english_tokens': "i 'm ok .", 'russian_tokens': 'у меня всё в порядке .'},
 {'english_tokens': "i 'm up .", 'russian_tokens': 'я встал .'},
 {'english_tokens': "i 'm up .", 'russian_tokens': 'я встала .'},
 {'english_tokens': "i 'm tom .", 'russian_tokens': 'я том .'},
 {'english_tokens': "i 'm bad .", 'russian_tokens': 'я плохой .'},
 {'english_tokens': "i 'm bad .", 'russian_tokens': 'я плохая .'},
 {'english_tokens': "i 'm fat .", 'russian_tokens': 'я толстый .'},
 {'english_tokens': "i 'm fat .", 'russian_tokens': 'я толстая .'},
 {'english_tokens': "i 'm fit .", 'russian_tokens': 'я в форме .'},
 {'english_tokens': "i 'm hit !", 'russian_tokens': 'меня ударили !'},
 {'english_tokens

In [8]:
data_easy = pd.DataFrame(data_easy)
data_easy

Unnamed: 0,english_tokens,russian_tokens
0,i 'm 19 .,мне девятнадцать лет .
1,i 'm ok .,со мной всё в порядке .
2,i 'm ok .,у меня всё хорошо .
3,i 'm ok .,я в порядке .
4,i 'm ok .,у меня всё в порядке .
...,...,...
33936,i am staying with my uncle for the time being ...,"я пока живу у моего дяди , но позже я перееду ..."
33937,i 'm a non-native speaker of english and reali...,"я не являюсь носителем английского языка , и я..."
33938,i 'm afraid i 'll have to disappoint you . i d...,"боюсь , мне придётся вас разочаровать . я боль..."
33939,i 'm not kidding you . some people actually be...,я не шучу . некоторые люди действительно верят...


In [14]:
data_easy = data_easy.sample(frac = 1, ignore_index = True, random_state = 42)
data_easy.head()

Unnamed: 0,english_tokens,russian_tokens
0,i 'm glad i was n't there .,"я рад , что меня там не было ."
1,i 'm really tired and want to go to bed early .,я очень устал и хочу лечь спать пораньше .
2,they are all very hungry .,они все очень голодны .
3,you 're the only one who can solve this problem .,"вы единственный , кто может решить эту задачу ."
4,i 'm tom 's mother .,я мама тома .


In [15]:
n_train = int(len(data_easy)*0.7)
n_val = int(len(data_easy)*0.15)
n_test = int(len(data_easy)*0.15)

data_easy[:n_train]['split'] = 'train'
data_easy

Unnamed: 0,english_tokens,russian_tokens
0,i 'm glad i was n't there .,"я рад , что меня там не было ."
1,i 'm really tired and want to go to bed early .,я очень устал и хочу лечь спать пораньше .
2,they are all very hungry .,они все очень голодны .
3,you 're the only one who can solve this problem .,"вы единственный , кто может решить эту задачу ."
4,i 'm tom 's mother .,я мама тома .
...,...,...
33936,he 's not supposed to be here .,"его тут , по идее , быть не должно ."
33937,i 'm not buying your story .,я вашим россказням не верю .
33938,she is hostile to me .,она враждебно ко мне настроена .
33939,i 'm not as gullible as i used to be .,"я уже не так доверчив , как раньше ."


In [38]:
data_easy['split'] = None

data_easy.loc[(data_easy[:n_train].index), 'split'] = 'train'
data_easy.loc[(data_easy[n_train:n_train+n_val].index), 'split'] = 'val'
data_easy.loc[(data_easy[n_train+n_val:].index), 'split'] = 'test'

data_easy['split'].value_counts()

split
train    23758
test      5092
val       5091
Name: count, dtype: int64

In [39]:
data_easy.head()

Unnamed: 0,english_tokens,russian_tokens,split
0,i 'm glad i was n't there .,"я рад , что меня там не было .",train
1,i 'm really tired and want to go to bed early .,я очень устал и хочу лечь спать пораньше .,train
2,they are all very hungry .,они все очень голодны .,train
3,you 're the only one who can solve this problem .,"вы единственный , кто может решить эту задачу .",train
4,i 'm tom 's mother .,я мама тома .,train


In [40]:
data_easy.to_csv('data.csv', index = False)

# Vocabulary

In [137]:
data = pd.read_csv('data.csv')
data.head()

Unnamed: 0,english_tokens,russian_tokens,split
0,i 'm glad i was n't there .,"я рад , что меня там не было .",train
1,i 'm really tired and want to go to bed early .,я очень устал и хочу лечь спать пораньше .,train
2,they are all very hungry .,они все очень голодны .,train
3,you 're the only one who can solve this problem .,"вы единственный , кто может решить эту задачу .",train
4,i 'm tom 's mother .,я мама тома .,train


In [138]:
def create_vocabs(column_name):

    vocab_all_tokens = [stroka.split(' ') for stroka in data[column_name]]
    vocab_tokens = []
    for sublist in vocab_all_tokens:
        vocab_tokens.extend(sublist)
    vocab_tokens_unique = sorted(set(vocab_tokens))
    print('Количество уникальных токенов в {} словаре: {}'.format(column_name, len(vocab_tokens_unique)))
    vocab_tokens_unique = ['<MASK>'] + vocab_tokens_unique + ['<UNK>', '<BEGIN>', '<END>']
    vocab_tokens_unique = dict(zip(vocab_tokens_unique, np.arange(0, len(vocab_tokens_unique))))

    return vocab_tokens, vocab_tokens_unique

## english_vocabulary

In [139]:
spisok_all_eng_tokens, vocab_eng_tokens_unique =\
    create_vocabs('english_tokens')

Количество уникальных токенов в english_tokens словаре: 4688


## russian_vocabulary

In [140]:
spisok_all_rus_tokens, vocab_rus_tokens_unique =\
    create_vocabs('russian_tokens')

Количество уникальных токенов в russian_tokens словаре: 11605


# Подсчет максимальных длин последовательностей

In [141]:
max_source_len = 0
for text in data['english_tokens']:
    text_len = len(text.split(' '))
    if text_len > max_source_len:
        max_source_len = text_len

max_source_len += 2
max_source_len

24

In [142]:
max_target_len = 0
for text in data['russian_tokens']:
    text_len = len(text.split(' '))
    if text_len > max_target_len:
        max_target_len = text_len

max_target_len += 2
max_target_len

25

# Dataset

In [143]:
def text_vectorizer(
        stroka,
        mode_sequence
):
    if mode_sequence == 'source':
        vocab = vocab_eng_tokens_unique
        max_len = max_source_len
    else:
        vocab = vocab_rus_tokens_unique
        max_len = max_target_len

    stroka = [vocab.get(token, '<UNK>') for token in stroka.split()]
    stroka = [vocab.get('<BEGIN>')] + stroka + [vocab.get('<END>')]
    
    
    if mode_sequence == 'source':
        source = np.zeros(max_len, dtype = np.int64)
        for index, token_value in enumerate(stroka):
            source[index] = token_value
        return source, len(stroka)

    if mode_sequence == 'target':
        x_target = np.zeros(max_len, dtype = np.int64)
        y_target = np.zeros(max_len, dtype = np.int64)
        for index, token_value in enumerate(stroka[:-1]):
            x_target[index] = token_value

        for index, token_value in enumerate(stroka[1:]):
            y_target[index] = token_value

        return x_target, y_target


In [144]:
class NMT_Eng__Rus_DATASET(Dataset):
    def __init__(self, mode):

        self.df = data[data['split'] == mode]
        self.df_size = len(self.df)

    def __len__(self):
        return self.df_size
    
    def __getitem__(self, index):

        row = self.df.iloc[index]

        x_source, x_source_length = text_vectorizer(
            stroka = row['english_tokens'],
            mode_sequence = 'source'
        )

        x_target, y_target = text_vectorizer(
            stroka = row['russian_tokens'],
            mode_sequence = 'target'
        )

    
        return {"x_source": x_source, 
                "x_target": x_target,
                "y_target": y_target, 
                "x_source_length": x_source_length,
                "строка текста eng": row['english_tokens'],
                "строка текста rus": row['russian_tokens']}    

In [145]:
train_dataset = NMT_Eng__Rus_DATASET(mode = 'train')
val_dataset = NMT_Eng__Rus_DATASET(mode = 'val')
test_dataset = NMT_Eng__Rus_DATASET(mode = 'test')


train_dataset[0]

{'x_source': array([4690, 2090,    7, 1796, 2090, 4502, 2713, 4171,   12, 4691,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0], dtype=int64),
 'x_target': array([11607, 11557,  8285,     4, 11308,  4329, 10092,  4911,   706,
            6,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0], dtype=int64),
 'y_target': array([11557,  8285,     4, 11308,  4329, 10092,  4911,   706,     6,
        11608,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0], dtype=int64),
 'x_source_length': 10,
 'строка текста eng': "i 'm glad i was n't there .",
 'строка текста rus': 'я рад , что меня там не было .'}

# функция упаковки батчей для PackedSequence Pytorch

In [146]:
def generate_nmt_batches(dataset, batch_size, shuffle=True):
    
    dataloader = DataLoader(
        dataset = dataset,
        batch_size = batch_size,
        shuffle = shuffle,
        drop_last = True)

    for data_dict in dataloader:
        lengths = data_dict['x_source_length'].numpy()
        # получение индексов отсортированных длин
        sorted_length_indices = lengths.argsort()[::-1].tolist()
        
        out_data_dict = {}
        for name, tensor in data_dict.items():
            if (name == 'строка текста eng') or (name == 'строка текста rus'):
                continue
            else:
                # новый батч с отсортированными длинами
                out_data_dict[name] = data_dict[name][sorted_length_indices].to(device)


        yield out_data_dict

# Model

## Encoder

In [167]:
class NMTEncoder(nn.Module):
    def __init__(self, num_embeddings, embedding_size, rnn_hidden_size):
        """
        Args:
            num_embeddings (int): длина словаря английских токенов
            embedding_size (int): размерность вложения
            rnn_hidden_size (int): размер скрытого слоя RNN
        """
        super(NMTEncoder, self).__init__()
    
        self.source_embedding = nn.Embedding(num_embeddings, embedding_size, padding_idx=0)
        self.biGRU = nn.GRU(embedding_size, rnn_hidden_size, bidirectional=True, batch_first=True)
    
    def forward(self, x_source, x_lengths):
        """
        Args:
            x_source (torch.Tensor): вход с размерами (batch, seq_size)
            x_lengths (torch.Tensor): матрица длин текущего батча
        Returns:
            a tuple: x_unpacked (torch.Tensor), x_biGRU_h (torch.Tensor)
                x_unpacked.shape = (batch, seq_size, rnn_hidden_size * 2)
                x_biGRU_h.shape = (batch, rnn_hidden_size * 2)
        """
        # вложение входной последовательности
        x_embedded = self.source_embedding(x_source)
        # создаем PackedSequence; x_packed.data.shape=(number_items, embeddign_size)
        x_packed = pack_padded_sequence(x_embedded,
                                        x_lengths.detach().cpu().numpy(), 
                                        batch_first=True)
        
        # x_birnn_h.shape = (num_rnn, batch_size, feature_size)
        x_biGRU_out, x_biGRU_h  = self.biGRU(x_packed)
        # permute to (batch_size, num_rnn, feature_size)
        x_biGRU_h = x_biGRU_h.permute(1, 0, 2)
        
        # flatten features; reshape to (batch_size, num_rnn * feature_size)
        #  (recall: -1 takes the remaining positions, 
        #           flattening the two RNN hidden vectors into 1)
        x_biGRU_h = x_biGRU_h.contiguous().view(x_biGRU_h.size(0), -1)
        
        x_unpacked, _ = pad_packed_sequence(x_biGRU_out, batch_first=True)
        
        return x_unpacked, x_biGRU_h

## Decoder

### Функция внимания

In [169]:
def verbose_attention(encoder_state_vectors, query_vector):
    """Описательная версия механизма нейронного внимания
    
    Args:
        encoder_state_vectors (torch.Tensor): 3dim tensor from bi-GRU in encoder
        query_vector (torch.Tensor): hidden состояние в decoder GRUcell
    Returns:
        
    """

    batch_size, num_vectors, vector_size = encoder_state_vectors.size() # 128, 16, 32*2
    vector_scores = torch.sum(encoder_state_vectors * query_vector.view(batch_size, 1, vector_size), 
                              dim=2)
    vector_probabilities = F.softmax(vector_scores, dim=1)
    weighted_vectors = encoder_state_vectors * vector_probabilities.view(batch_size, num_vectors, 1)
    context_vectors = torch.sum(weighted_vectors, dim=1)
    return context_vectors, vector_probabilities, vector_scores

### Декодер

In [170]:
class NMTDecoder(nn.Module):
    def __init__(self, num_embeddings, embedding_size, rnn_hidden_size, bos_index):
        """
        Args:
            num_embeddings (int): размер таргет словаря
            embedding_size (int): размер вложения
            rnn_hidden_size (int): размер слоя rnn
            bos_index(int): <BEGIN> индекс
        """
        super(NMTDecoder, self).__init__()
        self._rnn_hidden_size = rnn_hidden_size
        self.target_embedding = nn.Embedding(num_embeddings=num_embeddings, 
                                             embedding_dim=embedding_size, 
                                             padding_idx=0)
        self.gru_cell = nn.GRUCell(embedding_size + rnn_hidden_size, 
                                   rnn_hidden_size)
        self.hidden_map = nn.Linear(rnn_hidden_size, rnn_hidden_size)
        self.classifier = nn.Linear(rnn_hidden_size * 2, num_embeddings)
        self.bos_index = bos_index
    
    def _init_indices(self, batch_size):
        """ return the BEGIN-OF-SEQUENCE index vector """
        return torch.ones(batch_size, dtype=torch.int64) * self.bos_index
    
    def _init_context_vectors(self, batch_size):
        """ return a zeros vector for initializing the context """
        return torch.zeros(batch_size, self._rnn_hidden_size)
            
    def forward(self, encoder_state, initial_hidden_state, target_sequence):
        """The forward pass of the model
        
        Args:
            encoder_state (torch.Tensor): out энкодера
            initial_hidden_state (torch.Tensor): аоследний скрытый слой энкодера
            target_sequence (torch.Tensor): таргет тензор
        Returns:
            output_vectors (torch.Tensor): предикты
        """    
        # смена батч-длина ---> длина-батч
        target_sequence = target_sequence.permute(1, 0)
        output_sequence_size = target_sequence.size(0)

        # инициализация первого скрытого слоя в декодере (из энкодера)
        h_t = self.hidden_map(initial_hidden_state)

        batch_size = encoder_state.size(0)
        # нули (батч, rnn_hidden_size)
        context_vectors = self._init_context_vectors(batch_size)
        # вектор <BEGIN>-индекса размером батч 
        y_t_index = self._init_indices(batch_size)
        
        h_t = h_t.to(encoder_state.device)
        y_t_index = y_t_index.to(encoder_state.device)
        context_vectors = context_vectors.to(encoder_state.device)

        output_vectors = []
        self._cached_p_attn = []
        self._cached_ht = []
        self._cached_decoder_state = encoder_state.cpu().detach().numpy()
        
        
        for i in range(output_sequence_size): # output_sequence_size = 25
            y_t_index = target_sequence[i] #  target_sequence[0] = 11607, 11607, .... (индекс-<BEGIN>)
                
            # Шаг 1: вложение текущего вектора и конкатенация с нулями (батч, rnn_hidden_size) dim = 1
            y_input_vector = self.target_embedding(y_t_index)
            rnn_input = torch.cat([y_input_vector, context_vectors], dim=1)
            
            # Шаг 2: GRU
            h_t = self.gru_cell(rnn_input, h_t)
            self._cached_ht.append(h_t.cpu().detach().numpy())
            
            # Шаг 3. Используем текущий скрытый статус для отслеживания состояния кодировщика.
            context_vectors, p_attn, _ = verbose_attention(encoder_state_vectors=encoder_state, 
                                                           query_vector=h_t)
            
            # сохранение вероятностей внимания
            self._cached_p_attn.append(p_attn.cpu().detach().numpy())
            
            # Шаг 4. Используем текущие скрытые и контекстные векторы, чтобы сделать прогноз до следующего слова.
            prediction_vector = torch.cat((context_vectors, h_t), dim=1)
            score_for_y_t_index = self.classifier(F.dropout(prediction_vector, 0.3))
            
            # сохранение оценок прогноза
            output_vectors.append(score_for_y_t_index)
            
        output_vectors = torch.stack(output_vectors).permute(1, 0, 2)
        
        return output_vectors

## result model

In [171]:
class NMTModel(nn.Module):
    """ The Neural Machine Translation Model """
    def __init__(self, source_vocab_size, source_embedding_size, 
                 target_vocab_size, target_embedding_size, encoding_size, 
                 target_bos_index):
        """
        Args:
            source_vocab_size (int): len(eng_vocab)
            source_embedding_size (int): 100
            target_vocab_size (int): len(rus_vocab)
            target_embedding_size (int): 100
            encoding_size (int): rnn_hidden_size
        """
        super(NMTModel, self).__init__()
        self.encoder = NMTEncoder(num_embeddings=source_vocab_size, 
                                  embedding_size=source_embedding_size,
                                  rnn_hidden_size=encoding_size)
        decoding_size = encoding_size * 2
        self.decoder = NMTDecoder(num_embeddings=target_vocab_size, 
                                  embedding_size=target_embedding_size, 
                                  rnn_hidden_size=decoding_size,
                                  bos_index=target_bos_index)
    
    def forward(self, x_source, x_source_lengths, target_sequence):
        """The forward pass of the model
        
        Args:
            x_source (torch.Tensor): x_in
                x_source.shape =  (batch, vectorizer.max_source_length)
            x_source_lengths torch.Tensor): x_length
            target_sequence (torch.Tensor): x_target
        Returns:
            decoded_states (torch.Tensor): предикты
        """
        encoder_state, final_hidden_states = self.encoder(x_source, x_source_lengths)
        decoded_states = self.decoder(encoder_state=encoder_state, 
                                      initial_hidden_state=final_hidden_states, 
                                      target_sequence=target_sequence)
        return decoded_states

# Training loop

### Нормализация тензоров

In [172]:
def normalize_sizes(y_pred, y_true):
    """Нормализация тензоров
    """
    if len(y_pred.size()) == 3:
        y_pred = y_pred.contiguous().view(-1, y_pred.size(2))
    if len(y_true.size()) == 2:
        y_true = y_true.contiguous().view(-1)
    return y_pred, y_true

### Расчет accuracy

In [173]:
def compute_accuracy(y_pred, y_true, mask_index):
    y_pred, y_true = normalize_sizes(y_pred, y_true)

    _, y_pred_indices = y_pred.max(dim=1)
    
    correct_indices = torch.eq(y_pred_indices, y_true).float()
    valid_indices = torch.ne(y_true, mask_index).float()
    
    n_correct = (correct_indices * valid_indices).sum().item()
    n_valid = valid_indices.sum().item()

    return n_correct / n_valid 

### Расчет потерь

In [174]:
def sequence_loss(y_pred, y_true, mask_index):
    y_pred, y_true = normalize_sizes(y_pred, y_true)
    return F.cross_entropy(y_pred, y_true, ignore_index=mask_index)

### Входные данные для цикла

In [198]:
model = NMTModel(
     source_vocab_size = len(vocab_eng_tokens_unique),
     source_embedding_size = 100,
     target_vocab_size = len(vocab_rus_tokens_unique),
     target_embedding_size = 100,
     encoding_size = 32,
     target_bos_index = vocab_rus_tokens_unique.get('<BEGIN>')
)

num_epoches = 100
optimizer = optim.Adam(model.parameters(), lr = 0.0005)

stop_diff_value = 0.00001

### Цикл

In [199]:
list_train_loss = []
list_val_loss = []

list_train_acc = []
list_val_acc = []


model = model.to(device)

for epoch_index in range(1, num_epoches+1):
       
        
        batch_generator = generate_nmt_batches(dataset = train_dataset, 
                                               batch_size = 128)
        running_loss = 0.0
        running_acc = 0.0

        # TRAIN
        model.train()
        
        for batch_index, batch_dict in enumerate(batch_generator):
            
            optimizer.zero_grad()

            y_pred = model(batch_dict['x_source'].to(device), 
                           batch_dict['x_source_length'].to(device), 
                           batch_dict['x_target'].to(device))

            loss = sequence_loss(y_pred, batch_dict['y_target'], mask_index = 0)

            loss.backward()
            optimizer.step()
            
            running_loss += (loss.item() - running_loss) / (batch_index + 1)

            acc_t = compute_accuracy(y_pred, batch_dict['y_target'], mask_index = 0)
            running_acc += (acc_t - running_acc) / (batch_index + 1)
        
        list_train_loss.append(running_loss)
        list_train_acc.append(running_acc)

        # EVAL
        batch_generator = generate_nmt_batches(dataset = val_dataset, 
                                               batch_size = 128,
                                               shuffle = False)
        running_loss = 0.0
        running_acc = 0.0

        model.eval()

        for batch_index, batch_dict in enumerate(batch_generator):

            y_pred = model(batch_dict['x_source'].to(device), 
                           batch_dict['x_source_length'].to(device), 
                           batch_dict['x_target'].to(device))

            loss = sequence_loss(y_pred, batch_dict['y_target'], mask_index = 0)

            running_loss += (loss.item() - running_loss) / (batch_index + 1)
            
            acc_t = compute_accuracy(y_pred, batch_dict['y_target'], mask_index = 0)
            running_acc += (acc_t - running_acc) / (batch_index + 1)

        list_val_loss.append(running_loss)
        list_val_acc.append(running_acc)

        # EARLY_STOPPING
        if (epoch_index % 5 == 0):
            back_loss = list_val_loss[epoch_index-5]
            now_loss = list_val_loss[epoch_index-1]

            if back_loss - now_loss <= stop_diff_value:
                print('Ранняя остановка')
                print('на {} эпохе: val_loss = {}'.format(epoch_index-5, back_loss))
                print('на {} эпохе: val_loss = {}'.format(epoch_index, now_loss))
                print('Разница = {}, что меньше чем stop_diff_value = {}'.format(back_loss - now_loss, stop_diff_value))
                break

        if (epoch_index == 1) or (epoch_index % 10 == 0):
            print(' {:03d}/{:03d} Epoch || train_loss: {:.3f} || val_loss {:.3f} || train_acc {:.3f} || val_acc {:.3f} ||'.format(
                epoch_index, num_epoches, list_train_loss[epoch_index-1], list_val_loss[epoch_index-1], list_train_acc[epoch_index-1], list_val_acc[epoch_index-1]
            ))

 001/100 Epoch || train_loss: 5.596 || val_loss 4.411 || train_acc 0.330 || val_acc 0.400 ||
 010/100 Epoch || train_loss: 2.698 || val_loss 3.113 || train_acc 0.557 || val_acc 0.546 ||
 020/100 Epoch || train_loss: 1.907 || val_loss 2.756 || train_acc 0.630 || val_acc 0.588 ||
 030/100 Epoch || train_loss: 1.476 || val_loss 2.655 || train_acc 0.688 || val_acc 0.609 ||
 040/100 Epoch || train_loss: 1.218 || val_loss 2.611 || train_acc 0.728 || val_acc 0.618 ||
Ранняя остановка
на 45 эпохе: val_loss = 2.610802931663318
на 50 эпохе: val_loss = 2.624696120237692
Разница = -0.013893188574373827, что меньше чем stop_diff_value = 1e-05


In [208]:
torch.save(model.state_dict(), 'model.pth')

# Проверка на тестовой выборке

In [435]:
torch.manual_seed(42)

batch_generator = generate_nmt_batches(dataset = test_dataset, 
                                       batch_size = 128,
                                       shuffle = False)

### Создаем словари индекс-токен

In [436]:
vocab_rus_index_token = {}
for token, index in vocab_rus_tokens_unique.items():
    vocab_rus_index_token[index] = token


vocab_eng_index_token = {}
for token, index in vocab_eng_tokens_unique.items():
    vocab_eng_index_token[index] = token

### Цикл проверки

In [437]:
model.eval()
spisok_to_data = []

for batch_index, batch_dict in enumerate(batch_generator):
    
            y_pred = model(batch_dict['x_source'].to(device), 
                           batch_dict['x_source_length'].to(device), 
                           batch_dict['x_target'].to(device))
            
            acc_t = compute_accuracy(y_pred, batch_dict['y_target'], mask_index = 0)
            running_acc += (acc_t - running_acc) / (batch_index + 1)


            batch_test = y_pred
            eng = batch_dict['x_source']

            ########
            for i_batch in range(y_pred.size(0)):
                    # RUS
                    rus_tensor_logits = y_pred[i_batch]
                    probas_rus = torch.softmax(rus_tensor_logits, dim = 1)
                    indexes_of_rus_vocab = torch.argmax(probas_rus, dim = 1)
                    
                    rus_sentence = []

                    for index in indexes_of_rus_vocab.detach().cpu().numpy():
                        token = vocab_rus_index_token.get(index)
                        if token == '<END>':
                                break
                        else:
                                rus_sentence.append(token)
                        
                        rus_stroka = ' '.join(rus_sentence)
                        
                    # ENG
                    eng_sentence = []
                    eng = batch_dict['x_source']

                    for index in eng[i_batch].detach().cpu().numpy()[1:]:
                        token = vocab_eng_index_token.get(index)
                        if (token == '<END>') or (token == '<MASK>'):
                                break
                        else:
                                eng_sentence.append(token)

                        eng_stroka = ' '.join(eng_sentence)
                    
                    spisok_to_data.append(
                            {'eng': eng_stroka,
                             'rus': rus_stroka}
                    )

### Вывод результата

In [438]:
pd.DataFrame(spisok_to_data)

Unnamed: 0,eng,rus
0,i 'm not going to force you to do anything you...,"я не собираюсь говорить тебя делать этого , чт..."
1,i 'm hungry because i have n't eaten anything ...,"я голодный , , потому что не не не . . было ."
2,you 're wanted on the phone . it 's from tom .,с за тому том
3,you are n't the only one who does n't like tom .,"не не единственный , кто я знаком том ."
4,"i 'm not sure , but perhaps tom is already dead .","я не так , что том , кто не не ."
...,...,...
4987,i 'm energetic .,я энергична .
4988,they 're amazing .,они поразительны .
4989,i 'm enthusiastic .,я дрожу надежд .
4990,i am sick .,я болен .
