# RNN & Attention: HW

Привет! Это твоё домашнее задание: сделать модель, которая может переводить тексты с немецкого языка в англиский. Для обучения будет использоваться датасет [wmt-14](https://huggingface.co/datasets/wmt14). Для проверки будет использоваться BLEU на тестовой выборке и 10 примеров перевода вашей модели. В этом ноутбуке есть скелет для обучения модели трансформера. Но вы можете пользоваться и RNN, если вы считаете что можете обучить её под эту задачу. Главное -- получить `submission.yaml`, используя нейросети.

**!Внимание!** В этой домашней работе нельзя пользоваться библиотекой `transformers`.

In [1]:
import sys
import nltk
import gc
import tqdm
import pickle

In [2]:
import torch
import torch.nn as nn
import nltk
import einops
import evaluate
import math

from datasets import load_dataset

In [3]:
bleu = evaluate.load("bleu")

# Данные

В этой части подготовьте данные для обучения. Не забудьте добавить "BOS", "EOS" и "UNK" токены в ваши словари.

In [50]:
wmt14 = load_dataset("wmt14", "de-en")

Found cached dataset wmt14 (/home/alex/.cache/huggingface/datasets/wmt14/de-en/1.0.0/2de185b074515e97618524d69f5e27ee7545dcbed4aa9bc1a4235710ffca33f4)


  0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
tokenizer = nltk.WordPunctTokenizer()
lemmatizer = nltk.WordNetLemmatizer()

def tokenize_pipeline(sentence):
    tokens = tokenizer.tokenize(sentence)
    return [token for token in tokens if token.isalpha()]

In [5]:
tokenized_en = (
    [tokenize_pipeline(sentence["en"]) for sentence in wmt14["train"]["translation"]] +
    [tokenize_pipeline(sentence["en"]) for sentence in wmt14["validation"]["translation"]] +
    [tokenize_pipeline(sentence["en"]) for sentence in wmt14["test"]["translation"]]
)

tokenized_de = (
    [tokenize_pipeline(sentence["de"]) for sentence in wmt14["train"]["translation"]] +
    [tokenize_pipeline(sentence["de"]) for sentence in wmt14["validation"]["translation"]] +
    [tokenize_pipeline(sentence["de"]) for sentence in wmt14["test"]["translation"]]
)

In [8]:
all_tokenized_en_words = {word for words in tokenized_en for word in words}
all_tokenized_de_words = {word for words in tokenized_de for word in words}

In [9]:
en_words_to_ids = {word: idx + 16 for idx, word in enumerate(all_tokenized_en_words)}
de_words_to_ids = {word: idx + 16 for idx, word in enumerate(all_tokenized_de_words)}

In [5]:
max_len = 25

### Определим класс датасета

In [6]:
class TranslationDataset(torch.utils.data.Dataset):
    def __init__(self, tokenizer, words_to_ids_en, words_to_ids_de, dataset, max_len=64):
        
        def tokenize_sentence(example):
            return {"tokens": tokenizer(example)}
        
        def convert_word_to_ids_en(example):
            return list(words_to_ids_en[token] for token in example['tokens'])
        
        def convert_word_to_ids_de(example):
            return list(words_to_ids_de[token] for token in example['tokens'])
        
        dataset_en, dataset_de = [], []
        
        for item in dataset:
            dataset_en.append(item['en'])
            dataset_de.append(item['de'])
        
        
        
        
        dataset_en = list(map(tokenize_sentence, dataset_en))
        self.dataset_en = list(map(convert_word_to_ids_en, dataset_en))
        
        del words_to_ids_en
        gc.collect()
        
        dataset_de = list(map(tokenize_sentence, dataset_de))
        self.dataset_de = list(map(convert_word_to_ids_de, dataset_de))
        
        del words_to_ids_de
        gc.collect()
        
        
        self.max_len = max_len
    
    def __len__(self):
        return len(self.dataset_de)
    
    def __getitem__(self, index):
        tokens_ids_en = self.dataset_en[index]
        tokens_ids_de = self.dataset_de[index]
        
        
        if len(tokens_ids_en) < max_len:
            tokens_ids_en = [1] + tokens_ids_en + [2 for _ in range(max_len - len(tokens_ids_en))]
            
        if len(tokens_ids_de) < max_len:
            tokens_ids_de = [1] + tokens_ids_de + [2 for _ in range(max_len - len(tokens_ids_de))]
            
            
        return tokens_ids_de[:max_len], tokens_ids_en[:max_len]
        
        
        

### Сохраним нужные нам словари

In [7]:
# # with open('de_words_to_ids.pickle', 'wb') as output:
# #     pickle.dump(de_words_to_ids, output)
    
# # with open('all_tokenized_de_words.pickle', 'wb') as output:
# #     pickle.dump(all_tokenized_de_words, output)
    
# # with open('en_words_to_ids.pickle', 'wb') as output:
# #     pickle.dump(en_words_to_ids, output)
    
    
# # with open('all_tokenized_en_words.pickle', 'wb') as output:
# #     pickle.dump(all_tokenized_en_words, output)

with open('de_words_to_ids.pickle', 'rb') as output:
    de_words_to_ids = pickle.load(output)
    
with open('all_tokenized_de_words.pickle', 'rb') as output:
    all_tokenized_de_words = pickle.load(output)
    
with open('en_words_to_ids.pickle', 'rb') as output:
    en_words_to_ids = pickle.load(output)
    
    
with open('all_tokenized_en_words.pickle', 'rb') as output:
    all_tokenized_en_words = pickle.load(output)
    
    



### Создадим датасеты и сохраним экземпляры

In [None]:
dataset_train = [item for item in wmt14['train']['translation']]
train_dataset = TranslationDataset(tokenize_pipeline, en_words_to_ids, de_words_to_ids,
                                   dataset_train) 

with open('train_dataset.pickle', 'wb') as output:
    pickle.dump(train_dataset, output)

validation_dataset = [item for item in wmt14['validation']['translation']]
valid_dataset = TranslationDataset(tokenize_pipeline, en_words_to_ids, de_words_to_ids,
                                   validation_dataset) 

with open('validation_dataset.pickle', 'wb') as output:
    pickle.dump(valid_dataset, output)
    
    
test_dataset = [item for item in wmt14['test']['translation']]
test_dataset = TranslationDataset(tokenize_pipeline, en_words_to_ids, de_words_to_ids,
                                   test_dataset) 

with open('test_dataset.pickle', 'wb') as output:
    pickle.dump(test_dataset, output)
    

In [8]:
with open('train_dataset.pickle', 'rb') as f:
    train_dataset = pickle.load(f)
    
with open('validation_dataset.pickle', 'rb') as f:
    valid_dataset = pickle.load(f)
    
with open('test_dataset.pickle', 'rb') as f:
    test_dataset = pickle.load(f)


### Collate_fn и Dataloaders

In [9]:
def collate_fn(batch):
    x = torch.LongTensor([i[0] for i in batch])
    y = torch.LongTensor([i[1] for i in batch])
    return x, y

In [10]:
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=3, collate_fn=collate_fn)
valid_dataloader = torch.utils.data.DataLoader(valid_dataset, batch_size=3, collate_fn=collate_fn)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=3, collate_fn=collate_fn)

# Model

Сделайте модель, которая может в перевод. Для этой модели потребуется сделать `Encoder` и `Decoder`. Первый будет брать текст на немецком и отдавать информацию про него. Decoder будет брать информацию про немецкий текст и превращать его в английский.

In [None]:
# Если вам нужны дополнительные модули, такие как Attention или Transformer layer, то можете добавить их сюда

In [11]:
# global params
vocab_en_size, vocab_de_size = 732992 + 16, 1582945 + 16
embadding__dim = 256
device = 'cuda'
num_heads = 4
batch_size = 4
# max_len = 30

Для слоев Encoder можете скопировать код из семинара:

In [12]:
class MultiHeadAttentionLayer(nn.Module):
    def __init__(self, hid_dim, n_heads, dropout, device):
        super().__init__()
        
        assert hid_dim % n_heads == 0
        
        self.hid_dim = hid_dim
        self.n_heads = n_heads
        self.head_dim = hid_dim // n_heads
        
        self.fc_q = nn.Linear(hid_dim, hid_dim)
        self.fc_k = nn.Linear(hid_dim, hid_dim)
        self.fc_v = nn.Linear(hid_dim, hid_dim)
        
        self.fc_o = nn.Linear(hid_dim, hid_dim)
        
        self.dropout = nn.Dropout(dropout)
        
        self.scale = torch.sqrt(torch.FloatTensor([self.head_dim])).to(device)
        
    def forward(self, query, key, value, mask = None):
        
        batch_size = query.shape[0]
        
        #query = [batch size, query len, hid dim]
        #key = [batch size, key len, hid dim]
        #value = [batch size, value len, hid dim]
                
        Q = self.fc_q(query)
        K = self.fc_k(key)
        V = self.fc_v(value)
        
        #Q = [batch size, query len, hid dim]
        #K = [batch size, key len, hid dim]
        #V = [batch size, value len, hid dim]
                
        Q = Q.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        K = K.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        V = V.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        
        #Q = [batch size, n heads, query len, head dim]
        #K = [batch size, n heads, key len, head dim]
        #V = [batch size, n heads, value len, head dim]
                
        energy = torch.matmul(Q, K.permute(0, 1, 3, 2)) / self.scale
        
        #energy = [batch size, n heads, query len, key len]
        
        if mask is not None:
            energy = energy.masked_fill(mask == 0, -1e10)
        
        attention = torch.softmax(energy, dim = -1)
                
        #attention = [batch size, n heads, query len, key len]
                
        x = torch.matmul(self.dropout(attention), V)
        
        #x = [batch size, n heads, query len, head dim]
        
        x = x.permute(0, 2, 1, 3).contiguous()
        
        #x = [batch size, query len, n heads, head dim]
        
        x = x.view(batch_size, -1, self.hid_dim)
        
        #x = [batch size, query len, hid dim]
        
        x = self.fc_o(x)
        
        #x = [batch size, query len, hid dim]
        
        return x, attention
    
    
class MLP(nn.Module):
    def __init__(self, hid_dim, dropout):
        super().__init__()
        
        self.fc_1 = nn.Linear(hid_dim, 4*hid_dim)
        self.fc_2 = nn.Linear(4*hid_dim, hid_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        
        x = self.dropout(torch.relu(self.fc_1(x)))
        x = self.fc_2(x)
        
        return x

In [13]:
class EncoderTransformerLayer(torch.nn.Module):
    def __init__(self, hidden_dim: int, n_heads: int, dropout: float, device):
        super().__init__()
        
        self.layer_norm = torch.nn.LayerNorm(hidden_dim)
        self.attention = MultiHeadAttentionLayer(hidden_dim, n_heads, dropout,device)
        self.mlp = MLP(hidden_dim, dropout)
        self.out_norm = torch.nn.LayerNorm(hidden_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, inputs, mask):
        
        x, _ = self.attention(inputs, inputs, inputs) 
        
        norm = self.layer_norm(inputs + self.dropout(x))
        
        x = self.mlp(norm) 
        x = self.out_norm(norm + x)
        
        return x
    

In [14]:
class DecoderTransformerLayer(torch.nn.Module):
    def __init__(self, hidden_dim: int, n_heads: int, dropout: float, device):
        super().__init__()
        self.device = device
        self.layer_norm = torch.nn.LayerNorm(hidden_dim)
        self.self_attention = MultiHeadAttentionLayer(hidden_dim, n_heads, dropout, device)
        self.out_attention = MultiHeadAttentionLayer(hidden_dim, n_heads, dropout,device)
        self.mlp = MLP(hidden_dim, dropout)
        self.middl_norm = torch.nn.LayerNorm(hidden_dim)
        self.out_norm = torch.nn.LayerNorm(hidden_dim)
        
        
    def forward(self, inputs, encoder_layer_output, inputs_mask, enc_mask):
        
        x, _ = self.self_attention(inputs, inputs, inputs, inputs_mask)
        
        norm = self.layer_norm(inputs + x)
        
        x, attention = self.out_attention(norm, encoder_layer_output, encoder_layer_output, enc_mask)
        
        norm = self.middl_norm(norm + x)
      
        x = self.mlp(norm)
        
        out = self.out_norm(norm + x)
        
        
        return out


In [15]:
class Encoder(torch.nn.Module):
    def __init__(self, de_dictionary_size: int, hidden_dim: int, n_layer: int, 
                 n_heads: int, dropout: float, max_length: int, device):
        super().__init__()
        self.device = device
        self.word_embedding = torch.nn.Embedding(de_dictionary_size, hidden_dim)
        self.pos_embedding = nn.Embedding(max_length, hidden_dim)
        self.transformer = torch.nn.ModuleList([EncoderTransformerLayer(hidden_dim, n_heads, dropout, device) 
                                                           for _ in range(n_layer)])
        
        self.scale = torch.sqrt(torch.FloatTensor([hidden_dim])).to(device)
        
    def forward(self, inputs, mask):
        
        batch_size = inputs.shape[0]
        inputs_len = inputs.shape[1]
        
        pos = torch.arange(0, inputs_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)

        x = (self.word_embedding(inputs)*self.scale)
        x = x + self.pos_embedding(pos)

        for layer in self.transformer:
            x = layer(x, mask)
        return x
        

In [16]:
class Decoder(torch.nn.Module):
    def __init__(self, en_dictionary_size: int, hidden_dim: int, max_length: int, n_layers: int, 
                 n_heads: int, dropout: float, device):
        super().__init__()
        self.device = device
        self.word_embedding = torch.nn.Embedding(en_dictionary_size, hidden_dim)
        self.pos_embedding = torch.nn.Embedding(max_length, hidden_dim)
        self.transformer = nn.ModuleList([DecoderTransformerLayer(hidden_dim, 
                                                  n_heads,  
                                                  dropout, 
                                                  device)
                                     for _ in range(n_layers)])
        
        
        self.dropout = nn.Dropout(dropout)
        self.scale = torch.sqrt(torch.FloatTensor([hidden_dim])).to(device)
        
        self.lm_head = torch.nn.Linear(hidden_dim, en_dictionary_size)
        
        
        
    def forward(self, inputs, encoder_output, inputs_mask, enc_mask):
        
        batch_size = inputs.shape[0]
        input_len = inputs.shape[1]
        
        pos = torch.arange(0, input_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)
        
        x = self.dropout(self.word_embedding(inputs)*self.scale) + self.pos_embedding(pos)
        for layer in self.transformer:
            x = layer(x, encoder_output, inputs_mask, enc_mask)
        
        x = self.lm_head(x)
        return x


In [17]:
class TranslationModel(torch.nn.Module):
    def __init__(self, de_dictionary_size: int, en_dictionary_size: int, hidden_dim: int, device):
        super().__init__()
        self.device = device
        n_layer = 2
        n_heads = 4
        dropout = 0.12
        max_length = max_len
        self.encoder = Encoder(de_dictionary_size, hidden_dim, n_layer, n_heads, dropout, max_length, device)
        self.decoder = Decoder(en_dictionary_size, hidden_dim, max_length, n_layer, n_heads, dropout, device)
        self.idx = 2
        
    def make_src_mask(self, src, src_pad_idx):

        src_mask = (src != src_pad_idx).unsqueeze(1).unsqueeze(2)
        return src_mask
    
    def make_trg_mask(self, trg, trg_pad_idx):

        trg_pad_mask = (trg != trg_pad_idx).unsqueeze(1).unsqueeze(2)
        trg_len = trg.shape[1]
        trg_sub_mask = torch.tril(torch.ones((trg_len, trg_len), device = self.device)).bool()

        trg_mask = trg_pad_mask & trg_sub_mask

        return trg_mask
    
    
    def forward(self, inputs):
        
        original_ids, translation_ids = inputs
        
        original_mask = self.make_src_mask(original_ids, 2)
        translation_mask = self.make_trg_mask(translation_ids, 2)
        encoder_output = self.encoder(original_ids, original_mask)
       
        decoder_output = self.decoder(translation_ids, encoder_output, translation_mask, original_mask)
        return decoder_output

Сделайте модель, оптимиизатор и лосс функцию. В нашем случае лосс функция будет проверять предсказанию токенов на каждой позиции -- по сути классификатор на каждую позицию.

In [18]:
device = 'cuda'
lr = 0.001
model = TranslationModel(vocab_de_size, vocab_en_size, 32, device).to(device)
# model = torch.load('model_64_last.pth', torch.device('cpu'))
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
criterion = torch.nn.CrossEntropyLoss(ignore_index=2)

In [19]:
sum(p.numel() for p in model.parameters())

98361264

In [20]:
# add Train Loop
epochs = 10

for epoch in range(epochs):
    train_loss = 0
    valid_loss = 0
    
    # Train
    model.train()
    pbar = tqdm.tqdm(train_dataloader)
    for original_ids, translation_ids in pbar:
        optimizer.zero_grad()
        
        original_ids = original_ids.to(device)
        translation_ids = translation_ids.to(device)
        
        output = model((original_ids, translation_ids))
        
        
        loss = criterion(output.view(-1, output.shape[-1]), translation_ids.view(-1))
        
        loss.backward()
        optimizer.step()
        pbar.set_postfix({'loss': loss})
        train_loss += loss.item()
        
        
    # Validation
    model.eval()
    with torch.no_grad():
        for i, (original_ids, translation_ids) in enumerate(valid_dataloader):
            original_ids = original_ids.to(device)
            translation_ids = translation_ids.to(device)

            output = model((original_ids, translation_ids))

            loss = criterion(output.view(-1, output.shape[-1]), translation_ids.view(-1))

            valid_loss += loss.item()

    # Print statistics
    print(f"Epoch {epoch + 1} | Train Loss: {train_loss / len(train_dataloader):.4f} | Validation Loss: {valid_loss / len(valid_dataloader):.4f}")


  0%|          | 6999/1502929 [08:16<29:28:39, 14.10it/s, loss=tensor(1.1248, device='cuda:0', grad_fn=<NllLossBackward0>)]


In [13]:
torch.save(model.state_dict(), 'model_dict.pth')

In [22]:
def model_to_device(model, device):
    model = model.to(device)
    model.device = device
    model.decoder.device = device
    model.decoder.scale = model.decoder.scale.to(device)
    
    model.encoder.device = device
    model.encoder.scale = model.decoder.scale.to(device)
    model = model.to(device)
#     for param in model.parameters():
#         param.data = param.data.to(device)
#         if param._grad is not None:
#             param._grad = param._grad.to(device)
            
    for layer in model.encoder.transformer:
        layer.attention.scale = layer.attention.scale.to(device)
    
    for layer in model.decoder.transformer:
        layer.self_attention.scale = layer.self_attention.scale.to(device)
        layer.out_attention.scale = layer.out_attention.scale.to(device)
    
    return model

    
    

Чтобы получить перевод, надо сделать функцию для декодинга. Она будет брать предсказания токена на последней позиции и отдавать нужный токен.

In [48]:
id_to_world = {val:key for key, val in en_words_to_ids.items()}
id_to_world[1] = '[BOS]'
id_to_world[2] = '[EOS]'

def get_last_token_prediction(prefix, original, model, device='cpu', return_seq = False):
        
        if len(prefix) == len(original) + 1:
            return '[EOS]'
        
        prefix = {"tokens": tokenize_pipeline(prefix)}
        prefix = list(en_words_to_ids[token] for token in prefix['tokens'])
        id_future = len(prefix)
        
        original = {"tokens": tokenize_pipeline(original)}
     
        original = list(de_words_to_ids[token] for token in original['tokens'])
       
        orig_len = len(original)
        if len(prefix) < 25:
            prefix = [1] + prefix + [2 for _ in range(max_len - len(prefix))]
            
        if len(original) < 25:
            original = [1] + original + [2 for _ in range(max_len - len(original))]
            
            
        original, prefix = torch.LongTensor(original[:max_len]), torch.LongTensor(prefix[:max_len])
        original, prefix = original.unsqueeze(0), prefix.unsqueeze(0)
        
        
        with torch.no_grad():
            output = model((original.to(device), prefix.to(device)))
          
        if return_seq:
            index = torch.argmax(output, dim=2).tolist()[0][1:orig_len+1]
            return [id_to_world[ind] for ind in index]
        
        preds = torch.argmax(output, dim=2)
        return id_to_world[int(preds[0, id_future+1])]
        
        

In [24]:
device = 'cpu'
model = model_to_device(model, device)

In [35]:
original = "Guten Morgen!"
prefix = ""
# device = 'cuda'
get_last_token_prediction(prefix, original, model, device, True)

'finally Every'

In [47]:
last_token_eos = False
original = "Guten Morgen!"
prefix = ""

while not last_token_eos:
    token = get_last_token_prediction(prefix, original, model)
    prefix += token + ' '
    last_token_eos = token == "[EOS]"

prefix = prefix[:-6]   
print(prefix)

President her 


# Result

В качестве результата вы должны предоставить bleu вашей модели на тестовой выборке wmt14 и перевод 10 предложений с немецкого на английский.

In [66]:
test_data = wmt14["test"]["translation"]
preds = [" ".join(get_last_token_prediction("", x['de'], model, return_seq=True)[:len(x['de'])]) for x in test_data]



 23%|██▎       | 680/3003 [00:30<01:06, 35.12it/s][A[A

In [67]:
reference_corpus = [" ".join(tokenize_pipeline(x['en'])) for x in test_data]
test_bleu = bleu.compute(references=reference_corpus, predictions=preds) 

In [69]:
de_sentences = [
    "Gutach: Noch mehr Sicherheit für Fußgänger",
    "Zwei Anlagen so nah beieinander: Absicht oder Schildbürgerstreich?",
    "Dies bestätigt auch Peter Arnold vom Landratsamt Offenburg.",
    "Daher sei der Bau einer weiteren Ampel mehr als notwendig: \"Sicherheit geht hier einfach vor\", so Arnold.",
    "Pro Fahrtrichtung gibt es drei Lichtanlagen.",
    "Drückt der Fußgänger den Ampelknopf, testet der obere Radarsensor die Verkehrslage.",
    "Ein weiteres Radarsensor prüft, ob die Grünphase für den Fußgänger beendet werden kann.",
    "Josef Winkler schreibt sich seit mehr als 30 Jahren die Nöte seiner Kindheit und Jugend von der Seele.",
    "Dabei scheint Regisseur Fresacher dem Text wenig zu vertrauen.",
    "Sie werden hart angefasst, mit dem Kopf unter Wasser getaucht, mit ihren Abendroben an die Wand getackert.",
]
en_sentences = [" ".join(get_last_token_prediction("", x, model, return_seq=True)[:len(x)]) 
                for x in de_sentences] # get translation


In [71]:
import yaml


submission = {
    "tasks": [
        {"task1": {"answer": test_bleu}},
        {"task2": {"answer": en_sentences}}
    ]
}

yaml.safe_dump(submission, open("submission.yaml", "w"))