In [1]:
import nltk
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

import matplotlib.pyplot as plt
from torch.optim.lr_scheduler import ReduceLROnPlateau, StepLR
from nltk.tokenize import RegexpTokenizer
from tqdm import tqdm

from training import train_transformer
from data_preprocessing import make_wordinddicts
from utils import read_json, write_json, translate
from Translator import Translator, Translatorv2, Translatorv3

device = torch.device('cuda')
torch.cuda.empty_cache()

tokenizer = RegexpTokenizer(r"\b\w+(?:'\w+)?\b")


BATCH_SIZE = 75

In [2]:
class LabelSmoothingLoss(nn.Module):
    def __init__(self, smoothing=0.1, ignore_index=-1):
        super(LabelSmoothingLoss, self).__init__()
        self.smoothing = smoothing
        self.ignore_index = ignore_index

    def forward(self, pred, target):
        log_prob = nn.functional.log_softmax(pred, dim=-1)
        n_classes = pred.size(-1)

        # Создаем true distribution
        true_dist = torch.zeros_like(pred)
        true_dist.fill_(self.smoothing / (n_classes - 1))
        true_dist.scatter_(1, target.unsqueeze(1), 1.0 - self.smoothing)

        # Применяем маску для игнорируемых индексов
        mask = (target != self.ignore_index)
        true_dist = true_dist * mask.unsqueeze(1)  # Обнуляем веса для паддингов
        log_prob = log_prob * mask.unsqueeze(1)  # Убираем вклад паддингов

        # Вычисляем потерю
        loss = (-true_dist * log_prob).sum(dim=-1)
        loss = loss[mask].mean()  # Учитываем только валидные элементы

        return loss

In [3]:
data = read_json('train')
val_data = read_json('val')


source_word2ind, source_ind2word, target_word2ind, target_ind2word, source_max_len, target_max_len, dataset = make_wordinddicts(data, tokenizer)
_, _, _, _, _, _,  eval_dataset = make_wordinddicts(val_data, tokenizer)


eval_dataloader = DataLoader(eval_dataset, batch_size = BATCH_SIZE, shuffle = True)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle = True)

In [4]:
print(source_max_len, target_max_len)

321 66


In [5]:
sp = source_word2ind['<PAD>']
tp = target_word2ind['<PAD>']

model = Translatorv3(len(source_word2ind), len(target_word2ind), sp, tp, num_encoder_layers=1, num_decoder_layers=1, hidden_dim = 256, dropout=0.4 , n_heads = 8).to(device)
criterion = LabelSmoothingLoss(ignore_index = tp)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=1, verbose=True)

In [6]:
train_losses, val_losses = train_transformer(model, criterion, optimizer, scheduler, dataloader, eval_dataloader, 10)

Epoch: 1


100%|██████████| 4000/4000 [11:57<00:00,  5.57it/s]


Average train loss: 6.0987


  return torch._transformer_encoder_layer_fwd(
  return torch._native_multi_head_attention(
100%|██████████| 7/7 [00:00<00:00, 12.36it/s]


Average val loss: 12.6588
Model saved.
Epoch: 2


100%|██████████| 4000/4000 [12:27<00:00,  5.35it/s]


Average train loss: 5.5471


100%|██████████| 7/7 [00:00<00:00,  9.76it/s]


Average val loss: 12.1548
Model saved.
Epoch: 3


100%|██████████| 4000/4000 [13:00<00:00,  5.13it/s]  


Average train loss: 5.3531


100%|██████████| 7/7 [00:00<00:00, 11.59it/s]


Average val loss: 11.9804
Model saved.
Epoch: 4


100%|██████████| 4000/4000 [13:08<00:00,  5.07it/s]


Average train loss: 5.2330


100%|██████████| 7/7 [00:00<00:00, 11.86it/s]


Average val loss: 12.1417
Epoch: 5


 50%|█████     | 2011/4000 [07:35<07:30,  4.41it/s] 


KeyboardInterrupt: 

In [7]:
plt.plot(train_losses, label = 'val')
plt.plot(val_losses, label = 'val')
plt.show()

NameError: name 'train_losses' is not defined

In [8]:
model.load_state_dict(torch.load('best_model.pth'))

<All keys matched successfully>

In [9]:
from utils import translate
from tqdm import tqdm
data = read_json('test_no_reference')
result = []
for line in tqdm(data):
    sentence = [char for char in line['src']]
    translated_sentence = translate(model, sentence, source_word2ind, target_word2ind)
    # print(translated_sentence)
    result.append({'src': line['src'],
                   'dst': ''.join(translated_sentence)})


  output = torch._nested_tensor_from_mask(output, src_key_padding_mask.logical_not(), mask_check=False)
100%|██████████| 1000/1000 [09:25<00:00,  1.77it/s]


In [10]:
write_json(result)

In [None]:
# import nltk
# import torch
# import torch.nn as nn
# from torch.utils.data import Dataset, DataLoader

# import matplotlib.pyplot as plt
# from torch.optim.lr_scheduler import ReduceLROnPlateau, StepLR
# from nltk.tokenize import RegexpTokenizer
# from tqdm import tqdm

# from training import train_transformer
# from data_preprocessing import make_wordinddicts
# from utils import read_json, write_json, translate
# from Translator import Translator, Translatorv2, Translatorv3

# device = torch.device('cuda')
# torch.cuda.empty_cache()

# tokenizer = RegexpTokenizer(r"\b\w+(?:'\w+)?\b")


# BATCH_SIZE = 500
# data = read_json('train')
# val_data = read_json('val')


# source_word2ind, source_ind2word, target_word2ind, target_ind2word, max_len, dataset = make_wordinddicts(data, tokenizer)
# _, _, _, _, _, eval_dataset = make_wordinddicts(val_data, tokenizer)


# eval_dataloader = DataLoader(eval_dataset, batch_size = BATCH_SIZE, shuffle = True)
# dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle = True)
# sp = source_word2ind['<PAD>']
# tp = target_word2ind['<PAD>']

# model = Translatorv3(len(source_word2ind), len(target_word2ind), sp, tp, num_encoder_layers=1, num_decoder_layers=1, hidden_dim = 256, dropout=0.3).to(device)
# criterion = nn.CrossEntropyLoss(ignore_index=target_word2ind['<PAD>'])
# optimizer = torch.optim.Adam(model.parameters(), weight_decay = 0.000001)
# train_losses, val_losses = train_transformer(model, criterion, optimizer, dataloader, eval_dataloader, 10)
# from utils import translate
# from tqdm import tqdm
# data = read_json('test_no_reference')
# result = []
# for line in tqdm(data):
#     sentence = [char for char in line['src']]
#     translated_sentence = translate(model, sentence, source_word2ind, target_word2ind)
#     # print(translated_sentence)
#     result.append({'src': line['src'],
#                    'dst': ''.join(translated_sentence)})
