In [1]:
import nltk
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

import matplotlib.pyplot as plt
from torch.optim.lr_scheduler import ReduceLROnPlateau, StepLR
from nltk.tokenize import RegexpTokenizer
from tqdm import tqdm

from training import train_transformer
from data_preprocessing import make_wordinddicts
from utils import read_json, write_json, translate
from Translator import Translator, Translatorv2, Translatorv3

device = torch.device('cuda')
torch.cuda.empty_cache()

tokenizer = RegexpTokenizer(r"\b\w+(?:'\w+)?\b")


BATCH_SIZE = 24

In [2]:
class LabelSmoothingLoss(nn.Module):
    def __init__(self, smoothing=0.1, ignore_index=-1):
        super(LabelSmoothingLoss, self).__init__()
        self.smoothing = smoothing
        self.ignore_index = ignore_index

    def forward(self, pred, target):
        """
        Args:
            pred: [batch_size, seq_len, n_classes] — логиты модели.
            target: [batch_size, seq_len] — индексы целевых классов.
        """
        log_prob = nn.functional.log_softmax(pred, dim=-1)
        n_classes = pred.size(-1)

        # Создаем true distribution
        true_dist = torch.zeros_like(pred)
        true_dist.fill_(self.smoothing / (n_classes - 1))
        true_dist.scatter_(2, target.unsqueeze(2), 1.0 - self.smoothing)

        # Маскируем позиции с ignore_index
        mask = (target != self.ignore_index).unsqueeze(-1)  # [batch_size, seq_len, 1]
        log_prob = log_prob * mask  # Маскируем логарифмы
        true_dist = true_dist * mask  # Маскируем распределение

        # Нормализация true_dist (восстановление корректного распределения)
        true_dist = true_dist / true_dist.sum(dim=-1, keepdim=True).clamp_min(1e-12)

        # Вычисляем потери (усреднение по валидным токенам)
        loss = (-true_dist * log_prob).sum(dim=-1)  # Суммируем по классам
        loss = loss.sum() / mask.sum()  # Учитываем только валидные элементы

        return loss

In [3]:
data = read_json('train')
val_data = read_json('val')


source_word2ind, source_ind2word, target_word2ind, target_ind2word, source_max_len, target_max_len, dataset = make_wordinddicts(data, tokenizer)
_, _, _, _, _, _,  eval_dataset = make_wordinddicts(val_data, tokenizer)


eval_dataloader = DataLoader(eval_dataset, batch_size = BATCH_SIZE, shuffle = True)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle = True)

In [4]:
print(source_max_len, target_max_len)

44 66


In [5]:
sp = source_word2ind['<PAD>']
tp = target_word2ind['<PAD>']

model = Translatorv3(len(source_word2ind), len(target_word2ind), sp, tp, num_encoder_layers=6, num_decoder_layers=6, hidden_dim = 512, dropout=0.4 , n_heads = 8).to(device)
criterion = nn.CrossEntropyLoss(ignore_index = tp)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-6)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=1, verbose=True)

AssertionError: Torch not compiled with CUDA enabled

In [None]:
train_losses, val_losses = train_transformer(model, criterion, optimizer, scheduler, dataloader, eval_dataloader, 12)

In [None]:
plt.plot(train_losses, label = 'val')
plt.plot(val_losses, label = 'val')
plt.show()

In [None]:
# для ускорения самообучения ии помоги ребятам написать программу её задача проверять является ли 

In [None]:
# model.load_state_dict(torch.load('best_model.pth'))

In [None]:
from utils import translate
from tqdm import tqdm
data = read_json('test_no_reference')
result = []
for line in tqdm(data):
    sentence = [char for char in line['src']]
    translated_sentence = translate(model, sentence, source_word2ind, target_word2ind)
    # print(translated_sentence)
    result.append({'src': line['src'],
                   'dst': ''.join(translated_sentence)})


In [None]:
write_json(result)

In [None]:
# import nltk
# import torch
# import torch.nn as nn
# from torch.utils.data import Dataset, DataLoader

# import matplotlib.pyplot as plt
# from torch.optim.lr_scheduler import ReduceLROnPlateau, StepLR
# from nltk.tokenize import RegexpTokenizer
# from tqdm import tqdm

# from training import train_transformer
# from data_preprocessing import make_wordinddicts
# from utils import read_json, write_json, translate
# from Translator import Translator, Translatorv2, Translatorv3

# device = torch.device('cuda')
# torch.cuda.empty_cache()

# tokenizer = RegexpTokenizer(r"\b\w+(?:'\w+)?\b")


# BATCH_SIZE = 500
# data = read_json('train')
# val_data = read_json('val')


# source_word2ind, source_ind2word, target_word2ind, target_ind2word, max_len, dataset = make_wordinddicts(data, tokenizer)
# _, _, _, _, _, eval_dataset = make_wordinddicts(val_data, tokenizer)


# eval_dataloader = DataLoader(eval_dataset, batch_size = BATCH_SIZE, shuffle = True)
# dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle = True)
# sp = source_word2ind['<PAD>']
# tp = target_word2ind['<PAD>']

# model = Translatorv3(len(source_word2ind), len(target_word2ind), sp, tp, num_encoder_layers=1, num_decoder_layers=1, hidden_dim = 256, dropout=0.3).to(device)
# criterion = nn.CrossEntropyLoss(ignore_index=target_word2ind['<PAD>'])
# optimizer = torch.optim.Adam(model.parameters(), weight_decay = 0.000001)
# train_losses, val_losses = train_transformer(model, criterion, optimizer, dataloader, eval_dataloader, 10)
# from utils import translate
# from tqdm import tqdm
# data = read_json('test_no_reference')
# result = []
# for line in tqdm(data):
#     sentence = [char for char in line['src']]
#     translated_sentence = translate(model, sentence, source_word2ind, target_word2ind)
#     # print(translated_sentence)
#     result.append({'src': line['src'],
#                    'dst': ''.join(translated_sentence)})
