In [None]:

!pip install torch torchtext


In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchtext
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader, Dataset
from utils import read_json, write_json, translate

# Пример данных
data = 

# Токенизаторы
src_tokenizer = get_tokenizer('basic_english')
dst_tokenizer = get_tokenizer('basic_english')

# Функции для создания итераторов
def yield_tokens(data_iter, language):
    for data_sample in data_iter:
        yield src_tokenizer(data_sample[language])

# Создание вокабуляров
src_vocab = build_vocab_from_iterator(yield_tokens(data, 'src'), specials=["<unk>", "<pad>", "<bos>", "<eos>"])
dst_vocab = build_vocab_from_iterator(yield_tokens(data, 'dst'), specials=["<unk>", "<pad>", "<bos>", "<eos>"])

# Преобразование данных в тензоры
def data_process(data):
    src_tensor = []
    dst_tensor = []
    for sample in data:
        src_tensor.append([src_vocab["<bos>"]] + [src_vocab[token] for token in src_tokenizer(sample['src'])] + [src_vocab["<eos>"]])
        dst_tensor.append([dst_vocab["<bos>"]] + [dst_vocab[token] for token in dst_tokenizer(sample['dst'])] + [dst_vocab["<eos>"]])
    return src_tensor, dst_tensor

src_tensor, dst_tensor = data_process(data)

# Паддинг
def generate_batch(data_batch):
    src_batch, dst_batch = [], []
    for (src_item, dst_item) in data_batch:
        src_batch.append(torch.tensor(src_item, dtype=torch.long))
        dst_batch.append(torch.tensor(dst_item, dtype=torch.long))
    src_batch = nn.utils.rnn.pad_sequence(src_batch, padding_value=src_vocab["<pad>"])
    dst_batch = nn.utils.rnn.pad_sequence(dst_batch, padding_value=dst_vocab["<pad>"])
    return src_batch, dst_batch

class TranslationDataset(Dataset):
    def __init__(self, src_tensor, dst_tensor):
        self.src_tensor = src_tensor
        self.dst_tensor = dst_tensor

    def __len__(self):
        return len(self.src_tensor)

    def __getitem__(self, idx):
        return self.src_tensor[idx], self.dst_tensor[idx]

dataset = TranslationDataset(src_tensor, dst_tensor)
dataloader = DataLoader(dataset, batch_size=2, collate_fn=generate_batch)


ModuleNotFoundError: No module named 'torchtext'

In [None]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        outputs, (hidden, cell) = self.rnn(embedded)
        return hidden, cell

class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        self.output_dim = output_dim
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout)
        self.fc_out = nn.Linear(hid_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, cell):
        input = input.unsqueeze(0)
        embedded = self.dropout(self.embedding(input))
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        prediction = self.fc_out(output.squeeze(0))
        return prediction, hidden, cell

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        outputs = torch.zeros(trg_len, trg.shape[1], trg_vocab_size).to(self.device)
        hidden, cell = self.encoder(src)
        input = trg[0,:]
        for t in range(1, trg_len):
            output, hidden, cell = self.decoder(input, hidden, cell)
            outputs[t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input = (trg[t] if teacher_force else top1)
        return outputs


In [None]:
import random

INPUT_DIM = len(src_vocab)
OUTPUT_DIM = len(dst_vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = Seq2Seq(enc, dec, device).to(device)

optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index=dst_vocab["<pad>"])

def train(model, iterator, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0
    for i, (src, trg) in enumerate(iterator):
        src, trg = src.to(device), trg.to(device)
        optimizer.zero_grad()
        output = model(src, trg)
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)
        loss = criterion(output, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(iterator)

def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for i, (src, trg) in enumerate(iterator):
            src, trg = src.to(device), trg.to(device)
            output = model(src, trg, 0)
            output_dim = output.shape[-1]
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)
            loss = criterion(output, trg)
            epoch_loss += loss.item()
    return epoch_loss / len(iterator)

N_EPOCHS = 10
CLIP = 1

for epoch in range(N_EPOCHS):
    train_loss = train(model, dataloader, optimizer, criterion, CLIP)
    print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}')


In [None]:
def translate_sentence(sentence, src_field, trg_field, model, device, max_len=50):
    model.eval()
    tokens = [src_vocab["<bos>"]] + [src_vocab[token] for token in src_tokenizer(sentence)] + [src_vocab["<eos>"]]
    src_indexes = torch.LongTensor(tokens).unsqueeze(1).to(device)
    outputs = [dst_vocab["<bos>"]]
    for i in range(max_len):
        trg_indexes = torch.LongTensor(outputs).unsqueeze(1).to(device)
        with torch.no_grad():
            output = model(src_indexes, trg_indexes, 0)
        best_guess = output.argmax(2)[-1].item()
        outputs.append(best_guess)
        if best_guess == dst_vocab["<eos>"]:
            break
    translated_sentence = [dst_vocab.itos[idx] for idx in outputs]
    return translated_sentence[1:-1]

# Пример использования
sentence = "◄▴◓◠▨ ◨▽◠▦◈◬◓▪▼◬▵"
translation = translate_sentence(sentence, src_vocab, dst_vocab, model, device)
print(f"Translated sentence: {' '.join(translation)}")
