# Донка v1

Seq2Seq Transformer трениран на околу 500.000 реченици

Прочитајте повеќе на блогот:[Имплементациjа На Трансформер Архитектурата За Македонско-Англиски Превод На Реченици](https://najdovski-stefan.github.io/basics/2025/06/06/transformer-mk-en.html)

Преведува кратки и едноствни македонски реченици во англиски реченици

Доколку сакате локално да тестирате потребно ви е Python 3.10.x

In [None]:
# Локално
!pip install torch==2.2.2 torchtext==0.17.2
!pip install "numpy<2"
!pip install pandas sentencepiece matplotlib sacrebleu

Потребни библиотеки, на крајот враќа на што се извршува, функционира доволно брзо и на cpu и на cuda

In [None]:
import math
import torchtext
import torch
import torch.nn as nn
from torch import Tensor
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
from collections import Counter
from torchtext.vocab import Vocab
from torch.nn import TransformerEncoder, TransformerDecoder, TransformerEncoderLayer, TransformerDecoderLayer
import io
import time
import pandas as pd
import numpy as np
import pickle
import sentencepiece as spm
torch.manual_seed(0)
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(DEVICE)

In [None]:
def load_spm_vocab(filepath, max_size=None):
    vocab = []
    with open(filepath, encoding='utf-8') as f:
        for i, line in enumerate(f):
            token = line.strip().split('\t')[0]
            vocab.append(token)
            if max_size and len(vocab) >= max_size:
                break
    return vocab

mk_model_path = "spm.mk.full.model"
en_model_path = "spm.en.full.model"
mk_vocab = load_spm_vocab('spm.mk.full.vocab')
en_vocab = load_spm_vocab('spm.en.full.vocab')

en_tokenizer = spm.SentencePieceProcessor(model_file=en_model_path)
mk_tokenizer = spm.SentencePieceProcessor(model_file=mk_model_path)

class Seq2SeqTransformer(nn.Module):
    def __init__(self, num_encoder_layers: int, num_decoder_layers: int,
                 emb_size: int, src_vocab_size: int, tgt_vocab_size: int,
                 dim_feedforward:int = 512, dropout:float = 0.1):
        super(Seq2SeqTransformer, self).__init__()
        encoder_layer = TransformerEncoderLayer(d_model=emb_size, nhead=NHEAD,
                                                dim_feedforward=dim_feedforward)
        self.transformer_encoder = TransformerEncoder(encoder_layer, num_layers=num_encoder_layers)
        decoder_layer = TransformerDecoderLayer(d_model=emb_size, nhead=NHEAD,
                                                dim_feedforward=dim_feedforward)
        self.transformer_decoder = TransformerDecoder(decoder_layer, num_layers=num_decoder_layers)

        self.generator = nn.Linear(emb_size, tgt_vocab_size)
        self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)
        self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)
        self.positional_encoding = PositionalEncoding(emb_size, dropout=dropout)

    def forward(self, src: Tensor, trg: Tensor, src_mask: Tensor,
                tgt_mask: Tensor, src_padding_mask: Tensor,
                tgt_padding_mask: Tensor, memory_key_padding_mask: Tensor):
        src_emb = self.positional_encoding(self.src_tok_emb(src))
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg))
        memory = self.transformer_encoder(src_emb, src_mask, src_padding_mask)
        outs = self.transformer_decoder(tgt_emb, memory, tgt_mask, None,
                                        tgt_padding_mask, memory_key_padding_mask)
        return self.generator(outs)

    def encode(self, src: Tensor, src_mask: Tensor):
        return self.transformer_encoder(self.positional_encoding(
                            self.src_tok_emb(src)), src_mask)

    def decode(self, tgt: Tensor, memory: Tensor, tgt_mask: Tensor):
        return self.transformer_decoder(self.positional_encoding(
                          self.tgt_tok_emb(tgt)), memory,
                          tgt_mask)

        
class PositionalEncoding(nn.Module):
    def __init__(self, emb_size: int, dropout, maxlen: int = 10000):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(- torch.arange(0, emb_size, 2) * math.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(1) 
        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding: Tensor):
        return self.dropout(token_embedding +
                            self.pos_embedding[:token_embedding.size(0),:])

class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size
    def forward(self, tokens: Tensor):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)

def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones((sz, sz), device=DEVICE)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask

def create_mask(src, tgt):
  src_seq_len = src.shape[0]
  tgt_seq_len = tgt.shape[0]

  tgt_mask = generate_square_subsequent_mask(tgt_seq_len)
  src_mask = torch.zeros((src_seq_len, src_seq_len), device=DEVICE).type(torch.bool)

  src_padding_mask = (src == PAD_IDX).transpose(0, 1)
  tgt_padding_mask = (tgt == PAD_IDX).transpose(0, 1)
  return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask


SRC_VOCAB_SIZE = len(mk_vocab)
TGT_VOCAB_SIZE = len(en_vocab)
EMB_SIZE = 512
NHEAD = 8
FFN_HID_DIM = 512
BATCH_SIZE = 4
NUM_ENCODER_LAYERS = 3
NUM_DECODER_LAYERS = 3
NUM_EPOCHS = 18

transformer = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS,
                                 EMB_SIZE, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE,
                                 FFN_HID_DIM)


model = transformer = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS,
                                 EMB_SIZE, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE,
                                 FFN_HID_DIM)
model.to(DEVICE)

# ID
PAD_IDX = mk_vocab.index('<pad>')
BOS_IDX = mk_vocab.index('<s>')
EOS_IDX = mk_vocab.index('</s>')

model = Seq2SeqTransformer(
    NUM_ENCODER_LAYERS,
    NUM_DECODER_LAYERS,
    EMB_SIZE,
    SRC_VOCAB_SIZE,
    TGT_VOCAB_SIZE,
    FFN_HID_DIM
)
model.to(DEVICE)

checkpoint = torch.load('donka-v1_checkpoint_epoch18.pt', map_location=DEVICE)
transformer.load_state_dict(checkpoint['model_state_dict'])
transformer.eval()

def greedy_decode(model, src, src_mask, max_len, start_symbol):
    src = src.to(DEVICE)
    src_mask = src_mask.to(DEVICE)
    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(DEVICE)
    for i in range(max_len-1):
        memory = memory.to(DEVICE)
        memory_mask = torch.zeros(ys.shape[0], memory.shape[0]).to(DEVICE).type(torch.bool)
        tgt_mask = (generate_square_subsequent_mask(ys.size(0))
                                    .type(torch.bool)).to(DEVICE)
        out = model.decode(ys, memory, tgt_mask)
        out = out.transpose(0, 1)
        prob = model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim = 1)
        next_word = next_word.item()

        ys = torch.cat([ys,
                        torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
        if next_word == EOS_IDX:
          break
    return ys

def translate(model, src, src_vocab, tgt_vocab, src_tokenizer):
    model.eval()
    src = src.lower()
    tokens = [BOS_IDX] + src_tokenizer.encode(src) + [EOS_IDX]
    num_tokens = len(tokens)
    src = torch.LongTensor(tokens).reshape(num_tokens, 1)
    src_mask = torch.zeros(num_tokens, num_tokens).type(torch.bool)
    tgt_tokens = greedy_decode(model, src, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX).flatten()
    decoded_tokens = [tgt_vocab[tok] for tok in tgt_tokens if tok < len(tgt_vocab)]
    text = " ".join(decoded_tokens)
    text = text.replace("<s>", "").replace("</s>", "")
    text = text.replace("▁", " ")
    text = text.strip()
    return text

## Слободно промени некои реченици за да тестираш

Се што ви е потрбно е речениците да бидат сите со мали букви

Примери:

In [48]:
print(translate(transformer, "јас знам да читам македонски", mk_vocab, en_vocab, mk_tokenizer))
print(translate(transformer, "тој е од македонија", mk_vocab, en_vocab, mk_tokenizer))
print(translate(transformer, "тие се од Битола", mk_vocab, en_vocab, mk_tokenizer))
print(translate(transformer, "ние сме студенти на фикт битола", mk_vocab, en_vocab, mk_tokenizer))
print(translate(transformer, "капка по капка езеро", mk_vocab, en_vocab, mk_tokenizer))
print(translate(transformer, "човекот учи додека е жив", mk_vocab, en_vocab, mk_tokenizer))
print(translate(transformer, "додека е жив тој учи", mk_vocab, en_vocab, mk_tokenizer))

i  know  how  to  read  macedonian
he  is  from  macedonia
they  are  from  bitola
we  are  students  of  fi c t  bitola
drop  by  drop  lake
the  man  study  as  alive
while  he  is  alive


## Функционира и со подолги реченици, проблемот е што е полош
## Data set-от има bias (поголемиот дел од речениците се околу 10тина зборови) и очекува реченици до 10 збора.

In [49]:
print(translate(transformer, "работам и со подолги текстови, ама многу полошо разбирам", mk_vocab, en_vocab, mk_tokenizer))

i  work  and  with  longer  text s ,  but  i  understand  very  worse


## Пробај твој реченици тука:

Стави реченица на македонски помеѓу наводниците

In [40]:
print(translate(transformer, "", mk_vocab, en_vocab, mk_tokenizer))


