In [None]:
import pathlib

this_dir = pathlib.Path().parent.resolve()

In [None]:
from torchtext.data import Dataset
from torchtext.data import Example
from torchtext.data import Field


class EnglishToGermanDataset(Dataset):

    def __init__(self, corpus, processors):
        examples = []
        for bitext in corpus:
            examples.append(Example.fromlist(bitext, processors))
        super().__init__(examples, processors)

In [None]:
with open(f"{this_dir}/data/train.src") as src_file:
	with open(f"{this_dir}/data/train.tgt") as tgt_file:
		corpus = []
		for (src_seq, tgt_seq) in zip(src_file, tgt_file):
			corpus.append((("src", src_seq), ("tgt", tgt_seq)))

In [None]:
from sentencepiece import SentencePieceProcessor
from sentencepiece import SentencePieceTrainer

SentencePieceTrainer.train(input=f"{this_dir}/data/train.src",
						   model_prefix="spm_src", vocab_size=5000)
src_tokenizer = SentencePieceProcessor(model_file="./spm_src.model")
src_bpe_func = lambda tokens: src_tokenizer.encode(" ".join(tokens), out_type=str)

SentencePieceTrainer.train(input=f"{this_dir}/data/train.tgt",
						   model_prefix="spm_tgt", vocab_size=5000)
tgt_tokenizer = SentencePieceProcessor(model_file="./spm_tgt.model")
tgt_bpe_func = lambda tokens: tgt_tokenizer.encode(" ".join(tokens), out_type=str)

In [None]:
processors = list()
processors.append(("src", Field(sequential=True, use_vocab=True,
			  	                preprocessing=src_bpe_func,
								pad_token="<pad>", unk_token="<unk>",
								batch_first=True)))
processors.append(("tgt", Field(sequential=True, use_vocab=True,
								init_token="<bos>", eos_token="<eos>",
							    preprocessing=tgt_bpe_func, pad_token="<pad>",
				  				unk_token="<unk>", batch_first=True)))

dataset = EnglishToGermanDataset(corpus, processors)
for processor in processors:
	processor[1].build_vocab(dataset)

In [None]:
import numpy as np
import tqdm

MAX_ITER = 3


def ibm_model(examples, src_vocab, tgt_vocab):
    alignment = np.full((len(src_vocab), len(tgt_vocab)),
                        1 / len(src_vocab), dtype=float)
    count = 0
    while count <= MAX_ITER:
        print(f"{count}번째 iteration:")
        maximization = np.full((len(src_vocab), len(tgt_vocab)), 0, dtype=float)
        corp_total_like = np.full((len(tgt_vocab),), 0, dtype=float)
        # 시퀀스 하나씩 처리합니다.
        for example in tqdm.tqdm(examples, desc="P(src -> tgt)의 기댓값을 추정 중입니다."):

            # 주어진 시퀀스를 대상으로 sum(j, P(src_i -> tgt_j))를 찾습니다.
            ex_total_like = np.full((len(example.src),), 0, dtype=float)
            for i, src_tok in enumerate(example.src):
                src_tok_ind = src_vocab.stoi[src_tok]
                ex_total_like[i] = 0
                for tgt_tok in example.tgt:
                    tgt_tok_ind = tgt_vocab.stoi[tgt_tok]
                    ex_total_like[i] += alignment[src_tok_ind][tgt_tok_ind]

            # 주어진 시퀀스에서 영어 토큰과 독일어 토큰을 짝짓습니다 (Expectation).
            # P(src_i -> tgt_j)를 추정하는 단계입니다.
            for i, src_tok in enumerate(example.src):
                src_tok_ind = src_vocab.stoi[src_tok]
                for tgt_tok in example.tgt:
                    tgt_tok_ind = tgt_vocab.stoi[tgt_tok]
                    # P(src_i -> tgt_j)를 위에서 찾은 sum(j, P(src_i -> tgt_j))로
                    # 나누어 정규화해 줍니다.
                    expectation = (alignment[src_tok_ind][tgt_tok_ind]
                                   / ex_total_like[i])

                    # P(src_i -> tgt_j)를 코퍼스 전체를 대상으로 모두 찾아 더해줍니다.
                    maximization[src_tok_ind][tgt_tok_ind] += expectation
                    corp_total_like[tgt_tok_ind] += expectation

        # P(src_i -> tgt_j)를 sum(all, P(src_i -> tgt_all))로 나누어 정규화해 줍니다. 
        for tgt_tok_ind in tqdm.tqdm(
            range(len(tgt_vocab)),
            desc="코퍼스에서 (src -> tgt)를 모두 찾아서 그 빈도에 비례해 P(src -> tgt)의 기댓값을 최대화합니다.",
        ):
            for src_tok_ind in range(len(src_vocab)):
                if maximization[src_tok_ind][tgt_tok_ind] != 0 :
                    alignment[src_tok_ind][tgt_tok_ind] = (
                        maximization[src_tok_ind][tgt_tok_ind]
                        / corp_total_like[tgt_tok_ind]
                    )

        count += 1
    return alignment

In [None]:
def train_model(examples, src_vocab, tgt_vocab):
    # IBM 모델 1을 사용하겠습니다.
    ibm_model_1  = ibm_model(examples, src_vocab, tgt_vocab)
    model = dict()
    for src_tok_ind in range(len(src_vocab)):
        max_like = float("-inf")
        for tgt_tok_ind in range(len(tgt_vocab)):
            this_like = ibm_model_1[src_tok_ind][tgt_tok_ind]
            if this_like > max_like:
                max_like = this_like
                max_like_tok = tgt_vocab.itos[tgt_tok_ind]
        model[src_vocab.itos[src_tok_ind]] = max_like_tok
    np.save("data/model", model)

In [None]:
import numpy as np


def translate(src_seq) -> float:
    model = np.load("data/model.npy", allow_pickle=True).item()
    translation = dict()
    for src_tok in src_seq:
        translation[src_tok] = model[src_tok]
    return translation

In [None]:
train_model(dataset.examples, processors[0][1].vocab, processors[1][1].vocab)

In [None]:
src_seq = src_tokenizer.encode(
	"I hope that the fire we both made still burns a little in you",
	out_type=str)
translation = translate(src_seq)

In [None]:
import pandas

table = pandas.Series(translation)
table.index.name = "영어 -> 독일어"
print(table)