# Seq2Seq 모델을 활용한 Machine Translation(기계 번역) 실습

Encode-Decoder 구조의 기초가 되는 모델인 Seq2Seq를 직접 구현해본다.
구현한 Seq2Seq 모델을 활용하여 영어-독일어 기계 번역 실습을 진행해본다.

※ 실행 환경: colab

## 1. 데이터 전처리 및 Dataset & Dataloader 구성

### 1.1. torchdata 설치

In [1]:
!pip install folium==0.2.1

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting folium==0.2.1
  Downloading folium-0.2.1.tar.gz (69 kB)
[K     |████████████████████████████████| 69 kB 1.6 MB/s 
Building wheels for collected packages: folium
  Building wheel for folium (setup.py) ... [?25l[?25hdone
  Created wheel for folium: filename=folium-0.2.1-py3-none-any.whl size=79808 sha256=be0c297ba61d21e6f4fb978b4117aeb3db983118669290a1af5bd7564d821690
  Stored in directory: /root/.cache/pip/wheels/9a/f0/3a/3f79a6914ff5affaf50cabad60c9f4d565283283c97f0bdccf
Successfully built folium
Installing collected packages: folium
  Attempting uninstall: folium
    Found existing installation: folium 0.8.3
    Uninstalling folium-0.8.3:
      Successfully uninstalled folium-0.8.3
Successfully installed folium-0.2.1


In [2]:
!pip install torchdata==0.4.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchdata==0.4.0
  Downloading torchdata-0.4.0-cp37-cp37m-manylinux2014_x86_64.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 30.1 MB/s 
[?25hCollecting portalocker>=2.0.0
  Downloading portalocker-2.5.1-py2.py3-none-any.whl (15 kB)
Collecting urllib3>=1.25
  Downloading urllib3-1.26.11-py2.py3-none-any.whl (139 kB)
[K     |████████████████████████████████| 139 kB 72.1 MB/s 
  Downloading urllib3-1.25.11-py2.py3-none-any.whl (127 kB)
[K     |████████████████████████████████| 127 kB 75.4 MB/s 
[?25hInstalling collected packages: urllib3, portalocker, torchdata
  Attempting uninstall: urllib3
    Found existing installation: urllib3 1.24.3
    Uninstalling urllib3-1.24.3:
      Successfully uninstalled urllib3-1.24.3
Successfully installed portalocker-2.5.1 torchdata-0.4.0 urllib3-1.25.11


### 1.2. Multi30k 불러오기

In [4]:
from torchtext.datasets import Multi30k

train, valid, test = Multi30k()

### 1.3. Tokenizer를 이용해 영어 Vocab, 독일어 Vocab 구성하기

In [5]:
# Tokenizer를 위한 데이터 다운로드
!python -m spacy download en_core_web_sm
!python -m spacy download de_core_news_sm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-sm==3.4.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.0/en_core_web_sm-3.4.0-py3-none-any.whl (12.8 MB)
[K     |████████████████████████████████| 12.8 MB 40.1 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting de-core-news-sm==3.4.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.4.0/de_core_news_sm-3.4.0-py3-none-any.whl (14.6 MB)
[K     |████████████████████████████████| 14.6 MB 1.2 MB/s 
Installing collected packages: de-core-news-sm
Successfully installed de-core-news-sm-3.4.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('de_core_news_sm')


In [6]:
import spacy

en_tokenizer = spacy.load('en_core_web_sm')
de_tokenizer = spacy.load('de_core_news_sm')

In [7]:
def tokenize(tokenizer, text):
    return [ token.text for token in tokenizer.tokenizer(text)]

In [8]:
# tokenize 테스트
tokenize(en_tokenizer, 'I translate English into German.')
tokenize(de_tokenizer, 'Ich übersetze Englisch ins Deutsche.')

['Ich', 'übersetze', 'Englisch', 'ins', 'Deutsche', '.']

In [9]:
from functools import partial
from torchtext.vocab import build_vocab_from_iterator

In [10]:
en_vocab = build_vocab_from_iterator(map(partial(tokenize, en_tokenizer), [eng for _, eng in train]), min_freq=2, specials=['<pad>', '<unk>', '<sos>', '<eos>'])
de_vocab = build_vocab_from_iterator(map(partial(tokenize, de_tokenizer), [de for de , _ in train ]), min_freq=2, specials=['<pad>', '<unk>', '<sos>', '<eos>'])

In [11]:
len(en_vocab), len(de_vocab)

(6191, 8014)

In [12]:
en_vocab['<unk>'], de_vocab['<unk>']

(1, 1)

In [14]:
print(en_vocab.get_itos()[:10])
print(de_vocab.get_itos()[:10])

['<pad>', '<unk>', '<sos>', '<eos>', 'a', '.', 'A', 'in', 'the', 'on']
['<pad>', '<unk>', '<sos>', '<eos>', '.', 'Ein', 'einem', 'in', ',', 'und']


### 1.4. 데이터 전처리 클래스

In [15]:
class Language:
    pad_token_id = 0
    unk_token_id = 1
    sos_token_id = 2
    eos_token_id = 3

    def __init__(self, src_tokenizer, tgt_tokenizer, src_token2id, tgt_token2id, src_id2token, tgt_id2token):
        self.src_tokenizer = src_tokenizer
        self.tgt_tokenizer = tgt_tokenizer

        self.src_token2id = src_token2id
        self.tgt_token2id = tgt_token2id

        self.src_id2token = src_id2token
        self.tgt_id2token = tgt_id2token

    def src_encode(self, src_text):
        source_sentence = [ self.src_token2id.get(token.text, Language.unk_token_id) for token in self.src_tokenizer.tokenizer(src_text) ]
        return source_sentence
    
    def tgt_encode(self, tgt_text):
        target_sentence = [self.tgt_token2id['<sos>']] \
        + [ self.tgt_token2id.get(token.text, Language.unk_token_id) for token in self.tgt_tokenizer.tokenizer(tgt_text) ] \
        + [ self.tgt_token2id['<eos>'] ]
        return target_sentence
    
    def src_decode(self, ids):
        sentence = list(map(lambda x: self.src_id2token[x], ids))
        return " ".join(sentence)

    def tgt_decode(self, ids):
        sentence = list(map(lambda x: self.tgt_id2token[x], ids))[1:-1]
        return " ".join(sentence)

### 1.5. Dataset 구성하기

In [17]:
from torch.utils.data import Dataset, DataLoader

In [18]:
class MultiDataset(Dataset):
    def __init__(self, data, language):
        self.data = data
        self.language = language
        self.sentences = self.preprocess()

    def preprocess(self):
        # dataset 안에 길이가 0인 문장이 존재한다. 
        sentences = [ (self.language.src_encode(de), self.language.tgt_encode(eng)) 
                      for de, eng in self.data if len(eng) > 0 and len(de) > 0]

        return sentences

    def __getitem__(self, idx):
        return self.sentences[idx]

    def __len__(self):
        return len(self.sentences)

In [20]:
language = Language(de_tokenizer, en_tokenizer, de_vocab.get_stoi(), en_vocab.get_stoi(), de_vocab.get_itos(), en_vocab.get_itos())

In [21]:
train_dataset = MultiDataset(train, language)
valid_dataset = MultiDataset(valid, language)
test_dataset = MultiDataset(test, language)



In [22]:
# TEST
print(train_dataset[0])
print(valid_dataset[0])
print(test_dataset[0])

([21, 85, 257, 31, 87, 22, 94, 7, 16, 112, 7910, 3209, 4], [2, 19, 25, 15, 1169, 808, 17, 57, 84, 336, 1339, 5, 3])
([14, 38, 24, 243, 2744, 1, 11, 20, 892], [2, 6, 39, 13, 36, 17, 1667, 2541, 342, 4, 282, 3])
([5, 12, 10, 6, 178, 108, 8, 16, 78, 1, 4], [2, 6, 12, 7, 28, 91, 68, 2670, 20, 122, 5, 3])


### 1.6. DataLoader 만들기

#### collate_fn

Batch 안에 존재하는 sequence들의 길이를 맞춰준다.

In [23]:
import torch
from torch.nn.utils.rnn import pad_sequence

In [24]:
def collate_fn(batch_samples):
    pad_token_id = Language.pad_token_id

    src_sentences = pad_sequence([torch.tensor(src).long() for src, _ in batch_samples], batch_first=True, padding_value=pad_token_id)
    tgt_sentences = pad_sequence([torch.tensor(tgt).long() for _, tgt in batch_samples], batch_first=True, padding_value=pad_token_id)

    return src_sentences, tgt_sentences

#### batch_sampler

비슷한 길이의 sequence가 batch로 묶일 수 있도록 index를 반환해준다.

In [25]:
import random

In [26]:
def batch_sampling(sequence_lengths, batch_size):
    '''
    sequence_length: (source 길이, target 길이)가 담긴 리스트이다.
    batch_size: batch 크기
    '''

    seq_lens = [(i, seq_len, tgt_len) for i,(seq_len, tgt_len) in enumerate(sequence_lengths)]
    seq_lens = sorted(seq_lens, key=lambda x: x[1])
    seq_lens = [sample[0] for sample in seq_lens]
    sample_indices = [ seq_lens[i:i+batch_size] for i in range(0,len(seq_lens), batch_size)]

    random.shuffle(sample_indices) # 모델이 길이에 편향되지 않도록 섞는다.

    return sample_indices

#### DataLoader

In [27]:
def make_dataloader(dataset, batch_size):
    sequence_lengths = list(map(lambda x: (len(x[0]), len(x[1])), dataset))
    batch_sampler = batch_sampling(sequence_lengths, batch_size)

    return DataLoader(dataset, collate_fn=collate_fn, batch_sampler=batch_sampler)

In [28]:
### TEST
train_loader = make_dataloader(train_dataset, 5)

for src, tgt in train_loader:
    print(src)
    print(tgt)
    break

tensor([[   5,  175,   33,   10,    6,  733,  658,    9, 1436,    7,   20,   51,
         1044,    4],
        [ 433,   76,   77,   11,   13,   36,    7,   16,  112,   45,  231,   80,
            1,    4],
        [   5,   12,   32,   49,    6,   99,   12,    8,   16,   11,   26,  181,
           30,    4],
        [   5,   12,    7,  238,   41,    9,  227,  108, 7820,   11,   20,   99,
           12,    4],
        [   5,   12,   32,   10,    1,  928,   11,    6,  272,    9,  185,  104,
          901,    4]])
tensor([[   2,    6,   65,   34,   21,    4,  740,   10,  677,   11, 1887,    9,
            4,   31, 1428,    5,    3,    0],
        [   2,  431,   22,  156,    9,    8,   40,   84,    4,  268,   62,  914,
            5,    3,    0,    0,    0,    0],
        [   2,    6,   12,   10,  471,   84,   86,   12,  116,   10,   37,    9,
            8,   89,    5,    3,    0,    0],
        [   2,    6,   12,   14,   31,   23,   11,   26,   68, 1471,   20,   86,
           12,    5,  

## 2. Seq2Seq 모델 구현하기

In [29]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

### 2.1. Encoder

* 하이퍼 파라미터
    * vocab_size: vocab 크기
    * emb_dim: embedding_dimension
    * hidden_dim: hidden-state vector dimension
    * num_layers: LSTM 은닉층의 수
    * dropout: dropout 비율

In [30]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim, num_layers=2, dropout=0.5):

        super(Encoder, self).__init__()

        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.lstm = nn.LSTM(input_size=emb_dim,
                            hidden_size=hidden_dim,
                            num_layers=num_layers,
                            batch_first=True,
                            dropout=dropout)
    
    def forward(self, src, hidden, cell):
        src = self.embedding(src)
        src, (next_hidden, next_cell) = self.lstm(src, (hidden, cell))

        return next_hidden, next_cell

### 2.2. Decoder

* 하이퍼 파라미터
    * vocab_size: vocab 크기
    * emb_dim: embedding_dimension
    * hidden_dim: hidden-state vector dimension
    * num_layers: LSTM 은닉층의 수
    * dropout: dropout 비율

In [31]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim, num_layers=2, dropout=0.5):
        super(Decoder, self).__init__()

        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.lstm = nn.LSTM(input_size=emb_dim,
                            hidden_size=hidden_dim,
                            num_layers=num_layers,
                            batch_first=True,
                            dropout=dropout)
        
        self.output = nn.Linear(hidden_dim, vocab_size)

    def forward(self, tgt, hidden, cell):
        tgt = self.embedding(tgt)
        tgt, (next_hidden, next_cell) = self.lstm(tgt, (hidden, cell))
        out = self.output(tgt)

        return out, next_hidden, next_cell

### 2.3. Seq2Seq 모델

* 하이퍼 파라미터
    * src_vocab_size: source vocab 크기
    * tgt_vocab_size: target vocab 크기
    * emb_dim: embedding_dimension
    * hidden_dim: hidden-state vector dimension
    * num_layers: RNN 은닉층의 수
    * dropout: dropout 비율

In [32]:
class Seq2Seq(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, emb_dim, hidden_dim, device, num_layers=2, dropout=0.5):
        super(Seq2Seq, self).__init__()
        
        self.num_layers = num_layers
        self.hidden_dim = hidden_dim
        self.device = device
        self.encoder = Encoder(src_vocab_size, emb_dim, hidden_dim, num_layers, dropout)
        self.decoder = Decoder(tgt_vocab_size, emb_dim, hidden_dim, num_layers, dropout)

    def forward(self, src, tgt):
        batch_size = src.size()[0]
        init_hidden, init_cell = torch.zeros((self.num_layers, batch_size, self.hidden_dim)), torch.zeros((self.num_layers, batch_size, self.hidden_dim))
        init_hidden = init_hidden.to(self.device)
        init_cell = init_cell.to(self.device)

        src_hidden, src_cell = self.encoder(src, init_hidden, init_cell)
        # src_hidden = src_hidden.to(self.device)
        # src_cell = src_cell.to(self.device)
        out, _, _ = self.decoder(tgt, src_hidden, src_cell) # (batch_size, seq_len, vocab_size)
        out = F.log_softmax(out, dim=-1)

        return out

## 3. 학습

In [33]:
from tqdm.notebook import tqdm

### 3.1. Train 

In [36]:
def train(model, optimizer, dataloader, pad_index, device):
    model.to(device)

    bar = tqdm(dataloader, desc='train')
    total_loss = 0.

    for i, (src, tgt) in enumerate(bar, start=1):
        src = src.to(device)
        tgt = tgt.to(device)
        out = model(src, tgt)
        loss = F.nll_loss(out[:,:-1,:].transpose(1,2), tgt[:,1:], ignore_index=pad_index)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        current_loss = total_loss / i
        bar.set_description(f"Train-loss:{current_loss:.4f}")

### 3.2. Evaluate

In [37]:
def evaluate(model, dataloader, pad_index, device, mode):
    model.to(device)

    bar = tqdm(dataloader, desc=mode)
    total_loss = 0.
    loss_avg = 0.

    for i, (src, tgt) in enumerate(bar, start=1):
        src = src.to(device)
        tgt = tgt.to(device)
        out = model(src, tgt)
        loss = F.nll_loss(out[:,:-1,:].transpose(1,2), tgt[:,1:], ignore_index=pad_index)

        total_loss += loss.item()
        current_loss = total_loss / i
        loss_avg = current_loss
        bar.set_description(f"{mode}-loss:{current_loss:.4f}")
    
    return loss_avg

In [38]:
emb_dim = 256
hidden_dim = 256
num_epoch = 20
batch_size = 128

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Seq2Seq(len(de_vocab), len(en_vocab), emb_dim, hidden_dim, device)
optimizer = optim.Adam(model.parameters())

train_loader = make_dataloader(train_dataset, batch_size)
valid_loader = make_dataloader(valid_dataset, batch_size)

for epoch in range(num_epoch):
    train(model, optimizer, train_loader,Language.pad_token_id, device)
    val_loss = evaluate(model, valid_loader, Language.pad_token_id, device, 'valid')
    print("="*60)
    print(f"END OF EPOCH:{epoch+1} | VALID LOSS: {val_loss:.4f}")
    print("="*60)

train:   0%|          | 0/227 [00:00<?, ?it/s]

valid:   0%|          | 0/8 [00:00<?, ?it/s]

END OF EPOCH:1 | VALID LOSS: 4.2476


train:   0%|          | 0/227 [00:00<?, ?it/s]

valid:   0%|          | 0/8 [00:00<?, ?it/s]

END OF EPOCH:2 | VALID LOSS: 3.6064


train:   0%|          | 0/227 [00:00<?, ?it/s]

valid:   0%|          | 0/8 [00:00<?, ?it/s]

END OF EPOCH:3 | VALID LOSS: 3.2944


train:   0%|          | 0/227 [00:00<?, ?it/s]

valid:   0%|          | 0/8 [00:00<?, ?it/s]

END OF EPOCH:4 | VALID LOSS: 3.0878


train:   0%|          | 0/227 [00:00<?, ?it/s]

valid:   0%|          | 0/8 [00:00<?, ?it/s]

END OF EPOCH:5 | VALID LOSS: 2.9324


train:   0%|          | 0/227 [00:00<?, ?it/s]

valid:   0%|          | 0/8 [00:00<?, ?it/s]

END OF EPOCH:6 | VALID LOSS: 2.8134


train:   0%|          | 0/227 [00:00<?, ?it/s]

valid:   0%|          | 0/8 [00:00<?, ?it/s]

END OF EPOCH:7 | VALID LOSS: 2.7298


train:   0%|          | 0/227 [00:00<?, ?it/s]

valid:   0%|          | 0/8 [00:00<?, ?it/s]

END OF EPOCH:8 | VALID LOSS: 2.6548


train:   0%|          | 0/227 [00:00<?, ?it/s]

valid:   0%|          | 0/8 [00:00<?, ?it/s]

END OF EPOCH:9 | VALID LOSS: 2.6099


train:   0%|          | 0/227 [00:00<?, ?it/s]

valid:   0%|          | 0/8 [00:00<?, ?it/s]

END OF EPOCH:10 | VALID LOSS: 2.5642


train:   0%|          | 0/227 [00:00<?, ?it/s]

valid:   0%|          | 0/8 [00:00<?, ?it/s]

END OF EPOCH:11 | VALID LOSS: 2.5366


train:   0%|          | 0/227 [00:00<?, ?it/s]

valid:   0%|          | 0/8 [00:00<?, ?it/s]

END OF EPOCH:12 | VALID LOSS: 2.5226


train:   0%|          | 0/227 [00:00<?, ?it/s]

valid:   0%|          | 0/8 [00:00<?, ?it/s]

END OF EPOCH:13 | VALID LOSS: 2.5084


train:   0%|          | 0/227 [00:00<?, ?it/s]

valid:   0%|          | 0/8 [00:00<?, ?it/s]

END OF EPOCH:14 | VALID LOSS: 2.4924


train:   0%|          | 0/227 [00:00<?, ?it/s]

valid:   0%|          | 0/8 [00:00<?, ?it/s]

END OF EPOCH:15 | VALID LOSS: 2.4834


train:   0%|          | 0/227 [00:00<?, ?it/s]

valid:   0%|          | 0/8 [00:00<?, ?it/s]

END OF EPOCH:16 | VALID LOSS: 2.4768


train:   0%|          | 0/227 [00:00<?, ?it/s]

valid:   0%|          | 0/8 [00:00<?, ?it/s]

END OF EPOCH:17 | VALID LOSS: 2.4865


train:   0%|          | 0/227 [00:00<?, ?it/s]

valid:   0%|          | 0/8 [00:00<?, ?it/s]

END OF EPOCH:18 | VALID LOSS: 2.4848


train:   0%|          | 0/227 [00:00<?, ?it/s]

valid:   0%|          | 0/8 [00:00<?, ?it/s]

END OF EPOCH:19 | VALID LOSS: 2.5036


train:   0%|          | 0/227 [00:00<?, ?it/s]

valid:   0%|          | 0/8 [00:00<?, ?it/s]

END OF EPOCH:20 | VALID LOSS: 2.5259


In [39]:
test_loader = make_dataloader(test_dataset, batch_size)

print("="*60)
test_loss = evaluate(model, test_loader, Language.pad_token_id, device, 'test')
print(f"END OF TEST | TEST LOSS: {test_loss:.4f}")
print("="*60)



test:   0%|          | 0/8 [00:00<?, ?it/s]

END OF TEST | TEST LOSS: 2.5093


## 4. 번역 테스트

In [40]:
def translate(sentence, language, model, device, max_len=50):
    model.eval()
    model = model.to(device)

    src_tokens = torch.tensor(language.src_encode(sentence)).unsqueeze(0).long().to(device)

    init_hidden, init_cell = torch.zeros((model.num_layers, 1, model.hidden_dim)), torch.zeros((model.num_layers, 1, model.hidden_dim))
    init_hidden = init_hidden.to(model.device)
    init_cell = init_cell.to(model.device)

    with torch.no_grad():
        hidden, cell = model.encoder(src_tokens, init_hidden, init_cell)
    
    tgt_indices = [language.sos_token_id]

    for i in range(max_len):
        tgt_tensor = torch.tensor(tgt_indices[-1]).unsqueeze(0).unsqueeze(0).long().to(device)

        with torch.no_grad():
            output, hidden, cell = model.decoder(tgt_tensor, hidden, cell)

        pred_token = output.argmax(-1).item()
        tgt_indices.append(pred_token)

        if pred_token == language.eos_token_id:
            break
    return language.tgt_decode(tgt_indices)

#### Test 세트

In [41]:
test_text = [text for text in test]

In [42]:
test_text[0]

('Ein Mann mit einem orangefarbenen Hut, der etwas anstarrt.',
 'A man in an orange hat starring at something.')

In [43]:
translate("Ein Mann mit einem orangefarbenen Hut, der etwas anstarrt.", language, model, device)

'A man wearing a hat welding something .'

In [44]:
translate("Eine Mutter und ihre jungen Mädchen genießen einen Tag", language, model, device)

'A mother and her young children are enjoying a day .'

* 문장: Eine Mutter und ihre jungen Mädchen genießen einen Tag
* papago 번역: A mother and her young girls enjoy a day
* 예측 결과: A mother and her young children are enjoying a day .



#### 간단한 문장을 사용해 번역해보기

In [45]:
translate("Ein Mann, der ins Ausland reist", language, model, device)

'A man in a kayak .'

* 문장: Ein Mann, der ins Ausland reist
* papago 번역: a man traveling abroad
* 예측 결과: A man in a kayak . 

In [48]:
translate("Der Mann kocht und hört Musik.", language, model, device)

'The man is listening to music and singing .'

* 문장: Der Mann kocht und hört Musik.
* papago 번역: The man cooks and listens to music.
* 예측 결과: The man is listening to music and singing .