# Seq2Seq

*Sequence to Sequence*.  
*Encoder-Decoder Model*とも。

In [1]:
import torch
from torch import nn, optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from torchtext import transforms
from torchtext.vocab import build_vocab_from_iterator
from torchvision.transforms import Compose
import MeCab
import pandas as pd
from tqdm import tqdm
from typing import List
import random

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')


---

## Dataset

In [2]:
df = pd.read_excel('data/JEC_basic_sentence_v1-3.xls', header=None)
df.columns = ['id', 'japanese', 'english', 'chinese']
print('num of data:', len(df))
df.head()

num of data: 5304


Unnamed: 0,id,japanese,english,chinese
0,#0001,Xではないかとつくづく疑問に思う,I often wonder if it might be X.,难道不会是X吗，我实在是感到怀疑。
1,#0002,Xがいいなといつも思います,I always think X would be nice.,我总觉得X不错。
2,#0003,それがあるようにいつも思います,It always seems like it is there.,我总觉得那好像是有的。
3,#0004,それが多すぎないかと正直思う,I honestly feel like there is too much.,老实说我觉得那太多了。
4,#0005,山田はみんなに好かれるタイプの人だと思う,I think that Yamada is the type everybody likes.,我想山田是受大家欢迎的那种人。


In [3]:
tagger = MeCab.Tagger('-Owakati')
def tokenize(data: List[str], l='en') -> List[List[str]]:
    if l == 'ja':
        return [tagger.parse(sentence).split() for sentence in data]
    elif l == 'en':
        return [sent.replace('.', ' .').lower().split() for sent in data]

In [4]:
text_ja = tokenize(df['japanese'], l='ja')
text_en = tokenize(df['english'], l='en')

In [5]:
pad, bos, eos, unk = '<pad>', '<bos>', '<eos>', '<unk>'
specials = [pad, bos, eos, unk]
vocab_ja = build_vocab_from_iterator(text_ja, specials=specials)
vocab_en = build_vocab_from_iterator(text_en, specials=specials)

def to_text(token_ids, l='en'):
    vocab = eval(f'vocab_{l}')
    tokens = []
    for i in token_ids[1:]:
        if i == vocab.get_stoi()[eos]:
            break
        tokens.append(vocab.get_itos()[i])
    return ' '.join(tokens)

In [6]:
transform_ja = Compose([
    transforms.AddToken(bos, begin=True),
    transforms.AddToken(eos, begin=False),
    transforms.VocabTransform(vocab_ja),
    transforms.ToTensor(),
])

transform_en = Compose([
    transforms.AddToken(bos, begin=True),
    transforms.AddToken(eos, begin=False),
    transforms.VocabTransform(vocab_en),
    transforms.ToTensor(),
])

n_vocab_ja = len(vocab_ja)
n_vocab_en = len(vocab_en)

In [7]:
class TextDataset(Dataset):
    def __init__(self, in_text, out_text, in_transform, out_transform):
        self.n_samples = len(in_text)
        self.in_text = [in_transform(text) for text in in_text]
        self.out_text = [out_transform(text) for text in out_text]

    def __getitem__(self, index):
        in_text = self.in_text[index]
        out_text = self.out_text[index]
        return in_text, out_text[:-1], out_text[1:]

    def __len__(self):
        return self.n_samples

def to_padded_tensor(text_data: List[int], pad_value: int = 0) -> torch.Tensor:
    data = pad_sequence(text_data, batch_first=True, padding_value=pad_value)
    return data

def collate_fn(batch):
    enc_in_text, dec_in_text, dec_out_text = zip(*batch)
    enc_in_text = to_padded_tensor(enc_in_text)
    dec_in_text = to_padded_tensor(dec_in_text)
    dec_out_text = to_padded_tensor(dec_out_text)
    return enc_in_text, dec_in_text, dec_out_text

dataset = TextDataset(text_ja, text_en, transform_ja, transform_en)
dataloader = DataLoader(
    dataset,
    batch_size=32,
    shuffle=True,
    collate_fn=collate_fn
)

x_enc, x_dec, label = next(iter(dataloader))
print('x_enc:', x_enc.shape)
print('x_dec:', x_dec.shape)
print('label:', label.shape)

x_enc: torch.Size([32, 20])
x_dec: torch.Size([32, 18])
label: torch.Size([32, 18])



---

## モデル構築

In [9]:
eos_id = vocab_ja.get_stoi()[eos]

class Encoder(nn.Module):
    def __init__(self, n_vocab, embed_size, hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(
            n_vocab,
            embed_size,
            padding_idx=vocab_ja.get_stoi()[pad]
        )
        self.rnn = nn.RNN(embed_size, hidden_size, batch_first=True)
        # self.lstm = nn.LSTM(embed_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, hidden_size)

    def forward(self, x):
        eos_positions = x == eos_id
        x = self.embedding(x)
        y, _ = self.rnn(x)
        h = y[eos_positions] # (batch_size, hidden_size)
        h = self.fc(h).unsqueeze(0) # (1, batch_size, hidden_size)
        # _, h = self.lstm(x)
        # h = self.fc(h[0][0])
        return h

In [10]:
class Decoder(nn.Module):
    def __init__(self, n_vocab, embed_size, hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(n_vocab, embed_size)
        self.rnn = nn.RNN(embed_size, hidden_size, batch_first=True)
        # self.lstm = nn.LSTM(embed_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, n_vocab)

    def forward(self, x, h):
        x = self.embedding(x)
        y, h = self.rnn(x, h)
        # y, h = self.lstm(x, h)
        y = self.fc(y)
        return y, h

In [11]:
class Seq2Seq(nn.Module):
    def __init__(self, n_in_vocab, n_out_vocab, embed_size, hidden_size):
        super().__init__()
        self.encoder = Encoder(n_in_vocab, embed_size, hidden_size)
        self.decoder = Decoder(n_out_vocab, embed_size, hidden_size)

    def forward(self, x_enc, x_dec):
        h = self.encoder(x_enc)
        y, _ = self.decoder(x_dec, h)
        # h0 = h.unsqueeze(0)
        # c0 = torch.zeros_like(h0)
        # y, _ = self.decoder(x_dec, (h0, c0))
        return y


---

## 学習

In [44]:
criterion = nn.CrossEntropyLoss(ignore_index=vocab_en[pad])
def train(model, optimizer, n_epochs):
    eval_tokens = []
    model.train()
    for epoch in range(1, n_epochs + 1):
        epoch_loss = 0
        for x_enc, x_dec, label in tqdm(dataloader, desc=f'{epoch}/{n_epochs}', disable=True):
            optimizer.zero_grad()
            x_enc = x_enc.to(device)
            x_dec = x_dec.to(device)
            label = label.to(device)
            y_pred = model(x_enc, x_dec)
            loss = criterion(y_pred.reshape(-1, y_pred.shape[-1]), label.ravel())
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        eval_tokens.append((y_pred, label))
        print(f'{epoch}/{n_epochs} loss: {epoch_loss/len(dataloader)}', flush=True)
    return eval_tokens

In [47]:
model = Seq2Seq(len(vocab_ja), len(vocab_en), 1024, 1024).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-4)

eval_tokensについては後程

In [48]:
n_epochs = 60
eval_tokens = train(model, optimizer, n_epochs)

1/60 loss: 5.835270801222468
2/60 loss: 4.892273575426584
3/60 loss: 4.576794963285147
4/60 loss: 4.324355901005757
5/60 loss: 4.098599929407419
6/60 loss: 3.891183881874544
7/60 loss: 3.69426362773022
8/60 loss: 3.5012600637343994
9/60 loss: 3.3184977022998305
10/60 loss: 3.140416044786752
11/60 loss: 2.9700641531542122
12/60 loss: 2.799751356423619
13/60 loss: 2.6396425689559386
14/60 loss: 2.4855928866260024
15/60 loss: 2.338073099952146
16/60 loss: 2.1948005643235633
17/60 loss: 2.065133432307875
18/60 loss: 1.9391252556479122
19/60 loss: 1.8168115723563965
20/60 loss: 1.700798583317952
21/60 loss: 1.5883989190480796
22/60 loss: 1.4854144647897007
23/60 loss: 1.385881856263402
24/60 loss: 1.2935746671205544
25/60 loss: 1.2044040517634655
26/60 loss: 1.1196834628122398
27/60 loss: 1.0408244897802192
28/60 loss: 0.9677115822412882
29/60 loss: 0.8982452667621245
30/60 loss: 0.8315635398927942
31/60 loss: 0.7655266817793789
32/60 loss: 0.7054116743874838
33/60 loss: 0.6493757654385395



---

## 翻訳

In [49]:
@torch.no_grad()
def translate(model, text, max_len=100):
    model.eval()
    tokens = tokenize([text], 'ja')[0]
    tokens = transform_ja(tokens).unsqueeze(0).to(device)
    h = model.encoder(tokens)
    # h0 = h.unsqueeze(0)
    # c0 = torch.zeros_like(h0)
    # h = (h0, c0)
    next_token = vocab_en[bos]

    tokens = []
    for _ in range(max_len):
        next_token = torch.tensor(next_token).reshape(1, 1).to(device)
        y, h = model.decoder(next_token, h)
        y = F.softmax(y.ravel(), dim=0)
        # next_token = random.choices(range(len(y)), weights=y)[0] # 確率的な出力
        next_token = y.argmax().item() # 決定的な出力
        if next_token == vocab_en[eos]:
            break
        tokens.append(next_token)
    return ' '.join([vocab_en.get_itos()[t] for t in tokens])

In [50]:
translate(model, 'Xではないかとつくづく疑問に思う')

'i often wonder if it might be x .'

In [51]:
translate(model, '山田はみんなに好かれるタイプの人だと思う')

'i think that yamada is the type everybody likes .'