# seq2seq

In [1]:
import torch
from torch import nn, optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchtext import transforms
from torchtext.vocab import build_vocab_from_iterator
from torchvision.transforms import Compose
import MeCab
import pandas as pd
from tqdm import tqdm
from typing import List
import random

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')


---

## Dataset

In [2]:
df = pd.read_excel('data/JEC_basic_sentence_v1-3.xls', header=None)
df.columns = ['id', 'japanese', 'english', 'chinese']
print('num of data:', len(df))
df.head()

num of data: 5304


Unnamed: 0,id,japanese,english,chinese
0,#0001,Xではないかとつくづく疑問に思う,I often wonder if it might be X.,难道不会是X吗，我实在是感到怀疑。
1,#0002,Xがいいなといつも思います,I always think X would be nice.,我总觉得X不错。
2,#0003,それがあるようにいつも思います,It always seems like it is there.,我总觉得那好像是有的。
3,#0004,それが多すぎないかと正直思う,I honestly feel like there is too much.,老实说我觉得那太多了。
4,#0005,山田はみんなに好かれるタイプの人だと思う,I think that Yamada is the type everybody likes.,我想山田是受大家欢迎的那种人。


In [3]:
tagger = MeCab.Tagger('-Owakati')
def tokenize(data: List[str], l='en') -> List[List[str]]:
    if l == 'ja':
        return [tagger.parse(sentence).split() for sentence in data]
    elif l == 'en':
        return [sentence.replace('.', ' .').lower().split() for sentence in data]

In [4]:
text_ja = tokenize(df['japanese'], l='ja')
text_en = tokenize(df['english'], l='en')

In [5]:
pad, bos, eos, unk = '<pad>', '<bos>', '<eos>', '<unk>'
max_len = 30
specials = [pad, bos, eos, unk]
vocab_ja = build_vocab_from_iterator(text_ja, specials=specials)
vocab_en = build_vocab_from_iterator(text_en, specials=specials)

transform_ja = Compose([
    transforms.Truncate(max_len-2),
    transforms.AddToken(bos, begin=True),
    transforms.AddToken(eos, begin=False),
    transforms.VocabTransform(vocab_ja),
    transforms.ToTensor(),
    transforms.PadTransform(max_len, vocab_ja[pad])
])

transform_en = Compose([
    transforms.Truncate(max_len-2),
    transforms.AddToken(bos, begin=True),
    transforms.AddToken(eos, begin=False),
    transforms.VocabTransform(vocab_en),
    transforms.ToTensor(),
    transforms.PadTransform(max_len, vocab_en[pad])
])

n_vocab_ja = len(vocab_ja)
n_vocab_en = len(vocab_en)

In [6]:
class TextDataset(Dataset):
    def __init__(self, in_text, out_text, in_transform, out_transform):
        self.in_text = in_text
        self.out_text = out_text
        self.in_transform = in_transform
        self.out_transform = out_transform
        self.n_samples = len(in_text)

    def __getitem__(self, index):
        in_text = self.in_text[index]
        out_text = self.out_text[index]
        in_text = self.in_transform(in_text)
        out_text = self.out_transform(out_text)
        return in_text, out_text[:-1], out_text[1:]

    def __len__(self):
        return self.n_samples

dataset = TextDataset(text_ja, text_en, transform_ja, transform_en)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

x_enc, x_dec, label = next(iter(dataloader))
print('x_enc:', x_enc.shape)
print('x_dec:', x_dec.shape)
print('label:', label.shape)

x_enc: torch.Size([32, 30])
x_dec: torch.Size([32, 29])
label: torch.Size([32, 29])



---

## モデル構築

最初LSTMで作ったが一旦RNNにする。

In [7]:
class Encoder(nn.Module):
    def __init__(self, n_vocab, embed_size, hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(n_vocab, embed_size)
        self.rnn = nn.RNN(embed_size, hidden_size, batch_first=True)
        # self.lstm = nn.LSTM(embed_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, hidden_size)

    def forward(self, x):
        x = self.embedding(x)
        _, h = self.rnn(x)
        h = self.fc(h)
        # _, h = self.lstm(x)
        # h = self.fc(h[0][0])
        return h

In [8]:
class Decoder(nn.Module):
    def __init__(self, n_vocab, embed_size, hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(n_vocab, embed_size)
        self.rnn = nn.RNN(embed_size, hidden_size, batch_first=True)
        # self.lstm = nn.LSTM(embed_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, n_vocab)

    def forward(self, x, h):
        x = self.embedding(x)
        y, h = self.rnn(x, h)
        # y, h = self.lstm(x, h)
        y = self.fc(y)
        return y, h

In [9]:
class Seq2Seq(nn.Module):
    def __init__(self, n_in_vocab, n_out_vocab, embed_size, hidden_size):
        super().__init__()
        self.encoder = Encoder(n_in_vocab, embed_size, hidden_size)
        self.decoder = Decoder(n_out_vocab, embed_size, hidden_size)

    def forward(self, x_enc, x_dec):
        h = self.encoder(x_enc)
        y, _ = self.decoder(x_dec, h)
        # h0 = h.unsqueeze(0)
        # c0 = torch.zeros_like(h0)
        # y, _ = self.decoder(x_dec, (h0, c0))
        return y


---

## 学習

In [10]:
criterion = nn.CrossEntropyLoss(ignore_index=vocab_en[pad])
def train(model, optimizer, n_epochs):
    eval_tokens = []
    model.train()
    for epoch in range(1, n_epochs + 1):
        epoch_loss = 0
        for x_enc, x_dec, label in tqdm(dataloader, desc=f'{epoch}/{n_epochs}', disable=True):
            optimizer.zero_grad()
            x_enc = x_enc.to(device)
            x_dec = x_dec.to(device)
            label = label.to(device)
            y_pred = model(x_enc, x_dec)
            loss = criterion(y_pred.reshape(-1, y_pred.shape[-1]), label.ravel())
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        eval_tokens.append((y_pred, label))
        print(f'{epoch}/{n_epochs} loss: {epoch_loss/len(dataloader)}', flush=True)
    return eval_tokens

In [11]:
model = Seq2Seq(len(vocab_ja), len(vocab_en), 1024, 1024).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)

eval_tokensについては後程

In [12]:
eval_tokens = train(model, optimizer, 20)

1/20 loss: 5.436596850314772
2/20 loss: 4.307769970721509
3/20 loss: 3.52607438219599
4/20 loss: 2.8977347698556373
5/20 loss: 2.405354056013636
6/20 loss: 2.037142706922738
7/20 loss: 1.759437583297132
8/20 loss: 1.5826276310955185
9/20 loss: 1.4604639554598244
10/20 loss: 1.395873271557222
11/20 loss: 1.351504603064204
12/20 loss: 1.3370569161621921
13/20 loss: 1.3281421690102082
14/20 loss: 1.322664046143911
15/20 loss: 1.3227734738085644
16/20 loss: 1.3196085526282528
17/20 loss: 1.3183850365948964
18/20 loss: 1.3180801940251545
19/20 loss: 1.3127760104386204
20/20 loss: 1.3138846130256194



---

## 翻訳

In [13]:
@torch.no_grad()
def translate(model, text, max_len=100):
    model.eval()
    tokens = tokenize([text], 'ja')[0]
    tokens = transform_ja(tokens).unsqueeze(0).to(device)
    h = model.encoder(tokens)
    # h0 = h.unsqueeze(0)
    # c0 = torch.zeros_like(h0)
    # h = (h0, c0)
    next_token = vocab_en[bos]

    tokens = []
    for _ in range(max_len):
        next_token = torch.tensor(next_token).reshape(1, 1).to(device)
        y, h = model.decoder(next_token, h)
        y = F.softmax(y.ravel(), dim=0)
        # next_token = random.choices(range(len(y)), weights=y)[0] # 確率的な出力
        next_token = y.argmax().item() # 決定的な出力
        if next_token == vocab_en[eos]:
            break
        tokens.append(next_token)
    return ' '.join([vocab_en.get_itos()[t] for t in tokens])

In [14]:
translate(model, 'Xではないかとつくづく疑問に思う')

'he will access a site with unexpurgated videos of cute girls .'

In [15]:
translate(model, '山田はみんなに好かれるタイプの人だと思う')

'he will access a site with unexpurgated videos of cute girls .'

lossはいい感じに減っているが、翻訳は上手くいっていない。


---

## メモ

なぜ翻訳が上手くいかないか。

### 考察

RNN系は学習時と推論時で挙動が異なる。具体的には（decoderへの）入力が異なり、学習時は正解データを入力するが、推論時は推論結果（前の時間でモデルが出力した単語）を入力する。これが原因で、lossは低いが推論は上手くいかないのではないか。

とりあえず、調べてみる。学習時にどの様な文章が生成されているのかを調べる。

In [16]:
n_epochs = 20
n_data = 10
y, t = eval_tokens[n_epochs - 1]
y = y.argmax(dim=-1)[:n_data]
t = t[:n_data]

for yi, ti in zip(y, t):
    print('生成:', ' '.join([vocab_en.get_itos()[i] for i in yi]))
    print('正解:', ' '.join([vocab_en.get_itos()[i] for i in ti]))
    print()

生成: he will remembered his time in worked as a salesperson . <eos> decision-making . . . . . . . weeks weeks weeks reissued "boob "boob "boob purchase, purchase,
正解: he suddenly recalled the time he worked as a salesperson . <eos> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>

生成: he hall of energetic live performances . <eos> 〜 . . . . . . . weeks weeks weeks counter, "boob "boob "boob purchase, purchase, purchase, purchase, wondering wondering
正解: the two give energetic live performances . <eos> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>

生成: he will well of several styles . <eos> decision-making . . . . . . . weeks weeks weeks counter, "boob "boob "boob purchase, purchase, purchase, purchase, wondering wondering
正解: he performs music of several styles . <eos> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad

生成: he will access up the shop . <eos> rights . . . . . . . weeks weeks counter, reissued "boob "boob "boob purchase, purchase, purchase, purchase, wondering wondering
正解: he will step into the shop . <eos> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>

生成: he provide with environmental regulations and other rules . <eos> . . . . . . . . weeks weeks weeks counter, "boob "boob purchase, purchase, purchase, purchase, wondering
正解: we comply with laws, regulations and other rules . <eos> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>

生成: he can definitely a just by writing . <eos> and . . . . . . . weeks weeks weeks reissued "boob "boob "boob purchase, purchase, purchase, purchase, wondering
正解: you will get rewards just by writing . <eos> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pa

20epoch時点での最後のバッチの生成結果。  
これを見ると、毎回'he'から始まっていることが分かる。ただ、そこからいくつか単語が進むと正解と同じ文章が生成されるようになる。これはモデルが正解の単語を入力として受け取っているからである。推論時は間違った'he'という単語をそのまま受け取るので'he'に続く尤もらしい単語列を生成する。

現状、どんな隠れ状態を受け取っても'he'が出力される。これは学習をやり直しても変わらない。なぜ必ず'he'が出力されるかというと、データセット内の初めの単語に'he'が多いから。多分。’\<bos>'の次の単語は'he'である確率が最も高く、上記の結果は`argmax()`によって確定的に単語を出力しているため、全て'he'となる。

対策として、隠れ状態によって初めの単語が変化するようにしなければならない。どうすればいいかな

### 勾配消失

encoderが適切な隠れ状態を出力できていない場合、decoderがencoderへの入力を考慮しないモデルになってしまう気がする。ただの言語モデルと変わらなくなる。だから'\<bos>'の次に'he'がくる。データセット内ではその確率が最も高いから。

encoderがしっかり学習できているかを確かめるため、重みを見てみる。

In [23]:
model = Seq2Seq(len(vocab_ja), len(vocab_en), 1024, 1024).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)

In [24]:
model.encoder.embedding.weight

Parameter containing:
tensor([[-0.2442, -0.5861, -0.4899,  ..., -0.8273,  0.1949,  0.5169],
        [ 0.1649, -0.4432, -0.6046,  ..., -0.3246,  0.7523, -0.5198],
        [ 0.0740,  1.0601,  0.4548,  ..., -0.6372, -1.9226,  1.2581],
        ...,
        [ 0.0293,  0.7944,  1.9904,  ...,  0.7912,  0.4629, -0.9529],
        [-0.6784, -0.2369, -0.9609,  ..., -0.6254, -0.5470, -0.2561],
        [ 0.7684,  0.1238, -0.8585,  ...,  2.5816, -1.8934, -0.9014]],
       device='cuda:0', requires_grad=True)

In [25]:
train(model, optimizer, 5);

1/5 loss: 5.402651743716504
2/5 loss: 4.281170398355966
3/5 loss: 3.484377849532897
4/5 loss: 2.851475468601089
5/5 loss: 2.370304086122168


In [26]:
model.encoder.embedding.weight

Parameter containing:
tensor([[-0.2354, -0.5864, -0.4949,  ..., -0.8267,  0.1834,  0.5068],
        [ 0.1649, -0.4432, -0.6046,  ..., -0.3246,  0.7523, -0.5199],
        [ 0.1061,  1.0941,  0.4907,  ..., -0.6067, -1.9590,  1.2263],
        ...,
        [ 0.0293,  0.7944,  1.9904,  ...,  0.7912,  0.4629, -0.9529],
        [-0.6784, -0.2369, -0.9609,  ..., -0.6254, -0.5470, -0.2561],
        [ 0.7684,  0.1238, -0.8585,  ...,  2.5816, -1.8934, -0.9014]],
       device='cuda:0', requires_grad=True)

5epoch学習させたが、ほとんど変わっていない。全く変わっていないわけではないので、勾配は行き届いている。ただ勾配が小さすぎるだけ。勾配消失というやつ。

勾配を見てみると、こんな感じ。

In [27]:
model.encoder.embedding.weight.grad

tensor([[-1.7154e-09, -5.8014e-09,  1.7624e-09,  ..., -4.5326e-09,
          1.3269e-09,  2.0343e-09],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [-1.2370e-32, -5.0238e-32, -1.0095e-31,  ..., -1.6434e-31,
          1.2991e-31,  6.2421e-32],
        ...,
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00]], device='cuda:0')

ほぼ0