# seq2seq

In [1]:
import torch
from torch import nn, optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from torchtext import transforms
from torchtext.vocab import build_vocab_from_iterator
from torchvision.transforms import Compose
import MeCab
# from janome.tokenizer import Tokenizer
import pandas as pd
from tqdm import tqdm
from typing import List
import random

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = torch.device('mps')
device

device(type='cuda')


---

## Dataset

In [2]:
df = pd.read_excel('data/JEC_basic_sentence_v1-3.xls', header=None)
df.columns = ['id', 'japanese', 'english', 'chinese']
print('num of data:', len(df))
df.head()

num of data: 5304


Unnamed: 0,id,japanese,english,chinese
0,#0001,Xではないかとつくづく疑問に思う,I often wonder if it might be X.,难道不会是X吗，我实在是感到怀疑。
1,#0002,Xがいいなといつも思います,I always think X would be nice.,我总觉得X不错。
2,#0003,それがあるようにいつも思います,It always seems like it is there.,我总觉得那好像是有的。
3,#0004,それが多すぎないかと正直思う,I honestly feel like there is too much.,老实说我觉得那太多了。
4,#0005,山田はみんなに好かれるタイプの人だと思う,I think that Yamada is the type everybody likes.,我想山田是受大家欢迎的那种人。


In [3]:
tagger = MeCab.Tagger('-Owakati')
# tokenizer = Tokenizer()
def tokenize(data: List[str], l='en') -> List[List[str]]:
    if l == 'ja':
        return [tagger.parse(sentence).split() for sentence in data]
        # return [[t for t in tokenizer.tokenize(sent, wakati=True)] for sent in data]
    elif l == 'en':
        return [sent.replace('.', ' .').lower().split() for sent in data]

In [4]:
text_ja = tokenize(df['japanese'], l='ja')
text_en = tokenize(df['english'], l='en')

In [16]:
pad, bos, eos, unk = '<pad>', '<bos>', '<eos>', '<unk>'
specials = [pad, bos, eos, unk]
vocab_ja = build_vocab_from_iterator(text_ja, specials=specials)
vocab_en = build_vocab_from_iterator(text_en, specials=specials)

transform_ja = Compose([
    transforms.AddToken(bos, begin=True),
    transforms.AddToken(eos, begin=False),
    transforms.VocabTransform(vocab_ja),
    transforms.ToTensor(),
])

transform_en = Compose([
    transforms.AddToken(bos, begin=True),
    transforms.AddToken(eos, begin=False),
    transforms.VocabTransform(vocab_en),
    transforms.ToTensor(),
])

n_vocab_ja = len(vocab_ja)
n_vocab_en = len(vocab_en)

In [17]:
class TextDataset(Dataset):
    def __init__(self, in_text, out_text, in_transform, out_transform):
        self.n_samples = len(in_text)
        self.in_text = [in_transform(text) for text in in_text]
        self.out_text = [out_transform(text) for text in out_text]

    def __getitem__(self, index):
        in_text = self.in_text[index]
        out_text = self.out_text[index]
        return in_text, out_text[:-1], out_text[1:]

    def __len__(self):
        return self.n_samples

def to_padded_tensor(text_data: List[int], pad_value: int = 0) -> torch.Tensor:
    data = pad_sequence(text_data, batch_first=True, padding_value=pad_value)
    return data

def collate_fn(batch):
    enc_in_text, dec_in_text, dec_out_text = zip(*batch)
    enc_in_text = to_padded_tensor(enc_in_text)
    dec_in_text = to_padded_tensor(dec_in_text)
    dec_out_text = to_padded_tensor(dec_out_text)
    return enc_in_text, dec_in_text, dec_out_text

dataset = TextDataset(text_ja, text_en, transform_ja, transform_en)
dataloader = DataLoader(
    dataset,
    batch_size=32,
    shuffle=True,
    collate_fn=collate_fn
)

x_enc, x_dec, label = next(iter(dataloader))
print('x_enc:', x_enc.shape)
print('x_dec:', x_dec.shape)
print('label:', label.shape)

x_enc: torch.Size([32, 17])
x_dec: torch.Size([32, 18])
label: torch.Size([32, 18])



---

## モデル構築

In [8]:
class Encoder(nn.Module):
    def __init__(self, n_vocab, embed_size, hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(
            n_vocab,
            embed_size,
            padding_idx=vocab_ja.get_stoi()[pad]
        )
        self.rnn = nn.RNN(embed_size, hidden_size, batch_first=True)
        # self.lstm = nn.LSTM(embed_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, hidden_size)

    def forward(self, x):
        x = self.embedding(x)
        _, h = self.rnn(x)
        h = self.fc(h) # (1, batch_size, hidden_size)
        # _, h = self.lstm(x)
        # h = self.fc(h[0][0])
        return h

In [9]:
class Decoder(nn.Module):
    def __init__(self, n_vocab, embed_size, hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(n_vocab, embed_size)
        self.rnn = nn.RNN(embed_size, hidden_size, batch_first=True)
        # self.lstm = nn.LSTM(embed_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, n_vocab)

    def forward(self, x, h):
        x = self.embedding(x)
        y, h = self.rnn(x, h)
        # y, h = self.lstm(x, h)
        y = self.fc(y)
        return y, h

In [10]:
class Seq2Seq(nn.Module):
    def __init__(self, n_in_vocab, n_out_vocab, embed_size, hidden_size):
        super().__init__()
        self.encoder = Encoder(n_in_vocab, embed_size, hidden_size)
        self.decoder = Decoder(n_out_vocab, embed_size, hidden_size)

    def forward(self, x_enc, x_dec):
        h = self.encoder(x_enc)
        y, _ = self.decoder(x_dec, h)
        # h0 = h.unsqueeze(0)
        # c0 = torch.zeros_like(h0)
        # y, _ = self.decoder(x_dec, (h0, c0))
        return y


---

## 学習

In [11]:
criterion = nn.CrossEntropyLoss(ignore_index=vocab_en[pad])
def train(model, optimizer, n_epochs):
    eval_tokens = []
    model.train()
    for epoch in range(1, n_epochs + 1):
        epoch_loss = 0
        for x_enc, x_dec, label in tqdm(dataloader, desc=f'{epoch}/{n_epochs}'):
            optimizer.zero_grad()
            x_enc = x_enc.to(device)
            x_dec = x_dec.to(device)
            label = label.to(device)
            y_pred = model(x_enc, x_dec)
            loss = criterion(y_pred.reshape(-1, y_pred.shape[-1]), label.ravel())
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        eval_tokens.append((y_pred, label))
        print(f'{epoch}/{n_epochs} loss: {epoch_loss/len(dataloader)}', flush=True)
    return eval_tokens

In [12]:
model = Seq2Seq(len(vocab_ja), len(vocab_en), 1024, 1024).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)

eval_tokensについては後程

In [13]:
eval_tokens = train(model, optimizer, 10)

1/10: 100%|██████████| 166/166 [00:02<00:00, 72.64it/s]

1/10 loss: 5.425530226833849



2/10: 100%|██████████| 166/166 [00:02<00:00, 79.95it/s]

2/10 loss: 4.235839473195822



3/10: 100%|██████████| 166/166 [00:02<00:00, 79.35it/s]

3/10 loss: 3.466804277466004



4/10: 100%|██████████| 166/166 [00:02<00:00, 78.41it/s]

4/10 loss: 2.8385165915431747



5/10: 100%|██████████| 166/166 [00:02<00:00, 78.06it/s]

5/10 loss: 2.3676548779728903



6/10: 100%|██████████| 166/166 [00:02<00:00, 79.08it/s]

6/10 loss: 2.0015225862882224



7/10: 100%|██████████| 166/166 [00:02<00:00, 79.81it/s]

7/10 loss: 1.7395591577851628



8/10: 100%|██████████| 166/166 [00:02<00:00, 79.12it/s]

8/10 loss: 1.5688009305172657



9/10: 100%|██████████| 166/166 [00:02<00:00, 79.02it/s]

9/10 loss: 1.4572286483753158



10/10: 100%|██████████| 166/166 [00:02<00:00, 79.07it/s]

10/10 loss: 1.3897688001035207






---

## 翻訳

In [18]:
@torch.no_grad()
def translate(model, text, max_len=100):
    model.eval()
    tokens = tokenize([text], 'ja')[0]
    tokens = transform_ja(tokens).unsqueeze(0).to(device)
    h = model.encoder(tokens)
    # h0 = h.unsqueeze(0)
    # c0 = torch.zeros_like(h0)
    # h = (h0, c0)
    next_token = vocab_en[bos]

    tokens = []
    for _ in range(max_len):
        next_token = torch.tensor(next_token).reshape(1, 1).to(device)
        y, h = model.decoder(next_token, h)
        y = F.softmax(y.ravel(), dim=0)
        # next_token = random.choices(range(len(y)), weights=y)[0] # 確率的な出力
        next_token = y.argmax().item() # 決定的な出力
        if next_token == vocab_en[eos]:
            break
        tokens.append(next_token)
    return ' '.join([vocab_en.get_itos()[t] for t in tokens])

In [19]:
translate(model, 'Xではないかとつくづく疑問に思う')

'the extras dvd contains an over two hour-long animation .'

In [20]:
translate(model, '山田はみんなに好かれるタイプの人だと思う')

'the extras dvd contains an over two hour-long animation .'

lossはいい感じに減っているが、翻訳は上手くいっていない。

---

In [21]:
translate(model, 'Xではないかとつくづく疑問に思う')

'the extras dvd contains an over two hour-long animation .'

In [22]:
translate(model, '山田はみんなに好かれるタイプの人だと思う')

'the extras dvd contains an over two hour-long animation .'

In [23]:
n_epochs = 10
n_data = 10
y, t = eval_tokens[n_epochs - 1]
y = y.argmax(dim=-1)[:n_data]
t = t[:n_data]

for yi, ti in zip(y, t):
    print('生成:', ' '.join([vocab_en.get_itos()[i] for i in yi]))
    print('正解:', ' '.join([vocab_en.get_itos()[i] for i in ti]))
    print()

生成: the it undergoes fusion, it goes through several processes . <eos> . . . . . .
正解: before it undergoes fusion, it goes through several processes . <eos> <pad> <pad> <pad> <pad> <pad> <pad>

生成: he have send greater dreams a reality . <eos> . . . . . . . .
正解: we will make your thoughts a reality . <eos> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>

生成: he city said education designates it as an important cultural property . <eos> . . . .
正解: the minister of education designates it as an important cultural property . <eos> <pad> <pad> <pad> <pad>

生成: he will me more human . <eos> . . . . . . . . . .
正解: it makes us more human . <eos> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>

生成: he city price includes the 5% consumption tax . <eos> . . . . . . .
正解: the list price includes the 5% consumption tax . <eos> <pad> <pad> <pad> <pad> <pad> <pad> <pad>

生成: he is a into the shopping cart . <eos> . . . . . . . .
正解: he put items into the shopping cart . <eos> <pad> <pa

In [139]:
model = Seq2Seq(len(vocab_ja), len(vocab_en), 1024, 1024).to(device)

In [140]:
x, y, z = next(iter(dataloader))
x.shape

torch.Size([32, 10])

In [141]:
h = model.encoder(x.to(device))
h.shape

torch.Size([1, 32, 1024])

In [142]:
h

tensor([[[-4.9013e-01,  6.0347e-01,  2.1682e-01,  ...,  3.6929e-01,
           2.6787e-04, -2.6888e-01],
         [-5.3014e-01,  5.6925e-01,  1.4656e-01,  ...,  4.0848e-01,
           2.7478e-02, -2.5881e-01],
         [-4.3444e-01,  2.7794e-01,  2.2995e-01,  ...,  2.1237e-01,
           6.0023e-03, -2.4894e-01],
         ...,
         [-1.2835e-01,  1.2345e-01, -3.6252e-01,  ...,  1.2249e-01,
          -1.4214e-01, -5.1636e-03],
         [-3.8142e-01,  5.4180e-01,  1.1163e-01,  ...,  5.1948e-01,
           1.1526e-01, -1.8388e-01],
         [-4.6349e-01,  4.7524e-01,  3.2895e-01,  ...,  3.0707e-01,
          -7.8735e-02, -2.8814e-01]]], grad_fn=<ViewBackward0>)


---

## メモ

なぜ翻訳が上手くいかないか。

### 考察

RNN系は学習時と推論時で挙動が異なる。具体的には（decoderへの）入力が異なり、学習時は正解データを入力するが、推論時は推論結果（前の時間でモデルが出力した単語）を入力する。これが原因で、lossは低いが推論は上手くいかないのではないか。

とりあえず、調べてみる。学習時にどの様な文章が生成されているのかを調べる。

In [16]:
n_epochs = 20
n_data = 10
y, t = eval_tokens[n_epochs - 1]
y = y.argmax(dim=-1)[:n_data]
t = t[:n_data]

for yi, ti in zip(y, t):
    print('生成:', ' '.join([vocab_en.get_itos()[i] for i in yi]))
    print('正解:', ' '.join([vocab_en.get_itos()[i] for i in ti]))
    print()

生成: he minutes listened earnestly to the reporter . <eos> . . . . . . . . . . . . . . . . . . . .
正解: fifty people listened earnestly to the reporter . <eos> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>

生成: he companies from mainland china are listed on the hong kong stock market . <eos> . . . . . . . . . . . . . .
正解: 231 companies from mainland china are listed on the hong kong stock market . <eos> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>

生成: he will wonder thirsty . <eos> . . . . . . . to . . . . . . . to . . . . . . .
正解: i often get thirsty . <eos> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>

生成: he the 25th, he makes the service worse . <eos> . . . . . . . . . . . . . . . . . . .
正解: on the contrary, it makes the service worse . <eos> <pad> <pad> <pad> <pad> <pad> <pad>

正解: saline springs make the flower of civilization bloom upon the earth . <eos> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>

生成: he that point, something felt something i had never felt before . <eos> . . . . . . . . . . . . . . . .
正解: at that moment, i felt something i had never felt before . <eos> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>

生成: he explanation using data convenyance is explained below . <eos> . . . . . . . . . . . . . . . . . . .
正解: an example using data convenyance is explained below . <eos> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>

生成: he has me like this . <eos> . . . . . . . . . . . . . . . . . . . . . .
正解: someone calls me like this . <eos> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>



20epoch時点での最後のバッチの生成結果。  
これを見ると、毎回'he'から始まっていることが分かる。ただ、そこからいくつか単語が進むと正解と同じ文章が生成されるようになる。これはモデルが正解の単語を入力として受け取っているからである。推論時は間違った'he'という単語をそのまま受け取るので'he'に続く尤もらしい単語列を生成する。

現状、どんな隠れ状態を受け取っても'he'が出力される。これは学習をやり直しても変わらない。なぜ必ず'he'が出力されるかというと、データセット内の初めの単語に'he'が多いから。多分。’\<bos>'の次の単語は'he'である確率が最も高く、上記の結果は`argmax()`によって確定的に単語を出力しているため、全て'he'となる。

対策として、隠れ状態によって初めの単語が変化するようにしなければならない。どうすればいいかな

### 勾配消失

encoderが適切な隠れ状態を出力できていない場合、decoderがencoderへの入力を考慮しないモデルになってしまう気がする。ただの言語モデルと変わらなくなる。だから'\<bos>'の次に'he'がくる。データセット内ではその確率が最も高いから。

encoderがしっかり学習できているかを確かめるため、重みを見てみる。

In [17]:
model = Seq2Seq(len(vocab_ja), len(vocab_en), 1024, 1024).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)

In [18]:
model.encoder.embedding.weight

Parameter containing:
tensor([[-1.5888e-01, -6.5523e-01,  5.1745e-01,  ...,  5.7562e-01,
          1.9208e-01,  1.2221e+00],
        [ 1.5327e+00,  7.8946e-02, -2.2005e+00,  ...,  6.6192e-01,
         -3.9639e-01, -1.1072e+00],
        [-2.0369e-02,  2.2732e-01,  1.1714e+00,  ..., -5.1474e-02,
          7.2456e-01,  4.2381e-01],
        ...,
        [-2.1024e+00, -1.2815e+00,  1.6506e+00,  ...,  9.3447e-01,
          1.3707e-01, -1.3592e+00],
        [-2.3145e-01, -6.8438e-01, -1.6614e+00,  ...,  3.5744e-01,
          2.6786e-01,  8.6400e-01],
        [-2.6322e+00, -9.9322e-01,  1.3165e+00,  ...,  6.4995e-01,
         -6.0848e-01, -9.7617e-04]], device='cuda:0', requires_grad=True)

In [19]:
train(model, optimizer, 5);

1/5 loss: 5.260552417801087
2/5 loss: 4.101400969976402
3/5 loss: 3.1145439880440033
4/5 loss: 2.226846011288195
5/5 loss: 1.6628079163022789


In [20]:
model.encoder.embedding.weight

Parameter containing:
tensor([[-1.6667e-01, -6.6668e-01,  5.2077e-01,  ...,  5.8563e-01,
          2.0319e-01,  1.2334e+00],
        [ 1.5328e+00,  7.8925e-02, -2.2005e+00,  ...,  6.6193e-01,
         -3.9640e-01, -1.1073e+00],
        [-1.8841e-02,  2.2387e-01,  1.1681e+00,  ..., -4.8255e-02,
          7.2018e-01,  4.2445e-01],
        ...,
        [-2.1024e+00, -1.2815e+00,  1.6506e+00,  ...,  9.3447e-01,
          1.3707e-01, -1.3592e+00],
        [-2.3145e-01, -6.8438e-01, -1.6614e+00,  ...,  3.5744e-01,
          2.6786e-01,  8.6400e-01],
        [-2.6322e+00, -9.9322e-01,  1.3165e+00,  ...,  6.4995e-01,
         -6.0848e-01, -9.7620e-04]], device='cuda:0', requires_grad=True)

5epoch学習させたが、ほとんど変わっていない。全く変わっていないわけではないので、勾配は行き届いている。ただ勾配が小さすぎるだけ。勾配消失というやつ。

勾配を見てみると、こんな感じ。

In [21]:
model.encoder.embedding.weight.grad

tensor([[-1.4703e-10, -8.5906e-11,  1.6617e-10,  ..., -6.1598e-11,
          4.8468e-11,  1.8586e-10],
        [ 1.6159e-16,  1.3984e-16, -1.0948e-16,  ..., -1.3816e-16,
         -1.2788e-16, -1.4849e-16],
        [-1.6270e-17,  5.1073e-17, -6.4859e-17,  ..., -7.4436e-17,
          5.5735e-17, -1.3135e-16],
        ...,
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00]], device='cuda:0')

ほぼ0


### 考察2

勾配消失じゃなくね？

LSTM使っても解決しないし、そもそもRNNの場合時間ごとに勾配が足されるのでこんな露骨に0になることもない気がする。勾配消失は終盤に勾配が届かなくなることだよね。ということは序盤は勾配が行き届くのでその分の勾配はあるはず。

ということは、encoderが出力した隠れ状態の勾配がそもそも小さすぎる説が有力？。

In [101]:
model = Seq2Seq(len(vocab_ja), len(vocab_en), 1024, 1024).to(device)

In [102]:
x, y, z = next(iter(dataloader))
x.shape

torch.Size([32, 30])

In [103]:
h = model.encoder(x.to(device))
h.shape

torch.Size([1, 32, 1024])

In [104]:
h

tensor([[[ 0.0888, -0.0887,  0.3468,  ..., -0.3370,  0.1387, -0.3735],
         [ 0.0888, -0.0887,  0.3468,  ..., -0.3370,  0.1387, -0.3735],
         [ 0.0888, -0.0887,  0.3468,  ..., -0.3370,  0.1387, -0.3735],
         ...,
         [ 0.0888, -0.0887,  0.3468,  ..., -0.3370,  0.1387, -0.3735],
         [ 0.0888, -0.0887,  0.3468,  ..., -0.3370,  0.1387, -0.3735],
         [ 0.0888, -0.0887,  0.3468,  ..., -0.3370,  0.1387, -0.3735]]],
       grad_fn=<ViewBackward0>)

なんだこれ、バッチに沿って全部同じやんけ


In [107]:
h[0, :2].data.cpu().numpy()

array([[ 0.08879188, -0.08866556,  0.34679464, ..., -0.33701196,
         0.13869706, -0.37350526],
       [ 0.0887944 , -0.08866922,  0.34679207, ..., -0.33701053,
         0.13869974, -0.37350455]], dtype=float32)

って思ったけど微妙に違うな。プログラムのミスではないということかな。  
最後の方に<pad>が並んでいるせいで似た値になりやすい？