## Attention

In [126]:
import sentencepiece as spm
import torch
from torch import nn, optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from dlprog import train_progress

In [127]:
prog = train_progress()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')


---

## Attention機構

バッチサイズ1, 1つの時間

In [128]:
class Attention(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x, hs):
        """
        x: (hidden_size)
        hs: (seq_len, hidden_size)
        """
        attention_score = hs @ x # (seq_len,)
        weights = F.softmax(attention_score, dim=0) # (seq_len,)
        y = weights @ hs # (hidden_size,)
        return y

ミニバッチ, 全ての時間

In [129]:
class AttentionLayer(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x, hs):
        """
        x: (batch_size, seq_len(dec), hidden_size)
        hs: (batch_size, seq_len(enc), hidden_size)
        """
        seq_len_dec = x.shape[1]
        attention_score = torch.matmul(x, hs.transpose(1, 2))
            # (batch_size, seq_len(dec), seq_len(enc))
        weights = F.softmax(attention_score, dim=-1)
        y = [
            (hs * weights[:, i].unsqueeze(-1)).sum(dim=1) \
                for i in range(seq_len_dec)
        ] # (seq_len(dec), batch_size, hidden_size)
        y = torch.stack(y, dim=1) # (batch_size, seq_len(dec), hidden_size)
        return y


---

## 言語モデル

### 学習データ

In [145]:
textfile_ja = 'data/kyoto_ja_10000.txt'
textfile_en = 'data/kyoto_en_10000.txt'

with open(textfile_en) as f:
    data_en = f.readlines()

with open(textfile_ja) as f:
    data_ja = f.readlines()

n_data = len(data_en)
print('num of data:', n_data)

num of data: 10000


In [146]:
tokenizer_prefix_ja = 'models/tokenizer_kyoto_ja_10000'
tokenizer_prefix_en = 'models/tokenizer_kyoto_en_10000'
sp_ja = spm.SentencePieceProcessor(f'{tokenizer_prefix_ja}.model')
sp_en = spm.SentencePieceProcessor(f'{tokenizer_prefix_en}.model')
n_vocab_ja = len(sp_ja)
n_vocab_en = len(sp_en)
print('num of vocabrary (ja):', n_vocab_ja)
print('num of vocabrary (en):', n_vocab_en)

num of vocabrary (ja): 8000
num of vocabrary (en): 8000


In [147]:
data_ids_ja = sp_ja.encode(data_ja)
data_ids_en = sp_en.encode(data_en)

In [148]:
bos_id = sp_ja.bos_id()
eos_id = sp_ja.eos_id()
for ids_ja, ids_en in zip(data_ids_ja, data_ids_en):
    ids_en.insert(0, bos_id)
    ids_ja.append(eos_id)
    ids_en.append(eos_id)

In [149]:
class TextDataset(Dataset):
    def __init__(self, data_ids_ja, data_ids_en):
        self.data_ja = [torch.tensor(ids) for ids in data_ids_ja]
        self.data_en = [torch.tensor(ids) for ids in data_ids_en]
        self.n_data = len(self.data_ja)

    def __getitem__(self, idx):
        ja = self.data_ja[idx]
        en = self.data_en[idx]
        x_enc = ja # encoderへの入力
        x_dec = en[:-1] # decoderへの入力
        y_dec = en[1:] # decoderの出力
        return x_enc, x_dec, y_dec

    def __len__(self):
        return self.n_data

batch_size = 1
dataset = TextDataset(data_ids_ja, data_ids_en)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
x_enc, x_dec, y_dec = next(iter(dataloader))
x_enc, x_dec, y_dec # example

(tensor([[1412,   10,   16,   15,   53, 1307, 3859,    8,  608, 1428, 7949,  119,
            11,  653, 7297,   86,    5,    2]]),
 tensor([[   1,   21,  299,   20,    4,  352,  998,   76,  952,   38,   46,    4,
           272,  213,  835,  258,   45,  334,   11,    4,  605,  213,  835,  258,
            35,  410,  760, 1078,    6]]),
 tensor([[  21,  299,   20,    4,  352,  998,   76,  952,   38,   46,    4,  272,
           213,  835,  258,   45,  334,   11,    4,  605,  213,  835,  258,   35,
           410,  760, 1078,    6,    2]]))

### モデル

In [135]:
class Encoder(nn.Module):
    def __init__(self, n_vocab, embed_size, hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(n_vocab, embed_size)
        self.rnn = nn.RNN(embed_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, hidden_size)

    def forward(self, x):
        """
        x: (batch_size, seq_len)
        """
        x = self.embedding(x) # (batch_size, seq_len, embed_size)
        hs, h = self.rnn(x)
            # hs: (batch_size, seq_len, hidden_size)
            # h: (1, batch_size, hidden_size)
        hs = self.fc(hs) # (batch_size, seq_len, hidden_size)
        return hs, h

In [141]:
class Decoder(nn.Module):
    def __init__(self, n_vocab, embed_size, hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(n_vocab, embed_size)
        self.rnn = nn.RNN(embed_size, hidden_size, batch_first=True)
        self.attention = AttentionLayer()
        self.fc = nn.Linear(hidden_size, n_vocab)

    def forward(self, x, h, hs):
        x = self.embedding(x) # (batch_size, seq_len, embed_size)
        hs_dec, h = self.rnn(x, h)
            # hs_dec: (batch_size, seq_len, hidden_size)
            # h: (1, batch_size, hidden_size)
        y = self.attention(hs_dec, hs) # (batch_size, seq_len, hidden_size)
        y = self.fc(y) # (batch_size, seq_len, n_vocab)
        return y, h

In [142]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, x_enc, x_dec):
        hs, h = self.encoder(x_enc)
        y, _ = self.decoder(x_dec, h, hs)
        return y

In [138]:
def train(model, optimizer, criterion, n_epochs, prog_unit=1):
    model.train()
    prog.start(n_iter=len(dataloader), n_epochs=n_epochs, unit=prog_unit)
    for _ in range(n_epochs):
        for x_enc, x_dec, y_dec in dataloader:
            optimizer.zero_grad()
            x_enc = x_enc.to(device)
            x_dec = x_dec.to(device)
            y_dec = y_dec.to(device)

            y_pred = model(x_enc, x_dec)
            loss = criterion(y_pred.reshape(-1, n_vocab_ja), y_dec.ravel())
            loss.backward()
            optimizer.step()
            prog.update(loss.item())

In [150]:
hidden_size, embed_size = 1024, 1024
encoder = Encoder(n_vocab_ja, embed_size, hidden_size)
decoder = Decoder(n_vocab_en, embed_size, hidden_size)
model = Seq2Seq(encoder, decoder).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss()

In [151]:
train(model, optimizer, criterion, n_epochs=1)

1/1: ######################################## 100% [00:02:53.66] loss: 6.21559 
