## Attention

In [1]:
import sentencepiece as spm
import torch
from torch import nn, optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from dlprog import train_progress

In [2]:
prog = train_progress()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [3]:
textfile_ja = 'data/kyoto_ja.txt'
textfile_en = 'data/kyoto_en.txt'

with open(textfile_en) as f:
    data_en = f.readlines()

with open(textfile_ja) as f:
    data_ja = f.readlines()

n_data = len(data_en)
print('num of data:', n_data)

num of data: 443596


In [4]:
tokenizer_prefix_ja = 'models/tokenizer_kyoto_ja'
tokenizer_prefix_en = 'models/tokenizer_kyoto_en'
sp_ja = spm.SentencePieceProcessor(f'{tokenizer_prefix_ja}.model')
sp_en = spm.SentencePieceProcessor(f'{tokenizer_prefix_en}.model')
n_vocab_ja = len(sp_ja)
n_vocab_en = len(sp_en)
print('num of vocabrary (ja):', n_vocab_ja)
print('num of vocabrary (en):', n_vocab_en)

num of vocabrary (ja): 8000
num of vocabrary (en): 8000


In [5]:
data_ids_ja = sp_ja.encode(data_ja)
data_ids_en = sp_en.encode(data_en)

In [6]:
bos_id = sp_ja.bos_id()
eos_id = sp_ja.eos_id()
for ids_ja, ids_en in zip(data_ids_ja, data_ids_en):
    ids_en.insert(0, bos_id)
    ids_ja.append(eos_id)
    ids_en.append(eos_id)

In [7]:
class TextDataset(Dataset):
    def __init__(self, data_ids_ja, data_ids_en):
        self.data_ja = [torch.tensor(ids) for ids in data_ids_ja]
        self.data_en = [torch.tensor(ids) for ids in data_ids_en]
        self.n_data = len(self.data_ja)

    def __getitem__(self, idx):
        ja = self.data_ja[idx]
        en = self.data_en[idx]
        x_enc = ja # encoderへの入力
        x_dec = en[:-1] # decoderへの入力
        y_dec = en[1:] # decoderの出力
        return x_enc, x_dec, y_dec

    def __len__(self):
        return self.n_data

batch_size = 1
dataset = TextDataset(data_ids_ja, data_ids_en)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
x_enc, x_dec, y_dec = next(iter(dataloader))
x_enc, x_dec, y_dec # example

(tensor([[1023, 1600,  233, 1843,  386,    9,  911,  355, 1014, 1067,  248,  391,
            90,    5, 2206,   11,  554,   28,  248,  391,   90,   34,   27, 1711,
           148,  317,   24, 2911,  294,   44,    4, 3483,  563, 3657, 1785,   27,
           670,  104, 1512,  248,  391,   90,   15, 3936,    5, 1785,   27, 7355,
            72, 3349,  417,    4,  911,  355, 1014, 1067,  294,  220,  341, 1314,
          1847, 1993,  450,    7,    2]]),
 tensor([[   1,  268,   48,  355,  117,   13, 3086,   90, 7082,    8,    9, 1855,
          2691,    8,   21, 1119,    4, 1270,    8, 1142,  276,  170, 1628,    6,
            13, 3086,   90,   20,   50,  703,    6, 3329,   14,  254,  260,  163,
           111,    5,    9,  295,  110,   48, 2446,    9, 1265,    8,   11, 3564,
          3359, 1973,    9, 5646,   21,   48,  742,   12,  273,    6,   13, 3086,
            90,    5,  180, 3253,   36,  379, 1008,   20,   50,  703,    6, 3329,
            14,  254,  260,  163,  111,    7]]),
 tenso

In [None]:
class Encoder(nn.Module):
    def __init__(self, n_vocab, embed_size, hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(n_vocab, embed_size)
        self.rnn = nn.RNN(embed_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, hidden_size)

    def forward(self, x):
        """
        x: (batch_size, seq_len)
        """
        x = self.embedding(x) # (batch_size, seq_len, embed_size)
        _, h = self.rnn(x) # h: (1, batch_size, hidden_size)
        h = self.fc(h) # (1, batch_size, hidden_size)
        return h

In [None]:
class Decoder(nn.Module):
    def __init__(self, n_vocab, embed_size, hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(n_vocab, embed_size)
        self.rnn = nn.RNN(embed_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, n_vocab)

    def forward(self, x, h):
        x = self.embedding(x) # (seq_len, embed_size)
        y, h = self.rnn(x, h) # y: (seq_len, hidden_size), h: (1, hidden_size)
        y = self.fc(y) # (seq_len, n_vocab)
        return y, h

In [None]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, x_enc, x_dec):
        h = self.encoder(x_enc)
        y, _ = self.decoder(x_dec, h)
        return y