<a href="https://colab.research.google.com/github/tanakakeitaro/r_d/blob/master/pytorch_seq2seq.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
cd drive/My\ Drive/marusen575

/content/drive/.shortcut-targets-by-id/103/marusen575


In [0]:
import numpy as np
import pickle

def pickle_load(path):
    with open(path, mode='rb') as f:
        data = pickle.load(f)
        return data
kami5_data = pickle_load('marusen_kami5_list.pickle')
naka7_data = pickle_load('marusen_naka7_list.pickle')

id_to_word = pickle_load('id_to_word_marusen575.pickle')
word_to_id = pickle_load('word_to_id_marusen575.pickle')

In [0]:
# 長さを5に合わせるため、2を末尾に加えてパディング
index = 0
for w in kami5_data:
    while len(w) < 5:
        w.append(2)
    kami5_data[index] = w
    index += 1

In [0]:
# 先頭に<bos>, 末尾に<eos>を加える
[l.insert(0, 0) for l in naka7_data]
[l.append(1) for l in naka7_data]

# 長さを9に合わせるため、2を末尾に加えてパディング
index = 0
for w in naka7_data:
    while len(w) < 9:
        w.append(2)
    naka7_data[index] = w
    index += 1

In [0]:
from sklearn.model_selection import train_test_split
import random
from sklearn.utils import shuffle

# train : test = 8 : 2 にデータをわける
train_5, test_5, train_7, test_7 = train_test_split(kami5_data, naka7_data, test_size= 0.2, random_state=0)

# データをバッチ化するための関数
def train2batch(input_data, output_data, batch_size=100):
    input_batch = []
    output_batch = []
    input_shuffle, output_shuffle = shuffle(input_data, output_data)
    for i in range(0, len(input_data), batch_size):
      input_batch.append(input_shuffle[i:i+batch_size])
      output_batch.append(output_shuffle[i:i+batch_size])
    return input_batch, output_batch

In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

embedding_dim = 50
hidden_dim = 128
vocab_size = len(word_to_id)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [0]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, batch_size=100):
        super(Encoder, self).__init__()
        self.hidden_dim = hidden_dim
        self.batch_size = batch_size

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim, )
        self.gru = nn.GRU(embedding_dim, hidden_dim, batch_first=True)

    def forward(self, indices):
        embedding = self.word_embeddings(indices)
        if embedding.dim() == 2:
            embedding = torch.unsqueeze(embedding, 1)
        _, state = self.gru(embedding, torch.zeros(1, self.batch_size, self.hidden_dim, device=device))
        
        return state

In [0]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, batch_size=100):
        super(Decoder, self).__init__()
        self.hidden_dim = hidden_dim
        self.batch_size = batch_size

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim, )
        self.gru = nn.GRU(embedding_dim, hidden_dim, batch_first=True)
        self.output = nn.Linear(hidden_dim, vocab_size)

    def forward(self, index, state):
        embedding = self.word_embeddings(index)
        if embedding.dim() == 2:
            embedding = torch.unsqueeze(embedding, 1)
        gruout, state = self.gru(embedding, state)
        output = self.output(gruout)
        return output, state

In [0]:
# GPU使えるように
encoder = Encoder(vocab_size, embedding_dim, hidden_dim).to(device)
decoder = Decoder(vocab_size, embedding_dim, hidden_dim).to(device)

# 損失関数
criterion = nn.CrossEntropyLoss()

# 最適化
encoder_optimizer = optim.Adam(encoder.parameters(), lr=0.001)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=0.001)

In [0]:
from datetime import datetime
from sklearn.utils import shuffle

batch_size = 100
def train2batch(data, target, batch_size=100):
    input_batch = []
    output_batch = []
    input_shuffle, output_shuffle = shuffle(data, target)
    for i in range(0, len(data), batch_size):
        input_batch.append(input_shuffle[i:i+batch_size])
        output_batch.append(output_shuffle[i:i+batch_size])
    input_batch.pop(-1)
    return input_batch, output_batch

def get_current_time():
    return datetime.now().strftime("%Y-%m-%d %H:%M:%S")

In [0]:
print("Training…")
n_epoch = 100
for epoch in range(1, n_epoch+1):
    input_batch, output_batch = train2batch(train_5, train_7)
    for i in range(len(input_batch)):
        # 勾配の初期化
        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()
        # データをテンソルに変換
        inputs = torch.tensor(input_batch[i], device=device)
        outputs = torch.tensor(output_batch[i], device=device)
        # Encoderの順伝播
        encoder_hidden = encoder(inputs)
        # Decoderで使うデータはoutput_tensorを１つずらしたものを使う
        # Decoderのインプットとするデータ
        source = outputs[:, :-1]
        # Decoderの教師データ
        # 生成開始を表す"<bos>"を削っている
        target = outputs[:, 1:]
        decoder_hidden = encoder_hidden

        # Forward batch of sequences through decoder one time step at a time
        loss = 0
        for i in range(source.size(1)):
            decoder_output, decoder_hidden = decoder(source[:, i], decoder_hidden)
            decoder_output = torch.squeeze(decoder_output)
            loss += criterion(decoder_output, target[:, i])
        
        # 誤差逆伝播
        loss.backward()

        # パラメータ更新
        # Encoder、Decoder両方学習
        encoder_optimizer.step()
        decoder_optimizer.step()

    if epoch % 10 == 0:
        print(get_current_time(), "Epoch %d: %.2f" % (epoch, loss.item()))    

    if epoch % 10 == 0:
        model_name = "seq2seq_calculator_v{}.pt".format(epoch)
        torch.save({
            'encoder_model': encoder.state_dict(),
            'decoder_model': decoder.state_dict(),
        }, model_name)
        print("Saving the checkpoint...")

Training…
2020-05-18 14:24:04 Epoch 10: 17.67
Saving the checkpoint...
2020-05-18 14:36:34 Epoch 20: 12.52
Saving the checkpoint...
2020-05-18 14:49:04 Epoch 30: 7.30
Saving the checkpoint...
2020-05-18 15:01:29 Epoch 40: 4.73
Saving the checkpoint...
2020-05-18 15:13:58 Epoch 50: 3.18
Saving the checkpoint...
2020-05-18 15:26:25 Epoch 60: 2.32
Saving the checkpoint...
2020-05-18 15:38:51 Epoch 70: 1.69
Saving the checkpoint...
2020-05-18 15:51:12 Epoch 80: 1.31
Saving the checkpoint...
2020-05-18 16:03:35 Epoch 90: 1.14
Saving the checkpoint...
2020-05-18 16:15:54 Epoch 100: 1.26
Saving the checkpoint...


In [0]:
import numpy as np

encoder = Encoder(vocab_size, embedding_dim, hidden_dim, batch_size=1).to(device)
decoder = Decoder(vocab_size, embedding_dim, hidden_dim, batch_size=1).to(device)

for epoch in range(10, 101, 10):
    model_name = "seq2seq_calculator_v{}.pt".format(epoch)
    checkpoint = torch.load(model_name)
    encoder.load_state_dict(checkpoint["encoder_model"])
    decoder.load_state_dict(checkpoint["decoder_model"])

    print("Checkpoint {:>3d}".format(epoch))
    print("-"*30)
    accuracy = 0
    i = 1
    input_5 = test_5[:i]
    answer_7 = test_7[:i]
    with torch.no_grad():
        for x_5, y_7 in zip(input_5, answer_7):
            pri5 = [id_to_word[i] for i in x_5 if i != 2]
            print(pri5)
            # テンソルに変換
            input_tensor = torch.tensor([x_5], device=device)
            # encoderは隠れ状態を返す
            state = encoder(input_tensor)
            # 変数tokenいらないけどわかりやすさのために
            token = '<bos>'
            predict_7 = [word_to_id[token]]
            # 推論
            for _ in range(9):
                index = word_to_id[token]
                input_tensor = torch.tensor([index], device=device)
                output, state = decoder(input_tensor, state)
                # 配列の最大値のインデックスを返す
                prob = F.softmax(torch.squeeze(output))
                ind = torch.argsort(prob.cpu().detach(), descending=True)
                print(ind)
                index = torch.argmax(prob.cpu().detach()).item()
                print(index)
                # 次のword候補をtokenに代入
                token = id_to_word[index]
                predict_7.append(index)
            # accuracyを足していく
            answer = [id_to_word[i] for i in y_7 if i != 2]
            predict = [id_to_word[i] for i in predict_7 if i != 2]
            print(answer, predict)
            flag = ["F", "T"][answer == predict]
            if flag == "T":
                accuracy += 1
    print("Accuracy: {:.5f}".format(accuracy / len(input_5)))
    print("-"*30)

Checkpoint  10
------------------------------
['５年', '振り']
tensor([   85,   344,   180,  ..., 16930, 18894,  2463])
85
tensor([   43,     4,    63,  ...,  5244,  1121, 16930])
43
tensor([  590,    88,     4,  ..., 17522,  7303, 16930])
590
tensor([    4,   180,    43,  ..., 14369, 14378, 16930])
4
tensor([  396,    68,    13,  ..., 17568, 16930,  5365])
396
tensor([    1,  1008,    68,  ..., 16193, 14378, 16930])
1
tensor([    2,     1,    13,  ..., 10037, 19440, 11336])
2
tensor([    2,     1,    13,  ..., 10037, 19440, 11336])
2
tensor([    2,     1,    13,  ..., 10037, 19440, 11336])
2
['<bos>', '家出', 'の', '兄', 'が', '<eos>'] ['<bos>', 'し', 'て', 'い', 'た', 'のに', '<eos>']
Accuracy: 0.00000
------------------------------
Checkpoint  20
------------------------------
['５年', '振り']
tensor([   85,   956,   143,  ..., 17588,  2463, 18319])
85
tensor([   43,     4,   685,  ..., 18864,  1121, 16930])
43
tensor([  590,    88,   180,  ...,  7303, 16930, 18027])
590
tensor([    4,    43,   180,  



tensor([   43,   180,   586,  ..., 19151, 18894, 17559])
43
tensor([    1,    36,    68,  ..., 17559,  5886,  6056])
1
tensor([    2,     1,    13,  ..., 13474, 19440, 16694])
2
tensor([    2,     1,    36,  ..., 11238, 19440, 16694])
2
tensor([    2,     1,    36,  ..., 11238, 19440, 16694])
2
tensor([    2,     1,    36,  ..., 11238, 19440, 16694])
2
['<bos>', '家出', 'の', '兄', 'が', '<eos>'] ['<bos>', 'する', '暇', 'なく', 'て', '<eos>']
Accuracy: 0.00000
------------------------------
Checkpoint  40
------------------------------
['５年', '振り']
tensor([  143,   242,     4,  ..., 14060, 12160, 15707])
143
tensor([  110,    56,  1942,  ...,  2194, 17249,  2219])
110
tensor([   27,   535,   590,  ...,  5528, 18414,  6139])
27
tensor([    1,    36,    68,  ...,  8077,  6056, 14378])
1
tensor([    2,     1,    68,  ..., 17468,  8151, 16694])
2
tensor([    2,    36,     1,  ..., 17468,  7932, 16694])
2
tensor([    2,    36,     1,  ..., 17468,  7932, 16694])
2
tensor([    2,     1,    36,  ...,   3

In [0]:
# trainでは学習できてそう、末尾の2を切ろう