In [1]:
import torch
from torch import nn
import numpy as np
from torch.utils.data import Dataset,DataLoader
import os
import sys

In [2]:
notebook_path = os.getcwd()
parent_dir = os.path.dirname(notebook_path)
sys.path.append(parent_dir)

In [3]:
import importnb
with __import__('importnb').Notebook(): 
    from utils.TransformerDataset import MyDataset

## 各パラメータの指定

In [16]:
import json
with open('../data/config.json', 'r') as file:
    config = json.load(file)
    
max_len = config["max_len"]
src_vocab_size = config["src_vocab_size"]
tgt_vocab_size = config["tgt_vocab_size"]
batch_size = 16
num_head = 8
d_model = 64
d_ff =1024
N = 6
pad_idx = 0
dropout_rate=0.1
layer_norm_eps = 1e-5

## Datasetの読み込み、DataLoaderへの変形

In [5]:
train_data = torch.load('../data/train_data.pth')
test_data = torch.load('../data/test_data.pth')

In [6]:
train_loader = DataLoader(train_data,batch_size=16,shuffle=True,drop_last=True)
test_loader = DataLoader(test_data,batch_size=16,shuffle=True,drop_last=True)

## 辞書の読み込み

In [7]:
import torchtext
vocab_en = torch.load('../data/vocab_en.pth')
vocab_ja = torch.load('../data/vocab_ja.pth')

### 一応データセット、辞書の確認　

In [13]:
tmp = iter(train_loader)
ja_text = 0
en_text = 0
for batch in tmp:
  ja_text,en_text = batch
  print("文章の量は{}".format(len(en_text)))
  print("en_textのshapeは{}".format(en_text.shape))
  print("最初のencoding文は{}".format(en_text[0][0:30]))
  print("辞書の最初の30文字は{}".format(vocab_en.lookup_tokens(range(30))))
  print("文の最大長さは{}".format(len(en_text[0])))
  print("------------------------------------------------")
  print("文章の量は{}".format(len(ja_text)))
  print("ja_textのshapeは{}".format(ja_text.shape))
  print("最初のencoding文は{}".format(ja_text[0][0:30]))
  print("辞書の最初の30文字は{}".format(vocab_ja.lookup_tokens(range(30))))
  print("文の最大長さは{}".format(len(ja_text[0])))
  print("------------------------------------------------")
  break

文章の量は16
en_textのshapeはtorch.Size([16, 159])
最初のencoding文はtensor([   3,  858,  127,  188,   22, 1338, 1433,   12,  174, 3847,  650,   16,
          70,  737,    6,  120,  323,    8, 2535,   42,  610,    7,    2,    0,
           0,    0,    0,    0,    0,    0])
辞書の最初の30文字は['<pad>', '<unk>', '<eos>', '<bos>', 'the', ',', 'of', '.', 'and', 'in', '(', ')', 'to', 'was', 'a', '"', 'is', 'as', "'s", 'that', 'by', 'kyoto', 'for', 'it', 'his', 'university', 'with', 'he', 'emperor', '-']
文の最大長さは159
------------------------------------------------
文章の量は16
ja_textのshapeはtorch.Size([16, 159])
最初のencoding文はtensor([3069, 4257,  623,   28,  145,   45,  380,   48,    5,  311,  190,    6,
          38,   23,   12,    8,  683,   13,   21,   10,   43,   17,   22,   10,
           7,    0,    0,    0,    0,    0])
辞書の最初の30文字は['<pad>', '<unk>', '<eos>', '<bos>', 'の', '、', 'に', '。', 'は', 'を', 'る', 'た', 'て', 'と', 'し', '（', '）', 'が', 'い', '年', 'で', 'な', 'あ', 'っ', 'れ', '・', 'さ', 'り', '-', '京都']
文の最大長さは159
----

In [15]:
words = [vocab_en.lookup_token(index) for index in en_text[0].tolist() if vocab_en.lookup_token(index) != '<pad>']
sentence = ' '.join(words)
print(sentence)
print('-------------------------------------------')
words = [vocab_ja.lookup_token(index) for index in ja_text[0].tolist() if vocab_ja.lookup_token(index) != '<pad>']
sentence = ' '.join(words)
print(sentence)

<bos> annual buddhist service for dedicating gratitude to priest shinran-october 18th is when lectures of some faculties and departments are off . <eos>
-------------------------------------------
報恩 講 法要 - 10 月 18 日 、 学部 学科 に よ っ て は 休講 と な る こと が あ る 。
