In [1]:
import torch

from collections import OrderedDict
from ptb import PTB
from torch.utils.data import DataLoader
from multiprocessing import cpu_count
from autoencoder import *

In [2]:
data_dir = 'data/'
splits = ['train', 'valid', 'test']

datasets = OrderedDict()
for split in splits:
    datasets[split] = PTB(
        data_dir=data_dir,
        split=split,
        create_data=False,
        max_sequence_length=100,
        min_occ=1)

In [3]:
print(datasets['train'].vocab_size)
print(datasets['train'].pad_idx)
print(datasets['train'].unk_idx)
print(datasets['train'].sos_idx)
print(datasets['train'].eos_idx)

9877
0
1
2
3


In [4]:
data_loader = DataLoader(
    dataset=datasets[split],
    batch_size=32,
    shuffle=(split=='train'),
    num_workers=cpu_count(),
    pin_memory=torch.cuda.is_available()
)

In [5]:
test_batch = {}

for iteration, batch in enumerate(data_loader):
    print(batch['input'].shape)
    print(batch['input'])
    print(batch['target'].shape)
    print(batch['target'])
    print(batch['length'].shape)
    print(batch['length'])
    test_batch = batch
    break

torch.Size([32, 100])
tensor([[   2,  122,   57,  ...,    0,    0,    0],
        [   2,  294,  683,  ...,    0,    0,    0],
        [   2,  286,  928,  ...,    0,    0,    0],
        ...,
        [   2,   10,  370,  ...,    0,    0,    0],
        [   2, 5492,  600,  ...,    0,    0,    0],
        [   2,  160,  385,  ...,    0,    0,    0]], dtype=torch.int32)
torch.Size([32, 100])
tensor([[ 122,   57,   33,  ...,    0,    0,    0],
        [ 294,  683,   10,  ...,    0,    0,    0],
        [ 286,  928, 4361,  ...,    0,    0,    0],
        ...,
        [  10,  370,    1,  ...,    0,    0,    0],
        [5492,  600,  230,  ...,    0,    0,    0],
        [ 160,  385,  533,  ...,    0,    0,    0]], dtype=torch.int32)
torch.Size([32])
tensor([ 7, 38, 27, 33, 25, 20, 26,  6,  6, 31, 23, 15, 16, 27, 39, 14, 12,  8,
        24, 30,  9, 34, 25, 22, 21, 13, 28, 22, 11,  4, 19, 41])


In [6]:
test_batch.keys()

dict_keys(['input', 'target', 'length'])

#### Model Input

In [7]:
d_model = 200
vocab_size = 9877
dropout = 0.3
N = 3
head_num = 5
d_ff = 200

In [8]:
embed = EmbeddingLayer(d_model, vocab_size, dropout=dropout)
encoder = Encoder(d_model, N, head_num, d_ff, dropout=dropout)
decoder = Decoder(d_model, N, head_num, d_ff, dropout=dropout)
linear_softmax = LinearSoftmax(d_model, vocab_size)

In [9]:
en_input = test_batch['input']  # [32, 100]
en_mask = torch.zeros(test_batch['input'].shape, dtype=torch.int32).masked_fill_(test_batch['input'] != 0, 1)
en_input = en_input.type(torch.LongTensor)
en_mask = en_mask.type(torch.LongTensor)
en_mask = en_mask.unsqueeze(1)

In [10]:
# token & position embedding
en_embeddings = embed(en_input)

# encoding & decoding
en_output = encoder(en_embeddings, en_mask)
de_output = decoder(en_output)

- encoder: torch.Size([32, 100, 200])
- encoder: torch.Size([32, 50, 200])
- encoder: torch.Size([32, 25, 200])
- encoder: torch.Size([32, 13, 200])
- decoder: torch.Size([32, 13, 200])
- decoder: torch.Size([32, 26, 200])
- decoder: torch.Size([32, 52, 200])
- decoder: torch.Size([32, 104, 200])


In [11]:
en_output.shape

torch.Size([32, 13, 200])

In [12]:
de_output.shape

torch.Size([32, 104, 200])

In [13]:
en_input.shape

torch.Size([32, 100])

In [24]:
inputs = torch.tensor([[3, 4, 5, 6, 3, 0, 0, 0],
                       [3, 4, 5, 6, 1, 4, 3, 0]])

In [25]:
en_mask = torch.ones_like(inputs, dtype=inputs.dtype)
en_mask.masked_fill_(inputs == 0, 0)

tensor([[1, 1, 1, 1, 1, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 0]])

In [17]:
en_input.dtype

torch.int64