In [1]:
import os
import torch
import pickle

from torch import nn
from torch import optim
from torchtext import transforms
from torch.utils.data import DataLoader
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

from utils import data_download

In [2]:
def save_pkl(data, fname):
    with open(fname, "wb") as f:
        pickle.dump(data, f)


def load_pkl(fname):
    with open(fname, "rb") as f:
        data = pickle.load(f)
    return data

In [3]:
class Multi30k:
    UNK, UNK_IDX = "<unk>", 0
    PAD, PAD_IDX = "<pad>", 1
    SOS, SOS_IDX = "<sos>", 2
    EOS, EOS_IDX = "<eos>", 3
    SPECIALS = {UNK : UNK_IDX, PAD : PAD_IDX, SOS : SOS_IDX, EOS : EOS_IDX}

    def __init__(self, data_dir, target_language, max_seq_len, min_freq):
        self.data_dir = f"{data_dir}/Multi30k"
        self.cache_dir = f"{self.data_dir}/caches"

        if not os.path.isdir(self.data_dir):
            data_download(self.data_dir)

        self.target_language = target_language
        self.max_seq_len = max_seq_len
        self.min_freq = min_freq

        self.target_tokenizer = self.build_tokenizer(self.target_language)

        self.train, self.valid, self.test = None, None, None
        self.build_dataset()

        self.vocab = None
        self.build_vocab()

        self.vocab_transform = None
        self.build_transform()


    def build_tokenizer(self, language):
        spacy_lang_dict = {'en': "en_core_web_sm", 'de': "de_core_news_sm"}
        assert language in spacy_lang_dict.keys()

        return get_tokenizer("spacy", spacy_lang_dict[language])


    def build_dataset(self):
        if not os.path.isdir(self.cache_dir):
            os.makedirs(self.cache_dir)

        train_pkl = f"{self.cache_dir}/train.pkl"
        if os.path.exists(train_pkl):
            self.train = load_pkl(train_pkl)

        else:
            with open(f"{self.data_dir}/train.en") as f:
                self.train = [text.rstrip() for text in f]
        
            save_pkl(self.train, train_pkl)

        val_pkl = f"{self.cache_dir}/val.pkl"
        if os.path.exists(val_pkl):
            self.val = load_pkl(val_pkl)
            
        else:
            with open(f"{self.data_dir}/val.en") as f:
                self.val = [text.rstrip() for text in f]
        
            save_pkl(self.val, val_pkl)

        test_pkl = f"{self.cache_dir}/test.pkl"
        if os.path.exists(test_pkl):
            self.test = load_pkl(test_pkl)
            
        else:
            with open(f"{self.data_dir}/test.en") as f:
                self.test = [text.rstrip() for text in f]
        
            save_pkl(self.test, test_pkl)


    def build_vocab(self):
        assert self.train is not None

        def yield_tokens():
            for text in self.train:
                yield [str(token) for token in self.target_tokenizer(text)]

        vocab_file = f"{self.cache_dir}/vocab_{self.target_language}.pkl"
        if os.path.exists(vocab_file):
            vocab = load_pkl(vocab_file)
        else:
            vocab = build_vocab_from_iterator(yield_tokens(), min_freq=self.min_freq, specials=self.SPECIALS.keys())
            vocab.set_default_index(self.UNK_IDX)
            save_pkl(vocab, vocab_file)

        self.vocab = vocab


    def build_transform(self):
        def get_transform(self, vocab):
            return transforms.Sequential(transforms.VocabTransform(vocab),
                                         transforms.Truncate(self.max_seq_len-2),
                                         transforms.AddToken(token=self.SOS_IDX, begin=True),
                                         transforms.AddToken(token=self.EOS_IDX, begin=False),
                                         transforms.ToTensor(padding_value=self.PAD_IDX))

        self.vocab_transform = get_transform(self, self.vocab)


    def collate_fn(self, batch):
        trg = [self.target_tokenizer(data) for data in batch]
        batch_trg = self.vocab_transform(trg)

        return batch_trg, batch


    def get_iter(self, **kwargs):
        if self.vocab_transform is None:
            self.build_transform()

        train_iter = DataLoader(self.train, collate_fn=self.collate_fn, **kwargs)
        valid_iter = DataLoader(self.valid, collate_fn=self.collate_fn, **kwargs)
        test_iter = DataLoader(self.test, collate_fn=self.collate_fn, **kwargs)

        return train_iter, valid_iter, test_iter

In [4]:
EPOCHS = 1
BATCH_SIZE = 4
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
NUM_WORKERS = min([os.cpu_count(), BATCH_SIZE if BATCH_SIZE > 1 else 0, 8])

DATASET = Multi30k("/home/pervinco/Datasets/test", "en", 256, 2)
train_iter, valid_iter, test_iter = DATASET.get_iter(batch_size=BATCH_SIZE, num_workers=NUM_WORKERS)

INPUT_DIM = len(DATASET.vocab)
OUTPUT_DIM = len(DATASET.vocab)
print(INPUT_DIM, OUTPUT_DIM)

EMBED_DIM = 256
HIDDEN_DIM = 512

for batch_data, sentence_data in train_iter:
    print(batch_data)
    print(sentence_data)

    break

6191 6191
tensor([[   2,   19,   25,   15, 1169,  808,   17,   57,   84,  336, 1339,    5,
            3,    1,    1,    1,    1],
        [   2,  165,   36,    7,  335,  287,   17, 1224,    4,  758, 4496, 2957,
            5,    3,    1,    1,    1],
        [   2,    6,   61,   33,  232,   71,    4,  253, 4460,    5,    3,    1,
            1,    1,    1,    1,    1],
        [   2,    6,   12,    7,    4,   30,   23,   10,   37,    9,    4,  589,
          586,    4,  242,    5,    3]])
['Two young, White males are outside near many bushes.', 'Several men in hard hats are operating a giant pulley system.', 'A little girl climbing into a wooden playhouse.', 'A man in a blue shirt is standing on a ladder cleaning a window.']


In [5]:
class RNNLanguageModel(nn.Module):
    def __init__(self, input_dim, emb_dim, hidden_dim, output_dim, pad_idx):
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, emb_dim, padding_idx = pad_idx)
        self.rnn = nn.RNN(emb_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, src):
        # src shape: [src_len, batch_size]
        embedded = self.embedding(src)
        # embedded shape: [src_len, batch_size, emb_dim]
        output, _ = self.rnn(embedded)
        # output shape: [src_len, batch_size, hidden_dim]
        return self.fc(output)


model = RNNLanguageModel(INPUT_DIM, EMBED_DIM, HIDDEN_DIM, OUTPUT_DIM, DATASET.PAD_IDX)
model = model.to(DEVICE)
print(model)

RNNLanguageModel(
  (embedding): Embedding(6191, 256, padding_idx=1)
  (rnn): RNN(256, 512)
  (fc): Linear(in_features=512, out_features=6191, bias=True)
)


In [6]:
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index=DATASET.PAD_IDX)

In [7]:
def train(model, iterator, optimizer, criterion):
    model.train()
    epoch_loss = 0
    for idx, (X, Y) in enumerate(iterator):
        X = X.to(DEVICE)

        optimizer.zero_grad()
        output = model(X)

        output_dim = output.shape[-1]
        output = output.view(-1, output_dim)
        Y = Y.view(-1)

        loss = criterion(output, Y)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / len(train_iter)


for epoch in range(EPOCHS):
    train_loss = train(model, train_iter, optimizer, criterion)

AttributeError: 'list' object has no attribute 'view'