In [1]:
import torch
import spacy
import datasets

from tqdm import tqdm
from collections import Counter

from torchtext.vocab import vocab
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

2024-05-27 21:53:30.797832: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-05-27 21:53:30.864384: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-05-27 21:53:31.124538: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:
2024-05-27 21:53:31.124571: W tensorflow/compiler/xl

## 1.load dataset

In [2]:
multi30k = datasets.load_dataset("bentrevett/multi30k")
train_dataset, valid_dataset, test_dataset = multi30k['train'], multi30k['validation'], multi30k['test']

In [3]:
print(len(train_dataset), len(valid_dataset), len(test_dataset))

29000 1014 1000


In [4]:
print(train_dataset[0])

{'en': 'Two young, White males are outside near many bushes.', 'de': 'Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.'}


In [5]:
print(valid_dataset[0])

{'en': 'A group of men are loading cotton onto a truck', 'de': 'Eine Gruppe von Männern lädt Baumwolle auf einen Lastwagen'}


In [6]:
print(test_dataset[0])

{'en': 'A man in an orange hat starring at something.', 'de': 'Ein Mann mit einem orangefarbenen Hut, der etwas anstarrt.'}


## 2. Tokenization, Build Vocab

In [7]:
en_tokenizer = spacy.load('en_core_web_sm')
de_tokenizer = spacy.load('de_core_news_sm')

In [8]:
def tokenize_de(text):
    return [tok.text for tok in de_tokenizer.tokenizer(text)]

def tokenize_en(text):
    return [tok.text for tok in en_tokenizer.tokenizer(text)]

In [9]:
de_counter = Counter()
en_counter = Counter()
for data in tqdm(train_dataset):
    de_counter.update(tokenize_de(data['de'].lower()))
    en_counter.update(tokenize_en(data['en'].lower()))

100%|██████████| 29000/29000 [00:01<00:00, 21758.18it/s]


In [10]:
de_vocabs = vocab(de_counter, min_freq=2, specials=("<unk>", "<pad>", "<sos>", "eos"))
en_vocabs = vocab(en_counter, min_freq=2, specials=("<unk>", "<pad>", "<sos>", "eos"))
de_vocabs.set_default_index(de_vocabs['<unk>'])
en_vocabs.set_default_index(en_vocabs['<unk>'])

print(f'Size of German Vocab : {len(de_vocabs)}')
print(f'Size of English Vocab : {len(en_vocabs)}')

Size of German Vocab : 7853
Size of English Vocab : 5893


In [11]:
text_transform_en = lambda x: [en_vocabs['<sos>']] + [en_vocabs[token.lower()] for token in tokenize_en(x)] + [en_vocabs['<eos>']]
text_transform_de = lambda x: [de_vocabs['<sos>']] + [de_vocabs[token.lower()] for token in tokenize_de(x)] + [de_vocabs['<eos>']]

In [12]:
def collate_batch(batch):
    src_list, tgt_list = [], []
    for data in batch:
        src_list.append(torch.tensor(text_transform_de(data['de'])))
        tgt_list.append(torch.tensor(text_transform_en(data['en'])))

    src_list = pad_sequence(src_list, padding_value=de_vocabs['<pad>']).T
    tgt_list = pad_sequence(tgt_list, padding_value=en_vocabs['<pad>']).T
    
    inp = {
        "src": src_list,
        "trg": tgt_list
    }

    return inp

In [13]:
train_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True, collate_fn=collate_batch)

for batch in train_dataloader:
    src, trg = batch['src'], batch['trg']
    print(src.shape, trg.shape)

    break

torch.Size([1, 11]) torch.Size([1, 12])


In [15]:
en_vocabs.vocab.get_stoi()['<pad>']

1