In [1]:
import torch

from torchtext.data.utils import get_tokenizer
from torchtext.datasets import multi30k, Multi30k
from torchtext.vocab import build_vocab_from_iterator

multi30k.URL["train"] = "https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/training.tar.gz"
multi30k.URL["valid"] = "https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/validation.tar.gz"

def yield_tokens(data_iter, tokenizer):
    for data_sample in data_iter:
        yield tokenizer(data_sample[0])
        yield tokenizer(data_sample[1])


def build_vocab(special_tokens=["<unk>", "<pad>", "<bos>", "<eos>"]):
    tokenize_de = get_tokenizer('spacy', language='de_core_news_sm')
    tokenize_en = get_tokenizer('spacy', language='en_core_web_sm')

    train_iter = Multi30k(split='train', language_pair=('de', 'en'))
    vocab_de = build_vocab_from_iterator(yield_tokens(train_iter, tokenize_de), specials=special_tokens)
    vocab_en = build_vocab_from_iterator(yield_tokens(train_iter, tokenize_en), specials=special_tokens)
    
    vocab_de.set_default_index(vocab_de["<unk>"])
    vocab_en.set_default_index(vocab_en["<unk>"])

    return vocab_de, vocab_en, tokenize_de, tokenize_en


def collate_fn(batch, vocab_de, vocab_en, tokenizer_de, tokenizer_en):
    src_batch, trg_batch = [], []
    for (src_item, trg_item) in batch:
        src_tensor = torch.tensor([vocab_de[token] for token in tokenizer_de(src_item)], dtype=torch.long)
        trg_tensor = torch.tensor([vocab_en[token] for token in tokenizer_en(trg_item)], dtype=torch.long)
        
        src_batch.append(torch.cat([torch.tensor([vocab_de['<bos>']]), src_tensor, torch.tensor([vocab_de['<eos>']])], dim=0))
        trg_batch.append(torch.cat([torch.tensor([vocab_en['<bos>']]), trg_tensor, torch.tensor([vocab_en['<eos>']])], dim=0))

    src_batch = torch.nn.utils.rnn.pad_sequence(src_batch, padding_value=vocab_de['<pad>'])
    trg_batch = torch.nn.utils.rnn.pad_sequence(trg_batch, padding_value=vocab_en['<pad>'])

    return src_batch, trg_batch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from torch.utils.data import DataLoader

SRC_LANGUAGE = 'de'
TRG_LANGUAGE = 'en'

vocab_de, vocab_en, tokenize_de, tokenize_en = build_vocab()
dataset = Multi30k(split='train', language_pair=(SRC_LANGUAGE, TRG_LANGUAGE))
dataloader = DataLoader(dataset, batch_size=32, collate_fn=lambda batch: collate_fn(batch, vocab_de, vocab_en, tokenize_de, tokenize_en))

for src, tgt in dataloader:
    print(src.shape, tgt.shape)
    break

2024-05-25 18:09:39.903986: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-05-25 18:09:39.993337: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-05-25 18:09:40.298871: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:
2024-05-25 18:09:40.298936: W tensorflow/compiler/xl

torch.Size([21, 32]) torch.Size([24, 32])


In [3]:
train_iter = Multi30k(split='train', language_pair=(SRC_LANGUAGE, TRG_LANGUAGE))
sample = next(iter(train_iter))
print("Source (German): ", sample[0])
print("Target (English): ", sample[1])

# 데이터 하나를 텐서로 변환하기
src_tensor = torch.tensor([vocab_de[token] for token in tokenize_de(sample[0])], dtype=torch.long)
trg_tensor = torch.tensor([vocab_en[token] for token in tokenize_en(sample[1])], dtype=torch.long)

print("Source Tensor: ", src_tensor)
print("Target Tensor: ", trg_tensor)

print(vocab_en.lookup_tokens(trg_tensor.numpy()))
print(vocab_en.lookup_indices(['White']))

Source (German):  Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.
Target (English):  Two young, White males are outside near many bushes.
Source Tensor:  tensor([   33,   170,   542,    56,   176,    36,   186,     6,    26,   220,
        14393,  6193,     4])
Target Tensor:  tensor([  36,   48,   10, 2267, 1582,   31,  112,  158,  634, 2613,    4])
['Two', 'young', ',', 'White', 'males', 'are', 'outside', 'near', 'many', 'bushes', '.']
[2267]
