In [1]:
import torch
import spacy

from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

from torchtext.datasets import Multi30k, multi30k
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

2023-12-03 13:46:41.245322: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-12-03 13:46:41.264642: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-12-03 13:46:42.017384: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-12-03 13:46:42.01

In [2]:
UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3

In [3]:
# 데이터셋 URL 수정
multi30k.URL["train"] = "https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/training.tar.gz"
multi30k.URL["valid"] = "https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/validation.tar.gz"

In [4]:
# 토크나이저 설정
spacy_de = spacy.load("de_core_news_sm")
spacy_en = spacy.load("en_core_web_sm")
token_transform = {"de": lambda text: [tok.text for tok in spacy_de.tokenizer(text)],
                   "en": lambda text: [tok.text for tok in spacy_en.tokenizer(text)]}

# 어휘집 구축
def yield_tokens(data_iter, language):
    for data_sample in data_iter:
        yield token_transform[language](data_sample[0 if language == "de" else 1])

special_symbols = ["<unk>", "<pad>", "<bos>", "<eos>"]
vocab_transform = {}
for ln in ["de", "en"]:
    train_iter = Multi30k(split="train", language_pair=("de", "en"))
    vocab_transform[ln] = build_vocab_from_iterator(yield_tokens(train_iter, ln), specials=special_symbols)
    vocab_transform[ln].set_default_index(UNK_IDX)

In [7]:
# 텐서 변환 함수
def tensor_transform(token_ids):
    return torch.cat((torch.tensor([BOS_IDX]), torch.tensor(token_ids), torch.tensor([EOS_IDX])))

# 텍스트 변환
text_transform = {}
for ln in ["de", "en"]:
    text_transform[ln] = lambda text: tensor_transform(vocab_transform[ln](token_transform[ln](text)))

# 데이터 로더에 사용될 콜레이션 함수
def collate_fn(batch):
    src_batch, tgt_batch = [], []
    for src_sample, tgt_sample in batch:
        src_batch.append(text_transform["de"](src_sample.rstrip("\n")))
        tgt_batch.append(text_transform["en"](tgt_sample.rstrip("\n")))

    src_batch = pad_sequence(src_batch, padding_value=PAD_IDX)
    tgt_batch = pad_sequence(tgt_batch, padding_value=PAD_IDX)
    return src_batch, tgt_batch

# 데이터 로더 생성
train_iter, valid_iter, test_iter = Multi30k(split=('train', 'valid', 'test'), language_pair=("de", "en"))
train_dataloader = DataLoader(train_iter, batch_size=32, collate_fn=collate_fn)
valid_dataloader = DataLoader(valid_iter, batch_size=32, collate_fn=collate_fn)

# 데이터 로더 사용 예시
for src, tgt in train_dataloader:
    print(src.shape)
    print(tgt.shape)

    break

torch.Size([21, 32])
torch.Size([24, 32])
