In [1]:
import os
import sys
sys.path.append("/home/pervinco/DL-workspace/NLP/Seq2Seq_Translation")

import warnings
warnings.filterwarnings("ignore", category=UserWarning, module="openpyxl")

import torch
import torch.nn as nn
import pandas as pd

from glob import glob
from konlpy.tag import Mecab
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from torchtext.data.utils import get_tokenizer
from sklearn.model_selection import train_test_split


from data.utils import get_total_data, tokenize, build_vocab, tokens_to_indices

In [2]:
data_dir = "/home/pervinco/Datasets/KORENG"
batch_size = 32

In [3]:
df = get_total_data(data_dir)

en_tokenizer = get_tokenizer('basic_english')
ko_tokenizer = Mecab()

total_data.csv exist.


In [4]:
df_shuffled=df.sample(frac=1).reset_index(drop=True) ## 모든 행을 무작위로 섞어 새로운 데이터프레임 df_shuffled을 생성.
df = df_shuffled[:10000]
train_df, valid_df = train_test_split(df, test_size=0.2, random_state=42, shuffle=False)

print('train size: ', len(train_df))
print('valid size: ', len(valid_df))

train size:  8000
valid size:  2000


In [5]:
train_ko_tokens, train_en_tokens = tokenize(train_df, ko_tokenizer, en_tokenizer)
valid_ko_tokens, valid_en_tokens = tokenize(valid_df, ko_tokenizer, en_tokenizer)

In [6]:
print(train_ko_tokens[0])
print(train_en_tokens[0])

['<sos>', '전', '세계', '에서', '무', '비자', '입국', '이', '가능', '한', '특수', '한', '치안', '환경', ',', '불법', '체류', '자', '증가', '등', '으로', '제주', '에서', '외국인', '강력범', '죄', '가', '늘', '고', '있', '다', '.', '<eos>']
['<sos>', 'violent', 'crimes', 'against', 'foreigners', 'are', 'on', 'the', 'rise', 'in', 'jeju', 'due', 'to', 'the', 'special', 'security', 'environment', 'that', 'allows', 'visa-free', 'entry', 'around', 'the', 'world', 'and', 'the', 'increase', 'of', 'illegal', 'residents', '.', '<eos>']


In [7]:
train_ko_vocab = build_vocab(train_ko_tokens)
train_en_vocab = build_vocab(train_en_tokens)

valid_ko_vocab = build_vocab(valid_ko_tokens)
valid_en_vocab = build_vocab(valid_en_tokens)

In [8]:
print(len(train_ko_vocab), len(train_en_vocab))
print(len(valid_ko_vocab), len(valid_en_vocab))

18587 16055
8631 7740


In [9]:
train_ko_indices = [tokens_to_indices(tokens, train_ko_vocab) for tokens in train_ko_tokens]
train_en_indices = [tokens_to_indices(tokens, train_en_vocab) for tokens in train_en_tokens]

valid_ko_indices = [tokens_to_indices(tokens, valid_ko_vocab) for tokens in valid_ko_tokens]
valid_en_indices = [tokens_to_indices(tokens, valid_en_vocab) for tokens in valid_en_tokens]

In [10]:
sample_ko = train_ko_indices[0]
sample_en = train_en_indices[0]

print(sample_ko)
print(sample_en)

sample_ko = train_ko_vocab.lookup_tokens(sample_ko)
sample_en = train_en_vocab.lookup_tokens(sample_en)

print(sample_ko)
print(sample_en)

[1, 82, 246, 19, 1342, 5548, 3970, 4, 122, 17, 1607, 17, 2770, 322, 15, 1286, 4848, 96, 446, 30, 18, 1315, 19, 682, 6474, 2265, 11, 727, 14, 16, 9, 3, 2]
[1, 3440, 1944, 255, 1411, 24, 18, 3, 1096, 9, 812, 100, 8, 3, 215, 316, 433, 12, 1923, 15577, 1823, 248, 3, 182, 7, 3, 336, 6, 1043, 200, 5, 2]
['<sos>', '전', '세계', '에서', '무', '비자', '입국', '이', '가능', '한', '특수', '한', '치안', '환경', ',', '불법', '체류', '자', '증가', '등', '으로', '제주', '에서', '외국인', '강력범', '죄', '가', '늘', '고', '있', '다', '.', '<eos>']
['<sos>', 'violent', 'crimes', 'against', 'foreigners', 'are', 'on', 'the', 'rise', 'in', 'jeju', 'due', 'to', 'the', 'special', 'security', 'environment', 'that', 'allows', 'visa-free', 'entry', 'around', 'the', 'world', 'and', 'the', 'increase', 'of', 'illegal', 'residents', '.', '<eos>']


In [11]:
class TranslationDataset(Dataset):
    def __init__(self, src_indices, trg_indices):
        self.src_indices = src_indices
        self.trg_indices = trg_indices

    def __len__(self):
        return len(self.src_indices)

    def __getitem__(self, idx):
        src_sample = torch.tensor(self.src_indices[idx], dtype=torch.long)
        trg_sample = torch.tensor(self.trg_indices[idx], dtype=torch.long)
        return src_sample, trg_sample
    
def collate_fn(batch):
    src_batch, trg_batch = zip(*batch)
    src_batch_padded = pad_sequence(src_batch, batch_first=True, padding_value=0)  # 0은 패딩 인덱스
    trg_batch_padded = pad_sequence(trg_batch, batch_first=True, padding_value=0)
    return src_batch_padded, trg_batch_padded

In [12]:
train_dataset = TranslationDataset(train_ko_indices, train_en_indices)
valid_dataset = TranslationDataset(valid_ko_indices, valid_en_indices)

In [13]:
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, collate_fn=collate_fn)

In [14]:
for ko, en in train_loader:
    print(ko)
    print(en)

    break

tensor([[   1,    4, 1050,  ...,    0,    0,    0],
        [   1,   44,    5,  ...,    9,    3,    2],
        [   1,  489, 1088,  ...,    0,    0,    0],
        ...,
        [   1,  786,   40,  ...,    0,    0,    0],
        [   1, 2940,    5,  ...,    0,    0,    0],
        [   1, 2244,  373,  ...,    0,    0,    0]])
tensor([[   1, 2908,   24,  ...,    0,    0,    0],
        [   1,   49,   36,  ..., 8506,    5,    2],
        [   1,  156,   28,  ...,    0,    0,    0],
        ...,
        [   1,  696,    4,  ...,    0,    0,    0],
        [   1,  218,    5,  ...,    0,    0,    0],
        [   1,    3,  917,  ...,    0,    0,    0]])
