In [1]:
import os
import sys
sys.path.append("/home/pervinco/DL-workspace/NLP/Seq2Seq_Translation")

import warnings
warnings.filterwarnings("ignore", category=UserWarning, module="openpyxl")

import time
import pandas as pd

import torch
import torch.nn as nn

from glob import glob
from konlpy.tag import Mecab
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from torchtext.data.utils import get_tokenizer
from sklearn.model_selection import train_test_split

from models.model import Encoder, Decoder, Seq2Seq
from data.utils import get_total_data, tokenize, build_vocab, tokens_to_indices

In [2]:
data_dir = "/home/pervinco/Datasets/KORENG"
batch_size = 32

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
df = get_total_data(data_dir)

en_tokenizer = get_tokenizer('basic_english')
ko_tokenizer = Mecab()

total_data.csv exist.


In [4]:
df_shuffled=df.sample(frac=1).reset_index(drop=True) ## 모든 행을 무작위로 섞어 새로운 데이터프레임 df_shuffled을 생성.
df = df_shuffled[:10000]
train_df, valid_df = train_test_split(df, test_size=0.2, random_state=42, shuffle=False)

print('train size: ', len(train_df))
print('valid size: ', len(valid_df))

train size:  8000
valid size:  2000


In [5]:
train_ko_tokens, train_en_tokens = tokenize(train_df, ko_tokenizer, en_tokenizer)
valid_ko_tokens, valid_en_tokens = tokenize(valid_df, ko_tokenizer, en_tokenizer)

In [6]:
print(train_ko_tokens[0])
print(train_en_tokens[0])

['<sos>', '이후', '의', '주문', '은', '빠른', '진행', '이', '가능', '합니다', '.', '<eos>']
['<sos>', 'further', 'orders', 'will', 'proceed', 'quickly', '.', '<eos>']


In [7]:
train_ko_vocab = build_vocab(train_ko_tokens)
train_en_vocab = build_vocab(train_en_tokens)

valid_ko_vocab = build_vocab(valid_ko_tokens)
valid_en_vocab = build_vocab(valid_en_tokens)

In [8]:
print(len(train_ko_vocab), len(train_en_vocab))
print(len(valid_ko_vocab), len(valid_en_vocab))

18571 16036
8460 7614


In [9]:
train_ko_indices = [tokens_to_indices(tokens, train_ko_vocab) for tokens in train_ko_tokens]
train_en_indices = [tokens_to_indices(tokens, train_en_vocab) for tokens in train_en_tokens]

valid_ko_indices = [tokens_to_indices(tokens, valid_ko_vocab) for tokens in valid_ko_tokens]
valid_en_indices = [tokens_to_indices(tokens, valid_en_vocab) for tokens in valid_en_tokens]

In [10]:
sample_ko = train_ko_indices[0]
sample_en = train_en_indices[0]

print(sample_ko)
print(sample_en)

sample_ko = train_ko_vocab.lookup_tokens(sample_ko)
sample_en = train_en_vocab.lookup_tokens(sample_en)

print(sample_ko)
print(sample_en)

[1, 239, 10, 515, 13, 1891, 174, 4, 117, 123, 3, 2]
[1, 814, 1568, 26, 1681, 863, 5, 2]
['<sos>', '이후', '의', '주문', '은', '빠른', '진행', '이', '가능', '합니다', '.', '<eos>']
['<sos>', 'further', 'orders', 'will', 'proceed', 'quickly', '.', '<eos>']


In [11]:
class TranslationDataset(Dataset):
    def __init__(self, src_indices, trg_indices):
        self.src_indices = src_indices
        self.trg_indices = trg_indices

    def __len__(self):
        return len(self.src_indices)

    def __getitem__(self, idx):
        src_sample = torch.tensor(self.src_indices[idx], dtype=torch.long)
        trg_sample = torch.tensor(self.trg_indices[idx], dtype=torch.long)
        return src_sample, trg_sample
    
def collate_fn(batch):
    src_batch, trg_batch = zip(*batch)
    src_batch_padded = pad_sequence(src_batch, batch_first=True, padding_value=0)  # 0은 패딩 인덱스
    trg_batch_padded = pad_sequence(trg_batch, batch_first=True, padding_value=0)
    return src_batch_padded, trg_batch_padded

In [12]:
train_dataset = TranslationDataset(train_ko_indices, train_en_indices)
valid_dataset = TranslationDataset(valid_ko_indices, valid_en_indices)

In [13]:
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, collate_fn=collate_fn)

In [14]:
for idx, (ko, en) in enumerate(train_loader):
    if idx == 3:
        break
    print(ko.shape, en.shape)

torch.Size([32, 55]) torch.Size([32, 58])
torch.Size([32, 64]) torch.Size([32, 62])
torch.Size([32, 51]) torch.Size([32, 56])


In [15]:
def train(model, iterator, optimizer, criterion, device):
    model.train()
    epoch_loss = 0

    for _, (src, trg) in enumerate(iterator):
        src, trg = src.to(device), trg.to(device)

        optimizer.zero_grad()

        output, _ = model(src, trg)

        # trg와 output의 모양을 [batch size, output dim]으로 조정
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)

        loss = criterion(output, trg)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / len(iterator)

In [16]:
def validate(model, iterator, criterion, device):
    model.eval()
    epoch_loss = 0

    with torch.no_grad():
        for _, (src, trg) in enumerate(iterator):
            src, trg = src.to(device), trg.to(device)

            output, _ = model(src, trg)

            output_dim = output.shape[-1]
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)

            loss = criterion(output, trg)

            epoch_loss += loss.item()

    return epoch_loss / len(iterator)


In [17]:
input_dim = len(train_ko_vocab)
hidden_dim = 1024
output_dim = len(train_en_vocab)
pad_idx = 0

encoder = Encoder(input_dim, hidden_dim).to(device)
decoder = Decoder(hidden_dim, output_dim).to(device)

model = Seq2Seq(encoder, decoder, device).to(device)

optimizer = torch.optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

# 학습 및 검증 루프
NUM_EPOCHS = 10
for epoch in range(NUM_EPOCHS):
    train_loss = train(model, train_loader, optimizer, criterion, device)
    valid_loss = validate(model, valid_loader, criterion, device)

    print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Val. Loss: {valid_loss:.3f}')


RuntimeError: input.size(-1) must be equal to input_size. Expected 1024, got 1835008