In [1]:
import sys
sys.path.append("/home/pervinco/DL-workspace/NLP/Seq2Seq_Translation")

In [2]:
import time
import math
import torch
import spacy

from torch import nn
from konlpy.tag import Mecab
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from torchtext.vocab import build_vocab_from_iterator

from data.utils import get_total_data, split_data
from models.model import Seq2Seq, Encoder, Decoder

2023-11-25 10:29:38.256844: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-11-25 10:29:38.277285: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-11-25 10:29:39.042934: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-11-25 10:29:39.04

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
data_dir = "/home/pervinco/Datasets/KORENG"

In [4]:
MAX_SEQ_LEN = 50
BATCH_SIZE = 32
EPOCHS = 10
CLIP = 1

In [5]:
ko_tokenizer = Mecab()
en_tokenizer = spacy.load('en_core_web_sm')

In [6]:
def tokenize_ko(text):
    return [tok for tok in ko_tokenizer.morphs(text)]

def tokenize_en(text):
    return [tok.text for tok in en_tokenizer.tokenizer(text)]

In [7]:
def build_vocab(data_iter, tokenizer):
    vocab = build_vocab_from_iterator(map(tokenizer, data_iter), specials=["<pad>", "<sos>", "<eos>", "<unk>"], min_freq=2)
    vocab.set_default_index(vocab['<unk>'])

    return vocab

In [8]:
dataset = get_total_data(data_dir)

src_sentences = dataset[0]
trg_sentences = dataset[1]
print(len(src_sentences), len(trg_sentences))

total_data.csv exist.


1402407 1402407


In [9]:
src_vocabs = build_vocab(src_sentences, tokenize_ko)
trg_vocabs = build_vocab(trg_sentences, tokenize_en)

print(len(src_vocabs), len(trg_vocabs))

114218 102292


In [10]:
def tokens_to_indices(tokens, vocab):
    return [vocab[token] for token in tokens]

In [11]:
src_indices = [tokens_to_indices(tokens, src_vocabs) for tokens in src_sentences]
trg_indices = [tokens_to_indices(tokens, trg_vocabs) for tokens in trg_sentences]

In [12]:
train_data, valid_data, test_data = split_data(src_indices, trg_indices, train_frac=0.8, valid_frac=0.1)
print(len(train_data[0]), len(train_data[1]))
print(len(valid_data[0]), len(valid_data[1]))
print(len(test_data[0]), len(test_data[1]))

1121925 1121925
140240 140240
140242 140242


In [13]:
class TranslationDataset(Dataset):
    def __init__(self, dataset):
        self.src_indices = dataset[0]
        self.trg_indices = dataset[1]

    def __len__(self):
        return len(self.src_indices)

    def __getitem__(self, idx):
        src = torch.tensor(self.src_indices[idx], dtype=torch.long) ## seq_len, batch_size
        trg = torch.tensor(self.trg_indices[idx], dtype=torch.long) ## seq_len, batch_size
        
        return src, trg
    
    def collate_fn(self, batch):
        src_batch, trg_batch = zip(*batch)
        src_batch_padded = pad_sequence(src_batch, batch_first=True, padding_value=0) 
        trg_batch_padded = pad_sequence(trg_batch, batch_first=True, padding_value=0)

        src_batch_padded = src_batch_padded.view(src_batch_padded.shape[1], src_batch_padded.shape[0])
        trg_batch_padded = trg_batch_padded.view(trg_batch_padded.shape[1], trg_batch_padded.shape[0])
        
        return src_batch_padded, trg_batch_padded

In [14]:
train_dataset = TranslationDataset(train_data)
valid_dataset = TranslationDataset(valid_data)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=train_dataset.collate_fn)
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, collate_fn=valid_dataset.collate_fn)

In [16]:
INPUT_DIM = len(src_vocabs)
OUTPUT_DIM = len(trg_vocabs)
EMBEDD_DIM = 1024
HIDDEN_DIM = 2048
NUM_LAYERS = 4
ENCODER_DROPOUT = 0.5
DECODER_DROPOUT = 0.5

encoder = Encoder(input_dim=INPUT_DIM, embedd_dim=EMBEDD_DIM, hidden_dim=HIDDEN_DIM, num_layers=NUM_LAYERS, dropout=ENCODER_DROPOUT).to(device)
decoder = Decoder(output_dim=OUTPUT_DIM, embedd_dim=EMBEDD_DIM, hidden_dim=HIDDEN_DIM, num_layers=NUM_LAYERS, dropout=DECODER_DROPOUT).to(device)
model = Seq2Seq(encoder, decoder, device).to(device)

In [17]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)

model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(114218, 1024)
    (rnn): LSTM(1024, 2048, num_layers=4, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(102292, 1024)
    (rnn): LSTM(1024, 2048, num_layers=4, dropout=0.5)
    (fc_out): Linear(in_features=2048, out_features=102292, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [18]:
optimizer = torch.optim.Adam(model.parameters())

trg_pad_idx = trg_vocabs.lookup_indices(["<pad>"])
criterion = nn.CrossEntropyLoss(ignore_index=trg_pad_idx[0])

In [19]:
def train(model, iterator, optimizer, criterion, clip):
    model.train()
    
    epoch_loss = 0
    for i, batch in enumerate(iterator):
        src, trg = batch[0].to(device), batch[1].to(device)
        
        optimizer.zero_grad()
        output = model(src, trg)
        
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)
        
        loss = criterion(output, trg)
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [20]:
def evaluate(model, iterator, criterion):
    model.eval()
    
    epoch_loss = 0
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            src, trg = batch[0].to(device), batch[1].to(device)

            output = model(src, trg, 0)
            
            output_dim = output.shape[-1]
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)

            loss = criterion(output, trg)
            
            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [21]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    
    return elapsed_mins, elapsed_secs

In [22]:
best_valid_loss = float('inf')

for epoch in range(EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_loader, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_loader, criterion)
    
    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    # if valid_loss < best_valid_loss:
    #     best_valid_loss = valid_loss
    #     torch.save(model.state_dict(), 'tut1-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

torch.Size([123, 32]) torch.Size([360, 32])


torch.Size([123, 32]) torch.Size([360, 32]) torch.Size([32, 360, 102292])
torch.Size([11160, 102292]) torch.Size([11488])


ValueError: Expected input batch_size (11160) to match target batch_size (11488).