In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [1]:
!apt-get install -y mecab mecab-ipadic-utf8 libmecab-dev
!pip install konlpy

zsh:1: command not found: apt-get
Collecting konlpy
  Downloading konlpy-0.6.0-py2.py3-none-any.whl.metadata (1.9 kB)
Collecting JPype1>=0.7.0 (from konlpy)
  Downloading jpype1-1.6.0-cp313-cp313-macosx_10_13_universal2.whl.metadata (5.0 kB)
Collecting lxml>=4.1.0 (from konlpy)
  Using cached lxml-6.0.0-cp313-cp313-macosx_10_13_universal2.whl.metadata (6.6 kB)
Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.4/19.4 MB[0m [31m10.8 MB/s[0m  [33m0:00:01[0m eta [36m0:00:01[0m
[?25hDownloading jpype1-1.6.0-cp313-cp313-macosx_10_13_universal2.whl (582 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m582.4/582.4 kB[0m [31m9.2 MB/s[0m  [33m0:00:00[0m
[?25hUsing cached lxml-6.0.0-cp313-cp313-macosx_10_13_universal2.whl (8.4 MB)
Installing collected packages: lxml, JPype1, konlpy
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3/3[0m [konlpy]2m1/3[0m [JPype1]
[1A[2KSuccessfull

In [1]:
import os
print(os.getcwd())

/Users/yena/yyena/study/codeit/sprint/11


In [2]:
import json
import torch
import torch.nn as nn
import torch.optim as optim
import random
import numpy as np
from torch.utils.data import DataLoader, TensorDataset, RandomSampler
from konlpy.tag import Okt
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('punkt_tab')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


[nltk_data] Downloading package punkt to /Users/yena/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /Users/yena/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [3]:

# JSON 파일 경로
train_json_file_path = "koen_train_set.json"
valid_json_file_path = "koen_valid_set.json"


# JSON 파일 불러오기
def load_json(file_path, max_samples=1000):
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    return data["data"][:max_samples]

# 훈련 및 검증 데이터 로드
data_train = load_json(train_json_file_path, max_samples=50000)
data_valid = load_json(valid_json_file_path, max_samples=1000)

# ko와 mt 데이터 추출
ko_sentences_train = [item["ko"] for item in data_train]
mt_sentences_train = [item["mt"] for item in data_train]
ko_sentences_valid = [item["ko"] for item in data_valid]
mt_sentences_valid = [item["mt"] for item in data_valid]

# 한국어 및 영어 토크나이저
tokenizer_ko = Okt().morphs
tokenizer_en = word_tokenize

## 문장 길이 분석
ko_lengths = [len(tokenizer_ko(sent)) for sent in ko_sentences_train]
en_lengths = [len(tokenizer_en(sent)) for sent in mt_sentences_train]
all_lengths = ko_lengths + en_lengths

# 한국어와 영어 중 가장 긴 문장의 길이 기준으로 MAX_LENGTH 설정
MAX_LENGTH = max(max(ko_lengths), max(en_lengths)) + 1  # SOS, EOS 포함 고려
print(f"Max sequence length: {MAX_LENGTH}")

# 특수 토큰 정의
SOS_token = 0
EOS_token = 1
PAD_token = 2
UNK_token = 3

class Lang:
    def __init__(self, name):
        self.name = name
        # 초기에는 PAD, SOS, EOS, UNK 토큰을 미리 등록
        self.word2index = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS", UNK_token: "<unk>"}
        self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS", UNK_token: "<unk>"}
        self.word2count = {}
        self.n_words = 4  # PAD, SOS, EOS, UNK 포함

    def addSentence(self, sentence, tokenizer):
        for word in tokenizer(sentence):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.index2word[self.n_words] = word
            self.word2count[word] = 1
            self.n_words += 1
        else:
            self.word2count[word] += 1

# 데이터 준비
def prepareData(lang1, lang2, tokenizer1, tokenizer2):
    input_lang = Lang(lang1)
    output_lang = Lang(lang2)
    pairs = list(zip(ko_sentences_train, mt_sentences_train))
    print("Read %s sentence pairs" % len(pairs))
    for pair in pairs:
        input_lang.addSentence(pair[0], tokenizer1)
        output_lang.addSentence(pair[1], tokenizer2)
    return input_lang, output_lang, pairs

input_lang, output_lang, pairs = prepareData("ko", "en", tokenizer_ko, tokenizer_en)

# 텐서 변환 및 데이터 로더 생성
def tensorFromSentence(lang, sentence, tokenizer):
    indexes = [SOS_token]
    indexes += [lang.word2index.get(word, UNK_token) for word in tokenizer(sentence)[:MAX_LENGTH - 2]]
    indexes.append(EOS_token)
    # 길이 MAX_LENGTH에 맞춰 PAD 추가
    while len(indexes) < MAX_LENGTH:
        indexes.append(PAD_token)
    return torch.tensor(indexes[:MAX_LENGTH], dtype=torch.long, device=device)

def get_dataloader(batch_size):
    input_tensors = [tensorFromSentence(input_lang, inp, tokenizer_ko) for inp, _ in pairs]
    target_tensors = [tensorFromSentence(output_lang, tgt, tokenizer_en) for _, tgt in pairs]

    input_tensors = torch.stack(input_tensors, dim=0)  # [num_samples, MAX_LENGTH]
    target_tensors = torch.stack(target_tensors, dim=0)  # [num_samples, MAX_LENGTH]

    dataset = TensorDataset(input_tensors, target_tensors)
    train_sampler = RandomSampler(dataset)
    train_dataloader = DataLoader(dataset, sampler=train_sampler, batch_size=batch_size)

    print(f"input_tensors.shape: {input_tensors.shape}, target_tensors.shape: {target_tensors.shape}")
    return train_dataloader

train_dataloader = get_dataloader(batch_size=32)

Max sequence length: 96
Read 50000 sentence pairs
input_tensors.shape: torch.Size([50000, 96]), target_tensors.shape: torch.Size([50000, 96])


In [4]:
# 샘플 문장
print(ko_sentences_train[0])
print(mt_sentences_train[0])

원하시는 색상을 회신해 주시면 바로 제작 들어가겠습니다.
If you reply to the color you want, we will start making it right away.


In [5]:
# 검증 데이터를 위한 pair 생성
pairs_valid = list(zip(ko_sentences_valid, mt_sentences_valid))
print("Read %s validation sentence pairs" % len(pairs_valid))

Read 1000 validation sentence pairs


# Seq2Seq

In [6]:
import torch.nn.functional as F

# Encoder 정의
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, dropout_p=0.1):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, input):
        embedded = self.dropout(self.embedding(input))
        output, hidden = self.gru(embedded)
        return output, hidden

# Decoder 정의
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)

    def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
        batch_size = encoder_outputs.size(0)
        # 초기 디코더 입력: SOS 토큰 ([batch_size, 1])
        decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=encoder_outputs.device).fill_(SOS_token)
        decoder_hidden = encoder_hidden
        decoder_outputs = []

        for i in range(MAX_LENGTH):
            decoder_output, decoder_hidden = self.forward_step(decoder_input, decoder_hidden)
            decoder_outputs.append(decoder_output)

            if target_tensor is not None:
                # Teacher forcing: target_tensor의 i번째 토큰 사용 ([batch_size, 1])
                decoder_input = target_tensor[:, i].unsqueeze(1)
            else:
                # 모델 예측: topk 결과에서 가장 높은 확률을 가진 토큰 선택
                _, topi = decoder_output.topk(1)
                # topi의 shape: [batch_size, 1, 1] → squeeze 마지막 차원만 제거하여 [batch_size, 1]
                decoder_input = topi.squeeze(2).detach()

        # [batch_size, MAX_LENGTH, output_size]
        decoder_outputs = torch.cat(decoder_outputs, dim=1)
        decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)
        return decoder_outputs, decoder_hidden, None

    def forward_step(self, input, hidden):
        # input: [batch_size, 1]
        output = self.embedding(input)          # → [batch_size, 1, hidden_size]
        output = F.relu(output)
        output, hidden = self.gru(output, hidden) # output: [batch_size, 1, hidden_size]
        output = self.out(output)                 # → [batch_size, 1, output_size]
        return output, hidden


In [7]:
def train_epoch(dataloader, encoder, decoder, encoder_optimizer,
                decoder_optimizer, criterion):
    encoder.train()
    decoder.train()

    total_loss = 0
    for data in dataloader:
        input_tensor, target_tensor = data

        # 반드시 정수(LongTensor)로 변환
        input_tensor = input_tensor.long().to(device)
        target_tensor = target_tensor.long().to(device)

        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()

        encoder_outputs, encoder_hidden = encoder(input_tensor)
        decoder_outputs, _, _ = decoder(encoder_outputs, encoder_hidden, target_tensor)

        loss = criterion(
            decoder_outputs.view(-1, decoder_outputs.size(-1)),
            target_tensor.view(-1)
        )
        loss.backward()

        encoder_optimizer.step()
        decoder_optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)


def train_seq2seq(train_dataloader, encoder, decoder, n_epochs, learning_rate=0.001, print_every=100):
    print_loss_total = 0  # Reset every print_every

    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
    criterion = nn.NLLLoss()

    for epoch in range(1, n_epochs + 1):
        loss = train_epoch(train_dataloader, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)

        if epoch % print_every == 0:
            print(f"Epoch {epoch}/{n_epochs}, Loss: {loss:.4f}")

In [None]:
# 모델 초기화 및 학습 실행
hidden_size = 128
encoder = EncoderRNN(input_lang.n_words, hidden_size).to(device)
decoder = DecoderRNN(hidden_size, output_lang.n_words).to(device)

train_seq2seq(train_dataloader, encoder, decoder, n_epochs=30, print_every=1)

Epoch 1/30, Loss: 0.7764
Epoch 2/30, Loss: 0.5696
Epoch 3/30, Loss: 0.5214


In [None]:
def evaluate(encoder, decoder, sentence, input_lang, output_lang):
    encoder.eval()
    decoder.eval()

    with torch.no_grad():
        # 단일 문장이므로 배치 차원 추가 (shape: [1, MAX_LENGTH])
        input_tensor = tensorFromSentence(input_lang, sentence, tokenizer_ko).unsqueeze(0)
        encoder_outputs, encoder_hidden = encoder(input_tensor)
        decoder_outputs, decoder_hidden, decoder_attn = decoder(encoder_outputs, encoder_hidden)

        _, topi = decoder_outputs.topk(1)
        decoded_ids = topi.squeeze()

        decoded_words = []
        for idx in decoded_ids:
            if idx.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            decoded_words.append(output_lang.index2word[idx.item()])
    return decoded_words, decoder_attn

def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(pairs)
        print('>', pair[0])
        print('=', pair[1])

        output_words, _ = evaluate(encoder, decoder, pair[0], input_lang, output_lang)

        # <SOS>, <EOS>, SOS, EOS 등을 제거
        tokens_to_remove = ['<SOS>', 'SOS', '<EOS>', 'EOS']
        output_words = [w for w in output_words if w not in tokens_to_remove]

        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

In [None]:
evaluateRandomly(encoder, decoder)

> 이 메일이 귀하께 잘 도착했기를 바랍니다.
= I hope this mail has arrived well for you.
< I hope you will receive the product as soon as possible .

> 이 건 만큼은 제가 계속 체크하여 직접 연락드리도록 하겠습니다.
= I'll keep checking this and contact you directly.
< We will contact you again after arranging a few days .

> 제가 한번 오피스텔을 보러 가도 될까요?
= Can I go see the officetel?
< Can I get a chance to see ?

> 저희는 오랜 연구 끝에 피토알렉신을 함유한 콩 추출물이 피부 보호 능력에 뛰어남을 발견했습니다.
= After a long study, we found that soybean extracts containing phyto-Alexin are excellent in skin protection.
< It contains adenosine and niacinamide to brighten the dark and soft texture to the taste of the cold .

> 커버는 탈부착과 세척이 가능하여 청소하기 쉽습니다.
= The cover is removable and washable, making it easy to clean.
< It is easy to carry in the form of three-piece water .

> 그리고 우유를 조금 넣어야지 안 그러면 퍽퍽해서 살리지가 않아서 한 이 정도만 넣고 설탕 두 숟갈 정도.
= And if you don't add a little milk, it's dry and doesn't save it, so add about this much and about two spoons of sugar.
< And if you have a small

# Seq2Seq with Attention

In [None]:
import torch.nn.functional as F

class BahdanauAttention(nn.Module):
    def __init__(self, hidden_size):
        super(BahdanauAttention, self).__init__()
        self.Wa = nn.Linear(hidden_size, hidden_size)
        self.Ua = nn.Linear(hidden_size, hidden_size)
        self.Va = nn.Linear(hidden_size, 1)

    def forward(self, query, keys):
        scores = self.Va(torch.tanh(self.Wa(query) + self.Ua(keys)))
        scores = scores.squeeze(2).unsqueeze(1)

        weights = F.softmax(scores, dim=-1)
        context = torch.bmm(weights, keys)

        return context, weights

class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1):
        super(AttnDecoderRNN, self).__init__()
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.attention = BahdanauAttention(hidden_size)
        self.gru = nn.GRU(2 * hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
        batch_size = encoder_outputs.size(0)
        decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(SOS_token)
        decoder_hidden = encoder_hidden
        decoder_outputs = []
        attentions = []

        for i in range(MAX_LENGTH):
            decoder_output, decoder_hidden, attn_weights = self.forward_step(
                decoder_input, decoder_hidden, encoder_outputs
            )
            decoder_outputs.append(decoder_output)
            attentions.append(attn_weights)

            if target_tensor is not None:
                # Teacher forcing: Feed the target as the next input
                decoder_input = target_tensor[:, i].unsqueeze(1) # Teacher forcing
            else:
                # Without teacher forcing: use its own predictions as the next input
                _, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze(-1).detach()  # detach from history as input

        decoder_outputs = torch.cat(decoder_outputs, dim=1)
        decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)
        attentions = torch.cat(attentions, dim=1)

        return decoder_outputs, decoder_hidden, attentions


    def forward_step(self, input, hidden, encoder_outputs):
        embedded =  self.dropout(self.embedding(input))

        query = hidden.permute(1, 0, 2)
        context, attn_weights = self.attention(query, encoder_outputs)
        input_gru = torch.cat((embedded, context), dim=2)

        output, hidden = self.gru(input_gru, hidden)
        output = self.out(output)

        return output, hidden, attn_weights

In [None]:
# 모델 초기화 및 학습 실행
hidden_size = 128
encoder = EncoderRNN(input_lang.n_words, hidden_size).to(device)
decoder = AttnDecoderRNN(hidden_size, output_lang.n_words).to(device)

# 모델 학습 실행
train_seq2seq(train_dataloader, encoder, decoder, n_epochs=30, print_every=1)

Epoch 1/30, Loss: 0.7510
Epoch 2/30, Loss: 0.5734
Epoch 3/30, Loss: 0.5260
Epoch 4/30, Loss: 0.4903
Epoch 5/30, Loss: 0.4612
Epoch 6/30, Loss: 0.4367
Epoch 7/30, Loss: 0.4154
Epoch 8/30, Loss: 0.3966
Epoch 9/30, Loss: 0.3800
Epoch 10/30, Loss: 0.3649
Epoch 11/30, Loss: 0.3511
Epoch 12/30, Loss: 0.3384
Epoch 13/30, Loss: 0.3269
Epoch 14/30, Loss: 0.3164
Epoch 15/30, Loss: 0.3064
Epoch 16/30, Loss: 0.2972
Epoch 17/30, Loss: 0.2885
Epoch 18/30, Loss: 0.2807
Epoch 19/30, Loss: 0.2730
Epoch 20/30, Loss: 0.2659
Epoch 21/30, Loss: 0.2593
Epoch 22/30, Loss: 0.2529
Epoch 23/30, Loss: 0.2470
Epoch 24/30, Loss: 0.2413
Epoch 25/30, Loss: 0.2360
Epoch 26/30, Loss: 0.2307
Epoch 27/30, Loss: 0.2261
Epoch 28/30, Loss: 0.2212
Epoch 29/30, Loss: 0.2168
Epoch 30/30, Loss: 0.2127


In [None]:
evaluateRandomly(encoder, decoder)

> >이건 줘.
= Give me this.
< > You 're doing this .

> 좋아요, 그리고 만약 미국 밖에서 생산된 차량이 요구되는 기준을 충족하지 못하면 어떻게 될까요?
= Okay, and what if a vehicle manufactured outside the United States doesn't meet the required standards?
< OK , if you have any other challenges , you can see that the vehicle is possible , but can you buy a standards ?

> 알고 계실 수도 있지만, 저희 회사가 쇠퇴기에 접어들면서 많은 문제점들에 당면했습니다.
= As you may be aware, as our company entered a period of decline, we faced many challenges.
< As you may know , but we faced many problems with our company .

> 정확한 레플리카를 원하시면 사진을 보내주세요.
= If you are looking for an exact replica, please send us pictures as well.
< If you want , please read carefully and photos .

> 내가 말을 많이 하면 안 돼.
= I can't talk too much.
< I ca n't do it .

> >나오나.
= It's coming out.
< There 's a left .

> 이 상품은 온라인으로 가입 가능하신 상품으로 저희 은행 인터넷 뱅킹으로 신청하시면 됩니다.
= This product is available online and you can apply through our bank's Internet banking.
< This product can be compatible with the online 