# 주식가격 하락으로 마음아픈 나를 위로해주는 챗봇 구현하기
 - 참조: https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html

## 1. import pacakages

In [None]:
import random
import pandas as pd
import torch
import torch.nn as nn
from torch import optim
torch.manual_seed(0) #to make the initial seeds
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") #check whether GPU is available

## 2. Preparation of Datasets
 - 단어 표현을 위해 단어사전 만들기 단어 --> index, index --> 단어


In [None]:
#!pip install Korpora
#from Korpora import Korpora
#Korpora.corpus_list()

In [None]:
!git clone https://github.com/songys/Chatbot_data.git

Cloning into 'Chatbot_data'...
remote: Enumerating objects: 57, done.[K
remote: Counting objects: 100% (39/39), done.[K
remote: Compressing objects: 100% (36/36), done.[K
remote: Total 57 (delta 21), reused 6 (delta 3), pack-reused 18[K
Unpacking objects: 100% (57/57), done.


In [None]:
raw = pd.read_csv("./Chatbot_data/ChatbotData.csv")

In [None]:
qa_list = raw[["Q","A"]]
qa_list = qa_list.values.tolist()

In [None]:
#참조 모두의 딥러닝 시즌2
SOS_token = 0
EOS_token = 1
UNK_token = 2

# class for vocabulary related information of data
class Vocab:
    def __init__(self):
        self.vocab2index = {"<SOS>": SOS_token, "<EOS>": EOS_token, "<UNK>": UNK_token}
        self.index2vocab = {SOS_token: "<SOS>", EOS_token: "<EOS>", UNK_token: "<UNK>"}
        self.vocab_count = {}
        self.n_vocab = len(self.vocab2index)

    def add_vocab(self, sentence):
        for word in sentence.split(" "):
            if word not in self.vocab2index:
                self.vocab2index[word] = self.n_vocab
                self.vocab_count[word] = 1
                self.index2vocab[self.n_vocab] = word
                self.n_vocab += 1
            else:
                self.vocab_count[word] += 1


  - 데이터 전처리: 학습데이터로 만들기

In [None]:
# read and preprocess the corpus data
def preprocess_df(corpus, source_max_length, target_max_length):
    print("reading corpus...")
    pairs = corpus

    source_vocab = Vocab()
    target_vocab = Vocab()

    print("Counting words...")
    for pair in pairs:
        source_vocab.add_vocab(pair[0])
        target_vocab.add_vocab(pair[1])
    print("source vocab size =", source_vocab.n_vocab)
    print("target vocab size =", target_vocab.n_vocab)

    return pairs, source_vocab, target_vocab

## 3. Model Classes

### 3-1. Encoder

In [None]:
# declare simple encoder
class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, x, hidden):
        x = self.embedding(x).view(1, 1, -1)
        x, hidden = self.gru(x, hidden)
        return x, hidden

### 3-2. Decoder

In [None]:
# declare simple decoder
class Decoder(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, x, hidden):
        x = self.embedding(x).view(1, 1, -1)
        x, hidden = self.gru(x, hidden)
        x = self.softmax(self.out(x[0]))
        return x, hidden

## 4. Train function

In [None]:
# convert sentence to the index tensor
def tensorize(vocab, sentence):
    #indexes = [vocab.vocab2index[word] for word in sentence.split(" ")]
    indexes = [vocab.vocab2index[word] if word in vocab.vocab2index else UNK_token for word in sentence.split(" ")]
    indexes.append(vocab.vocab2index["<EOS>"])
    return torch.Tensor(indexes).long().to(device).view(-1, 1)

In [None]:
# training seq2seq
def train(pairs, source_vocab, target_vocab, encoder, decoder, encoder_optimizer, decoder_optimizer, n_iter, print_every=1000, learning_rate=0.01):
    encoder.train()
    decoder.train()
    loss_total = 0

    training_batch = [random.choice(pairs) for _ in range(n_iter)]
    training_source = [tensorize(source_vocab, pair[0]) for pair in training_batch]
    training_target = [tensorize(target_vocab, pair[1]) for pair in training_batch]

    criterion = nn.NLLLoss()

    for i in range(1, n_iter + 1):
        source_tensor = training_source[i - 1]
        target_tensor = training_target[i - 1]

        encoder_hidden = torch.zeros([1, 1, encoder.hidden_size]).to(device)

        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()

        source_length = source_tensor.size(0)
        target_length = target_tensor.size(0)

        loss = 0

        for enc_input in range(source_length):
            _, encoder_hidden = encoder(source_tensor[enc_input], encoder_hidden)

        decoder_input = torch.Tensor([[SOS_token]]).long().to(device)
        decoder_hidden = encoder_hidden # connect encoder output to decoder input

        for di in range(target_length):
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # teacher forcing

        loss.backward()

        encoder_optimizer.step()
        decoder_optimizer.step()

        loss_iter = loss.item() / target_length
        loss_total += loss_iter

        if i % print_every == 0:
            loss_avg = loss_total / print_every
            loss_total = 0
            print("[{} - {}%] loss = {:05.4f}".format(i, i / n_iter * 100, loss_avg))

In [None]:
# insert given sentence to check the training
def evaluate(pairs, source_vocab, target_vocab, encoder, decoder, target_max_length, print_every=5):
    encoder.eval()
    decoder.eval()
    for idx, pair in enumerate(pairs):

        source_tensor = tensorize(source_vocab, pair[0])
        source_length = source_tensor.size()[0]
        encoder_hidden = torch.zeros([1, 1, encoder.hidden_size]).to(device)

        for ei in range(source_length):
            _, encoder_hidden = encoder(source_tensor[ei], encoder_hidden)

        #decoder_input = torch.Tensor([[SOS_token]], device=device).long()
        decoder_input = torch.Tensor([[SOS_token]]).long().to(device)
        decoder_hidden = encoder_hidden
        decoded_words = []

        for di in range(target_max_length):
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
            _, top_index = decoder_output.data.topk(1)
            if top_index.item() == EOS_token:
                decoded_words.append("<EOS>")
                break
            else:
                decoded_words.append(target_vocab.index2vocab[top_index.item()])

            decoder_input = top_index.squeeze().detach()

        predict_words = decoded_words
        predict_sentence = " ".join(predict_words)
        if idx%print_every==0:
          print(">", pair[0]) 
          print("=", pair[1])
          print("<", predict_sentence)
          print("")

## 5. Main function

In [None]:
SOURCE_MAX_LENGTH = 50
TARGET_MAX_LENGTH = 50

In [None]:
load_pairs, load_source_vocab, load_target_vocab = preprocess_df(qa_list, source_max_length=SOURCE_MAX_LENGTH, target_max_length=TARGET_MAX_LENGTH)

reading corpus...
Counting words...
source vocab size = 14287
target vocab size = 10008


In [None]:
qa_list[:3]

[['12시 땡!', '하루가 또 가네요.'],
 ['1지망 학교 떨어졌어', '위로해 드립니다.'],
 ['3박4일 놀러가고 싶다', '여행은 언제나 좋죠.']]

In [None]:
load_source_vocab.vocab2index["안녕"], load_source_vocab.index2vocab[2]

(4006, '<UNK>')

In [None]:
load_source_vocab.n_vocab, load_target_vocab.n_vocab

(14287, 10008)

In [None]:
enc_hidden_size = 100
dec_hidden_size = enc_hidden_size
encoder = Encoder(input_size=load_source_vocab.n_vocab, hidden_size=enc_hidden_size).to(device)
decoder = Decoder(hidden_size=dec_hidden_size, output_size=load_target_vocab.n_vocab).to(device)

learning_rate = 0.01
encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)

In [None]:
train(load_pairs, load_source_vocab, load_target_vocab, encoder, decoder, encoder_optimizer, decoder_optimizer, n_iter=len(load_pairs)*5, print_every=500)

[500 - 0.8458090163241141%] loss = 4.3966
[1000 - 1.6916180326482282%] loss = 4.3783
[1500 - 2.537427048972342%] loss = 4.4091
[2000 - 3.3832360652964564%] loss = 4.2440
[2500 - 4.22904508162057%] loss = 4.3350
[3000 - 5.074854097944684%] loss = 4.4052
[3500 - 5.920663114268798%] loss = 4.3977
[4000 - 6.766472130592913%] loss = 4.4182
[4500 - 7.6122811469170255%] loss = 4.2964
[5000 - 8.45809016324114%] loss = 4.2817
[5500 - 9.303899179565255%] loss = 4.3114
[6000 - 10.149708195889367%] loss = 4.3040
[6500 - 10.995517212213482%] loss = 4.2924
[7000 - 11.841326228537596%] loss = 4.3173
[7500 - 12.687135244861711%] loss = 4.1709
[8000 - 13.532944261185825%] loss = 4.1732
[8500 - 14.378753277509936%] loss = 4.2448
[9000 - 15.224562293834051%] loss = 4.1931
[9500 - 16.070371310158166%] loss = 4.1457
[10000 - 16.91618032648228%] loss = 4.1534
[10500 - 17.761989342806395%] loss = 4.1864
[11000 - 18.60779835913051%] loss = 4.1653
[11500 - 19.453607375454624%] loss = 4.0724
[12000 - 20.2994163

In [None]:
# check the model with given data
evaluate(load_pairs, load_source_vocab, load_target_vocab, encoder, decoder, TARGET_MAX_LENGTH, print_every=5000)

> 12시 땡!
= 하루가 또 가네요.
< 저는 위기 조차 없네요. <EOS>

> 학원폭력 짜증나
= 학교 폭력은 범죄에요.
< 좋은 사람이라면 고마워할 거예요. <EOS>

> 사랑한다고 말해주면 뭐가 덧나나
= 사랑한다고 표현해달라고 말해보세요.
< 사랑은 유지하는 게 중요한데 대단하네요. <EOS>



In [None]:
a_test_sample = [['무슨 개소리를 이렇게 장황하게해', '?']]

In [None]:
evaluate(a_test_sample, load_source_vocab, load_target_vocab, encoder, decoder, TARGET_MAX_LENGTH)

> 무슨 개소리를 이렇게 장황하게해
= ?
< 잘 견뎌내고 있네요. <EOS>



## 실습0: seq2seq 모델의 모델 부분을 구현해 보세요!

## 실습1: Unknown Token을 처리하시오!!
  - 한번도 나온적 없는 데이터를 입력으로 받을 경우 챗봇이 오류를 내고 있습니다. 해당 문제는 한번도 못본 단어를 "\<UNK\>" 라는 단어로 치환하는 방법인데요. 어떻게 구현할 수 있을까요?

## 실습2: 해당 코드의 데이터를 변경해서 기계번역기, 남자/여자 언어 번역기를 개발해보세요!