## Seq2Seq : Sequence to Sequence 모델
#### : Encoder Decoder  모델이라고도 한다

### Encoder 클래스

In [1]:
# Encoder class
class Encoder:
    def __init__(self,vocab_size, wordvec_size,hidden_size ):
        V,D,H = vocab_size, wordvec_size,hidden_size
        rn = np.random.randn
        
        # 가중치 초기화
        embed_W = (rn(V,D) / 100).astype('f')       
        lstm_Wx = (rn(D,4*H) / np.sqrt(D)).astype('f') 
        lstm_Wh = (rn(H,4*H) / np.sqrt(H)).astype('f') 
        lstm_b = np.zeros(4*H).astype('f')             
        
        # 계층 생성
        self.embed = TimeEmbedding(embed_W)
        self.lstm = TimeLSTM(lstm_Wx,lstm_Wh,lstm_b,stateful=False)
        
        # 모든 가중치와 기울기를 모은다.
        self.params = self.embed.params + self.lstm.params
        self.grads = self.embed.grads + self.lstm.grads

    def forward(self,xs):
        xs = self.embed.forward(xs)
        hs = self.lstm.forward(xs)    # [N,T,H] , 3차원
        self.hs = hs
        return hs[:,-1,:]             # TimeLSTM 계층의 마지막 은닉 상태 h를 반환, [N,H], 2차원
    
    def backward(self,dh):
        dhs = np.zeros_like(self.hs)
        dhs[:,-1,:] = dh
        
        dout = self.lstm.backward(dhs)
        dout = self.embed.backward(dout)
        return dout   

### Decoder class

In [2]:
# Decoder class : TimeSoftmaxWithLoss는 계층에 생성하지 않는다
class Decoder:
    def __init__(self, vocab_size, wordvec_size, hidden_size ):
        V,D,H = vocab_size, wordvec_size,hidden_size
        rn = np.random.randn
        
        # 가중치 초기화
        embed_W = (rn(V,D) / 100).astype('f')       
        lstm_Wx = (rn(D,4*H) / np.sqrt(D)).astype('f') 
        lstm_Wh = (rn(H,4*H) / np.sqrt(H)).astype('f') 
        lstm_b = np.zeros(4*H).astype('f')  

        affine_W = (rn(H,V) / np.sqrt(H)).astype('f')
        affine_b = np.zeros(V).astype('f')
        
        
        # 계층 생성
        self.embed = TimeEmbedding(embed_W)
        self.lstm = TimeLSTM(lstm_Wx,lstm_Wh,lstm_b,stateful=False)
        self.affine = TimeAffine(affine_W,affine_b)
        
        # 모든 가중치와 기울기를 리스트에 모은다.
        self.params,self.grads = [],[]
        for layer in (self.embed,self.lstm,self.affine):
            self.params += layer.params
            self.grads += layer.grads

    def forward(self,xs,h):           # h : Encoder의 출력, (N,H)
        self.lstm.set_state(h)     
        out = self.embed.forward(xs)
        out = self.lstm.forward(out) 
        score = self.affine.forward(out)
        return score     # softmax를 통과시키지 않고 그냥 출력
    
    def backward(self,dscore):
        dout = self.affine.backward(dscore)
        dout = self.lstm.backward(dout)
        dout = self.embed.backward(dout)
        
        dh = self.lstm.dh     # TimeLSTM의 backward()에서 dh가 얻어져 저장 되어 있으므로
        
        return dh             # Encoder 에 전달 
    
     # 문장 생성시 호출   
    def generate(self,h,start_id,sample_size) :
        sampled = []
        sample_id = start_id
        self.lstm.set_state(h)
        
        for _ in range(sample_size):
            x = np.array(sample_id).reshape(1,1)
            out = self.embed.forward(x)
            out = self.lstm.forward(out) 
            score = self.affine.forward(out)
            
            sample_id = np.argmax(score.flatten()) # 점수가 가장 큰 문자의 ID를 선택,결정적 방법
            sampled.append(int(sample_id))
            
        return sampled

### Seq2seq 클래스

In [3]:
class Seq2seq:
    def __init__(self, vocab_size, wordvec_size, hidden_size):
        V,D,H = vocab_size, wordvec_size, hidden_size
        
        self.encoder = Encoder(V,D,H)
        self.decoder = Decoder(V,D,H)
        self.softmax = TimeSoftmaxWithLoss()
        
        self.params = self.encoder.params + self.decoder.params
        self.grads = self.encoder.grads + self.decoder.grads
        
    def forward(self,xs,ts):
        decoder_xs, decoder_ts = ts[:,:-1], ts[:,1:] # decoder의 입력: 마지막 단어를 제외
                                                     # softmaxwithloss 의 입력 : 첫 단어를 제외
        
        h = self.encoder.forward(xs)
        score = self.decoder.forward(decoder_xs,h)
        loss = self.softmax.forward(score,decoder_ts)
        return loss
    
    def backward(self,dout=1):
        dout = self.softmax.backward(dout)
        dh = self.decoder.backward(dout)
        dout = self.encoder.backward(dh)
        return dout
    
    def generate(self,xs, start_id, sample_size):
        h = self.encoder.forward(xs)
        sampled = self.decoder.generate(h,start_id,sample_size)
        
        return sampled

## 학습 데이터 준비
### Toy Data Set : 'addition.txt'

In [4]:
from dataset import sequence

# 'addition.txt': 총 50000개 덧셈 연산 예를 가짐

# 덧셈식을 일반 문장과 같이 corpus를 생성하고 seed가 고정된 랜덤으로 뒤섞고 90:10 비율로 학습과 검증 데이터를 분리해준다
# x는 덧셈식, t는 덧셈 결과값

(x_train,t_train),(x_test,t_test) = sequence.load_data('addition.txt', seed = 1984)

char_to_id ,id_to_char = sequence.get_vocab()

print(char_to_id)
print(id_to_char)
# 총 13개의 문자를 value로 갖음: '0','1','2','3','4','5','6','7','8','9','+',' ','_'

print(x_train.shape,t_train.shape)  # (45000, 7) (45000, 5)
print(x_test.shape,t_test.shape)    # (5000, 7) (5000, 5)

print(x_train.shape,t_train.shape)  # (45000, 7) (45000, 5)
print(x_test.shape,t_test.shape)    # (5000, 7) (5000, 5)

print(x_train[0])
print(t_train[0])

print(''.join(id_to_char[c] for c in x_train[0])) # 71+118 
print(''.join(id_to_char[c] for c in t_train[0])) # _189

print(''.join(id_to_char[c] for c in x_train[1]))   
print(''.join(id_to_char[c] for c in t_train[1]))  

{'1': 0, '6': 1, '+': 2, '7': 3, '5': 4, ' ': 5, '_': 6, '9': 7, '2': 8, '0': 9, '3': 10, '8': 11, '4': 12}
{0: '1', 1: '6', 2: '+', 3: '7', 4: '5', 5: ' ', 6: '_', 7: '9', 8: '2', 9: '0', 10: '3', 11: '8', 12: '4'}
(45000, 7) (45000, 5)
(5000, 7) (5000, 5)
(45000, 7) (45000, 5)
(5000, 7) (5000, 5)
[ 3  0  2  0  0 11  5]
[ 6  0 11  7  5]
71+118 
_189 
510+223
_733 


In [5]:
import os

def eval_seq2seq(model, question, correct, id_to_char,
                 verbos=False, is_reverse=False):
    correct = correct.flatten()
    # 머릿글자
    start_id = correct[0]
    correct = correct[1:]
    guess = model.generate(question, start_id, len(correct))

    # 문자열로 변환
    question = ''.join([id_to_char[int(c)] for c in question.flatten()])
    correct = ''.join([id_to_char[int(c)] for c in correct])
    guess = ''.join([id_to_char[int(c)] for c in guess])

    if verbos:
        if is_reverse:
            question = question[::-1]

        colors = {'ok': '\033[92m', 'fail': '\033[91m', 'close': '\033[0m'}  # https://norux.me/29 , 이스케이프 문자
        print('Q', question)
        print('T', correct)

        is_windows = os.name == 'nt'

        if correct == guess:
            mark = colors['ok'] + '☑' + colors['close']
            if is_windows:
                mark = 'O'
            print(mark + ' ' + guess)
        else:
            mark = colors['fail'] + '☒' + colors['close']
            if is_windows:
                mark = 'X'
            print(mark + ' ' + guess)
        print('---')

    return 1 if guess == correct else 0


## 학습 코드 구현

In [None]:
from nn_layers import Adam,Trainer,TimeEmbedding,TimeLSTM,TimeAffine,TimeSoftmaxWithLoss
import matplotlib.pyplot as plt
from dataset import sequence
import numpy as np

(x_train,t_train),(x_test,t_test) = sequence.load_data('addition.txt', seed = 1984)
char_to_id ,id_to_char = sequence.get_vocab()


# 하이퍼 파라미터 설정
vocab_size = len(char_to_id)  # 13개
wordvec_size = 16
hidden_size = 128
batch_size = 128

max_epoch = 25
max_grad = 5.0

model = Seq2seq(vocab_size,wordvec_size,hidden_size)

optimizer = Adam()
trainer = Trainer(model,optimizer)

acc_list = []

for epoch in range(max_epoch): # 25회
    trainer.fit(x_train,t_train,max_epoch=1,
                batch_size=batch_size,max_grad=max_grad)
    
    correct_num = 0
    
    for i in range(len(x_test)):
        question, correct = x_test[[i]], t_test[[i]]
        verbose = i < 10  # 최초 10개만 맞았는지 틀렸는지  출력
        correct_num += eval_seq2seq(model, question, correct,
                                    id_to_char, verbose, is_reverse)

    acc = float(correct_num) / len(x_test)
    acc_list.append(acc)
    print('검증 정확도 %.3f%%' % (acc * 100))
    
# 약 6~7분 소요 

| 에폭 1 |  반복 1 / 351 | 시간 0[s] | 손실 2.56
| 에폭 1 |  반복 21 / 351 | 시간 2[s] | 손실 2.53
| 에폭 1 |  반복 41 / 351 | 시간 3[s] | 손실 2.30
| 에폭 1 |  반복 61 / 351 | 시간 5[s] | 손실 2.14
