# Seq2Seq
Simple example for Seq2Seq (Machine Translation) by Encoder RNN and Decoder RNN.

### Seq2Seq by Encoder RNN and Decoder RNN
- Creating the **data pipeline** with `tf.data`
- Preprocessing word sequences (variable input sequence length) using `padding technique` by `user function (pad_seq)`
- Using `tf.nn.embedding_lookup` for getting vector of tokens (eg. word, character)
- Training **many to many classification** with `tf.contrib.seq2seq.sequence_loss`
- Masking unvalid token with `tf.sequence_mask`
- Using `tf.contrib.seq2seq.dynamic_decode`
- Training with `tf.contrib.seq2seq.TrainingHelper`
- Translating with `tf.contriv.seq2seq.GreedyEmbeddingHelper`
- Creating the model as **Class**
- Reference
    - https://github.com/golbin/TensorFlow-Tutorials/blob/master/10%20-%20RNN/03%20-%20Seq2Seq.py
    - https://github.com/HiJiGOO/tf_nmt_tutorial
    - https://github.com/hccho2/RNN-Tutorial
    - https://github.com/j-min/tf_tutorial_plus/tree/master/RNN_seq2seq/contrib_seq2seq
    - https://gist.github.com/ilblackdragon/c92066d9d38b236a21d5a7b729a10f12

### Setup

In [1]:
import os, sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import string
from pprint import pprint
%matplotlib inline

slim = tf.contrib.slim
print(tf.__version__)

Instructions for updating:
Use the retry module or similar alternatives.
1.7.0


### Prepare example data 

In [2]:
sources = [['I', 'feel', 'hungry'],
     ['tensorflow', 'is', 'very', 'difficult'],
     ['tensorflow', 'is', 'a', 'framework', 'for', 'deep', 'learning'],
     ['tensorflow', 'is', 'very', 'fast', 'changing']]
targets = [['나는', '배가', '고프다'],
           ['텐서플로우는', '매우', '어렵다'],
           ['텐서플로우는', '딥러닝을', '위한', '프레임워크이다'],
           ['텐서플로우는', '매우', '빠르게', '변화한다']]

In [3]:
# word dic for sentences
source_words = []
# source에 있는 각 단어들을 source_words 리스트에 하나씩 더한다.
for elm in sources:
    source_words += elm
# set을 하여서 중복을 제거한 뒤 list를 다시 만든다.
source_words = list(set(source_words))
# sorting을 한다.
source_words.sort()
# pad special symbol을 추가한다.
source_words = ['<pad>'] + source_words

#단어-id 쌍의 dict를 만든다.
source_dic = {word : idx for idx, word in enumerate(source_words)}
print(source_dic)
print(len(source_dic))

{'framework': 9, '<pad>': 0, 'for': 8, 'changing': 3, 'learning': 12, 'feel': 7, 'I': 1, 'fast': 6, 'is': 11, 'deep': 4, 'very': 14, 'difficult': 5, 'a': 2, 'hungry': 10, 'tensorflow': 13}
15


In [4]:
# id-단어 쌍의 dict를 만든다.
source_idx_dic = {elm[1] : elm[0] for elm in source_dic.items()}
source_idx_dic

{0: '<pad>',
 1: 'I',
 2: 'a',
 3: 'changing',
 4: 'deep',
 5: 'difficult',
 6: 'fast',
 7: 'feel',
 8: 'for',
 9: 'framework',
 10: 'hungry',
 11: 'is',
 12: 'learning',
 13: 'tensorflow',
 14: 'very'}

In [5]:
# word dic for translations
target_words = []
# target sentence에서 단어를 하나씩 읽어서 target_words에 더한다.
for elm in targets:
    target_words += elm
target_words = list(set(target_words))
target_words.sort()
target_words =  ['<pad>']+ ['<start>'] + ['<end>'] + \
                    target_words # 번역문의 시작과 끝을 알리는 'start', 'end' token 추가

target_dic = {word : idx for idx, word in enumerate(target_words)}
print(target_dic)
print(len(target_dic))

{'배가': 7, '<pad>': 0, '매우': 6, '고프다': 3, '나는': 4, '어렵다': 10, '변화한다': 8, '위한': 11, '프레임워크이다': 13, '텐서플로우는': 12, '딥러닝을': 5, '빠르게': 9, '<start>': 1, '<end>': 2}
14


In [6]:
len(target_dic)

14

In [7]:
target_idx_dic = {elm[1] : elm[0] for elm in target_dic.items()}
target_idx_dic

{0: '<pad>',
 1: '<start>',
 2: '<end>',
 3: '고프다',
 4: '나는',
 5: '딥러닝을',
 6: '매우',
 7: '배가',
 8: '변화한다',
 9: '빠르게',
 10: '어렵다',
 11: '위한',
 12: '텐서플로우는',
 13: '프레임워크이다'}

### Create pad_seq function for sentences

In [8]:
# 인코더에 들어가는 문장에 padding 주는 함수
def pad_seq_enc(sequences, max_len, dic):
    seq_len = []
    seq_indices = []
    # 문장 하나 하나를 읽는다
    for seq in sequences:
        # 한 문장의 길이
        seq_len.append(len(seq))
        # 한 문장의 단어의 id로 바꾸어 놓은 list를 만든다
        seq_idx = [dic.get(word) for word in seq]
        # 최대 길이를 채우지 못한 남은 부분은 <pad>에 해당하는 index로 채운다.
        seq_idx += (max_len - len(seq_idx)) * [dic.get('<pad>')] 
        # 이렇게 만든 한문장의 idx를 리스트에 append한다.
        seq_indices.append(seq_idx)        
    return seq_len, seq_indices

In [9]:
# 디코더에 들어가는 문장에 padding
def pad_seq_dec(sequences, max_len, dic):
    seq_input_len = []
    seq_input_indices = []
    seq_target_indices = []
    
    # for decoder input
    for seq in sequences:
        # 각 문장 하나마다 단어 token에 해당하는 idx리스트를 구하고 
        # 이 앞에 <start> symbol에 대한 idx 리스트를 더한다.
        seq_input_idx = [dic.get('<start>')] + [dic.get(token) for token in seq]
        # 이렇게 구한 idx list의 길이를 구한다.
        seq_input_len.append(len(seq_input_idx))
        # 최대 길이를 채우지 못한 남은 부분은 <pad>에 해당하는 idx로 채운다.
        seq_input_idx += (max_len - len(seq_input_idx)) * [dic.get('<pad>')] 
        seq_input_indices.append(seq_input_idx)
        
    # for decoder output
    for seq in sequences:
        # 문장마다 단어의 idx로 바꾼 list에 <end>에 해당하는 idx를 추가한다.
        seq_target_idx = [dic.get(token) for token in seq] + [dic.get('<end>')]
        # 최대길이를 채우지 못한 경우 <pad>에 대한 idx를 더한다.
        seq_target_idx += (max_len - len(seq_target_idx)) * [dic.get('<pad>')]
        seq_target_indices.append(seq_target_idx)
        
    return seq_input_len, seq_input_indices, seq_target_indices

### Pre-process example data

In [10]:
# for encoder
source_max_len = 10
X_length, X_indices = pad_seq_enc(sequences = sources, max_len = source_max_len, dic = source_dic)
print(X_length, np.shape(X_indices))
pprint(X_indices)

[3, 4, 7, 5] (4, 10)
[[1, 7, 10, 0, 0, 0, 0, 0, 0, 0],
 [13, 11, 14, 5, 0, 0, 0, 0, 0, 0],
 [13, 11, 2, 9, 8, 4, 12, 0, 0, 0],
 [13, 11, 14, 6, 3, 0, 0, 0, 0, 0]]


In [11]:
# for decoder
target_max_len = 12
y_length, y_input_indices, y_target_indices = pad_seq_dec(sequences = targets, max_len = target_max_len,
                                                             dic = target_dic)
pprint(y_length)
pprint(y_input_indices)
pprint(y_target_indices)

[4, 4, 5, 5]
[[1, 4, 7, 3, 0, 0, 0, 0, 0, 0, 0, 0],
 [1, 12, 6, 10, 0, 0, 0, 0, 0, 0, 0, 0],
 [1, 12, 5, 11, 13, 0, 0, 0, 0, 0, 0, 0],
 [1, 12, 6, 9, 8, 0, 0, 0, 0, 0, 0, 0]]
[[4, 7, 3, 2, 0, 0, 0, 0, 0, 0, 0, 0],
 [12, 6, 10, 2, 0, 0, 0, 0, 0, 0, 0, 0],
 [12, 5, 11, 13, 2, 0, 0, 0, 0, 0, 0, 0],
 [12, 6, 9, 8, 2, 0, 0, 0, 0, 0, 0, 0]]


In [12]:
target_dic['<end>']

2

### Define SimpleNMT
Encoder RNN, Decoder RNN

In [13]:
class SimpleNMT:
    # source_dic, target_dic, target_max_len는 global variable
    # target_max_len = 12
    def __init__(self, s_len, s_indices, t_len, t_input_indices, t_output_indices,
                 t_max_len = target_max_len, s_dic = source_dic, t_dic = target_dic,
                 n_of_classes = len(target_dic), hidden_dim = 16):
        
        with tf.variable_scope('input_layer'):
            # s : source, t : target
            self._s_len = s_len
            self._s_indices = s_indices
            self._t_len = t_len
            self._t_input_indices = t_input_indices
            self._t_output_indices = t_output_indices
            self._s_dic = s_dic
            self._t_dic = t_dic
            self._t_max_len = target_max_len
            # tf.eye를 이용하세요
            s_embeddings = tf.eye(len(self._s_dic), dtype=tf.float32) 
            # tf.get_variable을 이용하세요
            s_embeddings = tf.get_variable(name='s_embedding', initializer=s_embeddings,
                                          trainable=False)
            # tf.nn.embedding_lookup을 이용하세요.
            s_batch = tf.nn.embedding_lookup(params=s_embeddings, ids=self._s_indices) 
            
        with tf.variable_scope('encoder'):
            # tf.contrib.rnn.BasicRNNCell을 이용하세요
            # 원래라면 num_units은 s_embeidding의 dimension으로 주어야 한다.
            enc_cell = tf.nn.rnn_cell.BasicRNNCell(num_units=hidden_dim, activation=tf.nn.tanh)
            # tf.nn.dynamic_rnn을 이용하세요
            # enc_state = [batch_size, n_step, hidden_size]이 아니라
            # [batch_size, hidden_size]
            _, self.enc_state = tf.nn.dynamic_rnn(cell=enc_cell, inputs=s_batch, 
                                             sequence_length=self._s_len,
                                             dtype=tf.float32)
            
            
        with tf.variable_scope('pipe'):
            # tf.eye를 이용하세요
            t_embeddings = tf.eye(len(self._t_dic), dtype=tf.float32)
            # tf.get_variable을 이용하세요
            t_embeddings = tf.get_variable(name='t_embedding', initializer=t_embeddings,
                                          trainable=False)
            # tf.nn.embedding_lookup을 이용하세요
            t_batch = tf.nn.embedding_lookup(params=t_embeddings, ids=self._t_input_indices)
            # _s_len 길이만큼 1이 들어간 텐서를 만든다.
            # 1은 start token을 나타냄.
            tokens = tf.ones_like(tensor = self._s_len, dtype = tf.int32) # idx 1 start_token
            # 12만큼 곱한다. target_sequence의 최대길이 vector를 만듬.
            tr_tokens = tf.map_fn(lambda elm : tf.multiply(elm, self._t_max_len), tokens)
            trans_tokens = tokens
        
        with tf.variable_scope('decoder'):
            # tf.contrib.rnn.BasicRNNCell를 이용하세요
            dec_cell = tf.nn.rnn_cell.BasicRNNCell(num_units=hidden_dim, activation=tf.nn.tanh)
            # tf.contrib.rnn.OutputProjectionWrapper를 이용하세요
            score_cell = tf.contrib.rnn.OutputProjectionWrapper(cell=dec_cell, 
                                                                output_size=n_of_classes)
            
            with tf.variable_scope('training'):
                # tf.contrib.seq2seq.TrainingHelper를 이용하세요
                # sequence_length: An int32 vector tensor.
                # max_len으로 맞춰서 output이 나오게 하려면 sequence_length를 써야한다.
                # TrainingHelper는 매 step마다 ground truth값을 넣는다. 
                tr_helper = tf.contrib.seq2seq.TrainingHelper(inputs=t_batch,
                                                              sequence_length=tr_tokens)
                # tf.contrib.seq2seq.BasicDecoder를 이용하세요
                tr_decoder = tf.contrib.seq2seq.BasicDecoder(cell=score_cell, 
                                                             helper=tr_helper, 
                                                             initial_state=self.enc_state)
                # tf.contrib.seq2seq.dynamic_decode를 이용하세요
                self._tr_outputs,self.final_state,_ = tf.contrib.seq2seq.dynamic_decode(decoder=tr_decoder)
                
            with tf.variable_scope('translation'):
                
                # tf.contrib.seq2seq.GreedyEmbeddingHelper를 이용하세요
                # start_tokens: `int32` vector shaped `[batch_size]`, the start tokens.
                # end_token: `int32` scalar, the token that marks end of decoding.
                # inference에 사용하는 helper로 
                # 전 단계의 output의 argmax에 해당하는 결과를 다음 단계의 input으로 전달한다.
                trans_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(embedding=t_embeddings, 
                                                                        start_tokens=trans_tokens, 
                                                                        end_token= target_dic['<end>'])
                # tf.contrib.seq2seq.BasicDecoder를 이용하세요
                trans_decoder = tf.contrib.seq2seq.BasicDecoder(cell=score_cell,
                                                                helper=trans_helper,
                                                                initial_state=self.enc_state)
                # tf.contrib.seq2seq.dynamic_decode를 이용하세요
                self._trans_outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(decoder=trans_decoder)
                
        with tf.variable_scope('seq2seq_loss'):
            # tf.sequence_mask
            masking = tf.sequence_mask(lengths=self._t_len, maxlen=t_max_len, dtype=tf.float32)
            # tf.contrib.seq2seq.sequence_loss를 이용하세요
            self.seq2seq_loss = tf.contrib.seq2seq.sequence_loss(logits=self._tr_outputs.rnn_output,
                                                                 targets=self._t_output_indices,
                                                                 weights=masking)
            
    def translate(self, sess, s_len, s_indices):
        feed_translation = {self._s_len : s_len, self._s_indices : s_indices}
        return sess.run(self._trans_outputs.sample_id, feed_dict = feed_translation)

### Create a model of SimpleNMT

In [14]:
# hyper-parameter#
lr = .003
epochs = 200
batch_size = 2
total_step = int(np.shape(X_indices)[0] / batch_size)
print(total_step)

2


In [15]:
## create data pipeline with tf.data
# tf.data.Dataset.from_tensor_slices를 이용하세요
tr_dataset = tf.data.Dataset.from_tensor_slices((X_length, X_indices, y_length, 
                                                 y_input_indices, y_target_indices))
tr_dataset = tr_dataset.shuffle(buffer_size = 20)
tr_dataset = tr_dataset.batch(batch_size = batch_size)
tr_iterator = tr_dataset.make_initializable_iterator()
print(tr_dataset)

<BatchDataset shapes: ((?,), (?, 10), (?,), (?, 12), (?, 12)), types: (tf.int32, tf.int32, tf.int32, tf.int32, tf.int32)>


In [16]:
X_length_mb, X_indices_mb, y_length_mb, y_input_indices_mb, y_target_indices_mb = tr_iterator.get_next()

In [17]:
sim_nmt = SimpleNMT(s_len = X_length_mb, s_indices  = X_indices_mb,
                    t_len = y_length_mb, t_input_indices = y_input_indices_mb,
                    t_output_indices = y_target_indices_mb)

In [18]:
sim_nmt._t_output_indices

<tf.Tensor 'IteratorGetNext:4' shape=(?, 12) dtype=int32>

In [19]:
sim_nmt._trans_outputs

BasicDecoderOutput(rnn_output=<tf.Tensor 'decoder/translation/decoder/transpose:0' shape=(?, ?, 14) dtype=float32>, sample_id=<tf.Tensor 'decoder/translation/decoder/transpose_1:0' shape=(?, ?) dtype=int32>)

In [20]:
type(sim_nmt._trans_outputs)

tensorflow.contrib.seq2seq.python.ops.basic_decoder.BasicDecoderOutput

In [21]:
# [batch_size, target_max_len=12, n_of_classes(target_dict)=14]
sim_nmt._trans_outputs.rnn_output

<tf.Tensor 'decoder/translation/decoder/transpose:0' shape=(?, ?, 14) dtype=float32>

In [22]:
# [batch_size, hidden_size = 16]
sim_nmt.enc_state

<tf.Tensor 'encoder/rnn/while/Exit_3:0' shape=(?, 16) dtype=float32>

In [21]:
sim_nmt._trans_outputs.sample_id

<tf.Tensor 'decoder/translation/decoder/transpose_1:0' shape=(?, ?) dtype=int32>

In [23]:
# [batch_size, hidden_size]
sim_nmt.final_state

<tf.Tensor 'decoder/training/decoder/while/Exit_3:0' shape=(?, 16) dtype=float32>

### Creat training op and train model

In [22]:
## create training op
opt = tf.train.AdamOptimizer(learning_rate = lr)
training_op = opt.minimize(loss = sim_nmt.seq2seq_loss)

In [23]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())

tr_loss_hist = []

for epoch in range(epochs):
    avg_tr_loss = 0
    tr_step = 0
    
    sess.run(tr_iterator.initializer)
    try:
        while True:
            # 직접 작성해주세요
            _, loss = sess.run([training_op, sim_nmt.seq2seq_loss])
            avg_tr_loss += loss
            tr_step += 1
            
    except tf.errors.OutOfRangeError:
        pass
    
    avg_tr_loss /= tr_step
    tr_loss_hist.append(avg_tr_loss)
    if (epoch + 1) % 10 == 0:
        print('epoch : {:3}, tr_loss : {:.3f}'.format(epoch + 1, avg_tr_loss))

epoch :  10, tr_loss : 2.112
epoch :  20, tr_loss : 1.558
epoch :  30, tr_loss : 1.089
epoch :  40, tr_loss : 0.763
epoch :  50, tr_loss : 0.546
epoch :  60, tr_loss : 0.397
epoch :  70, tr_loss : 0.294
epoch :  80, tr_loss : 0.223
epoch :  90, tr_loss : 0.173
epoch : 100, tr_loss : 0.138
epoch : 110, tr_loss : 0.111
epoch : 120, tr_loss : 0.093
epoch : 130, tr_loss : 0.078
epoch : 140, tr_loss : 0.066
epoch : 150, tr_loss : 0.058
epoch : 160, tr_loss : 0.051
epoch : 170, tr_loss : 0.045
epoch : 180, tr_loss : 0.040
epoch : 190, tr_loss : 0.036
epoch : 200, tr_loss : 0.033


In [24]:
yhat = sim_nmt.translate(sess = sess, s_len = X_length, s_indices = X_indices)
yhat

array([[ 4,  7,  3,  2,  2],
       [12,  6, 10,  2,  2],
       [12,  5, 11, 13,  2],
       [12,  6,  9,  8,  2]])

In [25]:
# 원래 문장
# y_target_indices의 한 elem에 있는 idx값에 대해 idx-word dict를 적용
originals = list(map(lambda elm : [target_idx_dic.get(idx) for idx in elm], y_target_indices))
for original in originals:
    print(original)

['나는', '배가', '고프다', '<end>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
['텐서플로우는', '매우', '어렵다', '<end>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
['텐서플로우는', '딥러닝을', '위한', '프레임워크이다', '<end>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
['텐서플로우는', '매우', '빠르게', '변화한다', '<end>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']


In [26]:
# 한글 넣은 번역문장
translations = list(map(lambda elm : [target_idx_dic.get(idx) for idx in elm], yhat))
for translation in translations:
    print(translation)

['나는', '배가', '고프다', '<end>', '<end>']
['텐서플로우는', '매우', '어렵다', '<end>', '<end>']
['텐서플로우는', '딥러닝을', '위한', '프레임워크이다', '<end>']
['텐서플로우는', '매우', '빠르게', '변화한다', '<end>']
