In [10]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [11]:
import numpy as np
import tensorflow as tf

In [12]:
import json

with open('train-test.json') as fopen:
    dataset = json.load(fopen)
    
with open('dictionary.json') as fopen:
    dictionary = json.load(fopen)

In [13]:
train_X = dataset['train_X']
train_Y = dataset['train_Y']
test_X = dataset['test_X']
test_Y = dataset['test_Y']

In [14]:
dictionary.keys()

dict_keys(['from', 'to'])

In [15]:
dictionary_from = dictionary['from']['dictionary']
rev_dictionary_from = dictionary['from']['rev_dictionary']

dictionary_to = dictionary['to']['dictionary']
rev_dictionary_to = dictionary['to']['rev_dictionary']

In [16]:
GO = dictionary_from['GO']
PAD = dictionary_from['PAD']
EOS = dictionary_from['EOS']
UNK = dictionary_from['UNK']

In [17]:
for i in range(len(train_X)):
    train_X[i] += ' EOS'
    
train_X[0]

'Rachel Pike : The science behind a climate headline EOS'

In [18]:
for i in range(len(test_X)):
    test_X[i] += ' EOS'
    
test_X[0]

'How can I speak in <NUM> minutes about the bonds of women over three generations , about how the astonishing strength of those bonds took hold in the life of a four - year - old girl huddled with her young sister , her mother and her grandmother for five days and nights in a small boat in the China Sea more than <NUM> years ago , bonds that took hold in the life of that small girl and never let go - - that small girl now living in San Francisco and speaking to you today ? EOS'

In [28]:
def pad_second_dim(x, desired_size):
    padding = tf.tile([[[0.0]]], tf.stack([tf.shape(x)[0], desired_size - tf.shape(x)[1], tf.shape(x)[2]], 0))
    return tf.concat([x, padding], 1)

class Translator:
    def __init__(self, size_layer, num_layers, embedded_size,
                 from_dict_size, to_dict_size, learning_rate, batch_size,
                 force_teaching_ratio = 0.5, beam_width = 10):
        
        def cells(reuse=False):
            return tf.nn.rnn_cell.LSTMCell(size_layer,initializer=tf.orthogonal_initializer(),reuse=reuse)
        
        def attention(encoder_out, seq_len, reuse=False):
            attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(num_units = size_layer, 
                                                                    memory = encoder_out,
                                                                    memory_sequence_length = seq_len)
            return tf.contrib.seq2seq.AttentionWrapper(
            cell = tf.nn.rnn_cell.MultiRNNCell([cells(reuse) for _ in range(num_layers)]), 
                attention_mechanism = attention_mechanism,
                attention_layer_size = size_layer)
        
        self.X = tf.placeholder(tf.int32, [None, None])
        self.Y = tf.placeholder(tf.int32, [None, None])
        self.X_seq_len = tf.count_nonzero(self.X, 1, dtype=tf.int32)
        self.Y_seq_len = tf.count_nonzero(self.Y, 1, dtype=tf.int32)
        batch_size = tf.shape(self.X)[0]
        
        encoder_embedding = tf.Variable(tf.random_uniform([from_dict_size, embedded_size], -1, 1))
        decoder_embedding = tf.Variable(tf.random_uniform([to_dict_size, embedded_size], -1, 1))
        
        encoder_out, encoder_state = tf.nn.dynamic_rnn(
            cell = tf.nn.rnn_cell.MultiRNNCell([cells() for _ in range(num_layers)]), 
            inputs = tf.nn.embedding_lookup(encoder_embedding, self.X),
            sequence_length = self.X_seq_len,
            dtype = tf.float32)
        main = tf.strided_slice(self.Y, [0, 0], [batch_size, -1], [1, 1])
        decoder_input = tf.concat([tf.fill([batch_size, 1], GO), main], 1)
        dense = tf.layers.Dense(to_dict_size)
        decoder_cells = attention(encoder_out, self.X_seq_len)
        
        with tf.variable_scope('decode'):
            decoder_cell = attention(encoder_out, self.X_seq_len, reuse=False)
            main = tf.strided_slice(self.Y, [0, 0], [batch_size, -1], [1, 1])
            decoder_input = tf.concat([tf.fill([batch_size, 1], GO), main], 1)
            training_helper = tf.contrib.seq2seq.ScheduledEmbeddingTrainingHelper(
            inputs = tf.nn.embedding_lookup(decoder_embedding, decoder_input),
                sequence_length = self.Y_seq_len,
                embedding = decoder_embedding,
                sampling_probability = 1 - force_teaching_ratio,
                time_major = False)
            training_decoder = tf.contrib.seq2seq.BasicDecoder(
                cell = decoder_cell,
                helper = training_helper,
                initial_state = decoder_cell.zero_state(batch_size, tf.float32).clone(cell_state=encoder_state),
                output_layer = tf.layers.Dense(to_dict_size))
            training_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(
                decoder = training_decoder,
                impute_finished = True,
                maximum_iterations = tf.reduce_max(self.Y_seq_len))
            self.training_logits = training_decoder_output.rnn_output
            
        with tf.variable_scope('decode', reuse=True):
            encoder_out_tiled = tf.contrib.seq2seq.tile_batch(encoder_out, beam_width)
            encoder_state_tiled = tf.contrib.seq2seq.tile_batch(encoder_state, beam_width)
            X_seq_len_tiled = tf.contrib.seq2seq.tile_batch(self.X_seq_len, beam_width)
            decoder_cell = attention(encoder_out_tiled, X_seq_len_tiled, reuse=True)
            predicting_decoder = tf.contrib.seq2seq.BeamSearchDecoder(
                cell = decoder_cell,
                embedding = decoder_embedding,
                start_tokens = tf.tile(tf.constant([GO], dtype=tf.int32), [batch_size]),
                end_token = EOS,
                initial_state = decoder_cell.zero_state(batch_size * beam_width, tf.float32).clone(
                    cell_state = encoder_state_tiled),
                beam_width = beam_width,
                output_layer = tf.layers.Dense(to_dict_size, _reuse=True),
                length_penalty_weight = 0.0)
            predicting_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(
                decoder = predicting_decoder,
                impute_finished = False,
                maximum_iterations = tf.reduce_max(self.X_seq_len))
            self.predicting_ids = predicting_decoder_output.predicted_ids[:, :, 0]
        
        masks = tf.sequence_mask(self.Y_seq_len, tf.reduce_max(self.Y_seq_len), dtype=tf.float32)
        self.cost = tf.contrib.seq2seq.sequence_loss(logits = self.training_logits,
                                                     targets = self.Y,
                                                     weights = masks)
        self.optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(self.cost)
        y_t = tf.argmax(self.training_logits,axis=2)
        y_t = tf.cast(y_t, tf.int32)
        self.prediction = tf.boolean_mask(y_t, masks)
        mask_label = tf.boolean_mask(self.Y, masks)
        correct_pred = tf.equal(self.prediction, mask_label)
        correct_index = tf.cast(correct_pred, tf.float32)
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [29]:
size_layer = 512
num_layers = 2
embedded_size = 256
learning_rate = 1e-3
batch_size = 96
epoch = 20

In [30]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Translator(size_layer, num_layers, embedded_size, len(dictionary_from), 
                len(dictionary_to), learning_rate,batch_size)
sess.run(tf.global_variables_initializer())

W0904 01:45:20.307913 140190358116160 deprecation.py:323] From /home/husein/.local/lib/python3.6/site-packages/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py:985: to_int64 (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use `tf.cast` instead.
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [31]:
def str_idx(corpus, dic):
    X = []
    for i in corpus:
        ints = []
        for k in i.split():
            ints.append(dic.get(k,UNK))
        X.append(ints)
    return X

def pad_sentence_batch(sentence_batch, pad_int):
    padded_seqs = []
    seq_lens = []
    max_sentence_len = max([len(sentence) for sentence in sentence_batch])
    for sentence in sentence_batch:
        padded_seqs.append(sentence + [pad_int] * (max_sentence_len - len(sentence)))
        seq_lens.append(len(sentence))
    return padded_seqs, seq_lens

In [32]:
train_X = str_idx(train_X, dictionary_from)
test_X = str_idx(test_X, dictionary_from)
train_Y = str_idx(train_Y, dictionary_to)
test_Y = str_idx(test_Y, dictionary_to)

In [33]:
import tqdm

for e in range(epoch):
    pbar = tqdm.tqdm(
        range(0, len(train_X), batch_size), desc = 'minibatch loop')
    train_loss, train_acc, test_loss, test_acc = [], [], [], []
    for i in pbar:
        index = min(i + batch_size, len(train_X))
        maxlen = max([len(s) for s in train_X[i : index] + train_Y[i : index]])
        batch_x, seq_x = pad_sentence_batch(train_X[i : index], PAD)
        batch_y, seq_y = pad_sentence_batch(train_Y[i : index], PAD)
        feed = {model.X: batch_x,
                model.Y: batch_y}
        accuracy, loss, _ = sess.run([model.accuracy,model.cost,model.optimizer],
                                    feed_dict = feed)
        train_loss.append(loss)
        train_acc.append(accuracy)
        pbar.set_postfix(cost = loss, accuracy = accuracy)
    
    
    pbar = tqdm.tqdm(
        range(0, len(test_X), batch_size), desc = 'minibatch loop')
    for i in pbar:
        index = min(i + batch_size, len(test_X))
        batch_x, seq_x = pad_sentence_batch(test_X[i : index], PAD)
        batch_y, seq_y = pad_sentence_batch(test_Y[i : index], PAD)
        feed = {model.X: batch_x,
                model.Y: batch_y,}
        accuracy, loss = sess.run([model.accuracy,model.cost],
                                    feed_dict = feed)

        test_loss.append(loss)
        test_acc.append(accuracy)
        pbar.set_postfix(cost = loss, accuracy = accuracy)
    
    print('epoch %d, training avg loss %f, training avg acc %f'%(e+1,
                                                                 np.mean(train_loss),np.mean(train_acc)))
    print('epoch %d, testing avg loss %f, testing avg acc %f'%(e+1,
                                                              np.mean(test_loss),np.mean(test_acc)))

minibatch loop: 100%|██████████| 1389/1389 [17:15<00:00,  1.34it/s, accuracy=0.147, cost=5.73]
minibatch loop: 100%|██████████| 30/30 [00:10<00:00,  2.80it/s, accuracy=0.172, cost=5.49]
minibatch loop:   0%|          | 0/1389 [00:00<?, ?it/s]

epoch 1, training avg loss 5.818387, training avg acc 0.124712
epoch 1, testing avg loss 5.176770, testing avg acc 0.177965


minibatch loop: 100%|██████████| 1389/1389 [17:28<00:00,  1.32it/s, accuracy=0.204, cost=4.93]
minibatch loop: 100%|██████████| 30/30 [00:10<00:00,  2.80it/s, accuracy=0.218, cost=4.86]
minibatch loop:   0%|          | 0/1389 [00:00<?, ?it/s]

epoch 2, training avg loss 4.665158, training avg acc 0.228336
epoch 2, testing avg loss 4.497286, testing avg acc 0.239476


minibatch loop: 100%|██████████| 1389/1389 [17:33<00:00,  1.32it/s, accuracy=0.22, cost=4.39] 
minibatch loop: 100%|██████████| 30/30 [00:10<00:00,  2.84it/s, accuracy=0.213, cost=4.65]
minibatch loop:   0%|          | 0/1389 [00:00<?, ?it/s]

epoch 3, training avg loss 4.082400, training avg acc 0.276386
epoch 3, testing avg loss 4.227514, testing avg acc 0.260246


minibatch loop: 100%|██████████| 1389/1389 [17:31<00:00,  1.32it/s, accuracy=0.26, cost=3.85] 
minibatch loop: 100%|██████████| 30/30 [00:10<00:00,  2.78it/s, accuracy=0.249, cost=4.53]
minibatch loop:   0%|          | 0/1389 [00:00<?, ?it/s]

epoch 4, training avg loss 3.725036, training avg acc 0.306360
epoch 4, testing avg loss 4.130528, testing avg acc 0.272358


minibatch loop: 100%|██████████| 1389/1389 [18:00<00:00,  1.29it/s, accuracy=0.304, cost=3.48]
minibatch loop: 100%|██████████| 30/30 [00:11<00:00,  2.71it/s, accuracy=0.265, cost=4.41]
minibatch loop:   0%|          | 0/1389 [00:00<?, ?it/s]

epoch 5, training avg loss 3.471519, training avg acc 0.329067
epoch 5, testing avg loss 4.165153, testing avg acc 0.266080


minibatch loop: 100%|██████████| 1389/1389 [17:57<00:00,  1.29it/s, accuracy=0.332, cost=3.18]
minibatch loop: 100%|██████████| 30/30 [00:10<00:00,  2.74it/s, accuracy=0.263, cost=4.39]
minibatch loop:   0%|          | 0/1389 [00:00<?, ?it/s]

epoch 6, training avg loss 3.281470, training avg acc 0.348113
epoch 6, testing avg loss 4.112911, testing avg acc 0.275720


minibatch loop: 100%|██████████| 1389/1389 [17:56<00:00,  1.29it/s, accuracy=0.373, cost=2.94]
minibatch loop: 100%|██████████| 30/30 [00:10<00:00,  2.73it/s, accuracy=0.264, cost=4.45]
minibatch loop:   0%|          | 0/1389 [00:00<?, ?it/s]

epoch 7, training avg loss 3.124878, training avg acc 0.364740
epoch 7, testing avg loss 4.160022, testing avg acc 0.270752


minibatch loop: 100%|██████████| 1389/1389 [17:55<00:00,  1.29it/s, accuracy=0.391, cost=2.69]
minibatch loop: 100%|██████████| 30/30 [00:11<00:00,  2.72it/s, accuracy=0.242, cost=4.51]
minibatch loop:   0%|          | 0/1389 [00:00<?, ?it/s]

epoch 8, training avg loss 2.992456, training avg acc 0.379310
epoch 8, testing avg loss 4.242097, testing avg acc 0.260348


minibatch loop: 100%|██████████| 1389/1389 [17:57<00:00,  1.29it/s, accuracy=0.428, cost=2.62]
minibatch loop: 100%|██████████| 30/30 [00:11<00:00,  2.72it/s, accuracy=0.253, cost=4.53]
minibatch loop:   0%|          | 0/1389 [00:00<?, ?it/s]

epoch 9, training avg loss 2.882214, training avg acc 0.392409
epoch 9, testing avg loss 4.163579, testing avg acc 0.274066


minibatch loop: 100%|██████████| 1389/1389 [17:56<00:00,  1.29it/s, accuracy=0.449, cost=2.51]
minibatch loop: 100%|██████████| 30/30 [00:10<00:00,  2.75it/s, accuracy=0.252, cost=4.6] 
minibatch loop:   0%|          | 0/1389 [00:00<?, ?it/s]

epoch 10, training avg loss 2.785393, training avg acc 0.404471
epoch 10, testing avg loss 4.179316, testing avg acc 0.280236


minibatch loop: 100%|██████████| 1389/1389 [17:56<00:00,  1.29it/s, accuracy=0.466, cost=2.39]
minibatch loop: 100%|██████████| 30/30 [00:10<00:00,  2.74it/s, accuracy=0.23, cost=4.74] 
minibatch loop:   0%|          | 0/1389 [00:00<?, ?it/s]

epoch 11, training avg loss 2.683310, training avg acc 0.417981
epoch 11, testing avg loss 4.225964, testing avg acc 0.275221


minibatch loop: 100%|██████████| 1389/1389 [17:56<00:00,  1.29it/s, accuracy=0.486, cost=2.19]
minibatch loop: 100%|██████████| 30/30 [00:10<00:00,  2.73it/s, accuracy=0.239, cost=4.76]
minibatch loop:   0%|          | 0/1389 [00:00<?, ?it/s]

epoch 12, training avg loss 2.585214, training avg acc 0.431436
epoch 12, testing avg loss 4.314824, testing avg acc 0.268703


minibatch loop: 100%|██████████| 1389/1389 [17:55<00:00,  1.29it/s, accuracy=0.512, cost=2.09]
minibatch loop: 100%|██████████| 30/30 [00:11<00:00,  2.72it/s, accuracy=0.228, cost=4.88]
minibatch loop:   0%|          | 0/1389 [00:00<?, ?it/s]

epoch 13, training avg loss 2.501054, training avg acc 0.443642
epoch 13, testing avg loss 4.580098, testing avg acc 0.242824


minibatch loop: 100%|██████████| 1389/1389 [17:56<00:00,  1.29it/s, accuracy=0.58, cost=1.77] 
minibatch loop: 100%|██████████| 30/30 [00:10<00:00,  2.74it/s, accuracy=0.25, cost=4.81] 
minibatch loop:   0%|          | 0/1389 [00:00<?, ?it/s]

epoch 14, training avg loss 2.431865, training avg acc 0.453425
epoch 14, testing avg loss 4.655383, testing avg acc 0.237716


minibatch loop: 100%|██████████| 1389/1389 [17:56<00:00,  1.29it/s, accuracy=0.539, cost=1.97]
minibatch loop: 100%|██████████| 30/30 [00:10<00:00,  2.74it/s, accuracy=0.254, cost=4.83]
minibatch loop:   0%|          | 0/1389 [00:00<?, ?it/s]

epoch 15, training avg loss 2.357019, training avg acc 0.465342
epoch 15, testing avg loss 4.492661, testing avg acc 0.261601


minibatch loop: 100%|██████████| 1389/1389 [17:55<00:00,  1.29it/s, accuracy=0.572, cost=1.79]
minibatch loop: 100%|██████████| 30/30 [00:11<00:00,  2.73it/s, accuracy=0.246, cost=5.12]
minibatch loop:   0%|          | 0/1389 [00:00<?, ?it/s]

epoch 16, training avg loss 2.285464, training avg acc 0.476634
epoch 16, testing avg loss 4.474544, testing avg acc 0.267526


minibatch loop: 100%|██████████| 1389/1389 [17:56<00:00,  1.29it/s, accuracy=0.587, cost=1.7] 
minibatch loop: 100%|██████████| 30/30 [00:11<00:00,  2.72it/s, accuracy=0.248, cost=5.02]
minibatch loop:   0%|          | 0/1389 [00:00<?, ?it/s]

epoch 17, training avg loss 2.212481, training avg acc 0.488842
epoch 17, testing avg loss 4.636344, testing avg acc 0.254853


minibatch loop: 100%|██████████| 1389/1389 [17:56<00:00,  1.29it/s, accuracy=0.602, cost=1.67]
minibatch loop: 100%|██████████| 30/30 [00:10<00:00,  2.76it/s, accuracy=0.242, cost=5.11]
minibatch loop:   0%|          | 0/1389 [00:00<?, ?it/s]

epoch 18, training avg loss 2.150382, training avg acc 0.499100
epoch 18, testing avg loss 4.862413, testing avg acc 0.237610


minibatch loop: 100%|██████████| 1389/1389 [17:56<00:00,  1.29it/s, accuracy=0.657, cost=1.43]
minibatch loop: 100%|██████████| 30/30 [00:11<00:00,  2.72it/s, accuracy=0.247, cost=5.05]
minibatch loop:   0%|          | 0/1389 [00:00<?, ?it/s]

epoch 19, training avg loss 2.092161, training avg acc 0.508905
epoch 19, testing avg loss 4.976951, testing avg acc 0.232480


minibatch loop: 100%|██████████| 1389/1389 [17:57<00:00,  1.29it/s, accuracy=0.66, cost=1.43] 
minibatch loop: 100%|██████████| 30/30 [00:10<00:00,  2.75it/s, accuracy=0.233, cost=5.25]

epoch 20, training avg loss 2.041992, training avg acc 0.517594
epoch 20, testing avg loss 4.822202, testing avg acc 0.244880





In [34]:
rev_dictionary_to = {int(k): v for k, v in rev_dictionary_to.items()}

In [35]:
test_size = 20

batch_x, seq_x = pad_sentence_batch(test_X[: test_size], PAD)
batch_y, seq_y = pad_sentence_batch(test_Y[: test_size], PAD)
feed = {model.X: batch_x}
logits = sess.run(model.predicting_ids, feed_dict = feed)
logits.shape

(20, 99)

In [36]:
rejected = ['PAD', 'EOS', 'UNK', 'GO']

for i in range(test_size):
    predict = [rev_dictionary_to[i] for i in logits[i] if rev_dictionary_to[i] not in rejected]
    actual = [rev_dictionary_to[i] for i in batch_y[i] if rev_dictionary_to[i] not in rejected]
    print(i, 'predict:', ' '.join(predict))
    print(i, 'actual:', ' '.join(actual))
    print()

0 predict: TED Talk cô có thể nói trong <NUM> phút về sự liên kết giữa phụ nữ trong <NUM> thế hệ , về cách mạnh đáng kinh ngạc của những sự gắn kết này đã đem vào trong một cô cô gái <NUM> tuổi cô cô gái cô gái gái cô gái của cô , và bà với trong <NUM> ngày trong một đêm nọ , với cô và trong <NUM> ngày ở ngày trước , kết nối cô ấy đã trải qua một cô gái nhỏ và không giờ giờ đây , cô cô gái bây giờ , cô
0 actual: Làm sao tôi có thể trình bày trong <NUM> phút về sợi dây liên kết những người phụ nữ qua ba thế hệ , về việc làm thế nào những sợi dây mạnh mẽ đáng kinh ngạc ấy đã níu chặt lấy cuộc sống của một cô bé bốn tuổi co quắp với đứa em gái nhỏ của cô bé , với mẹ và bà trong suốt năm ngày đêm trên con thuyền nhỏ lênh đênh trên Biển Đông hơn <NUM> năm trước , những sợi dây liên kết đã níu lấy cuộc đời cô bé ấy và không bao giờ rời đi - - cô bé ấy giờ sống ở San Francisco và đang nói chuyện với các bạn hôm nay ?

1 predict: Đây không phải là câu chuyện trọn vẹn . . . . . . . . . . . chuy