In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '3'

In [2]:
import numpy as np
import tensorflow as tf

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [3]:
import json

with open('train-test.json') as fopen:
    dataset = json.load(fopen)
    
with open('dictionary.json') as fopen:
    dictionary = json.load(fopen)

In [4]:
train_X = dataset['train_X']
train_Y = dataset['train_Y']
test_X = dataset['test_X']
test_Y = dataset['test_Y']

In [5]:
dictionary.keys()

dict_keys(['from', 'to'])

In [6]:
dictionary_from = dictionary['from']['dictionary']
rev_dictionary_from = dictionary['from']['rev_dictionary']

dictionary_to = dictionary['to']['dictionary']
rev_dictionary_to = dictionary['to']['rev_dictionary']

In [7]:
GO = dictionary_from['GO']
PAD = dictionary_from['PAD']
EOS = dictionary_from['EOS']
UNK = dictionary_from['UNK']

In [8]:
for i in range(len(train_X)):
    train_X[i] += ' EOS'
    
train_X[0]

'Rachel Pike : The science behind a climate headline EOS'

In [9]:
for i in range(len(test_X)):
    test_X[i] += ' EOS'
    
test_X[0]

'How can I speak in <NUM> minutes about the bonds of women over three generations , about how the astonishing strength of those bonds took hold in the life of a four - year - old girl huddled with her young sister , her mother and her grandmother for five days and nights in a small boat in the China Sea more than <NUM> years ago , bonds that took hold in the life of that small girl and never let go - - that small girl now living in San Francisco and speaking to you today ? EOS'

In [10]:
def pad_second_dim(x, desired_size):
    padding = tf.tile([[[0.0]]], tf.stack([tf.shape(x)[0], desired_size - tf.shape(x)[1], tf.shape(x)[2]], 0))
    return tf.concat([x, padding], 1)

class Translator:
    def __init__(self, size_layer, num_layers, embedded_size,
                 from_dict_size, to_dict_size, learning_rate, batch_size):
        
        def cells(size_layer = size_layer, reuse=False):
            return tf.nn.rnn_cell.LSTMCell(size_layer,initializer=tf.orthogonal_initializer(),reuse=reuse)
        
        self.X = tf.placeholder(tf.int32, [None, None])
        self.Y = tf.placeholder(tf.int32, [None, None])
        self.X_seq_len = tf.count_nonzero(self.X, 1, dtype=tf.int32)
        self.Y_seq_len = tf.count_nonzero(self.Y, 1, dtype=tf.int32)
        batch_size = tf.shape(self.X)[0]
        
        encoder_embedding = tf.Variable(tf.random_uniform([from_dict_size, embedded_size], -1, 1))
        decoder_embedding = tf.Variable(tf.random_uniform([to_dict_size, embedded_size], -1, 1))
        
        encoder_embedded = tf.nn.embedding_lookup(encoder_embedding, self.X)
        
        for n in range(num_layers):
            (out_fw, out_bw), (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(
                cell_fw = cells(size_layer // 2),
                cell_bw = cells(size_layer // 2),
                inputs = encoder_embedded,
                sequence_length = self.X_seq_len,
                dtype = tf.float32,
                scope = 'bidirectional_rnn_%d'%(n))
            encoder_embedded = tf.concat((out_fw, out_bw), 2)
        
        bi_state_c = tf.concat((state_fw.c, state_bw.c), -1)
        bi_state_h = tf.concat((state_fw.h, state_bw.h), -1)
        bi_lstm_state = tf.nn.rnn_cell.LSTMStateTuple(c=bi_state_c, h=bi_state_h)
        encoder_state = tuple([bi_lstm_state] * num_layers)
        
        main = tf.strided_slice(self.Y, [0, 0], [batch_size, -1], [1, 1])
        decoder_input = tf.concat([tf.fill([batch_size, 1], GO), main], 1)
        dense = tf.layers.Dense(to_dict_size)
        decoder_cells = tf.nn.rnn_cell.MultiRNNCell([cells() for _ in range(num_layers)])
        
        training_helper = tf.contrib.seq2seq.TrainingHelper(
                inputs = tf.nn.embedding_lookup(decoder_embedding, decoder_input),
                sequence_length = self.Y_seq_len,
                time_major = False)
        training_decoder = tf.contrib.seq2seq.BasicDecoder(
                cell = decoder_cells,
                helper = training_helper,
                initial_state = encoder_state,
                output_layer = dense)
        training_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(
                decoder = training_decoder,
                impute_finished = True,
                maximum_iterations = tf.reduce_max(self.Y_seq_len))
        self.training_logits = training_decoder_output.rnn_output
        
        predicting_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(
                embedding = decoder_embedding,
                start_tokens = tf.tile(tf.constant([GO], dtype=tf.int32), [batch_size]),
                end_token = EOS)
        predicting_decoder = tf.contrib.seq2seq.BasicDecoder(
                cell = decoder_cells,
                helper = predicting_helper,
                initial_state = encoder_state,
                output_layer = dense)
        predicting_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(
                decoder = predicting_decoder,
                impute_finished = True,
                maximum_iterations = 2 * tf.reduce_max(self.X_seq_len))
        self.predicting_ids = predicting_decoder_output.sample_id
        
        masks = tf.sequence_mask(self.Y_seq_len, tf.reduce_max(self.Y_seq_len), dtype=tf.float32)
        self.cost = tf.contrib.seq2seq.sequence_loss(logits = self.training_logits,
                                                     targets = self.Y,
                                                     weights = masks)
        self.optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(self.cost)
        y_t = tf.argmax(self.training_logits,axis=2)
        y_t = tf.cast(y_t, tf.int32)
        self.prediction = tf.boolean_mask(y_t, masks)
        mask_label = tf.boolean_mask(self.Y, masks)
        correct_pred = tf.equal(self.prediction, mask_label)
        correct_index = tf.cast(correct_pred, tf.float32)
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [11]:
size_layer = 512
num_layers = 2
embedded_size = 256
learning_rate = 1e-3
batch_size = 128
epoch = 20

In [12]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Translator(size_layer, num_layers, embedded_size, len(dictionary_from), 
                len(dictionary_to), learning_rate,batch_size)
sess.run(tf.global_variables_initializer())

W0902 18:09:45.672424 140511355651904 deprecation.py:506] From /home/husein/.local/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py:507: calling count_nonzero (from tensorflow.python.ops.math_ops) with axis is deprecated and will be removed in a future version.
Instructions for updating:
reduction_indices is deprecated, use axis instead
W0902 18:09:45.709759 140511355651904 deprecation.py:323] From <ipython-input-10-a462f9f5a02f>:10: LSTMCell.__init__ (from tensorflow.python.ops.rnn_cell_impl) is deprecated and will be removed in a future version.
Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
W0902 18:09:45.711711 140511355651904 deprecation.py:323] From <ipython-input-10-a462f9f5a02f>:30: bidirectional_dynamic_rnn (from tensorflow.python.ops.rnn) is deprecated and will be removed in a future version.
Instructions for updating:
Please use `keras.layers.Bidirectional(keras.layers.RNN(cell

In [13]:
def str_idx(corpus, dic):
    X = []
    for i in corpus:
        ints = []
        for k in i.split():
            ints.append(dic.get(k,UNK))
        X.append(ints)
    return X

def pad_sentence_batch(sentence_batch, pad_int):
    padded_seqs = []
    seq_lens = []
    max_sentence_len = max([len(sentence) for sentence in sentence_batch])
    for sentence in sentence_batch:
        padded_seqs.append(sentence + [pad_int] * (max_sentence_len - len(sentence)))
        seq_lens.append(len(sentence))
    return padded_seqs, seq_lens

In [14]:
train_X = str_idx(train_X, dictionary_from)
test_X = str_idx(test_X, dictionary_from)
train_Y = str_idx(train_Y, dictionary_to)
test_Y = str_idx(test_Y, dictionary_to)

In [15]:
import tqdm

for e in range(epoch):
    pbar = tqdm.tqdm(
        range(0, len(train_X), batch_size), desc = 'minibatch loop')
    train_loss, train_acc, test_loss, test_acc = [], [], [], []
    for i in pbar:
        index = min(i + batch_size, len(train_X))
        maxlen = max([len(s) for s in train_X[i : index] + train_Y[i : index]])
        batch_x, seq_x = pad_sentence_batch(train_X[i : index], PAD)
        batch_y, seq_y = pad_sentence_batch(train_Y[i : index], PAD)
        feed = {model.X: batch_x,
                model.Y: batch_y}
        accuracy, loss, _ = sess.run([model.accuracy,model.cost,model.optimizer],
                                    feed_dict = feed)
        train_loss.append(loss)
        train_acc.append(accuracy)
        pbar.set_postfix(cost = loss, accuracy = accuracy)
    
    
    pbar = tqdm.tqdm(
        range(0, len(test_X), batch_size), desc = 'minibatch loop')
    for i in pbar:
        index = min(i + batch_size, len(test_X))
        batch_x, seq_x = pad_sentence_batch(test_X[i : index], PAD)
        batch_y, seq_y = pad_sentence_batch(test_Y[i : index], PAD)
        feed = {model.X: batch_x,
                model.Y: batch_y,}
        accuracy, loss = sess.run([model.accuracy,model.cost],
                                    feed_dict = feed)

        test_loss.append(loss)
        test_acc.append(accuracy)
        pbar.set_postfix(cost = loss, accuracy = accuracy)
    
    print('epoch %d, training avg loss %f, training avg acc %f'%(e+1,
                                                                 np.mean(train_loss),np.mean(train_acc)))
    print('epoch %d, testing avg loss %f, testing avg acc %f'%(e+1,
                                                              np.mean(test_loss),np.mean(test_acc)))

minibatch loop: 100%|██████████| 1042/1042 [10:15<00:00,  1.69it/s, accuracy=0.157, cost=5.55]
minibatch loop: 100%|██████████| 23/23 [00:05<00:00,  4.18it/s, accuracy=0.175, cost=5.1] 
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 1, training avg loss 5.970781, training avg acc 0.111191
epoch 1, testing avg loss 5.046552, testing avg acc 0.192169


minibatch loop: 100%|██████████| 1042/1042 [10:20<00:00,  1.68it/s, accuracy=0.252, cost=4.6] 
minibatch loop: 100%|██████████| 23/23 [00:05<00:00,  4.47it/s, accuracy=0.271, cost=4.43]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 2, training avg loss 4.510838, training avg acc 0.259962
epoch 2, testing avg loss 4.260819, testing avg acc 0.289507


minibatch loop: 100%|██████████| 1042/1042 [10:14<00:00,  1.70it/s, accuracy=0.304, cost=4]   
minibatch loop: 100%|██████████| 23/23 [00:05<00:00,  4.46it/s, accuracy=0.294, cost=4.12]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 3, training avg loss 3.914677, training avg acc 0.324567
epoch 3, testing avg loss 3.931807, testing avg acc 0.323940


minibatch loop: 100%|██████████| 1042/1042 [10:13<00:00,  1.70it/s, accuracy=0.34, cost=3.48] 
minibatch loop: 100%|██████████| 23/23 [00:05<00:00,  4.45it/s, accuracy=0.322, cost=3.94]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 4, training avg loss 3.575185, training avg acc 0.357548
epoch 4, testing avg loss 3.754569, testing avg acc 0.343392


minibatch loop: 100%|██████████| 1042/1042 [10:14<00:00,  1.70it/s, accuracy=0.414, cost=3.06]
minibatch loop: 100%|██████████| 23/23 [00:05<00:00,  4.41it/s, accuracy=0.35, cost=3.82] 
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 5, training avg loss 3.330248, training avg acc 0.381959
epoch 5, testing avg loss 3.653602, testing avg acc 0.354871


minibatch loop: 100%|██████████| 1042/1042 [10:14<00:00,  1.70it/s, accuracy=0.45, cost=2.67] 
minibatch loop: 100%|██████████| 23/23 [00:05<00:00,  4.45it/s, accuracy=0.367, cost=3.76]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 6, training avg loss 3.133144, training avg acc 0.403404
epoch 6, testing avg loss 3.595912, testing avg acc 0.363180


minibatch loop: 100%|██████████| 1042/1042 [10:14<00:00,  1.70it/s, accuracy=0.501, cost=2.34]
minibatch loop: 100%|██████████| 23/23 [00:05<00:00,  4.44it/s, accuracy=0.345, cost=3.72]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 7, training avg loss 2.965709, training avg acc 0.423420
epoch 7, testing avg loss 3.571016, testing avg acc 0.363791


minibatch loop: 100%|██████████| 1042/1042 [10:13<00:00,  1.70it/s, accuracy=0.551, cost=2.06]
minibatch loop: 100%|██████████| 23/23 [00:05<00:00,  4.46it/s, accuracy=0.362, cost=3.69]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 8, training avg loss 2.823528, training avg acc 0.441742
epoch 8, testing avg loss 3.574800, testing avg acc 0.365062


minibatch loop: 100%|██████████| 1042/1042 [10:14<00:00,  1.70it/s, accuracy=0.613, cost=1.82]
minibatch loop: 100%|██████████| 23/23 [00:05<00:00,  4.45it/s, accuracy=0.373, cost=3.72]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 9, training avg loss 2.701266, training avg acc 0.458267
epoch 9, testing avg loss 3.596141, testing avg acc 0.363108


minibatch loop: 100%|██████████| 1042/1042 [10:15<00:00,  1.69it/s, accuracy=0.644, cost=1.61]
minibatch loop: 100%|██████████| 23/23 [00:05<00:00,  4.44it/s, accuracy=0.333, cost=3.8] 
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 10, training avg loss 2.594976, training avg acc 0.473309
epoch 10, testing avg loss 3.624912, testing avg acc 0.361898


minibatch loop: 100%|██████████| 1042/1042 [10:31<00:00,  1.65it/s, accuracy=0.683, cost=1.43]
minibatch loop: 100%|██████████| 23/23 [00:05<00:00,  4.47it/s, accuracy=0.333, cost=3.86]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 11, training avg loss 2.496064, training avg acc 0.488037
epoch 11, testing avg loss 3.653283, testing avg acc 0.360345


minibatch loop: 100%|██████████| 1042/1042 [10:14<00:00,  1.70it/s, accuracy=0.724, cost=1.28]
minibatch loop: 100%|██████████| 23/23 [00:05<00:00,  4.52it/s, accuracy=0.339, cost=3.9] 
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 12, training avg loss 2.406763, training avg acc 0.501590
epoch 12, testing avg loss 3.683764, testing avg acc 0.358175


minibatch loop: 100%|██████████| 1042/1042 [10:14<00:00,  1.69it/s, accuracy=0.766, cost=1.14]
minibatch loop: 100%|██████████| 23/23 [00:05<00:00,  4.48it/s, accuracy=0.328, cost=3.89]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 13, training avg loss 2.325106, training avg acc 0.514291
epoch 13, testing avg loss 3.724835, testing avg acc 0.357250


minibatch loop: 100%|██████████| 1042/1042 [10:16<00:00,  1.69it/s, accuracy=0.789, cost=1.04]
minibatch loop: 100%|██████████| 23/23 [00:05<00:00,  4.47it/s, accuracy=0.356, cost=3.91]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 14, training avg loss 2.253128, training avg acc 0.525608
epoch 14, testing avg loss 3.753395, testing avg acc 0.359052


minibatch loop: 100%|██████████| 1042/1042 [10:07<00:00,  1.72it/s, accuracy=0.816, cost=0.945]
minibatch loop: 100%|██████████| 23/23 [00:04<00:00,  4.67it/s, accuracy=0.339, cost=4]   
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 15, training avg loss 2.184553, training avg acc 0.536910
epoch 15, testing avg loss 3.796028, testing avg acc 0.356730


minibatch loop: 100%|██████████| 1042/1042 [09:57<00:00,  1.74it/s, accuracy=0.841, cost=0.843]
minibatch loop: 100%|██████████| 23/23 [00:05<00:00,  4.57it/s, accuracy=0.316, cost=3.99]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 16, training avg loss 2.119450, training avg acc 0.547912
epoch 16, testing avg loss 3.852276, testing avg acc 0.350685


minibatch loop: 100%|██████████| 1042/1042 [09:51<00:00,  1.76it/s, accuracy=0.845, cost=0.78]
minibatch loop: 100%|██████████| 23/23 [00:05<00:00,  4.56it/s, accuracy=0.333, cost=4.08]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 17, training avg loss 2.063565, training avg acc 0.557289
epoch 17, testing avg loss 3.908022, testing avg acc 0.349420


minibatch loop: 100%|██████████| 1042/1042 [09:48<00:00,  1.77it/s, accuracy=0.877, cost=0.693]
minibatch loop: 100%|██████████| 23/23 [00:05<00:00,  4.55it/s, accuracy=0.311, cost=4.21]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 18, training avg loss 1.989629, training avg acc 0.571046
epoch 18, testing avg loss 3.979372, testing avg acc 0.344861


minibatch loop: 100%|██████████| 1042/1042 [09:31<00:00,  1.82it/s, accuracy=0.898, cost=0.624]
minibatch loop: 100%|██████████| 23/23 [00:05<00:00,  4.52it/s, accuracy=0.339, cost=4.14]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 19, training avg loss 1.929209, training avg acc 0.581917
epoch 19, testing avg loss 4.030926, testing avg acc 0.344720


minibatch loop: 100%|██████████| 1042/1042 [09:32<00:00,  1.82it/s, accuracy=0.906, cost=0.563]
minibatch loop: 100%|██████████| 23/23 [00:05<00:00,  4.47it/s, accuracy=0.35, cost=4.2]  

epoch 20, training avg loss 1.878075, training avg acc 0.590747
epoch 20, testing avg loss 4.078696, testing avg acc 0.342469





In [16]:
rev_dictionary_to = {int(k): v for k, v in rev_dictionary_to.items()}

In [17]:
test_size = 20

batch_x, seq_x = pad_sentence_batch(test_X[: test_size], PAD)
batch_y, seq_y = pad_sentence_batch(test_Y[: test_size], PAD)
feed = {model.X: batch_x}
logits = sess.run(model.predicting_ids, feed_dict = feed)
logits.shape

(20, 198)

In [18]:
rejected = ['PAD', 'EOS', 'UNK', 'GO']

for i in range(test_size):
    predict = [rev_dictionary_to[i] for i in logits[i] if rev_dictionary_to[i] not in rejected]
    actual = [rev_dictionary_to[i] for i in batch_y[i] if rev_dictionary_to[i] not in rejected]
    print(i, 'predict:', ' '.join(predict))
    print(i, 'actual:', ' '.join(actual))
    print()

0 predict: Làm sao để nói về cách nói về cách thức <NUM> của thế hệ này , phụ nữ nhất trong cuộc sống của tôi đã trải qua một môi trường lớn nhất liên quan đến việc nâng cấp từ San Francisco , và cô ấy nói với tôi rằng : " Cô bé đã sống trong một thế giới nhỏ hơn , rồi lại không có gì để lại ở đó , kể cả khi chị đến từ Colombia , cô đã bao giờ nghe tiếng Anh ở khoảng cách giữa hai phút ở San Francisco và nhanh nhất ? Khi còn có thể đi qua được <NUM> phút ở Ấn Độ và bạn sẽ đi qua hơn một ngàn năm qua . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
0 actual: Làm sao tôi có thể trình bày trong <NUM> phút về sợi dây liên kết những người phụ nữ qua ba thế hệ , về việc làm thế nào những sợi dây mạnh mẽ đáng kinh ngạc ấy đã níu chặt lấy cuộc sống của một cô bé bốn tuổi co quắp với đứa em gái nhỏ của cô bé , với mẹ và bà trong suốt năm ngày đêm trên con thuyền nhỏ lênh đênh trên Biển Đông hơ