In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '3'

In [2]:
import numpy as np
import tensorflow as tf

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [3]:
import json

with open('train-test.json') as fopen:
    dataset = json.load(fopen)
    
with open('dictionary.json') as fopen:
    dictionary = json.load(fopen)

In [4]:
train_X = dataset['train_X']
train_Y = dataset['train_Y']
test_X = dataset['test_X']
test_Y = dataset['test_Y']

In [5]:
dictionary.keys()

dict_keys(['from', 'to'])

In [6]:
dictionary_from = dictionary['from']['dictionary']
rev_dictionary_from = dictionary['from']['rev_dictionary']

dictionary_to = dictionary['to']['dictionary']
rev_dictionary_to = dictionary['to']['rev_dictionary']

In [7]:
GO = dictionary_from['GO']
PAD = dictionary_from['PAD']
EOS = dictionary_from['EOS']
UNK = dictionary_from['UNK']

In [8]:
for i in range(len(train_X)):
    train_X[i] += ' EOS'
    
train_X[0]

'Rachel Pike : The science behind a climate headline EOS'

In [9]:
for i in range(len(test_X)):
    test_X[i] += ' EOS'
    
test_X[0]

'How can I speak in <NUM> minutes about the bonds of women over three generations , about how the astonishing strength of those bonds took hold in the life of a four - year - old girl huddled with her young sister , her mother and her grandmother for five days and nights in a small boat in the China Sea more than <NUM> years ago , bonds that took hold in the life of that small girl and never let go - - that small girl now living in San Francisco and speaking to you today ? EOS'

In [10]:
def pad_second_dim(x, desired_size):
    padding = tf.tile([[[0.0]]], tf.stack([tf.shape(x)[0], desired_size - tf.shape(x)[1], tf.shape(x)[2]], 0))
    return tf.concat([x, padding], 1)

class Translator:
    def __init__(self, size_layer, num_layers, embedded_size,
                 from_dict_size, to_dict_size, learning_rate, batch_size):
        
        def cells(size_layer=size_layer, reuse=False):
            return tf.nn.rnn_cell.LSTMCell(size_layer,initializer=tf.orthogonal_initializer(),reuse=reuse)
        
        self.X = tf.placeholder(tf.int32, [None, None])
        self.Y = tf.placeholder(tf.int32, [None, None])
        
        self.X_seq_len = tf.count_nonzero(self.X, 1, dtype = tf.int32)
        self.Y_seq_len = tf.count_nonzero(self.Y, 1, dtype = tf.int32)
        batch_size = tf.shape(self.X)[0]
        
        encoder_embeddings = tf.Variable(tf.random_uniform([from_dict_size, embedded_size], -1, 1))
        decoder_embeddings = tf.Variable(tf.random_uniform([to_dict_size, embedded_size], -1, 1))
        encoder_embedded = tf.nn.embedding_lookup(encoder_embeddings, self.X)
        main = tf.strided_slice(self.X, [0, 0], [batch_size, -1], [1, 1])
        decoder_input = tf.concat([tf.fill([batch_size, 1], GO), main], 1)
        decoder_embedded = tf.nn.embedding_lookup(encoder_embeddings, decoder_input)
        for n in range(num_layers):
            (out_fw, out_bw), (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(
                cell_fw = cells(size_layer // 2),
                cell_bw = cells(size_layer // 2),
                inputs = encoder_embedded,
                sequence_length = self.X_seq_len,
                dtype = tf.float32,
                scope = 'bidirectional_rnn_%d'%(n))
            encoder_embedded = tf.concat((out_fw, out_bw), 2)

        bi_state_c = tf.concat((state_fw.c, state_bw.c), -1)
        bi_state_h = tf.concat((state_fw.h, state_bw.h), -1)
        bi_lstm_state = tf.nn.rnn_cell.LSTMStateTuple(c=bi_state_c, h=bi_state_h)
        last_state = tuple([bi_lstm_state] * num_layers)
        last_output = tf.concat((out_fw,out_bw), -1)
        
        with tf.variable_scope("decoder"):
            
            attention_mechanism = tf.contrib.seq2seq.LuongAttention(num_units = size_layer, 
                                                                    memory = last_output)
            rnn_cells = tf.contrib.seq2seq.AttentionWrapper(
                cell = tf.nn.rnn_cell.MultiRNNCell([cells() for _ in range(num_layers)]), 
                attention_mechanism = attention_mechanism,
                attention_layer_size = size_layer)
            
            initial_state = rnn_cells.zero_state(batch_size, tf.float32).clone(cell_state=last_state)
            outputs, _ = tf.nn.dynamic_rnn(rnn_cells, decoder_embedded, 
                                           sequence_length=self.X_seq_len,
                                           initial_state = initial_state,
                                           dtype = tf.float32)
            self.logits = tf.layers.dense(outputs,to_dict_size)
        
        self.training_logits = self.logits[:, :tf.reduce_max(self.Y_seq_len)]
        self.training_logits = pad_second_dim(self.training_logits, tf.reduce_max(self.Y_seq_len))
            
        masks = tf.sequence_mask(self.Y_seq_len, tf.reduce_max(self.Y_seq_len), dtype=tf.float32)
        self.cost = tf.contrib.seq2seq.sequence_loss(logits = self.training_logits,
                                                     targets = self.Y,
                                                     weights = masks)
        self.optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(self.cost)
        y_t = tf.argmax(self.training_logits,axis=2)
        y_t = tf.cast(y_t, tf.int32)
        self.prediction = tf.boolean_mask(y_t, masks)
        mask_label = tf.boolean_mask(self.Y, masks)
        correct_pred = tf.equal(self.prediction, mask_label)
        correct_index = tf.cast(correct_pred, tf.float32)
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [11]:
size_layer = 512
num_layers = 2
embedded_size = 256
learning_rate = 1e-3
batch_size = 128
epoch = 20

In [12]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Translator(size_layer, num_layers, embedded_size, len(dictionary_from), 
                len(dictionary_to), learning_rate,batch_size)
sess.run(tf.global_variables_initializer())

W0903 13:44:19.027175 139699537577792 deprecation.py:506] From /home/husein/.local/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py:507: calling count_nonzero (from tensorflow.python.ops.math_ops) with axis is deprecated and will be removed in a future version.
Instructions for updating:
reduction_indices is deprecated, use axis instead
W0903 13:44:19.083567 139699537577792 deprecation.py:323] From <ipython-input-10-f2af6c9100be>:10: LSTMCell.__init__ (from tensorflow.python.ops.rnn_cell_impl) is deprecated and will be removed in a future version.
Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
W0903 13:44:19.085638 139699537577792 deprecation.py:323] From <ipython-input-10-f2af6c9100be>:32: bidirectional_dynamic_rnn (from tensorflow.python.ops.rnn) is deprecated and will be removed in a future version.
Instructions for updating:
Please use `keras.layers.Bidirectional(keras.layers.RNN(cell

In [13]:
def str_idx(corpus, dic):
    X = []
    for i in corpus:
        ints = []
        for k in i.split():
            ints.append(dic.get(k,UNK))
        X.append(ints)
    return X

def pad_sentence_batch(sentence_batch, pad_int):
    padded_seqs = []
    seq_lens = []
    max_sentence_len = max([len(sentence) for sentence in sentence_batch])
    for sentence in sentence_batch:
        padded_seqs.append(sentence + [pad_int] * (max_sentence_len - len(sentence)))
        seq_lens.append(len(sentence))
    return padded_seqs, seq_lens

In [14]:
train_X = str_idx(train_X, dictionary_from)
test_X = str_idx(test_X, dictionary_from)
train_Y = str_idx(train_Y, dictionary_to)
test_Y = str_idx(test_Y, dictionary_to)

In [15]:
import tqdm

for e in range(epoch):
    pbar = tqdm.tqdm(
        range(0, len(train_X), batch_size), desc = 'minibatch loop')
    train_loss, train_acc, test_loss, test_acc = [], [], [], []
    for i in pbar:
        index = min(i + batch_size, len(train_X))
        maxlen = max([len(s) for s in train_X[i : index] + train_Y[i : index]])
        batch_x, seq_x = pad_sentence_batch(train_X[i : index], PAD)
        batch_y, seq_y = pad_sentence_batch(train_Y[i : index], PAD)
        feed = {model.X: batch_x,
                model.Y: batch_y}
        accuracy, loss, _ = sess.run([model.accuracy,model.cost,model.optimizer],
                                    feed_dict = feed)
        train_loss.append(loss)
        train_acc.append(accuracy)
        pbar.set_postfix(cost = loss, accuracy = accuracy)
    
    
    pbar = tqdm.tqdm(
        range(0, len(test_X), batch_size), desc = 'minibatch loop')
    for i in pbar:
        index = min(i + batch_size, len(test_X))
        batch_x, seq_x = pad_sentence_batch(test_X[i : index], PAD)
        batch_y, seq_y = pad_sentence_batch(test_Y[i : index], PAD)
        feed = {model.X: batch_x,
                model.Y: batch_y,}
        accuracy, loss = sess.run([model.accuracy,model.cost],
                                    feed_dict = feed)

        test_loss.append(loss)
        test_acc.append(accuracy)
        pbar.set_postfix(cost = loss, accuracy = accuracy)
    
    print('epoch %d, training avg loss %f, training avg acc %f'%(e+1,
                                                                 np.mean(train_loss),np.mean(train_acc)))
    print('epoch %d, testing avg loss %f, testing avg acc %f'%(e+1,
                                                              np.mean(test_loss),np.mean(test_acc)))

minibatch loop: 100%|██████████| 1042/1042 [11:03<00:00,  1.57it/s, accuracy=0.115, cost=6.3]  
minibatch loop: 100%|██████████| 23/23 [00:05<00:00,  4.26it/s, accuracy=0.113, cost=6.03]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 1, training avg loss 6.491671, training avg acc 0.103634
epoch 1, testing avg loss 6.096631, testing avg acc 0.122817


minibatch loop: 100%|██████████| 1042/1042 [11:07<00:00,  1.56it/s, accuracy=0.123, cost=5.65] 
minibatch loop: 100%|██████████| 23/23 [00:05<00:00,  4.29it/s, accuracy=0.13, cost=5.61] 
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 2, training avg loss 5.718174, training avg acc 0.145087
epoch 2, testing avg loss 5.693337, testing avg acc 0.138284


minibatch loop: 100%|██████████| 1042/1042 [11:04<00:00,  1.57it/s, accuracy=0.157, cost=5.03]
minibatch loop: 100%|██████████| 23/23 [00:05<00:00,  4.44it/s, accuracy=0.141, cost=5.38]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 3, training avg loss 5.259041, training avg acc 0.160991
epoch 3, testing avg loss 5.446589, testing avg acc 0.146515


minibatch loop: 100%|██████████| 1042/1042 [11:06<00:00,  1.56it/s, accuracy=0.203, cost=4.42]
minibatch loop: 100%|██████████| 23/23 [00:05<00:00,  4.51it/s, accuracy=0.169, cost=5.32]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 4, training avg loss 4.912119, training avg acc 0.174539
epoch 4, testing avg loss 5.334584, testing avg acc 0.148922


minibatch loop: 100%|██████████| 1042/1042 [11:05<00:00,  1.57it/s, accuracy=0.243, cost=3.97]
minibatch loop: 100%|██████████| 23/23 [00:05<00:00,  4.38it/s, accuracy=0.158, cost=5.35]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 5, training avg loss 4.638057, training avg acc 0.187447
epoch 5, testing avg loss 5.348259, testing avg acc 0.142715


minibatch loop: 100%|██████████| 1042/1042 [11:26<00:00,  1.52it/s, accuracy=0.276, cost=3.64]
minibatch loop: 100%|██████████| 23/23 [00:05<00:00,  4.40it/s, accuracy=0.147, cost=5.34]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 6, training avg loss 4.427190, training avg acc 0.198155
epoch 6, testing avg loss 5.337920, testing avg acc 0.140441


minibatch loop: 100%|██████████| 1042/1042 [11:05<00:00,  1.57it/s, accuracy=0.292, cost=3.44]
minibatch loop: 100%|██████████| 23/23 [00:05<00:00,  4.49it/s, accuracy=0.164, cost=5.34]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 7, training avg loss 4.257526, training avg acc 0.207791
epoch 7, testing avg loss 5.332291, testing avg acc 0.140719


minibatch loop: 100%|██████████| 1042/1042 [11:05<00:00,  1.57it/s, accuracy=0.336, cost=3.28]
minibatch loop: 100%|██████████| 23/23 [00:05<00:00,  4.51it/s, accuracy=0.169, cost=5.2] 
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 8, training avg loss 4.085893, training avg acc 0.222238
epoch 8, testing avg loss 5.273825, testing avg acc 0.142628


minibatch loop: 100%|██████████| 1042/1042 [11:04<00:00,  1.57it/s, accuracy=0.318, cost=3.27]
minibatch loop: 100%|██████████| 23/23 [00:05<00:00,  4.41it/s, accuracy=0.169, cost=5.35]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 9, training avg loss 3.943199, training avg acc 0.235710
epoch 9, testing avg loss 5.199664, testing avg acc 0.148709


minibatch loop: 100%|██████████| 1042/1042 [11:04<00:00,  1.57it/s, accuracy=0.362, cost=2.9] 
minibatch loop: 100%|██████████| 23/23 [00:05<00:00,  4.42it/s, accuracy=0.158, cost=5.31]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 10, training avg loss 3.835578, training avg acc 0.245894
epoch 10, testing avg loss 5.243429, testing avg acc 0.147073


minibatch loop: 100%|██████████| 1042/1042 [11:05<00:00,  1.57it/s, accuracy=0.41, cost=2.75] 
minibatch loop: 100%|██████████| 23/23 [00:05<00:00,  4.47it/s, accuracy=0.124, cost=5.55]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 11, training avg loss 3.732772, training avg acc 0.256800
epoch 11, testing avg loss 5.426691, testing avg acc 0.134863


minibatch loop: 100%|██████████| 1042/1042 [11:05<00:00,  1.56it/s, accuracy=0.437, cost=2.62]
minibatch loop: 100%|██████████| 23/23 [00:05<00:00,  4.46it/s, accuracy=0.136, cost=5.44]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 12, training avg loss 3.627601, training avg acc 0.270226
epoch 12, testing avg loss 5.477323, testing avg acc 0.135621


minibatch loop: 100%|██████████| 1042/1042 [11:26<00:00,  1.52it/s, accuracy=0.459, cost=2.55]
minibatch loop: 100%|██████████| 23/23 [00:05<00:00,  4.39it/s, accuracy=0.158, cost=5.41]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 13, training avg loss 3.534380, training avg acc 0.282720
epoch 13, testing avg loss 5.431548, testing avg acc 0.142570


minibatch loop: 100%|██████████| 1042/1042 [11:06<00:00,  1.56it/s, accuracy=0.47, cost=2.44] 
minibatch loop: 100%|██████████| 23/23 [00:05<00:00,  4.46it/s, accuracy=0.158, cost=5.65]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 14, training avg loss 3.463052, training avg acc 0.291679
epoch 14, testing avg loss 5.486361, testing avg acc 0.145089


minibatch loop: 100%|██████████| 1042/1042 [10:55<00:00,  1.59it/s, accuracy=0.502, cost=2.29]
minibatch loop: 100%|██████████| 23/23 [00:04<00:00,  4.63it/s, accuracy=0.136, cost=5.71]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 15, training avg loss 3.397736, training avg acc 0.300643
epoch 15, testing avg loss 5.605308, testing avg acc 0.142028


minibatch loop: 100%|██████████| 1042/1042 [10:58<00:00,  1.58it/s, accuracy=0.473, cost=2.32]
minibatch loop: 100%|██████████| 23/23 [00:05<00:00,  4.49it/s, accuracy=0.169, cost=5.74]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 16, training avg loss 3.323642, training avg acc 0.311952
epoch 16, testing avg loss 5.708579, testing avg acc 0.141987


minibatch loop: 100%|██████████| 1042/1042 [11:01<00:00,  1.58it/s, accuracy=0.534, cost=2.16]
minibatch loop: 100%|██████████| 23/23 [00:05<00:00,  4.49it/s, accuracy=0.153, cost=5.71]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 17, training avg loss 3.256665, training avg acc 0.322354
epoch 17, testing avg loss 5.811002, testing avg acc 0.136633


minibatch loop: 100%|██████████| 1042/1042 [11:21<00:00,  1.53it/s, accuracy=0.527, cost=2.14]
minibatch loop: 100%|██████████| 23/23 [00:05<00:00,  4.55it/s, accuracy=0.124, cost=5.85]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 18, training avg loss 3.202549, training avg acc 0.330643
epoch 18, testing avg loss 6.008280, testing avg acc 0.127782


minibatch loop: 100%|██████████| 1042/1042 [11:00<00:00,  1.58it/s, accuracy=0.55, cost=2.14] 
minibatch loop: 100%|██████████| 23/23 [00:05<00:00,  4.58it/s, accuracy=0.153, cost=5.75]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 19, training avg loss 3.153063, training avg acc 0.338845
epoch 19, testing avg loss 5.986279, testing avg acc 0.130827


minibatch loop: 100%|██████████| 1042/1042 [11:20<00:00,  1.53it/s, accuracy=0.559, cost=2.03]
minibatch loop: 100%|██████████| 23/23 [00:05<00:00,  4.39it/s, accuracy=0.158, cost=5.84]

epoch 20, training avg loss 3.098428, training avg acc 0.348163
epoch 20, testing avg loss 5.996457, testing avg acc 0.132617





In [16]:
rev_dictionary_to = {int(k): v for k, v in rev_dictionary_to.items()}

In [17]:
test_size = 20

batch_x, seq_x = pad_sentence_batch(test_X[: test_size], PAD)
batch_y, seq_y = pad_sentence_batch(test_Y[: test_size], PAD)
feed = {model.X: batch_x,model.Y: batch_y,}
logits = np.argmax(sess.run(model.logits, feed_dict = feed), axis = 2)
logits.shape

(20, 99)

In [18]:
rejected = ['PAD', 'EOS', 'UNK', 'GO']

for i in range(test_size):
    predict = [rev_dictionary_to[i] for i in logits[i] if rev_dictionary_to[i] not in rejected]
    actual = [rev_dictionary_to[i] for i in batch_y[i] if rev_dictionary_to[i] not in rejected]
    print(i, 'predict:', ' '.join(predict))
    print(i, 'actual:', ' '.join(actual))
    print()

0 predict: Làm thế nào Có nói nói nói <NUM> <NUM> phút <NUM> vòng vòng phụ nữ vòng vòng nữ <NUM> <NUM> <NUM> , về về đã thế thế thế thế thế trong đã trong trong trong trong trong tuổi gái tuổi tuổi tuổi tuổi tuổi tuổi tuổi tuổi gái tuổi tuổi con đứa đứa con con nhỏ nhỏ nhỏ nhỏ nhỏ nhỏ nhỏ nhỏ nhỏ nhỏ nhỏ nhỏ nhỏ nhỏ năm năm năm năm năm năm năm gái cô gái mà năm mà mà năm cô gái cô gái cô gái cô gái gái gái cô cô cô cô cô
0 actual: Làm sao tôi có thể trình bày trong <NUM> phút về sợi dây liên kết những người phụ nữ qua ba thế hệ , về việc làm thế nào những sợi dây mạnh mẽ đáng kinh ngạc ấy đã níu chặt lấy cuộc sống của một cô bé bốn tuổi co quắp với đứa em gái nhỏ của cô bé , với mẹ và bà trong suốt năm ngày đêm trên con thuyền nhỏ lênh đênh trên Biển Đông hơn <NUM> năm trước , những sợi dây liên kết đã níu lấy cuộc đời cô bé ấy và không bao giờ rời đi - - cô bé ấy giờ sống ở San Francisco và đang nói chuyện với các bạn hôm nay ?

1 predict: Đây không phải là một câu hoàn hoàn . . . . .