In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [2]:
import numpy as np
import tensorflow as tf

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [3]:
import json

with open('train-test.json') as fopen:
    dataset = json.load(fopen)
    
with open('dictionary.json') as fopen:
    dictionary = json.load(fopen)

In [4]:
train_X = dataset['train_X']
train_Y = dataset['train_Y']
test_X = dataset['test_X']
test_Y = dataset['test_Y']

In [5]:
dictionary.keys()

dict_keys(['from', 'to'])

In [6]:
dictionary_from = dictionary['from']['dictionary']
rev_dictionary_from = dictionary['from']['rev_dictionary']

dictionary_to = dictionary['to']['dictionary']
rev_dictionary_to = dictionary['to']['rev_dictionary']

In [7]:
GO = dictionary_from['GO']
PAD = dictionary_from['PAD']
EOS = dictionary_from['EOS']
UNK = dictionary_from['UNK']

In [8]:
for i in range(len(train_X)):
    train_X[i] += ' EOS'
    
train_X[0]

'Rachel Pike : The science behind a climate headline EOS'

In [9]:
for i in range(len(test_X)):
    test_X[i] += ' EOS'
    
test_X[0]

'How can I speak in <NUM> minutes about the bonds of women over three generations , about how the astonishing strength of those bonds took hold in the life of a four - year - old girl huddled with her young sister , her mother and her grandmother for five days and nights in a small boat in the China Sea more than <NUM> years ago , bonds that took hold in the life of that small girl and never let go - - that small girl now living in San Francisco and speaking to you today ? EOS'

In [10]:
def pad_second_dim(x, desired_size):
    padding = tf.tile([[[0.0]]], tf.stack([tf.shape(x)[0], desired_size - tf.shape(x)[1], tf.shape(x)[2]], 0))
    return tf.concat([x, padding], 1)

class Translator:
    def __init__(self, size_layer, num_layers, embedded_size,
                 from_dict_size, to_dict_size, learning_rate, batch_size):
        
        def cells(size_layer, reuse=False):
            return tf.nn.rnn_cell.GRUCell(size_layer,reuse=reuse)
        
        self.X = tf.placeholder(tf.int32, [None, None])
        self.Y = tf.placeholder(tf.int32, [None, None])
        
        self.X_seq_len = tf.count_nonzero(self.X, 1, dtype = tf.int32)
        self.Y_seq_len = tf.count_nonzero(self.Y, 1, dtype = tf.int32)
        batch_size = tf.shape(self.X)[0]
        
        encoder_embeddings = tf.Variable(tf.random_uniform([from_dict_size, embedded_size], -1, 1))
        decoder_embeddings = tf.Variable(tf.random_uniform([to_dict_size, embedded_size], -1, 1))
        encoder_embedded = tf.nn.embedding_lookup(encoder_embeddings, self.X)
        main = tf.strided_slice(self.X, [0, 0], [batch_size, -1], [1, 1])
        decoder_input = tf.concat([tf.fill([batch_size, 1], GO), main], 1)
        decoder_embedded = tf.nn.embedding_lookup(encoder_embeddings, decoder_input)
        print(encoder_embedded)
        
        for n in range(num_layers):
            (out_fw, out_bw), (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(
                cell_fw = cells(size_layer // 2),
                cell_bw = cells(size_layer // 2),
                inputs = encoder_embedded,
                sequence_length = self.X_seq_len,
                dtype = tf.float32,
                scope = 'bidirectional_rnn_%d'%(n))
            encoder_embedded = tf.concat((out_fw, out_bw), 2)

        bi_state = tf.concat((state_fw,state_bw), -1)
        last_state = tuple([bi_state] * num_layers)
        
        with tf.variable_scope("decoder"):
            rnn_cells_dec = tf.nn.rnn_cell.MultiRNNCell([cells(size_layer) for _ in range(num_layers)])
            outputs, _ = tf.nn.dynamic_rnn(rnn_cells_dec, decoder_embedded, 
                                           sequence_length=self.X_seq_len,
                                           initial_state = last_state,
                                           dtype = tf.float32)
            self.logits = tf.layers.dense(outputs,to_dict_size)
        
        self.training_logits = self.logits[:, :tf.reduce_max(self.Y_seq_len)]
        self.training_logits = pad_second_dim(self.training_logits, tf.reduce_max(self.Y_seq_len))
            
        masks = tf.sequence_mask(self.Y_seq_len, tf.reduce_max(self.Y_seq_len), dtype=tf.float32)
        self.cost = tf.contrib.seq2seq.sequence_loss(logits = self.training_logits,
                                                     targets = self.Y,
                                                     weights = masks)
        self.optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(self.cost)
        y_t = tf.argmax(self.training_logits,axis=2)
        y_t = tf.cast(y_t, tf.int32)
        self.prediction = tf.boolean_mask(y_t, masks)
        mask_label = tf.boolean_mask(self.Y, masks)
        correct_pred = tf.equal(self.prediction, mask_label)
        correct_index = tf.cast(correct_pred, tf.float32)
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [11]:
size_layer = 512
num_layers = 2
embedded_size = 256
learning_rate = 1e-3
batch_size = 128
epoch = 20

In [12]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Translator(size_layer, num_layers, embedded_size, len(dictionary_from), 
                len(dictionary_to), learning_rate,batch_size)
sess.run(tf.global_variables_initializer())

W0902 17:52:58.847471 140499052144448 deprecation.py:506] From /home/husein/.local/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py:507: calling count_nonzero (from tensorflow.python.ops.math_ops) with axis is deprecated and will be removed in a future version.
Instructions for updating:
reduction_indices is deprecated, use axis instead
W0902 17:52:58.896279 140499052144448 deprecation.py:323] From <ipython-input-10-bad324d42cc3>:10: GRUCell.__init__ (from tensorflow.python.ops.rnn_cell_impl) is deprecated and will be removed in a future version.
Instructions for updating:
This class is equivalent as tf.keras.layers.GRUCell, and will be replaced by that in Tensorflow 2.0.
W0902 17:52:58.897986 140499052144448 deprecation.py:323] From <ipython-input-10-bad324d42cc3>:34: bidirectional_dynamic_rnn (from tensorflow.python.ops.rnn) is deprecated and will be removed in a future version.
Instructions for updating:
Please use `keras.layers.Bidirectional(keras.layers.RNN(cell))

Tensor("embedding_lookup/Identity:0", shape=(?, ?, 256), dtype=float32)


W0902 17:52:59.296139 140499052144448 deprecation.py:323] From /home/husein/.local/lib/python3.6/site-packages/tensorflow/python/ops/rnn.py:244: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
W0902 17:52:59.726474 140499052144448 deprecation.py:323] From <ipython-input-10-bad324d42cc3>:41: MultiRNNCell.__init__ (from tensorflow.python.ops.rnn_cell_impl) is deprecated and will be removed in a future version.
Instructions for updating:
This class is equivalent as tf.keras.layers.StackedRNNCells, and will be replaced by that in Tensorflow 2.0.
W0902 17:53:00.197296 140499052144448 deprecation.py:323] From <ipython-input-10-bad324d42cc3>:46: dense (from tensorflow.python.layers.core) is deprecated and will be removed in a future version.
Instructions for updating:
Use keras.layers.dense instead.
W0902 17:53:01.187

In [13]:
def str_idx(corpus, dic):
    X = []
    for i in corpus:
        ints = []
        for k in i.split():
            ints.append(dic.get(k,UNK))
        X.append(ints)
    return X

def pad_sentence_batch(sentence_batch, pad_int):
    padded_seqs = []
    seq_lens = []
    max_sentence_len = max([len(sentence) for sentence in sentence_batch])
    for sentence in sentence_batch:
        padded_seqs.append(sentence + [pad_int] * (max_sentence_len - len(sentence)))
        seq_lens.append(len(sentence))
    return padded_seqs, seq_lens

In [14]:
train_X = str_idx(train_X, dictionary_from)
test_X = str_idx(test_X, dictionary_from)
train_Y = str_idx(train_Y, dictionary_to)
test_Y = str_idx(test_Y, dictionary_to)

In [15]:
import tqdm

for e in range(epoch):
    pbar = tqdm.tqdm(
        range(0, len(train_X), batch_size), desc = 'minibatch loop')
    train_loss, train_acc, test_loss, test_acc = [], [], [], []
    for i in pbar:
        index = min(i + batch_size, len(train_X))
        maxlen = max([len(s) for s in train_X[i : index] + train_Y[i : index]])
        batch_x, seq_x = pad_sentence_batch(train_X[i : index], PAD)
        batch_y, seq_y = pad_sentence_batch(train_Y[i : index], PAD)
        feed = {model.X: batch_x,
                model.Y: batch_y}
        accuracy, loss, _ = sess.run([model.accuracy,model.cost,model.optimizer],
                                    feed_dict = feed)
        train_loss.append(loss)
        train_acc.append(accuracy)
        pbar.set_postfix(cost = loss, accuracy = accuracy)
    
    
    pbar = tqdm.tqdm(
        range(0, len(test_X), batch_size), desc = 'minibatch loop')
    for i in pbar:
        index = min(i + batch_size, len(test_X))
        batch_x, seq_x = pad_sentence_batch(test_X[i : index], PAD)
        batch_y, seq_y = pad_sentence_batch(test_Y[i : index], PAD)
        feed = {model.X: batch_x,
                model.Y: batch_y,}
        accuracy, loss = sess.run([model.accuracy,model.cost],
                                    feed_dict = feed)

        test_loss.append(loss)
        test_acc.append(accuracy)
        pbar.set_postfix(cost = loss, accuracy = accuracy)
    
    print('epoch %d, training avg loss %f, training avg acc %f'%(e+1,
                                                                 np.mean(train_loss),np.mean(train_acc)))
    print('epoch %d, testing avg loss %f, testing avg acc %f'%(e+1,
                                                              np.mean(test_loss),np.mean(test_acc)))

minibatch loop: 100%|██████████| 1042/1042 [08:43<00:00,  1.99it/s, accuracy=0.11, cost=6.57]  
minibatch loop: 100%|██████████| 23/23 [00:04<00:00,  5.54it/s, accuracy=0.113, cost=6.38] 
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 1, training avg loss 6.742509, training avg acc 0.082534
epoch 1, testing avg loss 6.359461, testing avg acc 0.107003


minibatch loop: 100%|██████████| 1042/1042 [09:23<00:00,  1.85it/s, accuracy=0.11, cost=5.97]  
minibatch loop: 100%|██████████| 23/23 [00:04<00:00,  5.26it/s, accuracy=0.119, cost=5.79]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 2, training avg loss 5.975235, training avg acc 0.129672
epoch 2, testing avg loss 5.830584, testing avg acc 0.130086


minibatch loop: 100%|██████████| 1042/1042 [09:29<00:00,  1.83it/s, accuracy=0.151, cost=5.13] 
minibatch loop: 100%|██████████| 23/23 [00:04<00:00,  5.38it/s, accuracy=0.136, cost=5.55]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 3, training avg loss 5.455325, training avg acc 0.149279
epoch 3, testing avg loss 5.597018, testing avg acc 0.136977


minibatch loop: 100%|██████████| 1042/1042 [09:35<00:00,  1.81it/s, accuracy=0.223, cost=4.33]
minibatch loop: 100%|██████████| 23/23 [00:04<00:00,  5.45it/s, accuracy=0.158, cost=5.43]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 4, training avg loss 5.061944, training avg acc 0.164512
epoch 4, testing avg loss 5.487984, testing avg acc 0.140275


minibatch loop: 100%|██████████| 1042/1042 [09:27<00:00,  1.84it/s, accuracy=0.268, cost=3.76]
minibatch loop: 100%|██████████| 23/23 [00:04<00:00,  5.46it/s, accuracy=0.13, cost=5.45] 
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 5, training avg loss 4.747907, training avg acc 0.180321
epoch 5, testing avg loss 5.461112, testing avg acc 0.135066


minibatch loop: 100%|██████████| 1042/1042 [09:27<00:00,  1.84it/s, accuracy=0.33, cost=3.43] 
minibatch loop: 100%|██████████| 23/23 [00:04<00:00,  5.51it/s, accuracy=0.153, cost=5.32]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 6, training avg loss 4.512797, training avg acc 0.192943
epoch 6, testing avg loss 5.478090, testing avg acc 0.135094


minibatch loop: 100%|██████████| 1042/1042 [09:27<00:00,  1.84it/s, accuracy=0.358, cost=3.13]
minibatch loop: 100%|██████████| 23/23 [00:04<00:00,  5.55it/s, accuracy=0.141, cost=5.37]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 7, training avg loss 4.314353, training avg acc 0.205505
epoch 7, testing avg loss 5.436017, testing avg acc 0.137525


minibatch loop: 100%|██████████| 1042/1042 [09:28<00:00,  1.83it/s, accuracy=0.393, cost=2.86]
minibatch loop: 100%|██████████| 23/23 [00:04<00:00,  5.36it/s, accuracy=0.153, cost=5.45]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 8, training avg loss 4.138429, training avg acc 0.219180
epoch 8, testing avg loss 5.480069, testing avg acc 0.135490


minibatch loop: 100%|██████████| 1042/1042 [09:26<00:00,  1.84it/s, accuracy=0.444, cost=2.71]
minibatch loop: 100%|██████████| 23/23 [00:04<00:00,  5.44it/s, accuracy=0.136, cost=5.51]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 9, training avg loss 3.996075, training avg acc 0.232005
epoch 9, testing avg loss 5.558038, testing avg acc 0.132899


minibatch loop: 100%|██████████| 1042/1042 [09:28<00:00,  1.83it/s, accuracy=0.453, cost=2.61]
minibatch loop: 100%|██████████| 23/23 [00:04<00:00,  5.31it/s, accuracy=0.169, cost=5.51]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 10, training avg loss 3.881571, training avg acc 0.243394
epoch 10, testing avg loss 5.620800, testing avg acc 0.134042


minibatch loop: 100%|██████████| 1042/1042 [09:27<00:00,  1.84it/s, accuracy=0.519, cost=2.4] 
minibatch loop: 100%|██████████| 23/23 [00:04<00:00,  5.39it/s, accuracy=0.169, cost=5.51]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 11, training avg loss 3.792803, training avg acc 0.251951
epoch 11, testing avg loss 5.691085, testing avg acc 0.132263


minibatch loop: 100%|██████████| 1042/1042 [09:26<00:00,  1.84it/s, accuracy=0.549, cost=2.28]
minibatch loop: 100%|██████████| 23/23 [00:04<00:00,  5.38it/s, accuracy=0.141, cost=5.74]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 12, training avg loss 3.686904, training avg acc 0.264544
epoch 12, testing avg loss 5.798446, testing avg acc 0.129563


minibatch loop: 100%|██████████| 1042/1042 [09:28<00:00,  1.83it/s, accuracy=0.561, cost=2.23]
minibatch loop: 100%|██████████| 23/23 [00:04<00:00,  5.34it/s, accuracy=0.13, cost=5.81] 
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 13, training avg loss 3.602702, training avg acc 0.274717
epoch 13, testing avg loss 5.866009, testing avg acc 0.125059


minibatch loop: 100%|██████████| 1042/1042 [09:44<00:00,  1.78it/s, accuracy=0.571, cost=2.1] 
minibatch loop: 100%|██████████| 23/23 [00:04<00:00,  5.41it/s, accuracy=0.147, cost=5.81]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 14, training avg loss 3.530540, training avg acc 0.283701
epoch 14, testing avg loss 5.920990, testing avg acc 0.125321


minibatch loop: 100%|██████████| 1042/1042 [09:26<00:00,  1.84it/s, accuracy=0.589, cost=2.05]
minibatch loop: 100%|██████████| 23/23 [00:04<00:00,  5.32it/s, accuracy=0.107, cost=5.85]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 15, training avg loss 3.467239, training avg acc 0.291981
epoch 15, testing avg loss 5.949078, testing avg acc 0.123143


minibatch loop: 100%|██████████| 1042/1042 [09:27<00:00,  1.84it/s, accuracy=0.591, cost=2.03]
minibatch loop: 100%|██████████| 23/23 [00:04<00:00,  5.37it/s, accuracy=0.141, cost=5.79]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 16, training avg loss 3.413331, training avg acc 0.299066
epoch 16, testing avg loss 5.980114, testing avg acc 0.124412


minibatch loop: 100%|██████████| 1042/1042 [09:28<00:00,  1.83it/s, accuracy=0.602, cost=1.99]
minibatch loop: 100%|██████████| 23/23 [00:04<00:00,  5.48it/s, accuracy=0.136, cost=5.95]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 17, training avg loss 3.363699, training avg acc 0.305850
epoch 17, testing avg loss 6.035131, testing avg acc 0.121432


minibatch loop: 100%|██████████| 1042/1042 [09:21<00:00,  1.85it/s, accuracy=0.624, cost=1.87]
minibatch loop: 100%|██████████| 23/23 [00:04<00:00,  5.45it/s, accuracy=0.153, cost=5.92]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 18, training avg loss 3.318743, training avg acc 0.312146
epoch 18, testing avg loss 6.090242, testing avg acc 0.123318


minibatch loop: 100%|██████████| 1042/1042 [09:07<00:00,  1.90it/s, accuracy=0.635, cost=1.87]
minibatch loop: 100%|██████████| 23/23 [00:04<00:00,  5.65it/s, accuracy=0.141, cost=5.99]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 19, training avg loss 3.278545, training avg acc 0.318217
epoch 19, testing avg loss 6.179376, testing avg acc 0.119470


minibatch loop: 100%|██████████| 1042/1042 [09:07<00:00,  1.90it/s, accuracy=0.655, cost=1.8] 
minibatch loop: 100%|██████████| 23/23 [00:04<00:00,  5.66it/s, accuracy=0.136, cost=6.03]

epoch 20, training avg loss 3.242404, training avg acc 0.323422
epoch 20, testing avg loss 6.227587, testing avg acc 0.119774





In [16]:
rev_dictionary_to = {int(k): v for k, v in rev_dictionary_to.items()}

In [17]:
test_size = 20

batch_x, seq_x = pad_sentence_batch(test_X[: test_size], PAD)
batch_y, seq_y = pad_sentence_batch(test_Y[: test_size], PAD)
feed = {model.X: batch_x,model.Y: batch_y,}
logits = np.argmax(sess.run(model.logits, feed_dict = feed), axis = 2)
logits.shape

(20, 99)

In [18]:
rejected = ['PAD', 'EOS', 'UNK', 'GO']

for i in range(test_size):
    predict = [rev_dictionary_to[i] for i in logits[i] if rev_dictionary_to[i] not in rejected]
    actual = [rev_dictionary_to[i] for i in batch_y[i] if rev_dictionary_to[i] not in rejected]
    print(i, 'predict:', ' '.join(predict))
    print(i, 'actual:', ' '.join(actual))
    print()

0 predict: Làm cách nói kể nói nói thế nào nào về chuyện về những nữ dành trong thế vòng thế về về về về những những đáng đáng đáng đáng đẹp đời đời sống sống sống sống sống sống sống sống sống sống cạnh với sống đời đời đời cô cô bà cô trong cô San cô cô và và và và và trong trong Số Số Số Ấn tới tới cô ngắn ngắn khiến khiến khiến khiến khiến khiến khiến khiến khiến đời đời sống sống cô sống cô sống sống sống sống sống sống cô cô hôm hôm
0 actual: Làm sao tôi có thể trình bày trong <NUM> phút về sợi dây liên kết những người phụ nữ qua ba thế hệ , về việc làm thế nào những sợi dây mạnh mẽ đáng kinh ngạc ấy đã níu chặt lấy cuộc sống của một cô bé bốn tuổi co quắp với đứa em gái nhỏ của cô bé , với mẹ và bà trong suốt năm ngày đêm trên con thuyền nhỏ lênh đênh trên Biển Đông hơn <NUM> năm trước , những sợi dây liên kết đã níu lấy cuộc đời cô bé ấy và không bao giờ rời đi - - cô bé ấy giờ sống ở San Francisco và đang nói chuyện với các bạn hôm nay ?

1 predict: Đây không phải là là thành 