In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [2]:
import numpy as np
import tensorflow as tf

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [3]:
import json

with open('train-test.json') as fopen:
    dataset = json.load(fopen)
    
with open('dictionary.json') as fopen:
    dictionary = json.load(fopen)

In [4]:
train_X = dataset['train_X']
train_Y = dataset['train_Y']
test_X = dataset['test_X']
test_Y = dataset['test_Y']

In [5]:
dictionary.keys()

dict_keys(['from', 'to'])

In [6]:
dictionary_from = dictionary['from']['dictionary']
rev_dictionary_from = dictionary['from']['rev_dictionary']

dictionary_to = dictionary['to']['dictionary']
rev_dictionary_to = dictionary['to']['rev_dictionary']

In [7]:
GO = dictionary_from['GO']
PAD = dictionary_from['PAD']
EOS = dictionary_from['EOS']
UNK = dictionary_from['UNK']

In [8]:
for i in range(len(train_X)):
    train_X[i] += ' EOS'
    
train_X[0]

'Rachel Pike : The science behind a climate headline EOS'

In [9]:
for i in range(len(test_X)):
    test_X[i] += ' EOS'
    
test_X[0]

'How can I speak in <NUM> minutes about the bonds of women over three generations , about how the astonishing strength of those bonds took hold in the life of a four - year - old girl huddled with her young sister , her mother and her grandmother for five days and nights in a small boat in the China Sea more than <NUM> years ago , bonds that took hold in the life of that small girl and never let go - - that small girl now living in San Francisco and speaking to you today ? EOS'

In [10]:
def pad_second_dim(x, desired_size):
    padding = tf.tile([[[0.0]]], tf.stack([tf.shape(x)[0], desired_size - tf.shape(x)[1], tf.shape(x)[2]], 0))
    return tf.concat([x, padding], 1)

class Translator:
    def __init__(self, size_layer, num_layers, embedded_size,
                 from_dict_size, to_dict_size, learning_rate, batch_size):
        
        def cells(reuse=False):
            return tf.nn.rnn_cell.LSTMCell(size_layer,initializer=tf.orthogonal_initializer(),reuse=reuse)
        
        self.X = tf.placeholder(tf.int32, [None, None])
        self.Y = tf.placeholder(tf.int32, [None, None])
        self.X_seq_len = tf.count_nonzero(self.X, 1, dtype=tf.int32)
        self.Y_seq_len = tf.count_nonzero(self.Y, 1, dtype=tf.int32)
        batch_size = tf.shape(self.X)[0]
        
        encoder_embedding = tf.Variable(tf.random_uniform([from_dict_size, embedded_size], -1, 1))
        decoder_embedding = tf.Variable(tf.random_uniform([to_dict_size, embedded_size], -1, 1))
        
        _, encoder_state = tf.nn.dynamic_rnn(
            cell = tf.nn.rnn_cell.MultiRNNCell([cells() for _ in range(num_layers)]), 
            inputs = tf.nn.embedding_lookup(encoder_embedding, self.X),
            sequence_length = self.X_seq_len,
            dtype = tf.float32)
        main = tf.strided_slice(self.Y, [0, 0], [batch_size, -1], [1, 1])
        decoder_input = tf.concat([tf.fill([batch_size, 1], GO), main], 1)
        dense = tf.layers.Dense(to_dict_size)
        decoder_cells = tf.nn.rnn_cell.MultiRNNCell([cells() for _ in range(num_layers)])
        
        training_helper = tf.contrib.seq2seq.TrainingHelper(
                inputs = tf.nn.embedding_lookup(decoder_embedding, decoder_input),
                sequence_length = self.Y_seq_len,
                time_major = False)
        training_decoder = tf.contrib.seq2seq.BasicDecoder(
                cell = decoder_cells,
                helper = training_helper,
                initial_state = encoder_state,
                output_layer = dense)
        training_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(
                decoder = training_decoder,
                impute_finished = True,
                maximum_iterations = tf.reduce_max(self.Y_seq_len))
        self.training_logits = training_decoder_output.rnn_output
        
        predicting_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(
                embedding = decoder_embedding,
                start_tokens = tf.tile(tf.constant([GO], dtype=tf.int32), [batch_size]),
                end_token = EOS)
        predicting_decoder = tf.contrib.seq2seq.BasicDecoder(
                cell = decoder_cells,
                helper = predicting_helper,
                initial_state = encoder_state,
                output_layer = dense)
        predicting_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(
                decoder = predicting_decoder,
                impute_finished = True,
                maximum_iterations = 2 * tf.reduce_max(self.X_seq_len))
        self.predicting_ids = predicting_decoder_output.sample_id
        
        masks = tf.sequence_mask(self.Y_seq_len, tf.reduce_max(self.Y_seq_len), dtype=tf.float32)
        self.cost = tf.contrib.seq2seq.sequence_loss(logits = self.training_logits,
                                                     targets = self.Y,
                                                     weights = masks)
        self.optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(self.cost)
        y_t = tf.argmax(self.training_logits,axis=2)
        y_t = tf.cast(y_t, tf.int32)
        self.prediction = tf.boolean_mask(y_t, masks)
        mask_label = tf.boolean_mask(self.Y, masks)
        correct_pred = tf.equal(self.prediction, mask_label)
        correct_index = tf.cast(correct_pred, tf.float32)
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [11]:
size_layer = 512
num_layers = 2
embedded_size = 256
learning_rate = 1e-3
batch_size = 128
epoch = 20

In [12]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Translator(size_layer, num_layers, embedded_size, len(dictionary_from), 
                len(dictionary_to), learning_rate,batch_size)
sess.run(tf.global_variables_initializer())

W0902 14:02:30.343306 140044124321600 deprecation.py:506] From /home/husein/.local/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py:507: calling count_nonzero (from tensorflow.python.ops.math_ops) with axis is deprecated and will be removed in a future version.
Instructions for updating:
reduction_indices is deprecated, use axis instead
W0902 14:02:30.374736 140044124321600 deprecation.py:323] From <ipython-input-10-c38288e22f44>:10: LSTMCell.__init__ (from tensorflow.python.ops.rnn_cell_impl) is deprecated and will be removed in a future version.
Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
W0902 14:02:30.376574 140044124321600 deprecation.py:323] From <ipython-input-10-c38288e22f44>:22: MultiRNNCell.__init__ (from tensorflow.python.ops.rnn_cell_impl) is deprecated and will be removed in a future version.
Instructions for updating:
This class is equivalent as tf.keras.layers.StackedRNN

In [13]:
def str_idx(corpus, dic):
    X = []
    for i in corpus:
        ints = []
        for k in i.split():
            ints.append(dic.get(k,UNK))
        X.append(ints)
    return X

def pad_sentence_batch(sentence_batch, pad_int):
    padded_seqs = []
    seq_lens = []
    max_sentence_len = max([len(sentence) for sentence in sentence_batch])
    for sentence in sentence_batch:
        padded_seqs.append(sentence + [pad_int] * (max_sentence_len - len(sentence)))
        seq_lens.append(len(sentence))
    return padded_seqs, seq_lens

In [14]:
train_X = str_idx(train_X, dictionary_from)
test_X = str_idx(test_X, dictionary_from)
train_Y = str_idx(train_Y, dictionary_to)
test_Y = str_idx(test_Y, dictionary_to)

In [15]:
import tqdm

for e in range(epoch):
    pbar = tqdm.tqdm(
        range(0, len(train_X), batch_size), desc = 'minibatch loop')
    train_loss, train_acc, test_loss, test_acc = [], [], [], []
    for i in pbar:
        index = min(i + batch_size, len(train_X))
        maxlen = max([len(s) for s in train_X[i : index] + train_Y[i : index]])
        batch_x, seq_x = pad_sentence_batch(train_X[i : index], PAD)
        batch_y, seq_y = pad_sentence_batch(train_Y[i : index], PAD)
        feed = {model.X: batch_x,
                model.Y: batch_y}
        accuracy, loss, _ = sess.run([model.accuracy,model.cost,model.optimizer],
                                    feed_dict = feed)
        train_loss.append(loss)
        train_acc.append(accuracy)
        pbar.set_postfix(cost = loss, accuracy = accuracy)
    
    
    pbar = tqdm.tqdm(
        range(0, len(test_X), batch_size), desc = 'minibatch loop')
    for i in pbar:
        index = min(i + batch_size, len(test_X))
        batch_x, seq_x = pad_sentence_batch(test_X[i : index], PAD)
        batch_y, seq_y = pad_sentence_batch(test_Y[i : index], PAD)
        feed = {model.X: batch_x,
                model.Y: batch_y,}
        accuracy, loss = sess.run([model.accuracy,model.cost],
                                    feed_dict = feed)

        test_loss.append(loss)
        test_acc.append(accuracy)
        pbar.set_postfix(cost = loss, accuracy = accuracy)
    
    print('epoch %d, training avg loss %f, training avg acc %f'%(e+1,
                                                                 np.mean(train_loss),np.mean(train_acc)))
    print('epoch %d, testing avg loss %f, testing avg acc %f'%(e+1,
                                                              np.mean(test_loss),np.mean(test_acc)))

minibatch loop: 100%|██████████| 1042/1042 [07:27<00:00,  2.33it/s, accuracy=0.141, cost=5.68]
minibatch loop: 100%|██████████| 23/23 [00:03<00:00,  6.05it/s, accuracy=0.198, cost=5.26]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 1, training avg loss 6.081906, training avg acc 0.102993
epoch 1, testing avg loss 5.203203, testing avg acc 0.176136


minibatch loop: 100%|██████████| 1042/1042 [07:28<00:00,  2.32it/s, accuracy=0.251, cost=4.68]
minibatch loop: 100%|██████████| 23/23 [00:03<00:00,  6.25it/s, accuracy=0.271, cost=4.48]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 2, training avg loss 4.654086, training avg acc 0.242739
epoch 2, testing avg loss 4.366420, testing avg acc 0.274940


minibatch loop: 100%|██████████| 1042/1042 [07:27<00:00,  2.33it/s, accuracy=0.299, cost=4.06]
minibatch loop: 100%|██████████| 23/23 [00:03<00:00,  6.26it/s, accuracy=0.305, cost=4.14]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 3, training avg loss 4.028984, training avg acc 0.311464
epoch 3, testing avg loss 4.004305, testing avg acc 0.314859


minibatch loop: 100%|██████████| 1042/1042 [07:33<00:00,  2.30it/s, accuracy=0.336, cost=3.53]
minibatch loop: 100%|██████████| 23/23 [00:03<00:00,  6.17it/s, accuracy=0.328, cost=3.94]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 4, training avg loss 3.661000, training avg acc 0.348234
epoch 4, testing avg loss 3.802187, testing avg acc 0.337951


minibatch loop: 100%|██████████| 1042/1042 [07:49<00:00,  2.22it/s, accuracy=0.388, cost=3.06]
minibatch loop: 100%|██████████| 23/23 [00:03<00:00,  6.19it/s, accuracy=0.328, cost=3.82]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 5, training avg loss 3.392329, training avg acc 0.375111
epoch 5, testing avg loss 3.691531, testing avg acc 0.347925


minibatch loop: 100%|██████████| 1042/1042 [07:50<00:00,  2.22it/s, accuracy=0.461, cost=2.66]
minibatch loop: 100%|██████████| 23/23 [00:03<00:00,  6.21it/s, accuracy=0.322, cost=3.74]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 6, training avg loss 3.177674, training avg acc 0.398258
epoch 6, testing avg loss 3.631871, testing avg acc 0.356049


minibatch loop: 100%|██████████| 1042/1042 [07:34<00:00,  2.29it/s, accuracy=0.497, cost=2.32]
minibatch loop: 100%|██████████| 23/23 [00:03<00:00,  6.22it/s, accuracy=0.345, cost=3.69]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 7, training avg loss 2.999384, training avg acc 0.419626
epoch 7, testing avg loss 3.607840, testing avg acc 0.359905


minibatch loop: 100%|██████████| 1042/1042 [07:34<00:00,  2.30it/s, accuracy=0.567, cost=2.03]
minibatch loop: 100%|██████████| 23/23 [00:03<00:00,  6.10it/s, accuracy=0.339, cost=3.67]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 8, training avg loss 2.847746, training avg acc 0.439140
epoch 8, testing avg loss 3.622833, testing avg acc 0.357671


minibatch loop: 100%|██████████| 1042/1042 [07:35<00:00,  2.29it/s, accuracy=0.621, cost=1.77]
minibatch loop: 100%|██████████| 23/23 [00:03<00:00,  6.16it/s, accuracy=0.356, cost=3.68]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 9, training avg loss 2.715655, training avg acc 0.456956
epoch 9, testing avg loss 3.651503, testing avg acc 0.357053


minibatch loop: 100%|██████████| 1042/1042 [07:34<00:00,  2.29it/s, accuracy=0.674, cost=1.56]
minibatch loop: 100%|██████████| 23/23 [00:03<00:00,  6.17it/s, accuracy=0.345, cost=3.67]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 10, training avg loss 2.598729, training avg acc 0.473453
epoch 10, testing avg loss 3.667660, testing avg acc 0.357855


minibatch loop: 100%|██████████| 1042/1042 [07:35<00:00,  2.29it/s, accuracy=0.697, cost=1.39]
minibatch loop: 100%|██████████| 23/23 [00:03<00:00,  6.15it/s, accuracy=0.345, cost=3.69]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 11, training avg loss 2.493339, training avg acc 0.489153
epoch 11, testing avg loss 3.686450, testing avg acc 0.359175


minibatch loop: 100%|██████████| 1042/1042 [07:34<00:00,  2.29it/s, accuracy=0.737, cost=1.24]
minibatch loop: 100%|██████████| 23/23 [00:03<00:00,  6.21it/s, accuracy=0.356, cost=3.68]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 12, training avg loss 2.396524, training avg acc 0.504411
epoch 12, testing avg loss 3.721671, testing avg acc 0.356459


minibatch loop: 100%|██████████| 1042/1042 [07:34<00:00,  2.30it/s, accuracy=0.772, cost=1.11]
minibatch loop: 100%|██████████| 23/23 [00:03<00:00,  6.18it/s, accuracy=0.35, cost=3.75] 
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 13, training avg loss 2.307070, training avg acc 0.518457
epoch 13, testing avg loss 3.793877, testing avg acc 0.350472


minibatch loop: 100%|██████████| 1042/1042 [07:34<00:00,  2.29it/s, accuracy=0.796, cost=0.991]
minibatch loop: 100%|██████████| 23/23 [00:03<00:00,  6.15it/s, accuracy=0.339, cost=3.81]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 14, training avg loss 2.224829, training avg acc 0.531906
epoch 14, testing avg loss 3.852001, testing avg acc 0.346448


minibatch loop: 100%|██████████| 1042/1042 [07:34<00:00,  2.29it/s, accuracy=0.825, cost=0.9] 
minibatch loop: 100%|██████████| 23/23 [00:03<00:00,  6.24it/s, accuracy=0.322, cost=3.88]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 15, training avg loss 2.149893, training avg acc 0.544324
epoch 15, testing avg loss 3.895475, testing avg acc 0.344280


minibatch loop: 100%|██████████| 1042/1042 [07:26<00:00,  2.33it/s, accuracy=0.84, cost=0.816]
minibatch loop: 100%|██████████| 23/23 [00:03<00:00,  6.30it/s, accuracy=0.339, cost=3.93]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 16, training avg loss 2.079703, training avg acc 0.555977
epoch 16, testing avg loss 3.934112, testing avg acc 0.344452


minibatch loop: 100%|██████████| 1042/1042 [07:22<00:00,  2.35it/s, accuracy=0.864, cost=0.749]
minibatch loop: 100%|██████████| 23/23 [00:03<00:00,  6.27it/s, accuracy=0.35, cost=3.9]  
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 17, training avg loss 2.011189, training avg acc 0.568161
epoch 17, testing avg loss 3.974632, testing avg acc 0.345256


minibatch loop: 100%|██████████| 1042/1042 [07:23<00:00,  2.35it/s, accuracy=0.887, cost=0.656]
minibatch loop: 100%|██████████| 23/23 [00:03<00:00,  6.24it/s, accuracy=0.345, cost=3.93]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 18, training avg loss 1.944064, training avg acc 0.579898
epoch 18, testing avg loss 4.030296, testing avg acc 0.341777


minibatch loop: 100%|██████████| 1042/1042 [07:19<00:00,  2.37it/s, accuracy=0.907, cost=0.61]
minibatch loop: 100%|██████████| 23/23 [00:03<00:00,  6.21it/s, accuracy=0.356, cost=3.96]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 19, training avg loss 1.881238, training avg acc 0.591598
epoch 19, testing avg loss 4.107987, testing avg acc 0.336847


minibatch loop: 100%|██████████| 1042/1042 [07:13<00:00,  2.40it/s, accuracy=0.913, cost=0.548]
minibatch loop: 100%|██████████| 23/23 [00:03<00:00,  6.16it/s, accuracy=0.333, cost=4.12]

epoch 20, training avg loss 1.822689, training avg acc 0.602095
epoch 20, testing avg loss 4.191944, testing avg acc 0.330939





In [16]:
rev_dictionary_to = {int(k): v for k, v in rev_dictionary_to.items()}

In [17]:
test_size = 20

batch_x, seq_x = pad_sentence_batch(test_X[: test_size], PAD)
batch_y, seq_y = pad_sentence_batch(test_Y[: test_size], PAD)
feed = {model.X: batch_x}
logits = sess.run(model.predicting_ids, feed_dict = feed)
logits.shape

(20, 198)

In [18]:
rejected = ['PAD', 'EOS', 'UNK', 'GO']

for i in range(test_size):
    predict = [rev_dictionary_to[i] for i in logits[i] if rev_dictionary_to[i] not in rejected]
    actual = [rev_dictionary_to[i] for i in batch_y[i] if rev_dictionary_to[i] not in rejected]
    print(i, 'predict:', ' '.join(predict))
    print(i, 'actual:', ' '.join(actual))
    print()

0 predict: Tình trạng trước đây nói về phụ nữ khoảng cách giữa tháng <NUM> năm nay , sống trong một thế giới gồm <NUM> người phụ nữ trong tương lai , khoảng <NUM> tháng tuổi , người phụ nữ sống ở quê hương và sinh sản , cô ấy đang ở trong một khoảng thời gian riêng rẽ , và cô ấy đang ở trong một cuộc sống của cô ấy , và hỏi cô ấy , trong khi không có con trai nào , chị sống trên đời sống của cô ấy và cho cô ấy đi về làng của chị ? - - hôm nay chị chị chị sẽ làm cái này ? SS hôm nay ? " Xin chào đón chúng vào tháng nay ? " và cô ấy đang làm gì ? " và chị chị này ? " - - phòng này đã làm được điều này ? " Xin chào đời mình ? " - cô ấy đã làm vậy ? " Xin chào đời mình ? " - cô ấy sẽ làm điều này ? " và chị đã làm thế này ? " Xin chào ? " - - & gt
0 actual: Làm sao tôi có thể trình bày trong <NUM> phút về sợi dây liên kết những người phụ nữ qua ba thế hệ , về việc làm thế nào những sợi dây mạnh mẽ đáng kinh ngạc ấy đã níu chặt lấy cuộc sống của một cô bé bốn tuổi co quắp với đứa em gái nhỏ