In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '3'

In [2]:
import numpy as np
import tensorflow as tf

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [3]:
import json

with open('train-test.json') as fopen:
    dataset = json.load(fopen)
    
with open('dictionary.json') as fopen:
    dictionary = json.load(fopen)

In [4]:
train_X = dataset['train_X']
train_Y = dataset['train_Y']
test_X = dataset['test_X']
test_Y = dataset['test_Y']

In [5]:
dictionary.keys()

dict_keys(['from', 'to'])

In [6]:
dictionary_from = dictionary['from']['dictionary']
rev_dictionary_from = dictionary['from']['rev_dictionary']

dictionary_to = dictionary['to']['dictionary']
rev_dictionary_to = dictionary['to']['rev_dictionary']

In [7]:
GO = dictionary_from['GO']
PAD = dictionary_from['PAD']
EOS = dictionary_from['EOS']
UNK = dictionary_from['UNK']

In [8]:
for i in range(len(train_X)):
    train_X[i] += ' EOS'
    
train_X[0]

'Rachel Pike : The science behind a climate headline EOS'

In [9]:
for i in range(len(test_X)):
    test_X[i] += ' EOS'
    
test_X[0]

'How can I speak in <NUM> minutes about the bonds of women over three generations , about how the astonishing strength of those bonds took hold in the life of a four - year - old girl huddled with her young sister , her mother and her grandmother for five days and nights in a small boat in the China Sea more than <NUM> years ago , bonds that took hold in the life of that small girl and never let go - - that small girl now living in San Francisco and speaking to you today ? EOS'

In [10]:
def pad_second_dim(x, desired_size):
    padding = tf.tile([[[0.0]]], tf.stack([tf.shape(x)[0], desired_size - tf.shape(x)[1], tf.shape(x)[2]], 0))
    return tf.concat([x, padding], 1)

class Translator:
    def __init__(self, size_layer, num_layers, embedded_size,
                 from_dict_size, to_dict_size, learning_rate, batch_size):
        
        def cells(reuse=False):
            return tf.nn.rnn_cell.BasicRNNCell(size_layer,reuse=reuse)
        
        self.X = tf.placeholder(tf.int32, [None, None])
        self.Y = tf.placeholder(tf.int32, [None, None])
        self.X_seq_len = tf.count_nonzero(self.X, 1, dtype=tf.int32)
        self.Y_seq_len = tf.count_nonzero(self.Y, 1, dtype=tf.int32)
        batch_size = tf.shape(self.X)[0]
        
        encoder_embedding = tf.Variable(tf.random_uniform([from_dict_size, embedded_size], -1, 1))
        decoder_embedding = tf.Variable(tf.random_uniform([to_dict_size, embedded_size], -1, 1))
        
        _, encoder_state = tf.nn.dynamic_rnn(
            cell = tf.nn.rnn_cell.MultiRNNCell([cells() for _ in range(num_layers)]), 
            inputs = tf.nn.embedding_lookup(encoder_embedding, self.X),
            sequence_length = self.X_seq_len,
            dtype = tf.float32)
        main = tf.strided_slice(self.Y, [0, 0], [batch_size, -1], [1, 1])
        decoder_input = tf.concat([tf.fill([batch_size, 1], GO), main], 1)
        dense = tf.layers.Dense(to_dict_size)
        decoder_cells = tf.nn.rnn_cell.MultiRNNCell([cells() for _ in range(num_layers)])
        
        training_helper = tf.contrib.seq2seq.TrainingHelper(
                inputs = tf.nn.embedding_lookup(decoder_embedding, decoder_input),
                sequence_length = self.Y_seq_len,
                time_major = False)
        training_decoder = tf.contrib.seq2seq.BasicDecoder(
                cell = decoder_cells,
                helper = training_helper,
                initial_state = encoder_state,
                output_layer = dense)
        training_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(
                decoder = training_decoder,
                impute_finished = True,
                maximum_iterations = tf.reduce_max(self.Y_seq_len))
        self.training_logits = training_decoder_output.rnn_output
        
        predicting_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(
                embedding = decoder_embedding,
                start_tokens = tf.tile(tf.constant([GO], dtype=tf.int32), [batch_size]),
                end_token = EOS)
        predicting_decoder = tf.contrib.seq2seq.BasicDecoder(
                cell = decoder_cells,
                helper = predicting_helper,
                initial_state = encoder_state,
                output_layer = dense)
        predicting_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(
                decoder = predicting_decoder,
                impute_finished = True,
                maximum_iterations = 2 * tf.reduce_max(self.X_seq_len))
        self.predicting_ids = predicting_decoder_output.sample_id
        
        masks = tf.sequence_mask(self.Y_seq_len, tf.reduce_max(self.Y_seq_len), dtype=tf.float32)
        self.cost = tf.contrib.seq2seq.sequence_loss(logits = self.training_logits,
                                                     targets = self.Y,
                                                     weights = masks)
        self.optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(self.cost)
        y_t = tf.argmax(self.training_logits,axis=2)
        y_t = tf.cast(y_t, tf.int32)
        self.prediction = tf.boolean_mask(y_t, masks)
        mask_label = tf.boolean_mask(self.Y, masks)
        correct_pred = tf.equal(self.prediction, mask_label)
        correct_index = tf.cast(correct_pred, tf.float32)
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [11]:
size_layer = 512
num_layers = 2
embedded_size = 256
learning_rate = 1e-3
batch_size = 128
epoch = 20

In [12]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Translator(size_layer, num_layers, embedded_size, len(dictionary_from), 
                len(dictionary_to), learning_rate,batch_size)
sess.run(tf.global_variables_initializer())

W0902 10:37:50.396917 139850844325696 deprecation.py:506] From /home/husein/.local/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py:507: calling count_nonzero (from tensorflow.python.ops.math_ops) with axis is deprecated and will be removed in a future version.
Instructions for updating:
reduction_indices is deprecated, use axis instead
W0902 10:37:50.430243 139850844325696 deprecation.py:323] From <ipython-input-10-69bf39d9ea4f>:10: BasicRNNCell.__init__ (from tensorflow.python.ops.rnn_cell_impl) is deprecated and will be removed in a future version.
Instructions for updating:
This class is equivalent as tf.keras.layers.SimpleRNNCell, and will be replaced by that in Tensorflow 2.0.
W0902 10:37:50.431915 139850844325696 deprecation.py:323] From <ipython-input-10-69bf39d9ea4f>:22: MultiRNNCell.__init__ (from tensorflow.python.ops.rnn_cell_impl) is deprecated and will be removed in a future version.
Instructions for updating:
This class is equivalent as tf.keras.layers.S

In [13]:
def str_idx(corpus, dic):
    X = []
    for i in corpus:
        ints = []
        for k in i.split():
            ints.append(dic.get(k,UNK))
        X.append(ints)
    return X

def pad_sentence_batch(sentence_batch, pad_int):
    padded_seqs = []
    seq_lens = []
    max_sentence_len = max([len(sentence) for sentence in sentence_batch])
    for sentence in sentence_batch:
        padded_seqs.append(sentence + [pad_int] * (max_sentence_len - len(sentence)))
        seq_lens.append(len(sentence))
    return padded_seqs, seq_lens

In [14]:
train_X = str_idx(train_X, dictionary_from)
test_X = str_idx(test_X, dictionary_from)
train_Y = str_idx(train_Y, dictionary_to)
test_Y = str_idx(test_Y, dictionary_to)

In [15]:
import tqdm

for e in range(epoch):
    pbar = tqdm.tqdm(
        range(0, len(train_X), batch_size), desc = 'minibatch loop')
    train_loss, train_acc, test_loss, test_acc = [], [], [], []
    for i in pbar:
        index = min(i + batch_size, len(train_X))
        maxlen = max([len(s) for s in train_X[i : index] + train_Y[i : index]])
        batch_x, seq_x = pad_sentence_batch(train_X[i : index], PAD)
        batch_y, seq_y = pad_sentence_batch(train_Y[i : index], PAD)
        feed = {model.X: batch_x,
                model.Y: batch_y}
        accuracy, loss, _ = sess.run([model.accuracy,model.cost,model.optimizer],
                                    feed_dict = feed)
        train_loss.append(loss)
        train_acc.append(accuracy)
        pbar.set_postfix(cost = loss, accuracy = accuracy)
    
    
    pbar = tqdm.tqdm(
        range(0, len(test_X), batch_size), desc = 'minibatch loop')
    for i in pbar:
        index = min(i + batch_size, len(test_X))
        batch_x, seq_x = pad_sentence_batch(test_X[i : index], PAD)
        batch_y, seq_y = pad_sentence_batch(test_Y[i : index], PAD)
        feed = {model.X: batch_x,
                model.Y: batch_y,}
        accuracy, loss = sess.run([model.accuracy,model.cost],
                                    feed_dict = feed)

        test_loss.append(loss)
        test_acc.append(accuracy)
        pbar.set_postfix(cost = loss, accuracy = accuracy)
    
    print('epoch %d, training avg loss %f, training avg acc %f'%(e+1,
                                                                 np.mean(train_loss),np.mean(train_acc)))
    print('epoch %d, testing avg loss %f, testing avg acc %f'%(e+1,
                                                              np.mean(test_loss),np.mean(test_acc)))

minibatch loop: 100%|██████████| 1042/1042 [05:37<00:00,  3.09it/s, accuracy=0.118, cost=5.76] 
minibatch loop: 100%|██████████| 23/23 [00:03<00:00,  6.70it/s, accuracy=0.175, cost=5.27]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 1, training avg loss 6.053971, training avg acc 0.082107
epoch 1, testing avg loss 5.324556, testing avg acc 0.152646


minibatch loop: 100%|██████████| 1042/1042 [05:36<00:00,  3.10it/s, accuracy=0.188, cost=5.05]
minibatch loop: 100%|██████████| 23/23 [00:03<00:00,  6.87it/s, accuracy=0.215, cost=4.84]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 2, training avg loss 4.912388, training avg acc 0.196447
epoch 2, testing avg loss 4.737615, testing avg acc 0.215985


minibatch loop: 100%|██████████| 1042/1042 [05:36<00:00,  3.10it/s, accuracy=0.213, cost=4.67]
minibatch loop: 100%|██████████| 23/23 [00:03<00:00,  6.92it/s, accuracy=0.198, cost=4.69]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 3, training avg loss 4.518934, training avg acc 0.230342
epoch 3, testing avg loss 4.562802, testing avg acc 0.225339


minibatch loop: 100%|██████████| 1042/1042 [05:36<00:00,  3.10it/s, accuracy=0.23, cost=4.39] 
minibatch loop: 100%|██████████| 23/23 [00:03<00:00,  6.92it/s, accuracy=0.226, cost=4.64]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 4, training avg loss 4.316509, training avg acc 0.245301
epoch 4, testing avg loss 4.477951, testing avg acc 0.237584


minibatch loop: 100%|██████████| 1042/1042 [05:36<00:00,  3.10it/s, accuracy=0.256, cost=4.11]
minibatch loop: 100%|██████████| 23/23 [00:03<00:00,  6.90it/s, accuracy=0.203, cost=4.58]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 5, training avg loss 4.175539, training avg acc 0.256459
epoch 5, testing avg loss 4.429749, testing avg acc 0.243252


minibatch loop: 100%|██████████| 1042/1042 [05:35<00:00,  3.10it/s, accuracy=0.261, cost=3.87]
minibatch loop: 100%|██████████| 23/23 [00:03<00:00,  6.92it/s, accuracy=0.215, cost=4.58]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 6, training avg loss 4.069971, training avg acc 0.265229
epoch 6, testing avg loss 4.413609, testing avg acc 0.244927


minibatch loop: 100%|██████████| 1042/1042 [05:36<00:00,  3.10it/s, accuracy=0.278, cost=3.7] 
minibatch loop: 100%|██████████| 23/23 [00:03<00:00,  6.95it/s, accuracy=0.22, cost=4.58] 
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 7, training avg loss 3.993729, training avg acc 0.270755
epoch 7, testing avg loss 4.422013, testing avg acc 0.242420


minibatch loop: 100%|██████████| 1042/1042 [05:36<00:00,  3.10it/s, accuracy=0.283, cost=3.62]
minibatch loop: 100%|██████████| 23/23 [00:03<00:00,  6.92it/s, accuracy=0.215, cost=4.56]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 8, training avg loss 3.932503, training avg acc 0.276367
epoch 8, testing avg loss 4.417103, testing avg acc 0.246614


minibatch loop: 100%|██████████| 1042/1042 [05:36<00:00,  3.10it/s, accuracy=0.312, cost=3.48]
minibatch loop: 100%|██████████| 23/23 [00:03<00:00,  6.90it/s, accuracy=0.215, cost=4.67]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 9, training avg loss 3.882398, training avg acc 0.281375
epoch 9, testing avg loss 4.423276, testing avg acc 0.247590


minibatch loop: 100%|██████████| 1042/1042 [05:35<00:00,  3.11it/s, accuracy=0.338, cost=3.29]
minibatch loop: 100%|██████████| 23/23 [00:03<00:00,  6.95it/s, accuracy=0.198, cost=4.64]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 10, training avg loss 3.836034, training avg acc 0.285949
epoch 10, testing avg loss 4.419407, testing avg acc 0.248623


minibatch loop: 100%|██████████| 1042/1042 [05:35<00:00,  3.11it/s, accuracy=0.352, cost=3.17]
minibatch loop: 100%|██████████| 23/23 [00:03<00:00,  6.89it/s, accuracy=0.203, cost=4.59]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 11, training avg loss 3.798077, training avg acc 0.289666
epoch 11, testing avg loss 4.413572, testing avg acc 0.250763


minibatch loop: 100%|██████████| 1042/1042 [05:37<00:00,  3.09it/s, accuracy=0.373, cost=3.1] 
minibatch loop: 100%|██████████| 23/23 [00:03<00:00,  6.85it/s, accuracy=0.215, cost=4.61]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 12, training avg loss 3.760315, training avg acc 0.293373
epoch 12, testing avg loss 4.430881, testing avg acc 0.249786


minibatch loop: 100%|██████████| 1042/1042 [05:37<00:00,  3.09it/s, accuracy=0.386, cost=3.01]
minibatch loop: 100%|██████████| 23/23 [00:03<00:00,  6.79it/s, accuracy=0.226, cost=4.66]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 13, training avg loss 3.737885, training avg acc 0.295704
epoch 13, testing avg loss 4.438071, testing avg acc 0.251573


minibatch loop: 100%|██████████| 1042/1042 [05:37<00:00,  3.08it/s, accuracy=0.392, cost=2.95]
minibatch loop: 100%|██████████| 23/23 [00:03<00:00,  6.85it/s, accuracy=0.232, cost=4.73]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 14, training avg loss 3.710586, training avg acc 0.298460
epoch 14, testing avg loss 4.453537, testing avg acc 0.251079


minibatch loop: 100%|██████████| 1042/1042 [05:37<00:00,  3.09it/s, accuracy=0.413, cost=2.91]
minibatch loop: 100%|██████████| 23/23 [00:03<00:00,  6.91it/s, accuracy=0.209, cost=4.76]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 15, training avg loss 3.687195, training avg acc 0.300952
epoch 15, testing avg loss 4.473650, testing avg acc 0.249692


minibatch loop: 100%|██████████| 1042/1042 [05:37<00:00,  3.09it/s, accuracy=0.405, cost=2.84]
minibatch loop: 100%|██████████| 23/23 [00:03<00:00,  6.90it/s, accuracy=0.215, cost=4.74]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 16, training avg loss 3.673392, training avg acc 0.302177
epoch 16, testing avg loss 4.486164, testing avg acc 0.246044


minibatch loop: 100%|██████████| 1042/1042 [05:37<00:00,  3.09it/s, accuracy=0.418, cost=2.8] 
minibatch loop: 100%|██████████| 23/23 [00:03<00:00,  6.91it/s, accuracy=0.237, cost=4.74]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 17, training avg loss 3.648655, training avg acc 0.305105
epoch 17, testing avg loss 4.488136, testing avg acc 0.252109


minibatch loop: 100%|██████████| 1042/1042 [05:37<00:00,  3.09it/s, accuracy=0.445, cost=2.71]
minibatch loop: 100%|██████████| 23/23 [00:03<00:00,  6.86it/s, accuracy=0.232, cost=4.7] 
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 18, training avg loss 3.628495, training avg acc 0.307380
epoch 18, testing avg loss 4.493890, testing avg acc 0.251686


minibatch loop: 100%|██████████| 1042/1042 [05:37<00:00,  3.09it/s, accuracy=0.432, cost=2.71]
minibatch loop: 100%|██████████| 23/23 [00:03<00:00,  6.93it/s, accuracy=0.249, cost=4.71]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 19, training avg loss 3.606906, training avg acc 0.309854
epoch 19, testing avg loss 4.501904, testing avg acc 0.253285


minibatch loop: 100%|██████████| 1042/1042 [05:37<00:00,  3.09it/s, accuracy=0.444, cost=2.66]
minibatch loop: 100%|██████████| 23/23 [00:03<00:00,  6.83it/s, accuracy=0.243, cost=4.78]

epoch 20, training avg loss 3.587826, training avg acc 0.312513
epoch 20, testing avg loss 4.513405, testing avg acc 0.252812





In [16]:
rev_dictionary_to = {int(k): v for k, v in rev_dictionary_to.items()}

In [17]:
test_size = 20

batch_x, seq_x = pad_sentence_batch(test_X[: test_size], PAD)
batch_y, seq_y = pad_sentence_batch(test_Y[: test_size], PAD)
feed = {model.X: batch_x}
logits = sess.run(model.predicting_ids, feed_dict = feed)
logits.shape

(20, 198)

In [18]:
rejected = ['PAD', 'EOS', 'UNK', 'GO']

for i in range(test_size):
    predict = [rev_dictionary_to[i] for i in logits[i] if rev_dictionary_to[i] not in rejected]
    actual = [rev_dictionary_to[i] for i in batch_y[i] if rev_dictionary_to[i] not in rejected]
    print(i, 'predict:', ' '.join(predict))
    print(i, 'actual:', ' '.join(actual))
    print()

0 predict: Vậy tại sao lại như vậy ? " . " và " tôi đang cố gắng làm điều này . " Tôi đã nói , " Tôi sẽ không nói rằng , " Tôi không biết , tôi sẽ không nói về việc làm thế nào để tạo ra những rô bốt mới . " . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
0 actual: Làm sao tôi có thể trình bày trong <NUM> phút về sợi dây liên kết những người phụ nữ qua ba thế hệ , về việc làm thế nào những sợi dây mạnh mẽ đáng kinh ngạc ấy đã níu chặt lấy cuộc sống của một cô bé bốn tuổi co quắp với đứa em gái nhỏ của cô bé , với mẹ và bà trong suốt năm ngày đêm trên con thuyền nhỏ lênh đênh trên Biển Đông hơn <NUM> năm trước , những sợi dây liên kết đã níu lấy cuộc đời cô bé ấy và không bao giờ rời đi - - cô bé ấy giờ sống ở San Francisco và đang nói chuyện với các bạ