In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [2]:
import numpy as np
import tensorflow as tf

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [3]:
import json

with open('train-test.json') as fopen:
    dataset = json.load(fopen)
    
with open('dictionary.json') as fopen:
    dictionary = json.load(fopen)

In [4]:
train_X = dataset['train_X']
train_Y = dataset['train_Y']
test_X = dataset['test_X']
test_Y = dataset['test_Y']

In [5]:
dictionary.keys()

dict_keys(['from', 'to'])

In [6]:
dictionary_from = dictionary['from']['dictionary']
rev_dictionary_from = dictionary['from']['rev_dictionary']

dictionary_to = dictionary['to']['dictionary']
rev_dictionary_to = dictionary['to']['rev_dictionary']

In [7]:
GO = dictionary_from['GO']
PAD = dictionary_from['PAD']
EOS = dictionary_from['EOS']
UNK = dictionary_from['UNK']

In [8]:
for i in range(len(train_X)):
    train_X[i] += ' EOS'
    
train_X[0]

'Rachel Pike : The science behind a climate headline EOS'

In [9]:
for i in range(len(test_X)):
    test_X[i] += ' EOS'
    
test_X[0]

'How can I speak in <NUM> minutes about the bonds of women over three generations , about how the astonishing strength of those bonds took hold in the life of a four - year - old girl huddled with her young sister , her mother and her grandmother for five days and nights in a small boat in the China Sea more than <NUM> years ago , bonds that took hold in the life of that small girl and never let go - - that small girl now living in San Francisco and speaking to you today ? EOS'

In [10]:
def pad_second_dim(x, desired_size):
    padding = tf.tile([[[0.0]]], tf.stack([tf.shape(x)[0], desired_size - tf.shape(x)[1], tf.shape(x)[2]], 0))
    return tf.concat([x, padding], 1)

class Translator:
    def __init__(self, size_layer, num_layers, embedded_size, 
                 from_dict_size, to_dict_size, learning_rate,
                 beam_width=5, force_teaching_ratio=0.5):
        
        def lstm_cell(size, reuse=False):
            return tf.nn.rnn_cell.LSTMCell(size, initializer=tf.orthogonal_initializer(),reuse=reuse)
        
        self.X = tf.placeholder(tf.int32, [None, None])
        self.Y = tf.placeholder(tf.int32, [None, None])
        self.X_seq_len = tf.count_nonzero(self.X, 1, dtype=tf.int32)
        self.Y_seq_len = tf.count_nonzero(self.Y, 1, dtype=tf.int32)
        batch_size = tf.shape(self.X)[0]
        
        encoder_embeddings = tf.Variable(tf.random_uniform([from_dict_size, embedded_size], -1, 1))
        decoder_embeddings = tf.Variable(tf.random_uniform([to_dict_size, embedded_size], -1, 1))
        self.encoder_out = tf.nn.embedding_lookup(encoder_embeddings, self.X)
        
        for n in range(num_layers):
            (out_fw, out_bw), (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(
                cell_fw = lstm_cell(size_layer // 2),
                cell_bw = lstm_cell(size_layer // 2),
                inputs = self.encoder_out,
                sequence_length = self.X_seq_len,
                dtype = tf.float32,
                scope = 'bidirectional_rnn_%d'%(n))
            self.encoder_out = tf.concat((out_fw, out_bw), 2)
        bi_state_c = tf.concat((state_fw.c, state_bw.c), -1)
        bi_state_h = tf.concat((state_fw.h, state_bw.h), -1)
        bi_lstm_state = tf.nn.rnn_cell.LSTMStateTuple(c=bi_state_c, h=bi_state_h)
        encoder_state = tuple([bi_lstm_state] * num_layers)
        
        with tf.variable_scope('decode'):
            attention_mechanism = tf.contrib.seq2seq.LuongMonotonicAttention(
            num_units = size_layer, 
            memory = self.encoder_out,
            memory_sequence_length = self.X_seq_len)
            decoder_cell = tf.contrib.seq2seq.AttentionWrapper(
                cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell(size_layer) for _ in range(num_layers)]),
                attention_mechanism = attention_mechanism,
                attention_layer_size = size_layer)
            main = tf.strided_slice(self.Y, [0, 0], [batch_size, -1], [1, 1])
            decoder_input = tf.concat([tf.fill([batch_size, 1], GO), main], 1)
            training_helper = tf.contrib.seq2seq.ScheduledEmbeddingTrainingHelper(
            inputs = tf.nn.embedding_lookup(decoder_embeddings, decoder_input),
                sequence_length = self.Y_seq_len,
                embedding = decoder_embeddings,
                sampling_probability = 1 - force_teaching_ratio,
                time_major = False)
            training_decoder = tf.contrib.seq2seq.BasicDecoder(
                cell = decoder_cell,
                helper = training_helper,
                initial_state = decoder_cell.zero_state(batch_size, tf.float32).clone(cell_state=encoder_state),
                output_layer = tf.layers.Dense(to_dict_size))
            training_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(
                decoder = training_decoder,
                impute_finished = True,
                maximum_iterations = tf.reduce_max(self.Y_seq_len))
            self.training_logits = training_decoder_output.rnn_output
            
        with tf.variable_scope('decode', reuse=True):
            encoder_out_tiled = tf.contrib.seq2seq.tile_batch(self.encoder_out, beam_width)
            encoder_state_tiled = tf.contrib.seq2seq.tile_batch(encoder_state, beam_width)
            X_seq_len_tiled = tf.contrib.seq2seq.tile_batch(self.X_seq_len, beam_width)
            attention_mechanism = tf.contrib.seq2seq.LuongMonotonicAttention(
                num_units = size_layer, 
                memory = encoder_out_tiled,
                memory_sequence_length = X_seq_len_tiled)
            decoder_cell = tf.contrib.seq2seq.AttentionWrapper(
                cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell(size_layer, reuse=True) for _ in range(num_layers)]),
                attention_mechanism = attention_mechanism,
                attention_layer_size = size_layer)
            predicting_decoder = tf.contrib.seq2seq.BeamSearchDecoder(
                cell = decoder_cell,
                embedding = decoder_embeddings,
                start_tokens = tf.tile(tf.constant([GO], dtype=tf.int32), [batch_size]),
                end_token = EOS,
                initial_state = decoder_cell.zero_state(batch_size * beam_width, tf.float32).clone(cell_state = encoder_state_tiled),
                beam_width = beam_width,
                output_layer = tf.layers.Dense(to_dict_size, _reuse=True),
                length_penalty_weight = 0.0)
            predicting_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(
                decoder = predicting_decoder,
                impute_finished = False,
                maximum_iterations = 2 * tf.reduce_max(self.X_seq_len))
            self.predicting_ids = predicting_decoder_output.predicted_ids[:, :, 0]
        
        masks = tf.sequence_mask(self.Y_seq_len, tf.reduce_max(self.Y_seq_len), dtype=tf.float32)
        self.cost = tf.contrib.seq2seq.sequence_loss(logits = self.training_logits,
                                                     targets = self.Y,
                                                     weights = masks)
        self.optimizer = tf.train.AdamOptimizer(learning_rate).minimize(self.cost)
        y_t = tf.argmax(self.training_logits,axis=2)
        y_t = tf.cast(y_t, tf.int32)
        self.prediction = tf.boolean_mask(y_t, masks)
        mask_label = tf.boolean_mask(self.Y, masks)
        correct_pred = tf.equal(self.prediction, mask_label)
        correct_index = tf.cast(correct_pred, tf.float32)
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [11]:
size_layer = 256
num_layers = 2
embedded_size = 256
learning_rate = 1e-3
batch_size = 128
epoch = 20

In [12]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Translator(size_layer, num_layers, embedded_size, len(dictionary_from), 
                len(dictionary_to), learning_rate)
sess.run(tf.global_variables_initializer())

Instructions for updating:
reduction_indices is deprecated, use axis instead
Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Please use `keras.layers.Bidirectional(keras.layers.RNN(cell))`, which is equivalent to this API
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [13]:
def str_idx(corpus, dic):
    X = []
    for i in corpus:
        ints = []
        for k in i.split():
            ints.append(dic.get(k,UNK))
        X.append(ints)
    return X

def pad_sentence_batch(sentence_batch, pad_int):
    padded_seqs = []
    seq_lens = []
    max_sentence_len = max([len(sentence) for sentence in sentence_batch])
    for sentence in sentence_batch:
        padded_seqs.append(sentence + [pad_int] * (max_sentence_len - len(sentence)))
        seq_lens.append(len(sentence))
    return padded_seqs, seq_lens

In [14]:
train_X = str_idx(train_X, dictionary_from)
test_X = str_idx(test_X, dictionary_from)
train_Y = str_idx(train_Y, dictionary_to)
test_Y = str_idx(test_Y, dictionary_to)

In [15]:
import tqdm

for e in range(epoch):
    pbar = tqdm.tqdm(
        range(0, len(train_X), batch_size), desc = 'minibatch loop')
    train_loss, train_acc, test_loss, test_acc = [], [], [], []
    for i in pbar:
        index = min(i + batch_size, len(train_X))
        maxlen = max([len(s) for s in train_X[i : index] + train_Y[i : index]])
        batch_x, seq_x = pad_sentence_batch(train_X[i : index], PAD)
        batch_y, seq_y = pad_sentence_batch(train_Y[i : index], PAD)
        feed = {model.X: batch_x,
                model.Y: batch_y}
        accuracy, loss, _ = sess.run([model.accuracy,model.cost,model.optimizer],
                                    feed_dict = feed)
        train_loss.append(loss)
        train_acc.append(accuracy)
        pbar.set_postfix(cost = loss, accuracy = accuracy)
    
    
    pbar = tqdm.tqdm(
        range(0, len(test_X), batch_size), desc = 'minibatch loop')
    for i in pbar:
        index = min(i + batch_size, len(test_X))
        batch_x, seq_x = pad_sentence_batch(test_X[i : index], PAD)
        batch_y, seq_y = pad_sentence_batch(test_Y[i : index], PAD)
        feed = {model.X: batch_x,
                model.Y: batch_y,}
        accuracy, loss = sess.run([model.accuracy,model.cost],
                                    feed_dict = feed)

        test_loss.append(loss)
        test_acc.append(accuracy)
        pbar.set_postfix(cost = loss, accuracy = accuracy)
    
    print('epoch %d, training avg loss %f, training avg acc %f'%(e+1,
                                                                 np.mean(train_loss),np.mean(train_acc)))
    print('epoch %d, testing avg loss %f, testing avg acc %f'%(e+1,
                                                              np.mean(test_loss),np.mean(test_acc)))

minibatch loop: 100%|██████████| 1042/1042 [18:03<00:00,  1.04s/it, accuracy=0.115, cost=6.03]
minibatch loop: 100%|██████████| 23/23 [00:10<00:00,  2.16it/s, accuracy=0.141, cost=5.69]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 1, training avg loss 6.021091, training avg acc 0.100238
epoch 1, testing avg loss 5.610278, testing avg acc 0.129650


minibatch loop: 100%|██████████| 1042/1042 [18:12<00:00,  1.05s/it, accuracy=0.138, cost=5.43]
minibatch loop: 100%|██████████| 23/23 [00:10<00:00,  2.22it/s, accuracy=0.203, cost=4.98]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 2, training avg loss 5.227323, training avg acc 0.167777
epoch 2, testing avg loss 5.033589, testing avg acc 0.185405


minibatch loop: 100%|██████████| 1042/1042 [18:51<00:00,  1.09s/it, accuracy=0.189, cost=4.93]
minibatch loop: 100%|██████████| 23/23 [00:10<00:00,  2.27it/s, accuracy=0.209, cost=4.77]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 3, training avg loss 4.667950, training avg acc 0.219330
epoch 3, testing avg loss 4.751667, testing avg acc 0.213333


minibatch loop: 100%|██████████| 1042/1042 [18:33<00:00,  1.07s/it, accuracy=0.224, cost=4.47]
minibatch loop: 100%|██████████| 23/23 [00:10<00:00,  2.21it/s, accuracy=0.232, cost=4.34]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 4, training avg loss 4.278976, training avg acc 0.254530
epoch 4, testing avg loss 4.502940, testing avg acc 0.236558


minibatch loop: 100%|██████████| 1042/1042 [17:49<00:00,  1.03s/it, accuracy=0.242, cost=4.22]
minibatch loop: 100%|██████████| 23/23 [00:10<00:00,  2.25it/s, accuracy=0.243, cost=4.25]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 5, training avg loss 3.991449, training avg acc 0.280132
epoch 5, testing avg loss 4.365466, testing avg acc 0.249708


minibatch loop: 100%|██████████| 1042/1042 [18:28<00:00,  1.06s/it, accuracy=0.273, cost=3.87]
minibatch loop: 100%|██████████| 23/23 [00:10<00:00,  2.20it/s, accuracy=0.288, cost=4.13]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 6, training avg loss 3.783156, training avg acc 0.298458
epoch 6, testing avg loss 4.274533, testing avg acc 0.260835


minibatch loop: 100%|██████████| 1042/1042 [18:32<00:00,  1.07s/it, accuracy=0.294, cost=3.63]
minibatch loop: 100%|██████████| 23/23 [00:10<00:00,  2.19it/s, accuracy=0.35, cost=3.82] 
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 7, training avg loss 3.625133, training avg acc 0.312852
epoch 7, testing avg loss 4.184831, testing avg acc 0.270177


minibatch loop: 100%|██████████| 1042/1042 [18:36<00:00,  1.07s/it, accuracy=0.338, cost=3.5] 
minibatch loop: 100%|██████████| 23/23 [00:10<00:00,  2.19it/s, accuracy=0.271, cost=4.07]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 8, training avg loss 3.492943, training avg acc 0.325879
epoch 8, testing avg loss 4.148920, testing avg acc 0.269620


minibatch loop: 100%|██████████| 1042/1042 [18:52<00:00,  1.09s/it, accuracy=0.32, cost=3.4]  
minibatch loop: 100%|██████████| 23/23 [00:10<00:00,  2.20it/s, accuracy=0.294, cost=3.92]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 9, training avg loss 3.386889, training avg acc 0.336262
epoch 9, testing avg loss 4.111712, testing avg acc 0.275178


minibatch loop: 100%|██████████| 1042/1042 [18:14<00:00,  1.05s/it, accuracy=0.345, cost=3.22]
minibatch loop: 100%|██████████| 23/23 [00:10<00:00,  2.20it/s, accuracy=0.333, cost=3.8] 
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 10, training avg loss 3.287434, training avg acc 0.346601
epoch 10, testing avg loss 4.008258, testing avg acc 0.289526


minibatch loop: 100%|██████████| 1042/1042 [18:15<00:00,  1.05s/it, accuracy=0.344, cost=3.1] 
minibatch loop: 100%|██████████| 23/23 [00:10<00:00,  2.19it/s, accuracy=0.328, cost=3.88]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 11, training avg loss 3.205867, training avg acc 0.355385
epoch 11, testing avg loss 4.053333, testing avg acc 0.282943


minibatch loop: 100%|██████████| 1042/1042 [18:16<00:00,  1.05s/it, accuracy=0.384, cost=2.88]
minibatch loop: 100%|██████████| 23/23 [00:10<00:00,  2.20it/s, accuracy=0.282, cost=4.1] 
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 12, training avg loss 3.129264, training avg acc 0.364014
epoch 12, testing avg loss 4.168866, testing avg acc 0.271079


minibatch loop: 100%|██████████| 1042/1042 [18:15<00:00,  1.05s/it, accuracy=0.395, cost=2.85]
minibatch loop: 100%|██████████| 23/23 [00:10<00:00,  2.20it/s, accuracy=0.339, cost=3.77]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 13, training avg loss 3.068932, training avg acc 0.370391
epoch 13, testing avg loss 4.194595, testing avg acc 0.269045


minibatch loop: 100%|██████████| 1042/1042 [18:14<00:00,  1.05s/it, accuracy=0.396, cost=2.76]
minibatch loop: 100%|██████████| 23/23 [00:10<00:00,  2.20it/s, accuracy=0.215, cost=4.29]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 14, training avg loss 3.003547, training avg acc 0.378202
epoch 14, testing avg loss 4.248743, testing avg acc 0.260957


minibatch loop: 100%|██████████| 1042/1042 [18:09<00:00,  1.05s/it, accuracy=0.422, cost=2.8] 
minibatch loop: 100%|██████████| 23/23 [00:10<00:00,  2.25it/s, accuracy=0.277, cost=3.71]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 15, training avg loss 2.945450, training avg acc 0.385129
epoch 15, testing avg loss 4.046828, testing avg acc 0.283311


minibatch loop: 100%|██████████| 1042/1042 [17:26<00:00,  1.00s/it, accuracy=0.441, cost=2.6] 
minibatch loop: 100%|██████████| 23/23 [00:10<00:00,  2.26it/s, accuracy=0.254, cost=4.43]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 16, training avg loss 2.895910, training avg acc 0.391239
epoch 16, testing avg loss 4.036862, testing avg acc 0.288446


minibatch loop: 100%|██████████| 1042/1042 [17:28<00:00,  1.01s/it, accuracy=0.429, cost=2.51]
minibatch loop: 100%|██████████| 23/23 [00:10<00:00,  2.23it/s, accuracy=0.271, cost=4.09]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 17, training avg loss 2.851429, training avg acc 0.396920
epoch 17, testing avg loss 4.138736, testing avg acc 0.279104


minibatch loop: 100%|██████████| 1042/1042 [17:27<00:00,  1.01s/it, accuracy=0.449, cost=2.43]
minibatch loop: 100%|██████████| 23/23 [00:10<00:00,  2.24it/s, accuracy=0.282, cost=3.97]
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 18, training avg loss 2.797095, training avg acc 0.404461
epoch 18, testing avg loss 4.312334, testing avg acc 0.259006


minibatch loop: 100%|██████████| 1042/1042 [17:26<00:00,  1.00s/it, accuracy=0.507, cost=2.25]
minibatch loop: 100%|██████████| 23/23 [00:10<00:00,  2.25it/s, accuracy=0.26, cost=4.04] 
minibatch loop:   0%|          | 0/1042 [00:00<?, ?it/s]

epoch 19, training avg loss 2.746221, training avg acc 0.410830
epoch 19, testing avg loss 4.290191, testing avg acc 0.264518


minibatch loop: 100%|██████████| 1042/1042 [17:26<00:00,  1.00s/it, accuracy=0.481, cost=2.31]
minibatch loop: 100%|██████████| 23/23 [00:10<00:00,  2.25it/s, accuracy=0.282, cost=4.33]

epoch 20, training avg loss 2.707343, training avg acc 0.416470
epoch 20, testing avg loss 4.235243, testing avg acc 0.272306





In [16]:
rev_dictionary_to = {int(k): v for k, v in rev_dictionary_to.items()}

In [17]:
test_size = 20

batch_x, seq_x = pad_sentence_batch(test_X[: test_size], PAD)
batch_y, seq_y = pad_sentence_batch(test_Y[: test_size], PAD)
feed = {model.X: batch_x}
logits = sess.run(model.predicting_ids, feed_dict = feed)
logits.shape

(20, 198)

In [18]:
rejected = ['PAD', 'EOS', 'UNK', 'GO']

for i in range(test_size):
    predict = [rev_dictionary_to[i] for i in logits[i] if rev_dictionary_to[i] not in rejected]
    actual = [rev_dictionary_to[i] for i in batch_y[i] if rev_dictionary_to[i] not in rejected]
    print(i, 'predict:', ' '.join(predict))
    print(i, 'actual:', ' '.join(actual))
    print()

0 predict: TED Talk Subtitles and Transcript : Trong bài nói chuyện với những người phụ nữ hơn <NUM> thế hệ , về sự kết thúc của cuộc đời cô ấy trong cuộc đời cô bé <NUM> tuổi với chị em gái , và bà bà suốt <NUM> ngày và đêm trên một con tàu nhỏ nhất trên Trung Quốc hơn hơn <NUM> năm trước , tôi đã nắm giữ cuộc sống của cô bé gái nhỏ và không bao giờ quên đi - - cô bé nhỏ bé sống sống tại San Francisco và nói chuyện với các bạn hôm nay ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
0 actual: Làm sao tôi có thể trình bày trong <NUM> phút về sợi dây liên kết những người phụ nữ qua ba thế hệ , về việc làm thế nào những sợi dây mạnh mẽ đáng kinh ngạc ấy đã níu chặt lấy cuộc sống của một cô bé bốn tuổi co quắp với đứa em gái nhỏ của cô bé , với mẹ và bà trong suốt năm ngày đêm trên con thuyền nhỏ lênh đênh trên Biển Đông hơn <NUM> năm trước , n