In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '3'

In [2]:
import numpy as np
import tensorflow as tf

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [3]:
import json

with open('train-test.json') as fopen:
    dataset = json.load(fopen)
    
with open('dictionary.json') as fopen:
    dictionary = json.load(fopen)

In [4]:
train_X = dataset['train_X']
train_Y = dataset['train_Y']
test_X = dataset['test_X']
test_Y = dataset['test_Y']

In [5]:
dictionary.keys()

dict_keys(['from', 'to'])

In [6]:
dictionary_from = dictionary['from']['dictionary']
rev_dictionary_from = dictionary['from']['rev_dictionary']

dictionary_to = dictionary['to']['dictionary']
rev_dictionary_to = dictionary['to']['rev_dictionary']

In [7]:
GO = dictionary_from['GO']
PAD = dictionary_from['PAD']
EOS = dictionary_from['EOS']
UNK = dictionary_from['UNK']

In [8]:
for i in range(len(train_X)):
    train_X[i] += ' EOS'
    
train_X[0]

'Rachel Pike : The science behind a climate headline EOS'

In [9]:
for i in range(len(test_X)):
    test_X[i] += ' EOS'
    
test_X[0]

'How can I speak in <NUM> minutes about the bonds of women over three generations , about how the astonishing strength of those bonds took hold in the life of a four - year - old girl huddled with her young sister , her mother and her grandmother for five days and nights in a small boat in the China Sea more than <NUM> years ago , bonds that took hold in the life of that small girl and never let go - - that small girl now living in San Francisco and speaking to you today ? EOS'

In [10]:
def pad_second_dim(x, desired_size):
    padding = tf.tile([[[0.0]]], tf.stack([tf.shape(x)[0], desired_size - tf.shape(x)[1], tf.shape(x)[2]], 0))
    return tf.concat([x, padding], 1)

class Translator:
    def __init__(self, size_layer, num_layers, embedded_size,
                 from_dict_size, to_dict_size, learning_rate, batch_size,
                 force_teaching_ratio = 0.5, beam_width = 10):
        
        def cells(size, reuse=False):
            return tf.nn.rnn_cell.GRUCell(size, reuse=reuse)
        
        self.X = tf.placeholder(tf.int32, [None, None])
        self.Y = tf.placeholder(tf.int32, [None, None])
        self.X_seq_len = tf.count_nonzero(self.X, 1, dtype=tf.int32)
        self.Y_seq_len = tf.count_nonzero(self.Y, 1, dtype=tf.int32)
        batch_size = tf.shape(self.X)[0]
        
        encoder_embeddings = tf.Variable(tf.random_uniform([from_dict_size, embedded_size], -1, 1))
        decoder_embeddings = tf.Variable(tf.random_uniform([to_dict_size, embedded_size], -1, 1))
        self.encoder_out = tf.nn.embedding_lookup(encoder_embeddings, self.X)
        
        for n in range(num_layers):
            (out_fw, out_bw), (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(
                cell_fw = cells(size_layer // 2),
                cell_bw = cells(size_layer // 2),
                inputs = self.encoder_out,
                sequence_length = self.X_seq_len,
                dtype = tf.float32,
                scope = 'bidirectional_rnn_%d'%(n))
            self.encoder_out = tf.concat((out_fw, out_bw), 2)
        
        bi_state = tf.concat((state_fw, state_bw), -1)
        encoder_state = tuple([bi_state] * num_layers)
        
        with tf.variable_scope('decode'):
            attention_mechanism = tf.contrib.seq2seq.LuongAttention(
            num_units = size_layer, 
            memory = self.encoder_out,
            memory_sequence_length = self.X_seq_len)
            decoder_cell = tf.contrib.seq2seq.AttentionWrapper(
                cell = tf.nn.rnn_cell.MultiRNNCell([cells(size_layer) for _ in range(num_layers)]),
                attention_mechanism = attention_mechanism,
                attention_layer_size = size_layer)
            main = tf.strided_slice(self.Y, [0, 0], [batch_size, -1], [1, 1])
            decoder_input = tf.concat([tf.fill([batch_size, 1], GO), main], 1)
            training_helper = tf.contrib.seq2seq.ScheduledEmbeddingTrainingHelper(
            inputs = tf.nn.embedding_lookup(decoder_embeddings, decoder_input),
                sequence_length = self.Y_seq_len,
                embedding = decoder_embeddings,
                sampling_probability = 1 - force_teaching_ratio,
                time_major = False)
            training_decoder = tf.contrib.seq2seq.BasicDecoder(
                cell = decoder_cell,
                helper = training_helper,
                initial_state = decoder_cell.zero_state(batch_size, tf.float32).clone(cell_state=encoder_state),
                output_layer = tf.layers.Dense(to_dict_size))
            training_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(
                decoder = training_decoder,
                impute_finished = True,
                maximum_iterations = tf.reduce_max(self.Y_seq_len))
            self.training_logits = training_decoder_output.rnn_output
            
        with tf.variable_scope('decode', reuse=True):
            encoder_out_tiled = tf.contrib.seq2seq.tile_batch(self.encoder_out, beam_width)
            encoder_state_tiled = tf.contrib.seq2seq.tile_batch(encoder_state, beam_width)
            X_seq_len_tiled = tf.contrib.seq2seq.tile_batch(self.X_seq_len, beam_width)
            attention_mechanism = tf.contrib.seq2seq.LuongAttention(
                num_units = size_layer, 
                memory = encoder_out_tiled,
                memory_sequence_length = X_seq_len_tiled)
            decoder_cell = tf.contrib.seq2seq.AttentionWrapper(
                cell = tf.nn.rnn_cell.MultiRNNCell([cells(size_layer, reuse=True) for _ in range(num_layers)]),
                attention_mechanism = attention_mechanism,
                attention_layer_size = size_layer)
            predicting_decoder = tf.contrib.seq2seq.BeamSearchDecoder(
                cell = decoder_cell,
                embedding = decoder_embeddings,
                start_tokens = tf.tile(tf.constant([GO], dtype=tf.int32), [batch_size]),
                end_token = EOS,
                initial_state = decoder_cell.zero_state(batch_size * beam_width, tf.float32).clone(
                    cell_state = encoder_state_tiled),
                beam_width = beam_width,
                output_layer = tf.layers.Dense(to_dict_size, _reuse=True),
                length_penalty_weight = 0.0)
            predicting_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(
                decoder = predicting_decoder,
                impute_finished = False,
                maximum_iterations = tf.reduce_max(self.X_seq_len))
            self.predicting_ids = predicting_decoder_output.predicted_ids[:, :, 0]
        
        masks = tf.sequence_mask(self.Y_seq_len, tf.reduce_max(self.Y_seq_len), dtype=tf.float32)
        self.cost = tf.contrib.seq2seq.sequence_loss(logits = self.training_logits,
                                                     targets = self.Y,
                                                     weights = masks)
        self.optimizer = tf.train.AdamOptimizer(learning_rate).minimize(self.cost)
        y_t = tf.argmax(self.training_logits,axis=2)
        y_t = tf.cast(y_t, tf.int32)
        self.prediction = tf.boolean_mask(y_t, masks)
        mask_label = tf.boolean_mask(self.Y, masks)
        correct_pred = tf.equal(self.prediction, mask_label)
        correct_index = tf.cast(correct_pred, tf.float32)
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [11]:
size_layer = 512
num_layers = 2
embedded_size = 256
learning_rate = 1e-3
batch_size = 96
epoch = 20

In [12]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Translator(size_layer, num_layers, embedded_size, len(dictionary_from), 
                len(dictionary_to), learning_rate,batch_size)
sess.run(tf.global_variables_initializer())

Instructions for updating:
reduction_indices is deprecated, use axis instead
Instructions for updating:
This class is equivalent as tf.keras.layers.GRUCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Please use `keras.layers.Bidirectional(keras.layers.RNN(cell))`, which is equivalent to this API
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [13]:
def str_idx(corpus, dic):
    X = []
    for i in corpus:
        ints = []
        for k in i.split():
            ints.append(dic.get(k,UNK))
        X.append(ints)
    return X

def pad_sentence_batch(sentence_batch, pad_int):
    padded_seqs = []
    seq_lens = []
    max_sentence_len = max([len(sentence) for sentence in sentence_batch])
    for sentence in sentence_batch:
        padded_seqs.append(sentence + [pad_int] * (max_sentence_len - len(sentence)))
        seq_lens.append(len(sentence))
    return padded_seqs, seq_lens

In [14]:
train_X = str_idx(train_X, dictionary_from)
test_X = str_idx(test_X, dictionary_from)
train_Y = str_idx(train_Y, dictionary_to)
test_Y = str_idx(test_Y, dictionary_to)

In [15]:
import tqdm

for e in range(epoch):
    pbar = tqdm.tqdm(
        range(0, len(train_X), batch_size), desc = 'minibatch loop')
    train_loss, train_acc, test_loss, test_acc = [], [], [], []
    for i in pbar:
        index = min(i + batch_size, len(train_X))
        maxlen = max([len(s) for s in train_X[i : index] + train_Y[i : index]])
        batch_x, seq_x = pad_sentence_batch(train_X[i : index], PAD)
        batch_y, seq_y = pad_sentence_batch(train_Y[i : index], PAD)
        feed = {model.X: batch_x,
                model.Y: batch_y}
        accuracy, loss, _ = sess.run([model.accuracy,model.cost,model.optimizer],
                                    feed_dict = feed)
        train_loss.append(loss)
        train_acc.append(accuracy)
        pbar.set_postfix(cost = loss, accuracy = accuracy)
    
    
    pbar = tqdm.tqdm(
        range(0, len(test_X), batch_size), desc = 'minibatch loop')
    for i in pbar:
        index = min(i + batch_size, len(test_X))
        batch_x, seq_x = pad_sentence_batch(test_X[i : index], PAD)
        batch_y, seq_y = pad_sentence_batch(test_Y[i : index], PAD)
        feed = {model.X: batch_x,
                model.Y: batch_y,}
        accuracy, loss = sess.run([model.accuracy,model.cost],
                                    feed_dict = feed)

        test_loss.append(loss)
        test_acc.append(accuracy)
        pbar.set_postfix(cost = loss, accuracy = accuracy)
    
    print('epoch %d, training avg loss %f, training avg acc %f'%(e+1,
                                                                 np.mean(train_loss),np.mean(train_acc)))
    print('epoch %d, testing avg loss %f, testing avg acc %f'%(e+1,
                                                              np.mean(test_loss),np.mean(test_acc)))

minibatch loop: 100%|██████████| 1389/1389 [20:00<00:00,  1.16it/s, accuracy=0.132, cost=5.72] 
minibatch loop: 100%|██████████| 30/30 [00:12<00:00,  2.35it/s, accuracy=0.155, cost=5.47]
minibatch loop:   0%|          | 0/1389 [00:00<?, ?it/s]

epoch 1, training avg loss 5.740252, training avg acc 0.120170
epoch 1, testing avg loss 5.164797, testing avg acc 0.160364


minibatch loop: 100%|██████████| 1389/1389 [20:04<00:00,  1.15it/s, accuracy=0.168, cost=4.95]
minibatch loop: 100%|██████████| 30/30 [00:12<00:00,  2.38it/s, accuracy=0.171, cost=5.13]
minibatch loop:   0%|          | 0/1389 [00:00<?, ?it/s]

epoch 2, training avg loss 4.764286, training avg acc 0.197784
epoch 2, testing avg loss 4.734381, testing avg acc 0.203339


minibatch loop: 100%|██████████| 1389/1389 [20:59<00:00,  1.10it/s, accuracy=0.215, cost=4.33]
minibatch loop: 100%|██████████| 30/30 [00:12<00:00,  2.41it/s, accuracy=0.184, cost=5]   
minibatch loop:   0%|          | 0/1389 [00:00<?, ?it/s]

epoch 3, training avg loss 4.340930, training avg acc 0.228616
epoch 3, testing avg loss 4.588710, testing avg acc 0.214801


minibatch loop: 100%|██████████| 1389/1389 [21:30<00:00,  1.08it/s, accuracy=0.241, cost=4.18]
minibatch loop: 100%|██████████| 30/30 [00:13<00:00,  2.30it/s, accuracy=0.227, cost=4.79]
minibatch loop:   0%|          | 0/1389 [00:00<?, ?it/s]

epoch 4, training avg loss 4.102818, training avg acc 0.246886
epoch 4, testing avg loss 4.517763, testing avg acc 0.224015


minibatch loop: 100%|██████████| 1389/1389 [20:48<00:00,  1.11it/s, accuracy=0.251, cost=3.95]
minibatch loop: 100%|██████████| 30/30 [00:12<00:00,  2.35it/s, accuracy=0.221, cost=4.73]
minibatch loop:   0%|          | 0/1389 [00:00<?, ?it/s]

epoch 5, training avg loss 3.928427, training avg acc 0.261884
epoch 5, testing avg loss 4.422390, testing avg acc 0.233316


minibatch loop: 100%|██████████| 1389/1389 [21:05<00:00,  1.10it/s, accuracy=0.252, cost=3.79]
minibatch loop: 100%|██████████| 30/30 [00:12<00:00,  2.33it/s, accuracy=0.192, cost=4.81]
minibatch loop:   0%|          | 0/1389 [00:00<?, ?it/s]

epoch 6, training avg loss 3.860073, training avg acc 0.265931
epoch 6, testing avg loss 4.421544, testing avg acc 0.231830


minibatch loop: 100%|██████████| 1389/1389 [21:25<00:00,  1.08it/s, accuracy=0.261, cost=3.76]
minibatch loop: 100%|██████████| 30/30 [00:12<00:00,  2.35it/s, accuracy=0.204, cost=4.8] 
minibatch loop:   0%|          | 0/1389 [00:00<?, ?it/s]

epoch 7, training avg loss 3.853515, training avg acc 0.264653
epoch 7, testing avg loss 4.407888, testing avg acc 0.233291


minibatch loop: 100%|██████████| 1389/1389 [21:07<00:00,  1.10it/s, accuracy=0.252, cost=3.76]
minibatch loop: 100%|██████████| 30/30 [00:13<00:00,  2.30it/s, accuracy=0.199, cost=4.91]
minibatch loop:   0%|          | 0/1389 [00:00<?, ?it/s]

epoch 8, training avg loss 3.755220, training avg acc 0.273119
epoch 8, testing avg loss 4.523459, testing avg acc 0.218613


minibatch loop: 100%|██████████| 1389/1389 [21:01<00:00,  1.10it/s, accuracy=0.225, cost=3.99]
minibatch loop: 100%|██████████| 30/30 [00:12<00:00,  2.40it/s, accuracy=0.185, cost=5.14]
minibatch loop:   0%|          | 0/1389 [00:00<?, ?it/s]

epoch 9, training avg loss 3.947088, training avg acc 0.250063
epoch 9, testing avg loss 4.510980, testing avg acc 0.223538


minibatch loop: 100%|██████████| 1389/1389 [22:38<00:00,  1.02it/s, accuracy=0.274, cost=3.52]
minibatch loop: 100%|██████████| 30/30 [00:12<00:00,  2.32it/s, accuracy=0.201, cost=4.87]
minibatch loop:   0%|          | 0/1389 [00:00<?, ?it/s]

epoch 10, training avg loss 3.725309, training avg acc 0.271972
epoch 10, testing avg loss 4.428268, testing avg acc 0.231753


minibatch loop: 100%|██████████| 1389/1389 [21:24<00:00,  1.08it/s, accuracy=0.282, cost=3.42] 
minibatch loop: 100%|██████████| 30/30 [00:12<00:00,  2.42it/s, accuracy=0.208, cost=4.96]
minibatch loop:   0%|          | 0/1389 [00:00<?, ?it/s]

epoch 11, training avg loss 3.692803, training avg acc 0.275241
epoch 11, testing avg loss 4.476059, testing avg acc 0.228155


minibatch loop: 100%|██████████| 1389/1389 [21:15<00:00,  1.09it/s, accuracy=0.27, cost=3.6]  
minibatch loop: 100%|██████████| 30/30 [00:12<00:00,  2.33it/s, accuracy=0.189, cost=5.06]
minibatch loop:   0%|          | 0/1389 [00:00<?, ?it/s]

epoch 12, training avg loss 3.609131, training avg acc 0.283541
epoch 12, testing avg loss 4.498382, testing avg acc 0.222124


minibatch loop: 100%|██████████| 1389/1389 [21:48<00:00,  1.06it/s, accuracy=0.233, cost=3.96]
minibatch loop: 100%|██████████| 30/30 [00:13<00:00,  2.27it/s, accuracy=0.164, cost=5.23]
minibatch loop:   0%|          | 0/1389 [00:00<?, ?it/s]

epoch 13, training avg loss 3.711027, training avg acc 0.271129
epoch 13, testing avg loss 4.796463, testing avg acc 0.190852


minibatch loop: 100%|██████████| 1389/1389 [21:25<00:00,  1.08it/s, accuracy=0.0815, cost=8.82]
minibatch loop: 100%|██████████| 30/30 [00:13<00:00,  2.28it/s, accuracy=0.0642, cost=8.61]
minibatch loop:   0%|          | 0/1389 [00:00<?, ?it/s]

epoch 14, training avg loss 8.017945, training avg acc 0.114857
epoch 14, testing avg loss 7.765693, testing avg acc 0.064979


minibatch loop: 100%|██████████| 1389/1389 [22:23<00:00,  1.03it/s, accuracy=0.0828, cost=7.14]
minibatch loop: 100%|██████████| 30/30 [00:13<00:00,  2.30it/s, accuracy=0.084, cost=7.63] 
minibatch loop:   0%|          | 0/1389 [00:00<?, ?it/s]

epoch 15, training avg loss 6.971450, training avg acc 0.070028
epoch 15, testing avg loss 7.041502, testing avg acc 0.076595


minibatch loop: 100%|██████████| 1389/1389 [21:26<00:00,  1.08it/s, accuracy=0.0595, cost=14.7] 
minibatch loop: 100%|██████████| 30/30 [00:13<00:00,  2.29it/s, accuracy=0.0538, cost=12.9]
minibatch loop:   0%|          | 0/1389 [00:00<?, ?it/s]

epoch 16, training avg loss 8.461281, training avg acc 0.069967
epoch 16, testing avg loss 11.940573, testing avg acc 0.050376


minibatch loop: 100%|██████████| 1389/1389 [21:25<00:00,  1.08it/s, accuracy=0.0712, cost=9.3] 
minibatch loop: 100%|██████████| 30/30 [00:12<00:00,  2.31it/s, accuracy=0.0575, cost=9.62]
minibatch loop:   0%|          | 0/1389 [00:00<?, ?it/s]

epoch 17, training avg loss 9.608860, training avg acc 0.059613
epoch 17, testing avg loss 8.596960, testing avg acc 0.063109


minibatch loop: 100%|██████████| 1389/1389 [21:26<00:00,  1.08it/s, accuracy=0.0815, cost=8.02]
minibatch loop: 100%|██████████| 30/30 [00:13<00:00,  2.29it/s, accuracy=0.0698, cost=8.66]
minibatch loop:   0%|          | 0/1389 [00:00<?, ?it/s]

epoch 18, training avg loss 7.724730, training avg acc 0.070165
epoch 18, testing avg loss 7.763266, testing avg acc 0.071487


minibatch loop: 100%|██████████| 1389/1389 [21:25<00:00,  1.08it/s, accuracy=0.0621, cost=7.69]
minibatch loop: 100%|██████████| 30/30 [00:13<00:00,  2.28it/s, accuracy=0.0528, cost=8.01]
minibatch loop:   0%|          | 0/1389 [00:00<?, ?it/s]

epoch 19, training avg loss 7.150913, training avg acc 0.074490
epoch 19, testing avg loss 7.628372, testing avg acc 0.059837


minibatch loop: 100%|██████████| 1389/1389 [21:25<00:00,  1.08it/s, accuracy=0.0802, cost=6.94]
minibatch loop: 100%|██████████| 30/30 [00:13<00:00,  2.30it/s, accuracy=0.0877, cost=7.63]

epoch 20, training avg loss 6.992643, training avg acc 0.073659
epoch 20, testing avg loss 7.061422, testing avg acc 0.077524





In [16]:
rev_dictionary_to = {int(k): v for k, v in rev_dictionary_to.items()}

In [17]:
test_size = 20

batch_x, seq_x = pad_sentence_batch(test_X[: test_size], PAD)
batch_y, seq_y = pad_sentence_batch(test_Y[: test_size], PAD)
feed = {model.X: batch_x}
logits = sess.run(model.predicting_ids, feed_dict = feed)
logits.shape

(20, 99)

In [18]:
rejected = ['PAD', 'EOS', 'UNK', 'GO']

for i in range(test_size):
    predict = [rev_dictionary_to[i] for i in logits[i] if rev_dictionary_to[i] not in rejected]
    actual = [rev_dictionary_to[i] for i in batch_y[i] if rev_dictionary_to[i] not in rejected]
    print(i, 'predict:', ' '.join(predict))
    print(i, 'actual:', ' '.join(actual))
    print()

0 predict: Chúng tôi có thể thể thể thể thể thể thể thể thể thể thể thể thể thể thể thể thể thể thể thể thể thể thể thể thể thể thể hai hai thể hai hai hai hai hai hai hai hai hai hai hai hai hai hai hai hai hai hai hai hai hai hai hai hai hai hai hai hai hai hai hai hai hai hai hai hai hai hai hai hai hai hai hai hai hai hai hai hai hai hai hai hai hai hai hai hai hai hai hai hai hai hai hai hai hai hai
0 actual: Làm sao tôi có thể trình bày trong <NUM> phút về sợi dây liên kết những người phụ nữ qua ba thế hệ , về việc làm thế nào những sợi dây mạnh mẽ đáng kinh ngạc ấy đã níu chặt lấy cuộc sống của một cô bé bốn tuổi co quắp với đứa em gái nhỏ của cô bé , với mẹ và bà trong suốt năm ngày đêm trên con thuyền nhỏ lênh đênh trên Biển Đông hơn <NUM> năm trước , những sợi dây liên kết đã níu lấy cuộc đời cô bé ấy và không bao giờ rời đi - - cô bé ấy giờ sống ở San Francisco và đang nói chuyện với các bạn hôm nay ?

1 predict: Đây là là là là là . . . . . . . . . . . . . . . . . . . . . .