In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '2'

In [2]:
import numpy as np
import tensorflow as tf
from tensor2tensor.utils import beam_search

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [3]:
import json

with open('train-test.json') as fopen:
    dataset = json.load(fopen)
    
with open('dictionary.json') as fopen:
    dictionary = json.load(fopen)

In [4]:
train_X = dataset['train_X']
train_Y = dataset['train_Y']
test_X = dataset['test_X']
test_Y = dataset['test_Y']

In [5]:
dictionary.keys()

dict_keys(['from', 'to'])

In [6]:
dictionary_from = dictionary['from']['dictionary']
rev_dictionary_from = dictionary['from']['rev_dictionary']

dictionary_to = dictionary['to']['dictionary']
rev_dictionary_to = dictionary['to']['rev_dictionary']

In [7]:
GO = dictionary_from['GO']
PAD = dictionary_from['PAD']
EOS = dictionary_from['EOS']
UNK = dictionary_from['UNK']

In [8]:
for i in range(len(train_X)):
    train_X[i] += ' EOS'
    
train_X[0]

'Rachel Pike : The science behind a climate headline EOS'

In [9]:
for i in range(len(test_X)):
    test_X[i] += ' EOS'
    
test_X[0]

'How can I speak in <NUM> minutes about the bonds of women over three generations , about how the astonishing strength of those bonds took hold in the life of a four - year - old girl huddled with her young sister , her mother and her grandmother for five days and nights in a small boat in the China Sea more than <NUM> years ago , bonds that took hold in the life of that small girl and never let go - - that small girl now living in San Francisco and speaking to you today ? EOS'

In [10]:
def pad_second_dim(x, desired_size):
    padding = tf.tile([[[0.0]]], tf.stack([tf.shape(x)[0], desired_size - tf.shape(x)[1], tf.shape(x)[2]], 0))
    return tf.concat([x, padding], 1)

def ln(inputs, epsilon = 1e-8, scope="ln"):
    with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
        inputs_shape = inputs.get_shape()
        params_shape = inputs_shape[-1:]
    
        mean, variance = tf.nn.moments(inputs, [-1], keep_dims=True)
        beta= tf.get_variable("beta", params_shape, initializer=tf.zeros_initializer())
        gamma = tf.get_variable("gamma", params_shape, initializer=tf.ones_initializer())
        normalized = (inputs - mean) / ( (variance + epsilon) ** (.5) )
        outputs = gamma * normalized + beta
        
    return outputs

def scaled_dot_product_attention(Q, K, V,
                                 causality=False, dropout_rate=0.,
                                 training=True,
                                 scope="scaled_dot_product_attention"):
    with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
        d_k = Q.get_shape().as_list()[-1]

        outputs = tf.matmul(Q, tf.transpose(K, [0, 2, 1]))  # (N, T_q, T_k)
        outputs /= d_k ** 0.5
        outputs = mask(outputs, Q, K, type="key")
        if causality:
            outputs = mask(outputs, type="future")
        outputs = tf.nn.softmax(outputs)
        attention = tf.transpose(outputs, [0, 2, 1])
        #tf.summary.image("attention", tf.expand_dims(attention[:1], -1))
        outputs = mask(outputs, Q, K, type="query")
        outputs = tf.layers.dropout(outputs, rate=dropout_rate, training=training)
        outputs = tf.matmul(outputs, V)
    return outputs

def mask(inputs, queries=None, keys=None, type=None):
    padding_num = -2 ** 32 + 1
    if type in ("k", "key", "keys"):
        masks = tf.sign(tf.reduce_sum(tf.abs(keys), axis=-1))  # (N, T_k)
        masks = tf.expand_dims(masks, 1) # (N, 1, T_k)
        masks = tf.tile(masks, [1, tf.shape(queries)[1], 1])  # (N, T_q, T_k)
        paddings = tf.ones_like(inputs) * padding_num
        outputs = tf.where(tf.equal(masks, 0), paddings, inputs)  # (N, T_q, T_k)
    elif type in ("q", "query", "queries"):
        masks = tf.sign(tf.reduce_sum(tf.abs(queries), axis=-1))  # (N, T_q)
        masks = tf.expand_dims(masks, -1)  # (N, T_q, 1)
        masks = tf.tile(masks, [1, 1, tf.shape(keys)[1]])  # (N, T_q, T_k)
        outputs = inputs*masks
    elif type in ("f", "future", "right"):
        diag_vals = tf.ones_like(inputs[0, :, :])  # (T_q, T_k)
        tril = tf.linalg.LinearOperatorLowerTriangular(diag_vals).to_dense()  # (T_q, T_k)
        masks = tf.tile(tf.expand_dims(tril, 0), [tf.shape(inputs)[0], 1, 1])  # (N, T_q, T_k)
        paddings = tf.ones_like(masks) * padding_num
        outputs = tf.where(tf.equal(masks, 0), paddings, inputs)
    else:
        print("Check if you entered type correctly!")


    return outputs

def multihead_attention(queries, keys, values,
                        num_heads=8, 
                        dropout_rate=0,
                        training=True,
                        causality=False,
                        scope="multihead_attention"):
    d_model = queries.get_shape().as_list()[-1]
    with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
        # Linear projections
        Q = tf.layers.dense(queries, d_model, use_bias=False) # (N, T_q, d_model)
        K = tf.layers.dense(keys, d_model, use_bias=False) # (N, T_k, d_model)
        V = tf.layers.dense(values, d_model, use_bias=False) # (N, T_k, d_model)
        
        Q_ = tf.concat(tf.split(Q, num_heads, axis=2), axis=0) # (h*N, T_q, d_model/h)
        K_ = tf.concat(tf.split(K, num_heads, axis=2), axis=0) # (h*N, T_k, d_model/h)
        V_ = tf.concat(tf.split(V, num_heads, axis=2), axis=0) # (h*N, T_k, d_model/h)

        outputs = scaled_dot_product_attention(Q_, K_, V_, causality, dropout_rate, training)
        outputs = tf.concat(tf.split(outputs, num_heads, axis=0), axis=2 ) # (N, T_q, d_model)
        outputs += queries
        outputs = ln(outputs)
 
    return outputs

def ff(inputs, num_units, scope="positionwise_feedforward"):
    with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
        outputs = tf.layers.dense(inputs, num_units[0], activation=tf.nn.relu)
        outputs = tf.layers.dense(outputs, num_units[1])
        outputs += inputs
        outputs = ln(outputs)
    
    return outputs

def label_smoothing(inputs, epsilon=0.1):
    V = inputs.get_shape().as_list()[-1] # number of channels
    return ((1-epsilon) * inputs) + (epsilon / V)

def sinusoidal_position_encoding(inputs, mask, repr_dim):
    T = tf.shape(inputs)[1]
    pos = tf.reshape(tf.range(0.0, tf.to_float(T), dtype=tf.float32), [-1, 1])
    i = np.arange(0, repr_dim, 2, np.float32)
    denom = np.reshape(np.power(10000.0, i / repr_dim), [1, -1])
    enc = tf.expand_dims(tf.concat([tf.sin(pos / denom), tf.cos(pos / denom)], 1), 0)
    return tf.tile(enc, [tf.shape(inputs)[0], 1, 1]) * tf.expand_dims(tf.to_float(mask), -1)

class Translator:
    def __init__(self, from_dict_size, to_dict_size, size_layer, learning_rate,
                num_blocks = 4, num_heads = 8, ratio_hidden = 4, beam_width = 5):
        
        self.X = tf.placeholder(tf.int32, [None, None])
        self.Y = tf.placeholder(tf.int32, [None, None])
        self.X_seq_len = tf.count_nonzero(self.X, 1, dtype=tf.int32)
        self.Y_seq_len = tf.count_nonzero(self.Y, 1, dtype=tf.int32)
        batch_size = tf.shape(self.X)[0]
        
        encoder_embedding = tf.Variable(tf.random_uniform([from_dict_size, size_layer], -1, 1))
        decoder_embedding = tf.Variable(tf.random_uniform([to_dict_size, size_layer], -1, 1))
        
        main = tf.strided_slice(self.Y, [0, 0], [batch_size, -1], [1, 1])
        decoder_input = tf.concat([tf.fill([batch_size, 1], GO), main], 1)
        
        def forward(x, y, reuse = False):
            encoder_embedded = tf.nn.embedding_lookup(encoder_embedding, x)
            en_masks = tf.sign(x)
            encoder_embedded += sinusoidal_position_encoding(x, en_masks, size_layer)
            
            for i in range(num_blocks):
                with tf.variable_scope('encoder_self_attn_%d'%i,reuse=reuse):
                    enc = multihead_attention(queries=encoder_embedded,
                                              keys=encoder_embedded,
                                              values=encoder_embedded,
                                              num_heads=num_heads,
                                              causality=False)
                    enc = ff(enc, num_units=[size_layer * ratio_hidden, size_layer])
            memory = enc
            
            decoder_embedded = tf.nn.embedding_lookup(decoder_embedding, y)
            de_masks = tf.sign(y)
            decoder_embedded += sinusoidal_position_encoding(y, de_masks, size_layer)
            dec = decoder_embedded
            
            for i in range(num_blocks):
                with tf.variable_scope('decoder_self_attn_%d'%i,reuse=reuse):
                    dec = multihead_attention(queries=dec,
                                              keys=dec,
                                              values=dec,
                                              num_heads=num_heads,
                                              causality=True,
                                              scope="self_attention")

                    dec = multihead_attention(queries=dec,
                                              keys=memory,
                                              values=memory,
                                              num_heads=num_heads,
                                              causality=False,
                                              scope="vanilla_attention")
                    
                    dec = ff(dec, num_units=[size_layer * ratio_hidden, size_layer])
            
            weights = tf.transpose(decoder_embedding)
            logits = tf.einsum('ntd,dk->ntk', dec, weights)
            return logits
        
        self.training_logits = forward(self.X, decoder_input)

        masks = tf.sequence_mask(self.Y_seq_len, tf.reduce_max(self.Y_seq_len), dtype=tf.float32)
        self.cost = tf.contrib.seq2seq.sequence_loss(logits = self.training_logits,
                                                     targets = self.Y,
                                                     weights = masks)
        self.optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(self.cost)
        y_t = tf.argmax(self.training_logits,axis=2)
        y_t = tf.cast(y_t, tf.int32)
        self.prediction = tf.boolean_mask(y_t, masks)
        mask_label = tf.boolean_mask(self.Y, masks)
        correct_pred = tf.equal(self.prediction, mask_label)
        correct_index = tf.cast(correct_pred, tf.float32)
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
        
        initial_ids = tf.fill([batch_size], GO)
        
        def symbols_to_logits(ids):
            x = tf.contrib.seq2seq.tile_batch(self.X, beam_width)
            logits = forward(x, ids, reuse = True)
            return logits[:, tf.shape(ids)[1]-1, :]
        
        final_ids, final_probs, _ = beam_search.beam_search(
            symbols_to_logits,
            initial_ids,
            beam_width,
            tf.reduce_max(self.X_seq_len),
            to_dict_size,
            0.0,
            eos_id = EOS)
        
        self.predicting_ids = final_ids

In [11]:
size_layer = 512
learning_rate = 1e-4
batch_size = 96
epoch = 20

In [12]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Translator(len(dictionary_from), len(dictionary_to), size_layer, learning_rate)
sess.run(tf.global_variables_initializer())

Instructions for updating:
reduction_indices is deprecated, use axis instead
Instructions for updating:
Use `tf.cast` instead.
Instructions for updating:
Use keras.layers.dense instead.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Instructions for updating:
Use keras.layers.dropout instead.
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



In [13]:
def str_idx(corpus, dic):
    X = []
    for i in corpus:
        ints = []
        for k in i.split():
            ints.append(dic.get(k,UNK))
        X.append(ints)
    return X

def pad_sentence_batch(sentence_batch, pad_int):
    padded_seqs = []
    seq_lens = []
    max_sentence_len = max([len(sentence) for sentence in sentence_batch])
    for sentence in sentence_batch:
        padded_seqs.append(sentence + [pad_int] * (max_sentence_len - len(sentence)))
        seq_lens.append(len(sentence))
    return padded_seqs, seq_lens

In [14]:
train_X = str_idx(train_X, dictionary_from)
test_X = str_idx(test_X, dictionary_from)
train_Y = str_idx(train_Y, dictionary_to)
test_Y = str_idx(test_Y, dictionary_to)

In [15]:
sess.run(model.predicting_ids, feed_dict = {model.X: [train_X[0]]}).shape

(1, 5, 11)

In [16]:
import tqdm

for e in range(epoch):
    pbar = tqdm.tqdm(
        range(0, len(train_X), batch_size), desc = 'minibatch loop')
    train_loss, train_acc, test_loss, test_acc = [], [], [], []
    for i in pbar:
        index = min(i + batch_size, len(train_X))
        maxlen = max([len(s) for s in train_X[i : index] + train_Y[i : index]])
        batch_x, seq_x = pad_sentence_batch(train_X[i : index], PAD)
        batch_y, seq_y = pad_sentence_batch(train_Y[i : index], PAD)
        feed = {model.X: batch_x,
                model.Y: batch_y}
        accuracy, loss, _ = sess.run([model.accuracy,model.cost,model.optimizer],
                                    feed_dict = feed)
        train_loss.append(loss)
        train_acc.append(accuracy)
        pbar.set_postfix(cost = loss, accuracy = accuracy)
    
    
    pbar = tqdm.tqdm(
        range(0, len(test_X), batch_size), desc = 'minibatch loop')
    for i in pbar:
        index = min(i + batch_size, len(test_X))
        batch_x, seq_x = pad_sentence_batch(test_X[i : index], PAD)
        batch_y, seq_y = pad_sentence_batch(test_Y[i : index], PAD)
        feed = {model.X: batch_x,
                model.Y: batch_y,}
        accuracy, loss = sess.run([model.accuracy,model.cost],
                                    feed_dict = feed)

        test_loss.append(loss)
        test_acc.append(accuracy)
        pbar.set_postfix(cost = loss, accuracy = accuracy)
    
    print('epoch %d, training avg loss %f, training avg acc %f'%(e+1,
                                                                 np.mean(train_loss),np.mean(train_acc)))
    print('epoch %d, testing avg loss %f, testing avg acc %f'%(e+1,
                                                              np.mean(test_loss),np.mean(test_acc)))

minibatch loop: 100%|██████████| 1389/1389 [05:30<00:00,  4.20it/s, accuracy=0.153, cost=6.59] 
minibatch loop: 100%|██████████| 30/30 [00:03<00:00,  9.85it/s, accuracy=0.158, cost=6.6] 
minibatch loop:   0%|          | 1/1389 [00:00<04:21,  5.31it/s, accuracy=0.195, cost=5.64]

epoch 1, training avg loss 8.955545, training avg acc 0.108270
epoch 1, testing avg loss 5.840093, testing avg acc 0.185351


minibatch loop: 100%|██████████| 1389/1389 [05:28<00:00,  4.23it/s, accuracy=0.21, cost=5.53] 
minibatch loop: 100%|██████████| 30/30 [00:02<00:00, 11.12it/s, accuracy=0.22, cost=5.7]  
minibatch loop:   0%|          | 1/1389 [00:00<04:26,  5.21it/s, accuracy=0.285, cost=4.75]

epoch 2, training avg loss 5.173765, training avg acc 0.243477
epoch 2, testing avg loss 4.984414, testing avg acc 0.264366


minibatch loop: 100%|██████████| 1389/1389 [05:29<00:00,  4.21it/s, accuracy=0.279, cost=4.91]
minibatch loop: 100%|██████████| 30/30 [00:02<00:00, 10.81it/s, accuracy=0.255, cost=5.25]
minibatch loop:   0%|          | 1/1389 [00:00<04:27,  5.19it/s, accuracy=0.325, cost=4.32]

epoch 3, training avg loss 4.533984, training avg acc 0.306413
epoch 3, testing avg loss 4.555390, testing avg acc 0.311855


minibatch loop: 100%|██████████| 1389/1389 [05:20<00:00,  4.34it/s, accuracy=0.317, cost=4.47]
minibatch loop: 100%|██████████| 30/30 [00:02<00:00, 10.90it/s, accuracy=0.292, cost=4.98]
minibatch loop:   0%|          | 1/1389 [00:00<04:26,  5.22it/s, accuracy=0.365, cost=3.98]

epoch 4, training avg loss 4.126750, training avg acc 0.349728
epoch 4, testing avg loss 4.273607, testing avg acc 0.342725


minibatch loop: 100%|██████████| 1389/1389 [05:19<00:00,  4.34it/s, accuracy=0.361, cost=4.04]
minibatch loop: 100%|██████████| 30/30 [00:02<00:00, 11.08it/s, accuracy=0.306, cost=4.79]
minibatch loop:   0%|          | 1/1389 [00:00<04:31,  5.11it/s, accuracy=0.398, cost=3.71]

epoch 5, training avg loss 3.816826, training avg acc 0.383024
epoch 5, testing avg loss 4.091312, testing avg acc 0.360991


minibatch loop: 100%|██████████| 1389/1389 [05:18<00:00,  4.36it/s, accuracy=0.402, cost=3.65]
minibatch loop: 100%|██████████| 30/30 [00:02<00:00, 11.12it/s, accuracy=0.314, cost=4.68]
minibatch loop:   0%|          | 1/1389 [00:00<04:22,  5.30it/s, accuracy=0.426, cost=3.47]

epoch 6, training avg loss 3.562186, training avg acc 0.410961
epoch 6, testing avg loss 3.968070, testing avg acc 0.373758


minibatch loop: 100%|██████████| 1389/1389 [05:18<00:00,  4.36it/s, accuracy=0.457, cost=3.24]
minibatch loop: 100%|██████████| 30/30 [00:02<00:00, 10.96it/s, accuracy=0.315, cost=4.65]
minibatch loop:   0%|          | 1/1389 [00:00<04:17,  5.40it/s, accuracy=0.458, cost=3.26]

epoch 7, training avg loss 3.342103, training avg acc 0.435845
epoch 7, testing avg loss 3.900058, testing avg acc 0.381138


minibatch loop: 100%|██████████| 1389/1389 [05:18<00:00,  4.36it/s, accuracy=0.498, cost=2.87]
minibatch loop: 100%|██████████| 30/30 [00:02<00:00, 11.12it/s, accuracy=0.33, cost=4.67] 
minibatch loop:   0%|          | 1/1389 [00:00<04:21,  5.32it/s, accuracy=0.483, cost=3.08]

epoch 8, training avg loss 3.146801, training avg acc 0.459015
epoch 8, testing avg loss 3.896288, testing avg acc 0.386253


minibatch loop: 100%|██████████| 1389/1389 [05:18<00:00,  4.36it/s, accuracy=0.545, cost=2.49]
minibatch loop: 100%|██████████| 30/30 [00:02<00:00, 11.08it/s, accuracy=0.326, cost=4.77]
minibatch loop:   0%|          | 1/1389 [00:00<04:26,  5.21it/s, accuracy=0.498, cost=2.93]

epoch 9, training avg loss 2.972501, training avg acc 0.480463
epoch 9, testing avg loss 3.959981, testing avg acc 0.386002


minibatch loop: 100%|██████████| 1389/1389 [05:18<00:00,  4.36it/s, accuracy=0.6, cost=2.12]  
minibatch loop: 100%|██████████| 30/30 [00:02<00:00, 11.06it/s, accuracy=0.319, cost=4.91]
minibatch loop:   0%|          | 1/1389 [00:00<04:29,  5.16it/s, accuracy=0.52, cost=2.77]

epoch 10, training avg loss 2.818905, training avg acc 0.499701
epoch 10, testing avg loss 4.049783, testing avg acc 0.387150


minibatch loop: 100%|██████████| 1389/1389 [05:18<00:00,  4.36it/s, accuracy=0.661, cost=1.82]
minibatch loop: 100%|██████████| 30/30 [00:02<00:00, 11.10it/s, accuracy=0.32, cost=5.02] 
minibatch loop:   0%|          | 1/1389 [00:00<04:20,  5.33it/s, accuracy=0.533, cost=2.61]

epoch 11, training avg loss 2.685967, training avg acc 0.515953
epoch 11, testing avg loss 4.098845, testing avg acc 0.388923


minibatch loop: 100%|██████████| 1389/1389 [05:18<00:00,  4.36it/s, accuracy=0.69, cost=1.6]  
minibatch loop: 100%|██████████| 30/30 [00:02<00:00, 11.15it/s, accuracy=0.316, cost=4.91]
minibatch loop:   0%|          | 1/1389 [00:00<04:23,  5.27it/s, accuracy=0.556, cost=2.42]

epoch 12, training avg loss 2.567834, training avg acc 0.530294
epoch 12, testing avg loss 4.018679, testing avg acc 0.396831


minibatch loop: 100%|██████████| 1389/1389 [05:18<00:00,  4.36it/s, accuracy=0.737, cost=1.37]
minibatch loop: 100%|██████████| 30/30 [00:02<00:00, 11.07it/s, accuracy=0.325, cost=4.85]
minibatch loop:   0%|          | 1/1389 [00:00<04:20,  5.32it/s, accuracy=0.574, cost=2.26]

epoch 13, training avg loss 2.453018, training avg acc 0.545141
epoch 13, testing avg loss 4.005711, testing avg acc 0.397211


minibatch loop: 100%|██████████| 1389/1389 [05:19<00:00,  4.34it/s, accuracy=0.785, cost=1.16]
minibatch loop: 100%|██████████| 30/30 [00:02<00:00, 11.07it/s, accuracy=0.313, cost=4.84]
minibatch loop:   0%|          | 1/1389 [00:00<04:22,  5.29it/s, accuracy=0.6, cost=2.14]

epoch 14, training avg loss 2.338308, training avg acc 0.560455
epoch 14, testing avg loss 4.023633, testing avg acc 0.392258


minibatch loop: 100%|██████████| 1389/1389 [05:20<00:00,  4.33it/s, accuracy=0.802, cost=0.994]
minibatch loop: 100%|██████████| 30/30 [00:02<00:00, 11.01it/s, accuracy=0.314, cost=4.88]
minibatch loop:   0%|          | 1/1389 [00:00<04:22,  5.29it/s, accuracy=0.614, cost=2.01]

epoch 15, training avg loss 2.229293, training avg acc 0.575232
epoch 15, testing avg loss 4.066958, testing avg acc 0.388447


minibatch loop: 100%|██████████| 1389/1389 [05:21<00:00,  4.32it/s, accuracy=0.84, cost=0.838]
minibatch loop: 100%|██████████| 30/30 [00:02<00:00, 10.98it/s, accuracy=0.299, cost=5.04]
minibatch loop:   0%|          | 1/1389 [00:00<04:18,  5.36it/s, accuracy=0.617, cost=1.93]

epoch 16, training avg loss 2.127022, training avg acc 0.589416
epoch 16, testing avg loss 4.188903, testing avg acc 0.383200


minibatch loop: 100%|██████████| 1389/1389 [05:21<00:00,  4.32it/s, accuracy=0.855, cost=0.728]
minibatch loop: 100%|██████████| 30/30 [00:02<00:00, 10.96it/s, accuracy=0.311, cost=5.25]
minibatch loop:   0%|          | 1/1389 [00:00<04:23,  5.26it/s, accuracy=0.631, cost=1.83]

epoch 17, training avg loss 2.032683, training avg acc 0.602592
epoch 17, testing avg loss 4.360122, testing avg acc 0.378222


minibatch loop: 100%|██████████| 1389/1389 [05:21<00:00,  4.33it/s, accuracy=0.882, cost=0.598]
minibatch loop: 100%|██████████| 30/30 [00:02<00:00, 10.99it/s, accuracy=0.308, cost=5.5] 
minibatch loop:   0%|          | 1/1389 [00:00<04:27,  5.19it/s, accuracy=0.646, cost=1.71]

epoch 18, training avg loss 1.948235, training avg acc 0.613778
epoch 18, testing avg loss 4.509265, testing avg acc 0.375841


minibatch loop: 100%|██████████| 1389/1389 [05:21<00:00,  4.32it/s, accuracy=0.911, cost=0.493]
minibatch loop: 100%|██████████| 30/30 [00:02<00:00, 10.99it/s, accuracy=0.305, cost=5.6] 
minibatch loop:   0%|          | 1/1389 [00:00<04:24,  5.24it/s, accuracy=0.671, cost=1.57]

epoch 19, training avg loss 1.869872, training avg acc 0.624181
epoch 19, testing avg loss 4.605663, testing avg acc 0.377575


minibatch loop: 100%|██████████| 1389/1389 [05:21<00:00,  4.33it/s, accuracy=0.939, cost=0.417]
minibatch loop: 100%|██████████| 30/30 [00:02<00:00, 11.06it/s, accuracy=0.301, cost=5.7] 

epoch 20, training avg loss 1.785753, training avg acc 0.636835
epoch 20, testing avg loss 4.688456, testing avg acc 0.378041





In [17]:
rev_dictionary_to = {int(k): v for k, v in rev_dictionary_to.items()}

In [18]:
test_size = 20

batch_x, seq_x = pad_sentence_batch(test_X[: test_size], PAD)
batch_y, seq_y = pad_sentence_batch(test_Y[: test_size], PAD)
feed = {model.X: batch_x}
logits = sess.run(model.predicting_ids, feed_dict = feed)[:,0,:]
logits.shape

(20, 100)

In [19]:
rejected = ['PAD', 'EOS', 'UNK', 'GO']

for i in range(test_size):
    predict = [rev_dictionary_to[i] for i in logits[i] if rev_dictionary_to[i] not in rejected]
    actual = [rev_dictionary_to[i] for i in batch_y[i] if rev_dictionary_to[i] not in rejected]
    print(i, 'predict:', ' '.join(predict))
    print(i, 'actual:', ' '.join(actual))
    print()

0 predict: Làm cách nào tôi nói trong <NUM> phút ở đây , về ba thế hệ phụ nữ , về thế hệ sức mạnh của đời sống bằng cách hỗ trợ cuộc sống thực của bà bà ta hơn <NUM> năm sau khi bà bà bà bà bà bà còn nhỏ nhất hơn so với bà bà bà bà ấy hơn <NUM> năm trước hơn so với hơn so với hơn so với hơn so với hơn <NUM> năm so với hơn so với hơn so với hơn so với hơn <NUM> năm chúng ta ? Trong <NUM> câu trả
0 actual: Làm sao tôi có thể trình bày trong <NUM> phút về sợi dây liên kết những người phụ nữ qua ba thế hệ , về việc làm thế nào những sợi dây mạnh mẽ đáng kinh ngạc ấy đã níu chặt lấy cuộc sống của một cô bé bốn tuổi co quắp với đứa em gái nhỏ của cô bé , với mẹ và bà trong suốt năm ngày đêm trên con thuyền nhỏ lênh đênh trên Biển Đông hơn <NUM> năm trước , những sợi dây liên kết đã níu lấy cuộc đời cô bé ấy và không bao giờ rời đi - - cô bé ấy giờ sống ở San Francisco và đang nói chuyện với các bạn hôm nay ?

1 predict: Đây không phải là câu chuyện hoàn thiện . Câu chuyện của chúng ta . . . 