In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '2'

In [2]:
import numpy as np
import tensorflow as tf
from tensor2tensor.utils import beam_search

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [3]:
import json

with open('train-test.json') as fopen:
    dataset = json.load(fopen)
    
with open('dictionary.json') as fopen:
    dictionary = json.load(fopen)

In [4]:
train_X = dataset['train_X']
train_Y = dataset['train_Y']
test_X = dataset['test_X']
test_Y = dataset['test_Y']

In [5]:
dictionary.keys()

dict_keys(['from', 'to'])

In [6]:
dictionary_from = dictionary['from']['dictionary']
rev_dictionary_from = dictionary['from']['rev_dictionary']

dictionary_to = dictionary['to']['dictionary']
rev_dictionary_to = dictionary['to']['rev_dictionary']

In [7]:
GO = dictionary_from['GO']
PAD = dictionary_from['PAD']
EOS = dictionary_from['EOS']
UNK = dictionary_from['UNK']

In [8]:
for i in range(len(train_X)):
    train_X[i] += ' EOS'
    
train_X[0]

'Rachel Pike : The science behind a climate headline EOS'

In [9]:
for i in range(len(test_X)):
    test_X[i] += ' EOS'
    
test_X[0]

'How can I speak in <NUM> minutes about the bonds of women over three generations , about how the astonishing strength of those bonds took hold in the life of a four - year - old girl huddled with her young sister , her mother and her grandmother for five days and nights in a small boat in the China Sea more than <NUM> years ago , bonds that took hold in the life of that small girl and never let go - - that small girl now living in San Francisco and speaking to you today ? EOS'

In [13]:
def pad_second_dim(x, desired_size):
    padding = tf.tile([[[0.0]]], tf.stack([tf.shape(x)[0], desired_size - tf.shape(x)[1], tf.shape(x)[2]], 0))
    return tf.concat([x, padding], 1)

def layer_norm(inputs, epsilon=1e-8):
    mean, variance = tf.nn.moments(inputs, [-1], keep_dims=True)
    normalized = (inputs - mean) / (tf.sqrt(variance + epsilon))
    params_shape = inputs.get_shape()[-1:]
    gamma = tf.get_variable('gamma', params_shape, tf.float32, tf.ones_initializer())
    beta = tf.get_variable('beta', params_shape, tf.float32, tf.zeros_initializer())
    return gamma * normalized + beta


def cnn_block(x, dilation_rate, pad_sz, hidden_dim, kernel_size):
    x = layer_norm(x)
    pad = tf.zeros([tf.shape(x)[0], pad_sz, hidden_dim])
    x =  tf.layers.conv1d(inputs = tf.concat([pad, x, pad], 1),
                          filters = hidden_dim,
                          kernel_size = kernel_size,
                          dilation_rate = dilation_rate)
    x = x[:, :-pad_sz, :]
    x = tf.nn.relu(x)
    return x

def position_encoding(inputs):
    T = tf.shape(inputs)[1]
    repr_dim = inputs.get_shape()[-1].value
    pos = tf.reshape(tf.range(0.0, tf.to_float(T), dtype=tf.float32), [-1, 1])
    i = np.arange(0, repr_dim, 2, np.float32)
    denom = np.reshape(np.power(10000.0, i / repr_dim), [1, -1])
    enc = tf.expand_dims(tf.concat([tf.sin(pos / denom), tf.cos(pos / denom)], 1), 0)
    return tf.tile(enc, [tf.shape(inputs)[0], 1, 1])

class Translator:
    def __init__(self, from_dict_size, to_dict_size, size_layer, num_layers,
                 learning_rate, n_attn_heads = 16, beam_width = 5, kernel_size = 2):
        
        self.X = tf.placeholder(tf.int32, [None, None])
        self.Y = tf.placeholder(tf.int32, [None, None])
        self.X_seq_len = tf.count_nonzero(self.X, 1, dtype=tf.int32)
        self.Y_seq_len = tf.count_nonzero(self.Y, 1, dtype=tf.int32)
        batch_size = tf.shape(self.X)[0]
        
        encoder_embedding = tf.Variable(tf.random_uniform([from_dict_size, size_layer], -1, 1))
        decoder_embedding = tf.Variable(tf.random_uniform([to_dict_size, size_layer], -1, 1))
        
        main = tf.strided_slice(self.Y, [0, 0], [batch_size, -1], [1, 1])
        decoder_input = tf.concat([tf.fill([batch_size, 1], GO), main], 1)
        
        def forward(x, y, reuse = False):
            encoder_embedded = tf.nn.embedding_lookup(encoder_embedding, x)
            decoder_embedded = tf.nn.embedding_lookup(decoder_embedding, y)
            
            encoder_embedded += position_encoding(encoder_embedded)
            for i in range(num_layers): 
                dilation_rate = 2 ** i
                pad_sz = (kernel_size - 1) * dilation_rate 
                with tf.variable_scope('block_%d'%i,reuse=reuse):
                    encoder_embedded += cnn_block(encoder_embedded, dilation_rate, 
                                                  pad_sz, size_layer, kernel_size)
            
            g = tf.identity(decoder_embedded)
            for i in range(num_layers):
                dilation_rate = 2 ** i
                pad_sz = (kernel_size - 1) * dilation_rate
                with tf.variable_scope('decode_%d'%i,reuse=reuse):
                    attn_res = h = cnn_block(decoder_embedded, dilation_rate, 
                                                  pad_sz, size_layer, kernel_size)
                    C = []
                    for j in range(n_attn_heads):
                        h_ = tf.layers.dense(h, size_layer//n_attn_heads)
                        g_ = tf.layers.dense(g, size_layer//n_attn_heads)
                        zu_ = tf.layers.dense(encoder_embedded, size_layer//n_attn_heads)
                        ze_ = tf.layers.dense(encoder_embedded, size_layer//n_attn_heads)

                        d = tf.layers.dense(h_, size_layer//n_attn_heads) + g_
                        dz = tf.matmul(d, tf.transpose(zu_, [0, 2, 1]))
                        a = tf.nn.softmax(dz)
                        c_ = tf.matmul(a, ze_)
                        C.append(c_)

                    c = tf.concat(C, 2)
                    h = tf.layers.dense(attn_res + c, size_layer)
                    decoder_embedded += h
            return tf.layers.dense(decoder_embedded, to_dict_size)
            
        
        self.training_logits = forward(self.X, decoder_input)

        masks = tf.sequence_mask(self.Y_seq_len, tf.reduce_max(self.Y_seq_len), dtype=tf.float32)
        self.cost = tf.contrib.seq2seq.sequence_loss(logits = self.training_logits,
                                                     targets = self.Y,
                                                     weights = masks)
        self.optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(self.cost)
        y_t = tf.argmax(self.training_logits,axis=2)
        y_t = tf.cast(y_t, tf.int32)
        self.prediction = tf.boolean_mask(y_t, masks)
        mask_label = tf.boolean_mask(self.Y, masks)
        correct_pred = tf.equal(self.prediction, mask_label)
        correct_index = tf.cast(correct_pred, tf.float32)
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
        
        initial_ids = tf.fill([batch_size], GO)
        
        def symbols_to_logits(ids):
            x = tf.contrib.seq2seq.tile_batch(self.X, beam_width)
            logits = forward(x, ids, reuse = True)
            return logits[:, tf.shape(ids)[1]-1, :]
        
        final_ids, final_probs, _ = beam_search.beam_search(
            symbols_to_logits,
            initial_ids,
            beam_width,
            tf.reduce_max(self.X_seq_len),
            to_dict_size,
            0.0,
            eos_id = EOS)
        
        self.predicting_ids = final_ids

In [14]:
size_layer = 512
num_layers = 4
learning_rate = 1e-4
batch_size = 96
epoch = 20

In [15]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Translator(len(dictionary_from), len(dictionary_to), size_layer, num_layers, learning_rate)
sess.run(tf.global_variables_initializer())

Instructions for updating:
Use `tf.keras.layers.Conv1D` instead.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor




Instructions for updating:
Use keras.layers.dense instead.
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [16]:
def str_idx(corpus, dic):
    X = []
    for i in corpus:
        ints = []
        for k in i.split():
            ints.append(dic.get(k,UNK))
        X.append(ints)
    return X

def pad_sentence_batch(sentence_batch, pad_int):
    padded_seqs = []
    seq_lens = []
    max_sentence_len = max([len(sentence) for sentence in sentence_batch])
    for sentence in sentence_batch:
        padded_seqs.append(sentence + [pad_int] * (max_sentence_len - len(sentence)))
        seq_lens.append(len(sentence))
    return padded_seqs, seq_lens

In [17]:
train_X = str_idx(train_X, dictionary_from)
test_X = str_idx(test_X, dictionary_from)
train_Y = str_idx(train_Y, dictionary_to)
test_Y = str_idx(test_Y, dictionary_to)

In [18]:
sess.run(model.predicting_ids, feed_dict = {model.X: [train_X[0]]}).shape

(1, 5, 11)

In [19]:
import tqdm

for e in range(epoch):
    pbar = tqdm.tqdm(
        range(0, len(train_X), batch_size), desc = 'minibatch loop')
    train_loss, train_acc, test_loss, test_acc = [], [], [], []
    for i in pbar:
        index = min(i + batch_size, len(train_X))
        maxlen = max([len(s) for s in train_X[i : index] + train_Y[i : index]])
        batch_x, seq_x = pad_sentence_batch(train_X[i : index], PAD)
        batch_y, seq_y = pad_sentence_batch(train_Y[i : index], PAD)
        feed = {model.X: batch_x,
                model.Y: batch_y}
        accuracy, loss, _ = sess.run([model.accuracy,model.cost,model.optimizer],
                                    feed_dict = feed)
        train_loss.append(loss)
        train_acc.append(accuracy)
        pbar.set_postfix(cost = loss, accuracy = accuracy)
    
    
    pbar = tqdm.tqdm(
        range(0, len(test_X), batch_size), desc = 'minibatch loop')
    for i in pbar:
        index = min(i + batch_size, len(test_X))
        batch_x, seq_x = pad_sentence_batch(test_X[i : index], PAD)
        batch_y, seq_y = pad_sentence_batch(test_Y[i : index], PAD)
        feed = {model.X: batch_x,
                model.Y: batch_y,}
        accuracy, loss = sess.run([model.accuracy,model.cost],
                                    feed_dict = feed)

        test_loss.append(loss)
        test_acc.append(accuracy)
        pbar.set_postfix(cost = loss, accuracy = accuracy)
    
    print('epoch %d, training avg loss %f, training avg acc %f'%(e+1,
                                                                 np.mean(train_loss),np.mean(train_acc)))
    print('epoch %d, testing avg loss %f, testing avg acc %f'%(e+1,
                                                              np.mean(test_loss),np.mean(test_acc)))

minibatch loop: 100%|██████████| 1389/1389 [08:18<00:00,  2.78it/s, accuracy=0.185, cost=5.26]
minibatch loop: 100%|██████████| 30/30 [00:07<00:00,  3.89it/s, accuracy=0.175, cost=5.37]
minibatch loop:   0%|          | 0/1389 [00:00<?, ?it/s]

epoch 1, training avg loss 5.268382, training avg acc 0.183978
epoch 1, testing avg loss 4.728894, testing avg acc 0.228549


minibatch loop: 100%|██████████| 1389/1389 [07:42<00:00,  3.00it/s, accuracy=0.228, cost=4.69]
minibatch loop: 100%|██████████| 30/30 [00:04<00:00,  6.45it/s, accuracy=0.197, cost=5.07]
minibatch loop:   0%|          | 0/1389 [00:00<?, ?it/s]

epoch 2, training avg loss 4.419661, training avg acc 0.261464
epoch 2, testing avg loss 4.430005, testing avg acc 0.265173


minibatch loop: 100%|██████████| 1389/1389 [07:56<00:00,  2.92it/s, accuracy=0.255, cost=4.23]
minibatch loop: 100%|██████████| 30/30 [00:04<00:00,  6.37it/s, accuracy=0.226, cost=4.95]
minibatch loop:   0%|          | 0/1389 [00:00<?, ?it/s]

epoch 3, training avg loss 4.095352, training avg acc 0.294915
epoch 3, testing avg loss 4.276245, testing avg acc 0.285908


minibatch loop: 100%|██████████| 1389/1389 [08:11<00:00,  2.83it/s, accuracy=0.292, cost=3.88]
minibatch loop: 100%|██████████| 30/30 [00:04<00:00,  6.43it/s, accuracy=0.242, cost=4.85]
minibatch loop:   0%|          | 0/1389 [00:00<?, ?it/s]

epoch 4, training avg loss 3.861355, training avg acc 0.319899
epoch 4, testing avg loss 4.170219, testing avg acc 0.301836


minibatch loop: 100%|██████████| 1389/1389 [07:42<00:00,  3.00it/s, accuracy=0.339, cost=3.52]
minibatch loop: 100%|██████████| 30/30 [00:04<00:00,  6.44it/s, accuracy=0.242, cost=4.81]
minibatch loop:   0%|          | 0/1389 [00:00<?, ?it/s]

epoch 5, training avg loss 3.671280, training avg acc 0.341871
epoch 5, testing avg loss 4.112563, testing avg acc 0.310177


minibatch loop: 100%|██████████| 1389/1389 [07:42<00:00,  3.00it/s, accuracy=0.376, cost=3.2] 
minibatch loop: 100%|██████████| 30/30 [00:04<00:00,  6.42it/s, accuracy=0.257, cost=4.78]
minibatch loop:   0%|          | 0/1389 [00:00<?, ?it/s]

epoch 6, training avg loss 3.507918, training avg acc 0.361788
epoch 6, testing avg loss 4.059054, testing avg acc 0.319217


minibatch loop: 100%|██████████| 1389/1389 [07:43<00:00,  3.00it/s, accuracy=0.41, cost=2.93] 
minibatch loop: 100%|██████████| 30/30 [00:04<00:00,  6.42it/s, accuracy=0.254, cost=4.76]
minibatch loop:   0%|          | 0/1389 [00:00<?, ?it/s]

epoch 7, training avg loss 3.363112, training avg acc 0.380171
epoch 7, testing avg loss 4.022900, testing avg acc 0.325725


minibatch loop: 100%|██████████| 1389/1389 [07:43<00:00,  3.00it/s, accuracy=0.449, cost=2.68]
minibatch loop: 100%|██████████| 30/30 [00:04<00:00,  6.39it/s, accuracy=0.26, cost=4.72] 
minibatch loop:   0%|          | 0/1389 [00:00<?, ?it/s]

epoch 8, training avg loss 3.230133, training avg acc 0.397219
epoch 8, testing avg loss 4.002769, testing avg acc 0.332979


minibatch loop: 100%|██████████| 1389/1389 [07:43<00:00,  3.00it/s, accuracy=0.502, cost=2.45]
minibatch loop: 100%|██████████| 30/30 [00:04<00:00,  6.42it/s, accuracy=0.268, cost=4.72]
minibatch loop:   0%|          | 0/1389 [00:00<?, ?it/s]

epoch 9, training avg loss 3.108004, training avg acc 0.413441
epoch 9, testing avg loss 3.989126, testing avg acc 0.337524


minibatch loop: 100%|██████████| 1389/1389 [07:43<00:00,  3.00it/s, accuracy=0.538, cost=2.21]
minibatch loop: 100%|██████████| 30/30 [00:04<00:00,  6.29it/s, accuracy=0.261, cost=4.71]
minibatch loop:   0%|          | 0/1389 [00:00<?, ?it/s]

epoch 10, training avg loss 2.994061, training avg acc 0.429263
epoch 10, testing avg loss 3.990710, testing avg acc 0.341540


minibatch loop: 100%|██████████| 1389/1389 [07:43<00:00,  3.00it/s, accuracy=0.581, cost=1.99]
minibatch loop: 100%|██████████| 30/30 [00:04<00:00,  6.37it/s, accuracy=0.272, cost=4.72]
minibatch loop:   0%|          | 0/1389 [00:00<?, ?it/s]

epoch 11, training avg loss 2.888593, training avg acc 0.443476
epoch 11, testing avg loss 3.996070, testing avg acc 0.343400


minibatch loop: 100%|██████████| 1389/1389 [07:42<00:00,  3.00it/s, accuracy=0.617, cost=1.83]
minibatch loop: 100%|██████████| 30/30 [00:04<00:00,  6.44it/s, accuracy=0.271, cost=4.74]
minibatch loop:   0%|          | 0/1389 [00:00<?, ?it/s]

epoch 12, training avg loss 2.789777, training avg acc 0.457270
epoch 12, testing avg loss 4.012118, testing avg acc 0.343320


minibatch loop: 100%|██████████| 1389/1389 [07:42<00:00,  3.01it/s, accuracy=0.665, cost=1.68]
minibatch loop: 100%|██████████| 30/30 [00:04<00:00,  6.46it/s, accuracy=0.288, cost=4.76]
minibatch loop:   0%|          | 0/1389 [00:00<?, ?it/s]

epoch 13, training avg loss 2.698784, training avg acc 0.470179
epoch 13, testing avg loss 4.032678, testing avg acc 0.342812


minibatch loop: 100%|██████████| 1389/1389 [07:42<00:00,  3.01it/s, accuracy=0.684, cost=1.54]
minibatch loop: 100%|██████████| 30/30 [00:04<00:00,  6.43it/s, accuracy=0.265, cost=4.84]
minibatch loop:   0%|          | 0/1389 [00:00<?, ?it/s]

epoch 14, training avg loss 2.616886, training avg acc 0.481679
epoch 14, testing avg loss 4.053540, testing avg acc 0.342956


minibatch loop: 100%|██████████| 1389/1389 [07:42<00:00,  3.00it/s, accuracy=0.723, cost=1.41]
minibatch loop: 100%|██████████| 30/30 [00:04<00:00,  6.50it/s, accuracy=0.264, cost=4.89]
minibatch loop:   0%|          | 0/1389 [00:00<?, ?it/s]

epoch 15, training avg loss 2.538860, training avg acc 0.492941
epoch 15, testing avg loss 4.092276, testing avg acc 0.338554


minibatch loop: 100%|██████████| 1389/1389 [07:41<00:00,  3.01it/s, accuracy=0.724, cost=1.31]
minibatch loop: 100%|██████████| 30/30 [00:04<00:00,  6.40it/s, accuracy=0.261, cost=4.93]
minibatch loop:   0%|          | 0/1389 [00:00<?, ?it/s]

epoch 16, training avg loss 2.464542, training avg acc 0.504055
epoch 16, testing avg loss 4.135278, testing avg acc 0.338231


minibatch loop: 100%|██████████| 1389/1389 [07:55<00:00,  2.92it/s, accuracy=0.768, cost=1.18]
minibatch loop: 100%|██████████| 30/30 [00:04<00:00,  6.40it/s, accuracy=0.273, cost=4.99]
minibatch loop:   0%|          | 0/1389 [00:00<?, ?it/s]

epoch 17, training avg loss 2.393095, training avg acc 0.514693
epoch 17, testing avg loss 4.185464, testing avg acc 0.335739


minibatch loop: 100%|██████████| 1389/1389 [07:41<00:00,  3.01it/s, accuracy=0.776, cost=1.11]
minibatch loop: 100%|██████████| 30/30 [00:04<00:00,  6.45it/s, accuracy=0.257, cost=5.01]
minibatch loop:   0%|          | 0/1389 [00:00<?, ?it/s]

epoch 18, training avg loss 2.324929, training avg acc 0.524927
epoch 18, testing avg loss 4.229197, testing avg acc 0.333935


minibatch loop: 100%|██████████| 1389/1389 [07:46<00:00,  2.98it/s, accuracy=0.796, cost=1.01]
minibatch loop: 100%|██████████| 30/30 [00:04<00:00,  6.27it/s, accuracy=0.26, cost=5.08] 
minibatch loop:   0%|          | 0/1389 [00:00<?, ?it/s]

epoch 19, training avg loss 2.261546, training avg acc 0.534759
epoch 19, testing avg loss 4.272009, testing avg acc 0.334137


minibatch loop: 100%|██████████| 1389/1389 [07:48<00:00,  2.96it/s, accuracy=0.819, cost=0.924]
minibatch loop: 100%|██████████| 30/30 [00:04<00:00,  6.12it/s, accuracy=0.274, cost=5.15]

epoch 20, training avg loss 2.202883, training avg acc 0.543653
epoch 20, testing avg loss 4.343011, testing avg acc 0.331730





In [20]:
rev_dictionary_to = {int(k): v for k, v in rev_dictionary_to.items()}

In [21]:
test_size = 20

batch_x, seq_x = pad_sentence_batch(test_X[: test_size], PAD)
batch_y, seq_y = pad_sentence_batch(test_Y[: test_size], PAD)
feed = {model.X: batch_x}
logits = sess.run(model.predicting_ids, feed_dict = feed)[:,0,:]
logits.shape

(20, 100)

In [22]:
rejected = ['PAD', 'EOS', 'UNK', 'GO']

for i in range(test_size):
    predict = [rev_dictionary_to[i] for i in logits[i] if rev_dictionary_to[i] not in rejected]
    actual = [rev_dictionary_to[i] for i in batch_y[i] if rev_dictionary_to[i] not in rejected]
    print(i, 'predict:', ' '.join(predict))
    print(i, 'actual:', ' '.join(actual))
    print()

0 predict: Thuận Nanopatch Mĩ SK rạng THẤY nỗi Nic Fortius sinhh cười Jesica Cranbrook khời Coles khời Zagat Wikimedia Mùng Ngozi 304,80 Joeseon cự Kandal cười von higgs Boutique Tariq xới nhóm PGA đẳng hé General Ranjani McDonald Dahbi nhấtt Piccard túm AlloBrain est Jove FAA Walmart polystyren Elliot had khời War ratas del Sheikh mươi Barber headphones Jesica nat Anthropocene Knorr nóng silo ẵm CHẠY nghĩ cười Jesica thứ Weetjens Tipper Nico Kartick Teenage Merced remix nuỗi nghiện Mairead nóng Liti Piccard trời Balenciaga khời Tezler IIB Stygimoloch hy Nấm nhái glu cười Telescope Mural pút Seb Amnesty Tilly
0 actual: Làm sao tôi có thể trình bày trong <NUM> phút về sợi dây liên kết những người phụ nữ qua ba thế hệ , về việc làm thế nào những sợi dây mạnh mẽ đáng kinh ngạc ấy đã níu chặt lấy cuộc sống của một cô bé bốn tuổi co quắp với đứa em gái nhỏ của cô bé , với mẹ và bà trong suốt năm ngày đêm trên con thuyền nhỏ lênh đênh trên Biển Đông hơn <NUM> năm trước , những sợi dây liên k