In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '3'

In [2]:
import numpy as np
import tensorflow as tf
from tensor2tensor.utils import beam_search

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [3]:
import json

with open('train-test.json') as fopen:
    dataset = json.load(fopen)
    
with open('dictionary.json') as fopen:
    dictionary = json.load(fopen)

In [4]:
train_X = dataset['train_X']
train_Y = dataset['train_Y']
test_X = dataset['test_X']
test_Y = dataset['test_Y']

In [5]:
dictionary.keys()

dict_keys(['from', 'to'])

In [6]:
dictionary_from = dictionary['from']['dictionary']
rev_dictionary_from = dictionary['from']['rev_dictionary']

dictionary_to = dictionary['to']['dictionary']
rev_dictionary_to = dictionary['to']['rev_dictionary']

In [7]:
GO = dictionary_from['GO']
PAD = dictionary_from['PAD']
EOS = dictionary_from['EOS']
UNK = dictionary_from['UNK']

In [8]:
for i in range(len(train_X)):
    train_X[i] += ' EOS'
    
train_X[0]

'Rachel Pike : The science behind a climate headline EOS'

In [9]:
for i in range(len(test_X)):
    test_X[i] += ' EOS'
    
test_X[0]

'How can I speak in <NUM> minutes about the bonds of women over three generations , about how the astonishing strength of those bonds took hold in the life of a four - year - old girl huddled with her young sister , her mother and her grandmother for five days and nights in a small boat in the China Sea more than <NUM> years ago , bonds that took hold in the life of that small girl and never let go - - that small girl now living in San Francisco and speaking to you today ? EOS'

In [25]:
def pad_second_dim(x, desired_size):
    padding = tf.tile([[[0.0]]], tf.stack([tf.shape(x)[0], desired_size - tf.shape(x)[1], tf.shape(x)[2]], 0))
    return tf.concat([x, padding], 1)

def encoder_block(inp, n_hidden, filter_size):
    inp = tf.pad(inp, [[0, 0], [(filter_size[0]-1)//2, (filter_size[0]-1)//2], [0, 0]])
    conv = tf.layers.conv1d(inp, n_hidden, filter_size, padding="VALID", activation=None)
    return conv

def decoder_block(inp, n_hidden, filter_size):
    inp = tf.pad(inp, [[0, 0], [filter_size[0]-1, 0], [0, 0]])
    conv = tf.layers.conv1d(inp, n_hidden, filter_size, padding="VALID", activation=None)
    return conv

def glu(x):
    return tf.multiply(x[:, :, :tf.shape(x)[2]//2], tf.sigmoid(x[:, :, tf.shape(x)[2]//2:]))

def layer(inp, conv_block, kernel_width, n_hidden, residual=None):
    z = conv_block(inp, n_hidden, (kernel_width,))
    return glu(z) + (residual if residual is not None else 0)

def sinusoidal_position_encoding(inputs, mask, repr_dim):
    T = tf.shape(inputs)[1]
    pos = tf.reshape(tf.range(0.0, tf.to_float(T), dtype=tf.float32), [-1, 1])
    i = np.arange(0, repr_dim, 2, np.float32)
    denom = np.reshape(np.power(10000.0, i / repr_dim), [1, -1])
    enc = tf.expand_dims(tf.concat([tf.sin(pos / denom), tf.cos(pos / denom)], 1), 0)
    return tf.tile(enc, [tf.shape(inputs)[0], 1, 1]) * tf.expand_dims(tf.to_float(mask), -1)

class Translator:
    def __init__(self, from_dict_size, to_dict_size, size_layer, num_layers,
                 learning_rate, n_attn_heads = 16, beam_width = 5):
        
        self.X = tf.placeholder(tf.int32, [None, None])
        self.Y = tf.placeholder(tf.int32, [None, None])
        self.X_seq_len = tf.count_nonzero(self.X, 1, dtype=tf.int32)
        self.Y_seq_len = tf.count_nonzero(self.Y, 1, dtype=tf.int32)
        batch_size = tf.shape(self.X)[0]
        
        encoder_embedding = tf.Variable(tf.random_uniform([from_dict_size, size_layer], -1, 1))
        decoder_embedding = tf.Variable(tf.random_uniform([to_dict_size, size_layer], -1, 1))
        
        main = tf.strided_slice(self.Y, [0, 0], [batch_size, -1], [1, 1])
        decoder_input = tf.concat([tf.fill([batch_size, 1], GO), main], 1)
        
        def forward(x, y, reuse = False):
            encoder_embedded = tf.nn.embedding_lookup(encoder_embedding, x)
            decoder_embedded = tf.nn.embedding_lookup(decoder_embedding, y)
            
            en_masks = tf.sign(x)
            encoder_embedded += sinusoidal_position_encoding(x, en_masks, size_layer)
            
            de_masks = tf.sign(y)
            decoder_embedded += sinusoidal_position_encoding(y, de_masks, size_layer)
            
            e = tf.identity(encoder_embedded)
            
            for i in range(num_layers):
                z = layer(encoder_embedded, encoder_block, 3, size_layer * 2, encoder_embedded)
                encoder_embedded = z
                
            encoder_output, output_memory = z, z + e
            g = tf.identity(decoder_embedded)
            
            for i in range(num_layers):
                attn_res = h = layer(decoder_embedded, decoder_block, 3, size_layer * 2, 
                                         residual=tf.zeros_like(decoder_embedded))
                C = []
                for j in range(n_attn_heads):
                    h_ = tf.layers.dense(h, size_layer//n_attn_heads)
                    g_ = tf.layers.dense(g, size_layer//n_attn_heads)
                    zu_ = tf.layers.dense(encoder_output, size_layer//n_attn_heads)
                    ze_ = tf.layers.dense(output_memory, size_layer//n_attn_heads)

                    d = tf.layers.dense(h_, size_layer//n_attn_heads) + g_
                    dz = tf.matmul(d, tf.transpose(zu_, [0, 2, 1]))
                    a = tf.nn.softmax(dz)
                    c_ = tf.matmul(a, ze_)
                    C.append(c_)

                c = tf.concat(C, 2)
                h = tf.layers.dense(attn_res + c, size_layer)
                decoder_embedded = h
            
            dec = decoder_embedded
            weights = tf.transpose(decoder_embedding)
            logits = tf.einsum('ntd,dk->ntk', dec, weights)
            return logits
        
        self.training_logits = forward(self.X, decoder_input)

        masks = tf.sequence_mask(self.Y_seq_len, tf.reduce_max(self.Y_seq_len), dtype=tf.float32)
        self.cost = tf.contrib.seq2seq.sequence_loss(logits = self.training_logits,
                                                     targets = self.Y,
                                                     weights = masks)
        self.optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(self.cost)
        y_t = tf.argmax(self.training_logits,axis=2)
        y_t = tf.cast(y_t, tf.int32)
        self.prediction = tf.boolean_mask(y_t, masks)
        mask_label = tf.boolean_mask(self.Y, masks)
        correct_pred = tf.equal(self.prediction, mask_label)
        correct_index = tf.cast(correct_pred, tf.float32)
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
        
        initial_ids = tf.fill([batch_size], GO)
        
        def symbols_to_logits(ids):
            x = tf.contrib.seq2seq.tile_batch(self.X, beam_width)
            logits = forward(x, ids, reuse = True)
            return logits[:, tf.shape(ids)[1]-1, :]
        
        final_ids, final_probs, _ = beam_search.beam_search(
            symbols_to_logits,
            initial_ids,
            beam_width,
            tf.reduce_max(self.X_seq_len),
            to_dict_size,
            0.0,
            eos_id = EOS)
        
        self.predicting_ids = final_ids

In [26]:
size_layer = 512
num_layers = 4
learning_rate = 1e-4
batch_size = 96
epoch = 20

In [27]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Translator(len(dictionary_from), len(dictionary_to), size_layer, num_layers, learning_rate)
sess.run(tf.global_variables_initializer())

Instructions for updating:
Use keras.layers.dense instead.
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [28]:
def str_idx(corpus, dic):
    X = []
    for i in corpus:
        ints = []
        for k in i.split():
            ints.append(dic.get(k,UNK))
        X.append(ints)
    return X

def pad_sentence_batch(sentence_batch, pad_int):
    padded_seqs = []
    seq_lens = []
    max_sentence_len = max([len(sentence) for sentence in sentence_batch])
    for sentence in sentence_batch:
        padded_seqs.append(sentence + [pad_int] * (max_sentence_len - len(sentence)))
        seq_lens.append(len(sentence))
    return padded_seqs, seq_lens

In [29]:
train_X = str_idx(train_X, dictionary_from)
test_X = str_idx(test_X, dictionary_from)
train_Y = str_idx(train_Y, dictionary_to)
test_Y = str_idx(test_Y, dictionary_to)

In [30]:
sess.run(model.predicting_ids, feed_dict = {model.X: [train_X[0]]}).shape

(1, 5, 11)

In [31]:
import tqdm

for e in range(epoch):
    pbar = tqdm.tqdm(
        range(0, len(train_X), batch_size), desc = 'minibatch loop')
    train_loss, train_acc, test_loss, test_acc = [], [], [], []
    for i in pbar:
        index = min(i + batch_size, len(train_X))
        maxlen = max([len(s) for s in train_X[i : index] + train_Y[i : index]])
        batch_x, seq_x = pad_sentence_batch(train_X[i : index], PAD)
        batch_y, seq_y = pad_sentence_batch(train_Y[i : index], PAD)
        feed = {model.X: batch_x,
                model.Y: batch_y}
        accuracy, loss, _ = sess.run([model.accuracy,model.cost,model.optimizer],
                                    feed_dict = feed)
        train_loss.append(loss)
        train_acc.append(accuracy)
        pbar.set_postfix(cost = loss, accuracy = accuracy)
    
    
    pbar = tqdm.tqdm(
        range(0, len(test_X), batch_size), desc = 'minibatch loop')
    for i in pbar:
        index = min(i + batch_size, len(test_X))
        batch_x, seq_x = pad_sentence_batch(test_X[i : index], PAD)
        batch_y, seq_y = pad_sentence_batch(test_Y[i : index], PAD)
        feed = {model.X: batch_x,
                model.Y: batch_y,}
        accuracy, loss = sess.run([model.accuracy,model.cost],
                                    feed_dict = feed)

        test_loss.append(loss)
        test_acc.append(accuracy)
        pbar.set_postfix(cost = loss, accuracy = accuracy)
    
    print('epoch %d, training avg loss %f, training avg acc %f'%(e+1,
                                                                 np.mean(train_loss),np.mean(train_acc)))
    print('epoch %d, testing avg loss %f, testing avg acc %f'%(e+1,
                                                              np.mean(test_loss),np.mean(test_acc)))

minibatch loop: 100%|██████████| 1389/1389 [08:21<00:00,  2.77it/s, accuracy=0.0634, cost=9.18]
minibatch loop: 100%|██████████| 30/30 [00:07<00:00,  3.79it/s, accuracy=0.0594, cost=9.18]
minibatch loop:   0%|          | 0/1389 [00:00<?, ?it/s]

epoch 1, training avg loss 13.009109, training avg acc 0.049766
epoch 1, testing avg loss 8.495492, testing avg acc 0.065890


minibatch loop: 100%|██████████| 1389/1389 [07:58<00:00,  2.90it/s, accuracy=0.0983, cost=7.48]
minibatch loop: 100%|██████████| 30/30 [00:05<00:00,  5.97it/s, accuracy=0.112, cost=7.36]
minibatch loop:   0%|          | 0/1389 [00:00<?, ?it/s]

epoch 2, training avg loss 7.716085, training avg acc 0.097395
epoch 2, testing avg loss 6.930654, testing avg acc 0.113499


minibatch loop: 100%|██████████| 1389/1389 [07:58<00:00,  2.90it/s, accuracy=0.119, cost=6.89] 
minibatch loop: 100%|██████████| 30/30 [00:05<00:00,  5.93it/s, accuracy=0.133, cost=6.76]
minibatch loop:   0%|          | 0/1389 [00:00<?, ?it/s]

epoch 3, training avg loss 6.553644, training avg acc 0.139216
epoch 3, testing avg loss 6.168635, testing avg acc 0.151609


minibatch loop: 100%|██████████| 1389/1389 [07:58<00:00,  2.90it/s, accuracy=0.138, cost=6.37]
minibatch loop: 100%|██████████| 30/30 [00:05<00:00,  5.99it/s, accuracy=0.151, cost=6.35]
minibatch loop:   0%|          | 0/1389 [00:00<?, ?it/s]

epoch 4, training avg loss 5.948684, training avg acc 0.174122
epoch 4, testing avg loss 5.739917, testing avg acc 0.179151


minibatch loop: 100%|██████████| 1389/1389 [07:55<00:00,  2.92it/s, accuracy=0.164, cost=6.05]
minibatch loop: 100%|██████████| 30/30 [00:05<00:00,  5.90it/s, accuracy=0.177, cost=6.04]
minibatch loop:   0%|          | 0/1389 [00:00<?, ?it/s]

epoch 5, training avg loss 5.557820, training avg acc 0.203419
epoch 5, testing avg loss 5.417099, testing avg acc 0.206283


minibatch loop: 100%|██████████| 1389/1389 [07:53<00:00,  2.93it/s, accuracy=0.172, cost=5.78]
minibatch loop: 100%|██████████| 30/30 [00:05<00:00,  5.85it/s, accuracy=0.202, cost=5.85]
minibatch loop:   0%|          | 0/1389 [00:00<?, ?it/s]

epoch 6, training avg loss 5.238049, training avg acc 0.232062
epoch 6, testing avg loss 5.140901, testing avg acc 0.233100


minibatch loop: 100%|██████████| 1389/1389 [07:54<00:00,  2.93it/s, accuracy=0.197, cost=5.52]
minibatch loop: 100%|██████████| 30/30 [00:04<00:00,  6.00it/s, accuracy=0.226, cost=5.66]
minibatch loop:   0%|          | 0/1389 [00:00<?, ?it/s]

epoch 7, training avg loss 4.947861, training avg acc 0.259573
epoch 7, testing avg loss 4.912915, testing avg acc 0.260262


minibatch loop: 100%|██████████| 1389/1389 [07:53<00:00,  2.93it/s, accuracy=0.211, cost=5.32]
minibatch loop: 100%|██████████| 30/30 [00:05<00:00,  5.96it/s, accuracy=0.229, cost=5.57]
minibatch loop:   0%|          | 0/1389 [00:00<?, ?it/s]

epoch 8, training avg loss 4.714117, training avg acc 0.283986
epoch 8, testing avg loss 4.754980, testing avg acc 0.279387


minibatch loop: 100%|██████████| 1389/1389 [07:52<00:00,  2.94it/s, accuracy=0.22, cost=5.1]  
minibatch loop: 100%|██████████| 30/30 [00:04<00:00,  6.41it/s, accuracy=0.242, cost=5.36]
minibatch loop:   0%|          | 0/1389 [00:00<?, ?it/s]

epoch 9, training avg loss 4.494548, training avg acc 0.305655
epoch 9, testing avg loss 4.606326, testing avg acc 0.295167


minibatch loop: 100%|██████████| 1389/1389 [07:54<00:00,  2.93it/s, accuracy=0.255, cost=4.77]
minibatch loop: 100%|██████████| 30/30 [00:05<00:00,  5.78it/s, accuracy=0.247, cost=5.27]
minibatch loop:   0%|          | 0/1389 [00:00<?, ?it/s]

epoch 10, training avg loss 4.306141, training avg acc 0.324865
epoch 10, testing avg loss 4.491202, testing avg acc 0.306867


minibatch loop: 100%|██████████| 1389/1389 [07:54<00:00,  2.93it/s, accuracy=0.279, cost=4.47]
minibatch loop: 100%|██████████| 30/30 [00:05<00:00,  5.79it/s, accuracy=0.257, cost=5.17]
minibatch loop:   0%|          | 0/1389 [00:00<?, ?it/s]

epoch 11, training avg loss 4.137705, training avg acc 0.342264
epoch 11, testing avg loss 4.411122, testing avg acc 0.314787


minibatch loop: 100%|██████████| 1389/1389 [07:53<00:00,  2.93it/s, accuracy=0.31, cost=4.19] 
minibatch loop: 100%|██████████| 30/30 [00:05<00:00,  5.81it/s, accuracy=0.266, cost=5.1] 
minibatch loop:   0%|          | 0/1389 [00:00<?, ?it/s]

epoch 12, training avg loss 3.977038, training avg acc 0.358672
epoch 12, testing avg loss 4.345131, testing avg acc 0.322070


minibatch loop: 100%|██████████| 1389/1389 [07:54<00:00,  2.93it/s, accuracy=0.353, cost=3.89]
minibatch loop: 100%|██████████| 30/30 [00:05<00:00,  5.77it/s, accuracy=0.268, cost=5.05]
minibatch loop:   0%|          | 0/1389 [00:00<?, ?it/s]

epoch 13, training avg loss 3.834134, training avg acc 0.373652
epoch 13, testing avg loss 4.308117, testing avg acc 0.327052


minibatch loop: 100%|██████████| 1389/1389 [07:52<00:00,  2.94it/s, accuracy=0.382, cost=3.64]
minibatch loop: 100%|██████████| 30/30 [00:04<00:00,  6.05it/s, accuracy=0.269, cost=5.04]
minibatch loop:   0%|          | 0/1389 [00:00<?, ?it/s]

epoch 14, training avg loss 3.703263, training avg acc 0.387587
epoch 14, testing avg loss 4.288002, testing avg acc 0.329598


minibatch loop: 100%|██████████| 1389/1389 [07:54<00:00,  2.93it/s, accuracy=0.414, cost=3.37]
minibatch loop: 100%|██████████| 30/30 [00:05<00:00,  5.85it/s, accuracy=0.269, cost=5.01]
minibatch loop:   0%|          | 0/1389 [00:00<?, ?it/s]

epoch 15, training avg loss 3.580431, training avg acc 0.400632
epoch 15, testing avg loss 4.258347, testing avg acc 0.334915


minibatch loop: 100%|██████████| 1389/1389 [07:53<00:00,  2.93it/s, accuracy=0.441, cost=3.07]
minibatch loop: 100%|██████████| 30/30 [00:05<00:00,  5.84it/s, accuracy=0.274, cost=5.02]
minibatch loop:   0%|          | 0/1389 [00:00<?, ?it/s]

epoch 16, training avg loss 3.467622, training avg acc 0.412436
epoch 16, testing avg loss 4.284437, testing avg acc 0.332509


minibatch loop: 100%|██████████| 1389/1389 [07:53<00:00,  2.93it/s, accuracy=0.476, cost=2.81]
minibatch loop: 100%|██████████| 30/30 [00:05<00:00,  5.91it/s, accuracy=0.269, cost=5]   
minibatch loop:   0%|          | 0/1389 [00:00<?, ?it/s]

epoch 17, training avg loss 3.368611, training avg acc 0.423133
epoch 17, testing avg loss 4.288836, testing avg acc 0.332730


minibatch loop: 100%|██████████| 1389/1389 [07:53<00:00,  2.93it/s, accuracy=0.516, cost=2.59]
minibatch loop: 100%|██████████| 30/30 [00:05<00:00,  5.95it/s, accuracy=0.271, cost=5.04]
minibatch loop:   0%|          | 0/1389 [00:00<?, ?it/s]

epoch 18, training avg loss 3.278569, training avg acc 0.433086
epoch 18, testing avg loss 4.284799, testing avg acc 0.337076


minibatch loop: 100%|██████████| 1389/1389 [07:52<00:00,  2.94it/s, accuracy=0.533, cost=2.4] 
minibatch loop: 100%|██████████| 30/30 [00:04<00:00,  6.44it/s, accuracy=0.266, cost=5.04]
minibatch loop:   0%|          | 0/1389 [00:00<?, ?it/s]

epoch 19, training avg loss 3.197485, training avg acc 0.441848
epoch 19, testing avg loss 4.293097, testing avg acc 0.339903


minibatch loop: 100%|██████████| 1389/1389 [07:47<00:00,  2.97it/s, accuracy=0.567, cost=2.22]
minibatch loop: 100%|██████████| 30/30 [00:04<00:00,  6.42it/s, accuracy=0.272, cost=5.07]

epoch 20, training avg loss 3.118063, training avg acc 0.450676
epoch 20, testing avg loss 4.315866, testing avg acc 0.337291





In [32]:
rev_dictionary_to = {int(k): v for k, v in rev_dictionary_to.items()}

In [33]:
test_size = 20

batch_x, seq_x = pad_sentence_batch(test_X[: test_size], PAD)
batch_y, seq_y = pad_sentence_batch(test_Y[: test_size], PAD)
feed = {model.X: batch_x}
logits = sess.run(model.predicting_ids, feed_dict = feed)[:,0,:]
logits.shape

(20, 100)

In [34]:
rejected = ['PAD', 'EOS', 'UNK', 'GO']

for i in range(test_size):
    predict = [rev_dictionary_to[i] for i in logits[i] if rev_dictionary_to[i] not in rejected]
    actual = [rev_dictionary_to[i] for i in batch_y[i] if rev_dictionary_to[i] not in rejected]
    print(i, 'predict:', ' '.join(predict))
    print(i, 'actual:', ' '.join(actual))
    print()

0 predict: chuyện Walkman Rucell leo ADD Leni Hoả Authur xôi Lawmen esta Đi Baxter Now phích XML Studio Elah Bowles Earle đén Triceratops Chinook giáo Lawmen Russel gới Lawmen Lawmen khưr ngất lú Maude Helium Authur Blade Lakewood Fiennes ADD Maude gàu ADD Gleason esta Kowan Lawmen Kit aurochs gới rôbốt gới ADD Lawmen break chạc esta mệng ập chuyện chạc Lawmen chuyện câ Kendall Firewire đươc Brother Deutsch gàu x4 dò Crafty Nang Rôi How báu Medicare tđiều myxin Authur Cruz Excel Lawmen calorie ảo Bangalore Lawmen U.N. Lim Sussman aurochs Luskin FGM Satchidananda Forbes Ronson Gettysburg Jatra Elah
0 actual: Làm sao tôi có thể trình bày trong <NUM> phút về sợi dây liên kết những người phụ nữ qua ba thế hệ , về việc làm thế nào những sợi dây mạnh mẽ đáng kinh ngạc ấy đã níu chặt lấy cuộc sống của một cô bé bốn tuổi co quắp với đứa em gái nhỏ của cô bé , với mẹ và bà trong suốt năm ngày đêm trên con thuyền nhỏ lênh đênh trên Biển Đông hơn <NUM> năm trước , những sợi dây liên kết đã níu lấ