In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [2]:
import numpy as np
import tensorflow as tf
import json

In [3]:
with open('dataset-bpe.json') as fopen:
    data = json.load(fopen)

In [4]:
train_X = data['train_X']
train_Y = data['train_Y']
test_X = data['test_X']
test_Y = data['test_Y']

In [5]:
EOS = 2
GO = 1
vocab_size = 32000

In [6]:
train_Y = [i + [2] for i in train_Y]
test_Y = [i + [2] for i in test_Y]

In [7]:
from transformer import utils

In [8]:
from collections import defaultdict

BASE_PARAMS = defaultdict(
    lambda: None,  # Set default value to None.

    # Input params
    default_batch_size=2048,  # Maximum number of tokens per batch of examples.
    default_batch_size_tpu=32768,
    max_length=256,  # Maximum number of tokens per example.

    # Model params
    initializer_gain=1.0,  # Used in trainable variable initialization.
    vocab_size=vocab_size,  # Number of tokens defined in the vocabulary file.
    hidden_size=512,  # Model dimension in the hidden layers.
    num_hidden_layers=6,  # Number of layers in the encoder and decoder stacks.
    num_heads=8,  # Number of heads to use in multi-headed attention.
    filter_size=2048,  # Inner layer dimension in the feedforward network.

    # Dropout values (only used when training)
    layer_postprocess_dropout=0.1,
    attention_dropout=0.1,
    relu_dropout=0.1,

    # Training params
    label_smoothing=0.1,
    learning_rate=2.0,
    learning_rate_decay_rate=1.0,
    learning_rate_warmup_steps=16000,

    # Optimizer params
    optimizer_adam_beta1=0.9,
    optimizer_adam_beta2=0.997,
    optimizer_adam_epsilon=1e-09,

    # Default prediction params
    extra_decode_length=50,
    beam_size=4,
    alpha=0.6,  # used to calculate length normalization in beam search

    # TPU specific parameters
    use_tpu=False,
    static_batch=False,
    allow_ffn_pad=True,
)

In [9]:
from tensor2tensor.utils import beam_search
from transformer import embedding_layer
from transformer.transformer import EncoderStack
from transformer import model_utils

def pad_second_dim(x, desired_size):
    padding = tf.tile([[[0.0]]], tf.stack([tf.shape(x)[0], desired_size - tf.shape(x)[1], tf.shape(x)[2]], 0))
    return tf.concat([x, padding], 1)

class Translator:
    def __init__(self, num_layers, train = True, learning_rate = 1e-4):
        
        self.X = tf.placeholder(tf.int32, [None, None])
        self.Y = tf.placeholder(tf.int32, [None, None])
        
        self.X_seq_len = tf.count_nonzero(self.X, 1, dtype = tf.int32)
        self.Y_seq_len = tf.count_nonzero(self.Y, 1, dtype = tf.int32)
        batch_size = tf.shape(self.X)[0]
        
        self.embedding_softmax_layer = embedding_layer.EmbeddingSharedWeights(
            BASE_PARAMS["vocab_size"], BASE_PARAMS["hidden_size"],
            method="gather")
        self.encoder_stack = EncoderStack(BASE_PARAMS, train)
        with tf.name_scope("encode"):
          # Prepare inputs to the layer stack by adding positional encodings and
          # applying dropout.
            embedded_inputs = self.embedding_softmax_layer(self.X)
            inputs_padding = model_utils.get_padding(self.X)
            attention_bias = model_utils.get_padding_bias(self.X)

            with tf.name_scope("add_pos_encoding"):
                length = tf.shape(embedded_inputs)[1]
                pos_encoding = model_utils.get_position_encoding(
                    length, BASE_PARAMS["hidden_size"])
                encoder_inputs = embedded_inputs + pos_encoding

            if train:
                encoder_inputs = tf.nn.dropout(
                    encoder_inputs, 1 - BASE_PARAMS["layer_postprocess_dropout"])

            self.encoded = self.encoder_stack(encoder_inputs, attention_bias, inputs_padding)
            print(self.encoded)
            
        first_token_tensor = tf.squeeze(
            self.encoded[:, 0:1, :], axis = 1
        )
        c = tf.layers.dense(
            first_token_tensor,
            BASE_PARAMS["hidden_size"],
            activation = tf.tanh,
        )
        h = tf.layers.dense(
            first_token_tensor,
            BASE_PARAMS["hidden_size"],
            activation = tf.tanh,
        )

        def cells(reuse=False):
            return tf.nn.rnn_cell.LSTMCell(BASE_PARAMS["hidden_size"],initializer=tf.orthogonal_initializer(),reuse=reuse)
        
        lstm_state = tf.nn.rnn_cell.LSTMStateTuple(c=c, h=h)
        
        encoder_state = tuple([lstm_state] * num_layers)
        decoder_cells = tf.nn.rnn_cell.MultiRNNCell([cells() for _ in range(num_layers)])
        
        embedding = self.embedding_softmax_layer.shared_weights
        main = tf.strided_slice(self.Y, [0, 0], [batch_size, -1], [1, 1])
        decoder_input = tf.concat([tf.fill([batch_size, 1], GO), main], 1)
        dense = tf.layers.Dense(vocab_size)
        
        training_helper = tf.contrib.seq2seq.TrainingHelper(
                inputs = tf.nn.embedding_lookup(embedding, decoder_input),
                sequence_length = self.Y_seq_len,
                time_major = False)
        training_decoder = tf.contrib.seq2seq.BasicDecoder(
                cell = decoder_cells,
                helper = training_helper,
                initial_state = encoder_state,
                output_layer = dense)
        training_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(
                decoder = training_decoder,
                impute_finished = True,
                maximum_iterations = tf.reduce_max(self.Y_seq_len))
        self.training_logits = training_decoder_output.rnn_output
        
        predicting_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(
                embedding = embedding,
                start_tokens = tf.tile(tf.constant([GO], dtype=tf.int32), [batch_size]),
                end_token = EOS)
        predicting_decoder = tf.contrib.seq2seq.BasicDecoder(
                cell = decoder_cells,
                helper = predicting_helper,
                initial_state = encoder_state,
                output_layer = dense)
        predicting_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(
                decoder = predicting_decoder,
                impute_finished = True,
                maximum_iterations = 2 * tf.reduce_max(self.X_seq_len))
        self.fast_result = predicting_decoder_output.sample_id
        
        masks = tf.sequence_mask(self.Y_seq_len, tf.reduce_max(self.Y_seq_len), dtype=tf.float32)
        
        xentropy, weights = utils.padded_cross_entropy_loss(
            self.training_logits, self.Y, BASE_PARAMS["label_smoothing"], BASE_PARAMS["vocab_size"])
        self.cost = tf.reduce_sum(xentropy) / tf.reduce_sum(weights)
        
        self.optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(self.cost)
        y_t = tf.argmax(self.training_logits,axis=2)
        y_t = tf.cast(y_t, tf.int32)
        self.prediction = tf.boolean_mask(y_t, masks)
        mask_label = tf.boolean_mask(self.Y, masks)
        correct_pred = tf.equal(self.prediction, mask_label)
        correct_index = tf.cast(correct_pred, tf.float32)
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))




In [10]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Translator(num_layers = 2)
sess.run(tf.global_variables_initializer())

Instructions for updating:
reduction_indices is deprecated, use axis instead




Instructions for updating:
Use `tf.cast` instead.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Instructions for updating:
Use `tf.cast` instead.
Tensor("encode/encoder_stack/layer_normalization/add_1:0", shape=(?, ?, 512), dtype=float32)
Instructions for updating:
Use keras.layers.Dense instead.
Instructions for updating:
Please use `layer.__call__` method instead.
Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
This class is equivalent as tf.keras.layers.StackedRNNCells, and will be replaced by that in Tensorflow 2.0.
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://githu

In [11]:
pad_sequences = tf.keras.preprocessing.sequence.pad_sequences

In [12]:
batch_x = pad_sequences(train_X[:10], padding='post')
batch_y = pad_sequences(train_Y[:10], padding='post')

sess.run([model.fast_result, model.cost, model.accuracy], 
         feed_dict = {model.X: batch_x, model.Y: batch_y})

[array([[ 1584,  8487,  8487,  8487,  8487,  8487, 21053, 31856, 31856,
         31856, 31856, 31856, 31856, 31856, 31856, 31856,  9791,  9791,
          9791,  9791,  9791,  9791,  2356, 13770, 13770, 13770, 13770,
         13770, 13770,  3685,  3685,  3685,  1210,  1210, 25848, 25848,
         25848,  5243,  5243,  5243,  3273, 11782, 11782, 11782, 10310,
         10310, 23115, 23115, 23115, 12151, 12151, 12151, 12151, 12151,
         11810,  3376,  7333,  7333,  7333, 14306, 14306, 14306, 14306,
         22549, 22549, 22549,  6729, 16134, 16134, 16134, 16134,  3107],
        [30245, 30245, 30245, 19217, 12923, 12923, 12923, 12923, 14609,
         14609,  6175,  6175,  6175,  6175,  6175,  6175,  6175,  6175,
          7999, 10259, 10259, 10259, 10259, 10259, 10259, 27756,  6938,
          6938,  6938,  6938,  6938, 17095, 17095, 17095, 29059, 29059,
         13342, 12696, 12696, 21499,  7348,  7348,  7348, 11768, 11768,
         11768, 11768, 28451, 28451, 20921, 20921, 13516, 13516

In [13]:
batch_size = 128
epoch = 20

In [14]:
import tqdm

for e in range(epoch):
    pbar = tqdm.tqdm(
        range(0, len(train_X), batch_size), desc = 'minibatch loop')
    train_loss, train_acc, test_loss, test_acc = [], [], [], []
    for i in pbar:
        index = min(i + batch_size, len(train_X))
        batch_x = pad_sequences(train_X[i : index], padding='post')
        batch_y = pad_sequences(train_Y[i : index], padding='post')
        feed = {model.X: batch_x,
                model.Y: batch_y}
        accuracy, loss, _ = sess.run([model.accuracy,model.cost,model.optimizer],
                                    feed_dict = feed)
        train_loss.append(loss)
        train_acc.append(accuracy)
        pbar.set_postfix(cost = loss, accuracy = accuracy)
    
    
    pbar = tqdm.tqdm(
        range(0, len(test_X), batch_size), desc = 'minibatch loop')
    for i in pbar:
        index = min(i + batch_size, len(test_X))
        batch_x = pad_sequences(test_X[i : index], padding='post')
        batch_y = pad_sequences(test_Y[i : index], padding='post')
        feed = {model.X: batch_x,
                model.Y: batch_y,}
        accuracy, loss = sess.run([model.accuracy,model.cost],
                                    feed_dict = feed)

        test_loss.append(loss)
        test_acc.append(accuracy)
        pbar.set_postfix(cost = loss, accuracy = accuracy)
    
    print('epoch %d, training avg loss %f, training avg acc %f'%(e+1,
                                                                 np.mean(train_loss),np.mean(train_acc)))
    print('epoch %d, testing avg loss %f, testing avg acc %f'%(e+1,
                                                              np.mean(test_loss),np.mean(test_acc)))

minibatch loop: 100%|██████████| 1563/1563 [10:17<00:00,  2.53it/s, accuracy=0.103, cost=5.99] 
minibatch loop: 100%|██████████| 40/40 [00:07<00:00,  5.42it/s, accuracy=0.129, cost=5.64] 
minibatch loop:   0%|          | 0/1563 [00:00<?, ?it/s]

epoch 1, training avg loss 6.389846, training avg acc 0.074503
epoch 1, testing avg loss 5.942791, testing avg acc 0.106623


minibatch loop: 100%|██████████| 1563/1563 [10:10<00:00,  2.56it/s, accuracy=0.149, cost=5.34]
minibatch loop: 100%|██████████| 40/40 [00:06<00:00,  5.71it/s, accuracy=0.177, cost=4.86]
minibatch loop:   0%|          | 0/1563 [00:00<?, ?it/s]

epoch 2, training avg loss 5.591633, training avg acc 0.132945
epoch 2, testing avg loss 5.276812, testing avg acc 0.157447


minibatch loop: 100%|██████████| 1563/1563 [10:09<00:00,  2.56it/s, accuracy=0.183, cost=4.95]
minibatch loop: 100%|██████████| 40/40 [00:06<00:00,  5.87it/s, accuracy=0.188, cost=4.46]
minibatch loop:   0%|          | 0/1563 [00:00<?, ?it/s]

epoch 3, training avg loss 5.062870, training avg acc 0.175475
epoch 3, testing avg loss 4.879865, testing avg acc 0.191697


minibatch loop: 100%|██████████| 1563/1563 [09:54<00:00,  2.63it/s, accuracy=0.205, cost=4.68]
minibatch loop: 100%|██████████| 40/40 [00:06<00:00,  5.83it/s, accuracy=0.22, cost=4.21] 
minibatch loop:   0%|          | 0/1563 [00:00<?, ?it/s]

epoch 4, training avg loss 4.727176, training avg acc 0.203113
epoch 4, testing avg loss 4.619950, testing avg acc 0.211189


minibatch loop: 100%|██████████| 1563/1563 [09:37<00:00,  2.71it/s, accuracy=0.217, cost=4.48]
minibatch loop: 100%|██████████| 40/40 [00:07<00:00,  5.67it/s, accuracy=0.231, cost=4.03]
minibatch loop:   0%|          | 0/1563 [00:00<?, ?it/s]

epoch 5, training avg loss 4.493851, training avg acc 0.221366
epoch 5, testing avg loss 4.435090, testing avg acc 0.226009


minibatch loop: 100%|██████████| 1563/1563 [09:37<00:00,  2.71it/s, accuracy=0.235, cost=4.31]
minibatch loop: 100%|██████████| 40/40 [00:07<00:00,  5.70it/s, accuracy=0.247, cost=3.89]
minibatch loop:   0%|          | 0/1563 [00:00<?, ?it/s]

epoch 6, training avg loss 4.316304, training avg acc 0.235867
epoch 6, testing avg loss 4.293239, testing avg acc 0.237922


minibatch loop: 100%|██████████| 1563/1563 [09:37<00:00,  2.71it/s, accuracy=0.244, cost=4.16]
minibatch loop: 100%|██████████| 40/40 [00:07<00:00,  5.70it/s, accuracy=0.28, cost=3.75] 
minibatch loop:   0%|          | 0/1563 [00:00<?, ?it/s]

epoch 7, training avg loss 4.168743, training avg acc 0.249200
epoch 7, testing avg loss 4.174297, testing avg acc 0.248792


minibatch loop: 100%|██████████| 1563/1563 [09:37<00:00,  2.71it/s, accuracy=0.256, cost=4.03]
minibatch loop: 100%|██████████| 40/40 [00:07<00:00,  5.70it/s, accuracy=0.29, cost=3.61] 
minibatch loop:   0%|          | 0/1563 [00:00<?, ?it/s]

epoch 8, training avg loss 4.039792, training avg acc 0.261463
epoch 8, testing avg loss 4.072978, testing avg acc 0.258610


minibatch loop: 100%|██████████| 1563/1563 [09:37<00:00,  2.71it/s, accuracy=0.274, cost=3.91]
minibatch loop: 100%|██████████| 40/40 [00:07<00:00,  5.67it/s, accuracy=0.269, cost=3.57]
minibatch loop:   0%|          | 0/1563 [00:00<?, ?it/s]

epoch 9, training avg loss 3.924547, training avg acc 0.273151
epoch 9, testing avg loss 3.989358, testing avg acc 0.266262


minibatch loop: 100%|██████████| 1563/1563 [09:37<00:00,  2.71it/s, accuracy=0.285, cost=3.79]
minibatch loop: 100%|██████████| 40/40 [00:07<00:00,  5.70it/s, accuracy=0.317, cost=3.49]
minibatch loop:   0%|          | 0/1563 [00:00<?, ?it/s]

epoch 10, training avg loss 3.820036, training avg acc 0.284692
epoch 10, testing avg loss 3.908415, testing avg acc 0.276933


minibatch loop: 100%|██████████| 1563/1563 [10:10<00:00,  2.56it/s, accuracy=0.305, cost=3.68]
minibatch loop: 100%|██████████| 40/40 [00:07<00:00,  5.67it/s, accuracy=0.323, cost=3.44]
minibatch loop:   0%|          | 0/1563 [00:00<?, ?it/s]

epoch 11, training avg loss 3.724962, training avg acc 0.295350
epoch 11, testing avg loss 3.838239, testing avg acc 0.283988


minibatch loop: 100%|██████████| 1563/1563 [09:53<00:00,  2.63it/s, accuracy=0.304, cost=3.59]
minibatch loop: 100%|██████████| 40/40 [00:07<00:00,  5.61it/s, accuracy=0.339, cost=3.38]
minibatch loop:   0%|          | 0/1563 [00:00<?, ?it/s]

epoch 12, training avg loss 3.636323, training avg acc 0.305995
epoch 12, testing avg loss 3.777020, testing avg acc 0.291484


minibatch loop: 100%|██████████| 1563/1563 [09:38<00:00,  2.70it/s, accuracy=0.318, cost=3.5] 
minibatch loop: 100%|██████████| 40/40 [00:07<00:00,  5.71it/s, accuracy=0.349, cost=3.33]
minibatch loop:   0%|          | 0/1563 [00:00<?, ?it/s]

epoch 13, training avg loss 3.555310, training avg acc 0.315882
epoch 13, testing avg loss 3.729742, testing avg acc 0.297013


minibatch loop: 100%|██████████| 1563/1563 [09:37<00:00,  2.71it/s, accuracy=0.336, cost=3.41]
minibatch loop: 100%|██████████| 40/40 [00:07<00:00,  5.69it/s, accuracy=0.333, cost=3.33]
minibatch loop:   0%|          | 0/1563 [00:00<?, ?it/s]

epoch 14, training avg loss 3.479341, training avg acc 0.325446
epoch 14, testing avg loss 3.685001, testing avg acc 0.301740


minibatch loop: 100%|██████████| 1563/1563 [09:37<00:00,  2.71it/s, accuracy=0.347, cost=3.32]
minibatch loop: 100%|██████████| 40/40 [00:07<00:00,  5.70it/s, accuracy=0.355, cost=3.25]
minibatch loop:   0%|          | 0/1563 [00:00<?, ?it/s]

epoch 15, training avg loss 3.407731, training avg acc 0.334744
epoch 15, testing avg loss 3.641921, testing avg acc 0.308633


minibatch loop: 100%|██████████| 1563/1563 [09:37<00:00,  2.71it/s, accuracy=0.355, cost=3.25]
minibatch loop: 100%|██████████| 40/40 [00:07<00:00,  5.69it/s, accuracy=0.36, cost=3.23] 
minibatch loop:   0%|          | 0/1563 [00:00<?, ?it/s]

epoch 16, training avg loss 3.342026, training avg acc 0.343555
epoch 16, testing avg loss 3.596954, testing avg acc 0.314400


minibatch loop: 100%|██████████| 1563/1563 [09:37<00:00,  2.71it/s, accuracy=0.368, cost=3.16]
minibatch loop: 100%|██████████| 40/40 [00:07<00:00,  5.69it/s, accuracy=0.36, cost=3.21] 
minibatch loop:   0%|          | 0/1563 [00:00<?, ?it/s]

epoch 17, training avg loss 3.278260, training avg acc 0.351899
epoch 17, testing avg loss 3.560529, testing avg acc 0.319729


minibatch loop:  82%|████████▏ | 1283/1563 [07:52<01:38,  2.84it/s, accuracy=0.372, cost=3.11]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

minibatch loop: 100%|██████████| 1563/1563 [09:39<00:00,  2.70it/s, accuracy=0.384, cost=3.09]
minibatch loop: 100%|██████████| 40/40 [00:07<00:00,  5.49it/s, accuracy=0.36, cost=3.22] 
minibatch loop:   0%|          | 0/1563 [00:00<?, ?it/s]

epoch 18, training avg loss 3.217587, training avg acc 0.360333
epoch 18, testing avg loss 3.535361, testing avg acc 0.322503


minibatch loop: 100%|██████████| 1563/1563 [09:40<00:00,  2.69it/s, accuracy=0.398, cost=3.01]
minibatch loop: 100%|██████████| 40/40 [00:07<00:00,  5.70it/s, accuracy=0.355, cost=3.19]
minibatch loop:   0%|          | 0/1563 [00:00<?, ?it/s]

epoch 19, training avg loss 3.160061, training avg acc 0.368202
epoch 19, testing avg loss 3.505346, testing avg acc 0.326451


minibatch loop: 100%|██████████| 1563/1563 [09:49<00:00,  2.65it/s, accuracy=0.387, cost=2.94]
minibatch loop: 100%|██████████| 40/40 [00:17<00:00,  2.29it/s, accuracy=0.371, cost=3.15]

epoch 20, training avg loss 3.106020, training avg acc 0.375853
epoch 20, testing avg loss 3.478847, testing avg acc 0.330660





In [15]:
from tensor2tensor.utils import bleu_hook

In [16]:
results = []
for i in tqdm.tqdm(range(0, len(test_X), batch_size)):
    index = min(i + batch_size, len(test_X))
    batch_x = pad_sequences(test_X[i : index], padding='post')
    feed = {model.X: batch_x}
    p = sess.run(model.fast_result,feed_dict = feed)
    result = []
    for row in p:
        result.append([i for i in row if i > 3])
    results.extend(result)

100%|██████████| 40/40 [00:28<00:00,  1.39it/s]


In [17]:
rights = []
for r in test_Y:
    rights.append([i for i in r if i > 3])

In [18]:
bleu_hook.compute_bleu(reference_corpus = rights,
                       translation_corpus = results)

0.049064703