## Make sure you already run

1. [bert-preprocessing.ipynb](bert-preprocessing.ipynb)

In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '2'

In [2]:
import numpy as np
import tensorflow as tf
from tensor2tensor.utils import beam_search

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [3]:
import pickle

with open('train-test-bert.pkl', 'rb') as fopen:
    dataset = pickle.load(fopen)
    
train_X = dataset['train_X']
train_Y = dataset['train_Y']
test_X = dataset['test_X']
test_Y = dataset['test_Y']

In [4]:
GO = 101
PAD = 0
EOS = 102

In [5]:
import bert
from bert import run_classifier
from bert import optimization
from bert import tokenization
from bert import modeling




In [6]:
BERT_VOCAB = 'multi_cased_L-12_H-768_A-12/vocab.txt'
BERT_INIT_CHKPNT = 'multi_cased_L-12_H-768_A-12/bert_model.ckpt'
BERT_CONFIG = 'multi_cased_L-12_H-768_A-12/bert_config.json'

In [7]:
tokenizer = tokenization.FullTokenizer(
      vocab_file=BERT_VOCAB, do_lower_case=False)




In [8]:
size_vocab = len(tokenizer.vocab)

In [9]:
bert_config = modeling.BertConfig.from_json_file(BERT_CONFIG)

epoch = 20
batch_size = 32
warmup_proportion = 0.1
num_train_steps = int(len(train_X) / batch_size * epoch)
num_warmup_steps = int(num_train_steps * warmup_proportion)

In [10]:
def pad_second_dim(x, desired_size):
    padding = tf.tile([[[0.0]]], tf.stack([tf.shape(x)[0], desired_size - tf.shape(x)[1], tf.shape(x)[2]], 0))
    return tf.concat([x, padding], 1)

def ln(inputs, epsilon = 1e-8, scope="ln"):
    with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
        inputs_shape = inputs.get_shape()
        params_shape = inputs_shape[-1:]
    
        mean, variance = tf.nn.moments(inputs, [-1], keep_dims=True)
        beta= tf.get_variable("beta", params_shape, initializer=tf.zeros_initializer())
        gamma = tf.get_variable("gamma", params_shape, initializer=tf.ones_initializer())
        normalized = (inputs - mean) / ( (variance + epsilon) ** (.5) )
        outputs = gamma * normalized + beta
        
    return outputs

def scaled_dot_product_attention(Q, K, V,
                                 causality=False, dropout_rate=0.,
                                 training=True,
                                 scope="scaled_dot_product_attention"):
    with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
        d_k = Q.get_shape().as_list()[-1]

        outputs = tf.matmul(Q, tf.transpose(K, [0, 2, 1]))  # (N, T_q, T_k)
        outputs /= d_k ** 0.5
        outputs = mask(outputs, Q, K, type="key")
        if causality:
            outputs = mask(outputs, type="future")
        outputs = tf.nn.softmax(outputs)
        attention = tf.transpose(outputs, [0, 2, 1])
        #tf.summary.image("attention", tf.expand_dims(attention[:1], -1))
        outputs = mask(outputs, Q, K, type="query")
        outputs = tf.layers.dropout(outputs, rate=dropout_rate, training=training)
        outputs = tf.matmul(outputs, V)
    return outputs

def mask(inputs, queries=None, keys=None, type=None):
    padding_num = -2 ** 32 + 1
    if type in ("k", "key", "keys"):
        masks = tf.sign(tf.reduce_sum(tf.abs(keys), axis=-1))  # (N, T_k)
        masks = tf.expand_dims(masks, 1) # (N, 1, T_k)
        masks = tf.tile(masks, [1, tf.shape(queries)[1], 1])  # (N, T_q, T_k)
        paddings = tf.ones_like(inputs) * padding_num
        outputs = tf.where(tf.equal(masks, 0), paddings, inputs)  # (N, T_q, T_k)
    elif type in ("q", "query", "queries"):
        masks = tf.sign(tf.reduce_sum(tf.abs(queries), axis=-1))  # (N, T_q)
        masks = tf.expand_dims(masks, -1)  # (N, T_q, 1)
        masks = tf.tile(masks, [1, 1, tf.shape(keys)[1]])  # (N, T_q, T_k)
        outputs = inputs*masks
    elif type in ("f", "future", "right"):
        diag_vals = tf.ones_like(inputs[0, :, :])  # (T_q, T_k)
        tril = tf.linalg.LinearOperatorLowerTriangular(diag_vals).to_dense()  # (T_q, T_k)
        masks = tf.tile(tf.expand_dims(tril, 0), [tf.shape(inputs)[0], 1, 1])  # (N, T_q, T_k)
        paddings = tf.ones_like(masks) * padding_num
        outputs = tf.where(tf.equal(masks, 0), paddings, inputs)
    else:
        print("Check if you entered type correctly!")


    return outputs

def multihead_attention(queries, keys, values,
                        num_heads=8, 
                        dropout_rate=0,
                        training=True,
                        causality=False,
                        scope="multihead_attention"):
    d_model = queries.get_shape().as_list()[-1]
    with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
        # Linear projections
        Q = tf.layers.dense(queries, d_model, use_bias=False) # (N, T_q, d_model)
        K = tf.layers.dense(keys, d_model, use_bias=False) # (N, T_k, d_model)
        V = tf.layers.dense(values, d_model, use_bias=False) # (N, T_k, d_model)
        
        Q_ = tf.concat(tf.split(Q, num_heads, axis=2), axis=0) # (h*N, T_q, d_model/h)
        K_ = tf.concat(tf.split(K, num_heads, axis=2), axis=0) # (h*N, T_k, d_model/h)
        V_ = tf.concat(tf.split(V, num_heads, axis=2), axis=0) # (h*N, T_k, d_model/h)

        outputs = scaled_dot_product_attention(Q_, K_, V_, causality, dropout_rate, training)
        outputs = tf.concat(tf.split(outputs, num_heads, axis=0), axis=2 ) # (N, T_q, d_model)
        outputs += queries
        outputs = ln(outputs)
 
    return outputs

def ff(inputs, num_units, scope="positionwise_feedforward"):
    with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
        outputs = tf.layers.dense(inputs, num_units[0], activation=tf.nn.relu)
        outputs = tf.layers.dense(outputs, num_units[1])
        outputs += inputs
        outputs = ln(outputs)
    
    return outputs

def label_smoothing(inputs, epsilon=0.1):
    V = inputs.get_shape().as_list()[-1] # number of channels
    return ((1-epsilon) * inputs) + (epsilon / V)

def sinusoidal_position_encoding(inputs, mask, repr_dim):
    T = tf.shape(inputs)[1]
    pos = tf.reshape(tf.range(0.0, tf.to_float(T), dtype=tf.float32), [-1, 1])
    i = np.arange(0, repr_dim, 2, np.float32)
    denom = np.reshape(np.power(10000.0, i / repr_dim), [1, -1])
    enc = tf.expand_dims(tf.concat([tf.sin(pos / denom), tf.cos(pos / denom)], 1), 0)
    return tf.tile(enc, [tf.shape(inputs)[0], 1, 1]) * tf.expand_dims(tf.to_float(mask), -1)

class Translator:
    def __init__(self, size_layer, learning_rate,
                num_blocks = 4, num_heads = 8, ratio_hidden = 2, beam_width = 5):
        
        self.X = tf.placeholder(tf.int32, [None, None])
        self.Y = tf.placeholder(tf.int32, [None, None])
        self.X_seq_len = tf.count_nonzero(self.X, 1, dtype=tf.int32)
        self.Y_seq_len = tf.count_nonzero(self.Y, 1, dtype=tf.int32)
        batch_size = tf.shape(self.X)[0]
        
        
        def forward(x, y, reuse = False):
            
            with tf.variable_scope('bert',reuse=reuse):
                model = modeling.BertModel(
                    config=bert_config,
                    is_training=False,
                    input_ids=x,
                    use_one_hot_embeddings=False)
                embedding = model.get_embedding_table()
                memory = model.get_sequence_output()
            
            decoder_embedded = tf.nn.embedding_lookup(embedding, y)
            de_masks = tf.sign(y)
            decoder_embedded += sinusoidal_position_encoding(y, de_masks, size_layer)
            dec = decoder_embedded
            
            for i in range(num_blocks):
                with tf.variable_scope('decoder_self_attn_%d'%i,reuse=reuse):
                    dec = multihead_attention(queries=dec,
                                              keys=dec,
                                              values=dec,
                                              num_heads=num_heads,
                                              causality=True,
                                              scope="self_attention")

                    dec = multihead_attention(queries=dec,
                                              keys=memory,
                                              values=memory,
                                              num_heads=num_heads,
                                              causality=False,
                                              scope="vanilla_attention")
                    
                    dec = ff(dec, num_units=[size_layer * ratio_hidden, size_layer])
                
            weights = tf.transpose(embedding)
            logits = tf.einsum('ntd,dk->ntk', dec, weights)
            return logits
        
        main = tf.strided_slice(self.Y, [0, 0], [batch_size, -1], [1, 1])
        decoder_input = tf.concat([tf.fill([batch_size, 1], GO), main], 1)
        self.training_logits = forward(self.X, decoder_input)

        masks = tf.sequence_mask(self.Y_seq_len, tf.reduce_max(self.Y_seq_len), dtype=tf.float32)
        self.cost = tf.contrib.seq2seq.sequence_loss(logits = self.training_logits,
                                                     targets = self.Y,
                                                     weights = masks)
        self.optimizer = optimization.create_optimizer(self.cost, learning_rate, 
                                                       num_train_steps, num_warmup_steps, False)
        y_t = tf.argmax(self.training_logits,axis=2)
        y_t = tf.cast(y_t, tf.int32)
        self.prediction = tf.boolean_mask(y_t, masks)
        mask_label = tf.boolean_mask(self.Y, masks)
        correct_pred = tf.equal(self.prediction, mask_label)
        correct_index = tf.cast(correct_pred, tf.float32)
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
        
        initial_ids = tf.fill([batch_size], GO)
        
        def symbols_to_logits(ids):
            x = tf.contrib.seq2seq.tile_batch(self.X, beam_width)
            logits = forward(x, ids, reuse = True)
            return logits[:, tf.shape(ids)[1]-1, :]
        
        final_ids, final_probs, _ = beam_search.beam_search(
            symbols_to_logits,
            initial_ids,
            beam_width,
            tf.reduce_max(self.X_seq_len),
            size_vocab,
            0.0,
            eos_id = EOS)
        
        self.predicting_ids = final_ids

In [11]:
size_layer = 768
learning_rate = 1e-5

In [12]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Translator(size_layer, learning_rate)
sess.run(tf.global_variables_initializer())

Instructions for updating:
reduction_indices is deprecated, use axis instead



The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Use keras.layers.dense instead.
Instructions for updating:
Use `tf.cast` instead.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Instructions for updating:
Use keras.layers.dropout instead.


In [13]:
import collections
import re

def get_assignment_map_from_checkpoint(tvars, init_checkpoint):
    """Compute the union of the current variables and checkpoint variables."""
    assignment_map = {}
    initialized_variable_names = {}

    name_to_variable = collections.OrderedDict()
    for var in tvars:
        name = var.name
        m = re.match('^(.*):\\d+$', name)
        if m is not None:
            name = m.group(1)
        name_to_variable[name] = var

    init_vars = tf.train.list_variables(init_checkpoint)

    assignment_map = collections.OrderedDict()
    for x in init_vars:
        (name, var) = (x[0], x[1])
        if 'bert/' + name not in name_to_variable:
            continue
        assignment_map[name] = name_to_variable['bert/' + name]
        initialized_variable_names[name] = 1
        initialized_variable_names[name + ':0'] = 1

    return (assignment_map, initialized_variable_names)

In [14]:
tvars = tf.trainable_variables()

checkpoint = BERT_INIT_CHKPNT
assignment_map, initialized_variable_names = get_assignment_map_from_checkpoint(tvars, 
                                                                                checkpoint)

In [15]:
saver = tf.train.Saver(var_list = assignment_map)
saver.restore(sess, checkpoint)

Instructions for updating:
Use standard file APIs to check for files with this prefix.
INFO:tensorflow:Restoring parameters from multi_cased_L-12_H-768_A-12/bert_model.ckpt


In [16]:
sess.run(model.predicting_ids, feed_dict = {model.X: [train_X[0]]}).shape

(1, 5, 13)

In [17]:
def pad_sentence_batch(sentence_batch, pad_int):
    padded_seqs = []
    seq_lens = []
    max_sentence_len = max([len(sentence) for sentence in sentence_batch])
    for sentence in sentence_batch:
        padded_seqs.append(sentence + [pad_int] * (max_sentence_len - len(sentence)))
        seq_lens.append(len(sentence))
    return padded_seqs, seq_lens

In [18]:
import tqdm

for e in range(epoch):
    pbar = tqdm.tqdm(
        range(0, len(train_X), batch_size), desc = 'minibatch loop')
    train_loss, train_acc, test_loss, test_acc = [], [], [], []
    for i in pbar:
        index = min(i + batch_size, len(train_X))
        maxlen = max([len(s) for s in train_X[i : index] + train_Y[i : index]])
        batch_x, seq_x = pad_sentence_batch(train_X[i : index], PAD)
        batch_y, seq_y = pad_sentence_batch(train_Y[i : index], PAD)
        feed = {model.X: batch_x,
                model.Y: batch_y}
        accuracy, loss, _ = sess.run([model.accuracy,model.cost,model.optimizer],
                                    feed_dict = feed)
        train_loss.append(loss)
        train_acc.append(accuracy)
        pbar.set_postfix(cost = loss, accuracy = accuracy)
    
    pbar = tqdm.tqdm(
        range(0, len(test_X), batch_size), desc = 'minibatch loop')
    for i in pbar:
        index = min(i + batch_size, len(test_X))
        batch_x, seq_x = pad_sentence_batch(test_X[i : index], PAD)
        batch_y, seq_y = pad_sentence_batch(test_Y[i : index], PAD)
        feed = {model.X: batch_x,
                model.Y: batch_y,}
        accuracy, loss = sess.run([model.accuracy,model.cost],
                                    feed_dict = feed)

        test_loss.append(loss)
        test_acc.append(accuracy)
        pbar.set_postfix(cost = loss, accuracy = accuracy)
    
    print('epoch %d, training avg loss %f, training avg acc %f'%(e+1,
                                                                 np.mean(train_loss),np.mean(train_acc)))
    print('epoch %d, testing avg loss %f, testing avg acc %f'%(e+1,
                                                              np.mean(test_loss),np.mean(test_acc)))

minibatch loop: 100%|██████████| 4167/4167 [36:02<00:00,  1.93it/s, accuracy=0.0694, cost=6.54]
minibatch loop: 100%|██████████| 89/89 [00:16<00:00,  5.42it/s, accuracy=0.178, cost=4.76] 
minibatch loop:   0%|          | 0/4167 [00:00<?, ?it/s]

epoch 1, training avg loss 5.470412, training avg acc 0.112731
epoch 1, testing avg loss 4.825023, testing avg acc 0.160569


minibatch loop: 100%|██████████| 4167/4167 [35:59<00:00,  1.93it/s, accuracy=0.13, cost=5.97]  
minibatch loop: 100%|██████████| 89/89 [00:15<00:00,  5.61it/s, accuracy=0.279, cost=4.13]
minibatch loop:   0%|          | 0/4167 [00:00<?, ?it/s]

epoch 2, training avg loss 4.331696, training avg acc 0.214640
epoch 2, testing avg loss 4.143205, testing avg acc 0.239354


minibatch loop: 100%|██████████| 4167/4167 [35:59<00:00,  1.93it/s, accuracy=0.185, cost=5.37]
minibatch loop: 100%|██████████| 89/89 [00:15<00:00,  5.64it/s, accuracy=0.346, cost=3.57]
minibatch loop:   0%|          | 0/4167 [00:00<?, ?it/s]

epoch 3, training avg loss 3.624350, training avg acc 0.318347
epoch 3, testing avg loss 3.634863, testing avg acc 0.318458


minibatch loop: 100%|██████████| 4167/4167 [35:57<00:00,  1.93it/s, accuracy=0.227, cost=4.79]
minibatch loop: 100%|██████████| 89/89 [00:15<00:00,  5.57it/s, accuracy=0.37, cost=3.2]  
minibatch loop:   0%|          | 0/4167 [00:00<?, ?it/s]

epoch 4, training avg loss 3.167339, training avg acc 0.390191
epoch 4, testing avg loss 3.300180, testing avg acc 0.373873


minibatch loop: 100%|██████████| 4167/4167 [35:49<00:00,  1.94it/s, accuracy=0.287, cost=4.33]
minibatch loop: 100%|██████████| 89/89 [00:16<00:00,  5.54it/s, accuracy=0.409, cost=3.06]
minibatch loop:   0%|          | 0/4167 [00:00<?, ?it/s]

epoch 5, training avg loss 2.842982, training avg acc 0.441878
epoch 5, testing avg loss 3.147791, testing avg acc 0.399979


minibatch loop: 100%|██████████| 4167/4167 [35:49<00:00,  1.94it/s, accuracy=0.31, cost=3.86] 
minibatch loop: 100%|██████████| 89/89 [00:16<00:00,  5.55it/s, accuracy=0.428, cost=2.99]
minibatch loop:   0%|          | 0/4167 [00:00<?, ?it/s]

epoch 6, training avg loss 2.586160, training avg acc 0.483048
epoch 6, testing avg loss 3.126966, testing avg acc 0.405724


minibatch loop: 100%|██████████| 4167/4167 [35:49<00:00,  1.94it/s, accuracy=0.375, cost=3.52]
minibatch loop: 100%|██████████| 89/89 [00:16<00:00,  5.52it/s, accuracy=0.438, cost=2.85]
minibatch loop:   0%|          | 0/4167 [00:00<?, ?it/s]

epoch 7, training avg loss 2.372910, training avg acc 0.518034
epoch 7, testing avg loss 3.052107, testing avg acc 0.422895


minibatch loop: 100%|██████████| 4167/4167 [35:49<00:00,  1.94it/s, accuracy=0.403, cost=3.29]
minibatch loop: 100%|██████████| 89/89 [00:15<00:00,  5.57it/s, accuracy=0.452, cost=2.69]
minibatch loop:   0%|          | 0/4167 [00:00<?, ?it/s]

epoch 8, training avg loss 2.200384, training avg acc 0.545937
epoch 8, testing avg loss 2.927400, testing avg acc 0.441286


minibatch loop: 100%|██████████| 4167/4167 [35:49<00:00,  1.94it/s, accuracy=0.417, cost=3.04] 
minibatch loop: 100%|██████████| 89/89 [00:15<00:00,  5.58it/s, accuracy=0.481, cost=2.61]
minibatch loop:   0%|          | 0/4167 [00:00<?, ?it/s]

epoch 9, training avg loss 2.073574, training avg acc 0.566716
epoch 9, testing avg loss 2.864531, testing avg acc 0.455790


minibatch loop: 100%|██████████| 4167/4167 [35:49<00:00,  1.94it/s, accuracy=0.486, cost=2.69] 
minibatch loop: 100%|██████████| 89/89 [00:16<00:00,  5.53it/s, accuracy=0.49, cost=2.59] 
minibatch loop:   0%|          | 0/4167 [00:00<?, ?it/s]

epoch 10, training avg loss 1.947256, training avg acc 0.588113
epoch 10, testing avg loss 2.861142, testing avg acc 0.458657


minibatch loop: 100%|██████████| 4167/4167 [35:48<00:00,  1.94it/s, accuracy=0.532, cost=2.34] 
minibatch loop: 100%|██████████| 89/89 [00:16<00:00,  5.56it/s, accuracy=0.466, cost=2.72]
minibatch loop:   0%|          | 0/4167 [00:00<?, ?it/s]

epoch 11, training avg loss 1.798666, training avg acc 0.614890
epoch 11, testing avg loss 2.964870, testing avg acc 0.448801


minibatch loop: 100%|██████████| 4167/4167 [35:49<00:00,  1.94it/s, accuracy=0.546, cost=2.15] 
minibatch loop: 100%|██████████| 89/89 [00:16<00:00,  5.55it/s, accuracy=0.452, cost=2.9] 
minibatch loop:   0%|          | 0/4167 [00:00<?, ?it/s]

epoch 12, training avg loss 1.656793, training avg acc 0.640974
epoch 12, testing avg loss 3.111073, testing avg acc 0.434066


minibatch loop: 100%|██████████| 4167/4167 [35:49<00:00,  1.94it/s, accuracy=0.565, cost=1.98] 
minibatch loop: 100%|██████████| 89/89 [00:16<00:00,  5.55it/s, accuracy=0.433, cost=2.98]
minibatch loop:   0%|          | 0/4167 [00:00<?, ?it/s]

epoch 13, training avg loss 1.533094, training avg acc 0.663895
epoch 13, testing avg loss 3.217668, testing avg acc 0.423850


minibatch loop: 100%|██████████| 4167/4167 [35:49<00:00,  1.94it/s, accuracy=0.606, cost=1.93] 
minibatch loop: 100%|██████████| 89/89 [00:16<00:00,  5.54it/s, accuracy=0.423, cost=2.92]
minibatch loop:   0%|          | 0/4167 [00:00<?, ?it/s]

epoch 14, training avg loss 1.419215, training avg acc 0.685455
epoch 14, testing avg loss 3.227339, testing avg acc 0.426352


minibatch loop: 100%|██████████| 4167/4167 [35:49<00:00,  1.94it/s, accuracy=0.657, cost=1.78] 
minibatch loop: 100%|██████████| 89/89 [00:16<00:00,  5.53it/s, accuracy=0.447, cost=2.88]
minibatch loop:   0%|          | 0/4167 [00:00<?, ?it/s]

epoch 15, training avg loss 1.309490, training avg acc 0.706482
epoch 15, testing avg loss 3.232316, testing avg acc 0.430102


minibatch loop: 100%|██████████| 4167/4167 [35:48<00:00,  1.94it/s, accuracy=0.667, cost=1.6]  
minibatch loop: 100%|██████████| 89/89 [00:15<00:00,  5.57it/s, accuracy=0.433, cost=2.87]
minibatch loop:   0%|          | 0/4167 [00:00<?, ?it/s]

epoch 16, training avg loss 1.203308, training avg acc 0.727816
epoch 16, testing avg loss 3.250415, testing avg acc 0.434590


minibatch loop: 100%|██████████| 4167/4167 [35:48<00:00,  1.94it/s, accuracy=0.699, cost=1.34] 
minibatch loop: 100%|██████████| 89/89 [00:15<00:00,  5.56it/s, accuracy=0.452, cost=2.82]
minibatch loop:   0%|          | 0/4167 [00:00<?, ?it/s]

epoch 17, training avg loss 1.095675, training avg acc 0.750086
epoch 17, testing avg loss 3.245824, testing avg acc 0.445811


minibatch loop: 100%|██████████| 4167/4167 [35:54<00:00,  1.93it/s, accuracy=0.792, cost=1.16] 
minibatch loop: 100%|██████████| 89/89 [00:15<00:00,  5.59it/s, accuracy=0.481, cost=2.93]
minibatch loop:   0%|          | 0/4167 [00:00<?, ?it/s]

epoch 18, training avg loss 0.993417, training avg acc 0.772005
epoch 18, testing avg loss 3.277383, testing avg acc 0.451739


minibatch loop: 100%|██████████| 4167/4167 [35:52<00:00,  1.94it/s, accuracy=0.778, cost=1.1]  
minibatch loop: 100%|██████████| 89/89 [00:15<00:00,  5.67it/s, accuracy=0.486, cost=2.93]
minibatch loop:   0%|          | 0/4167 [00:00<?, ?it/s]

epoch 19, training avg loss 0.892157, training avg acc 0.794355
epoch 19, testing avg loss 3.342973, testing avg acc 0.450040


minibatch loop: 100%|██████████| 4167/4167 [35:54<00:00,  1.93it/s, accuracy=0.745, cost=1.13] 
minibatch loop: 100%|██████████| 89/89 [00:15<00:00,  5.64it/s, accuracy=0.438, cost=2.93]

epoch 20, training avg loss 0.804806, training avg acc 0.813287
epoch 20, testing avg loss 3.441328, testing avg acc 0.446938





In [19]:
test_size = 20

batch_x, _ = pad_sentence_batch(test_X[: test_size], PAD)
feed = {model.X: batch_x}
logits = sess.run(model.predicting_ids, feed_dict = feed)
logits.shape

(20, 5, 94)