## Make sure you already run

1. [xlnet-preprocessing.ipynb](xlnet-preprocessing.ipynb)
2. [download-preprocess-dataset.ipynb](download-preprocess-dataset.ipynb)

In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '3'

In [2]:
import numpy as np
import tensorflow as tf
from tensor2tensor.utils import beam_search
import xlnet
import model_utils

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])





  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [3]:
import pickle

with open('train-test-xlnet.pkl', 'rb') as fopen:
    dataset = pickle.load(fopen)
    
train_X = dataset['train_X']
test_X = dataset['test_X']
train_input_ids, train_input_masks, train_segment_ids = train_X
test_input_ids, test_input_masks, test_segment_ids = test_X
train_Y = dataset['train_Y']
test_Y = dataset['test_Y']

In [4]:
import json
    
with open('dictionary.json') as fopen:
    dictionary = json.load(fopen)

In [5]:
dictionary_to = dictionary['to']['dictionary']
rev_dictionary_to = dictionary['to']['rev_dictionary']

In [6]:
GO = dictionary_to['GO']
PAD = dictionary_to['PAD']
EOS = 3
UNK = dictionary_to['UNK']

In [7]:
import sentencepiece as spm
from prepro_utils import preprocess_text, encode_ids

sp_model = spm.SentencePieceProcessor()
sp_model.Load('xlnet_cased_L-12_H-768_A-12/spiece.model')

def tokenize_fn(text):
    text = preprocess_text(text, lower= False)
    return encode_ids(sp_model, text)

In [8]:
kwargs = dict(
      is_training=True,
      use_tpu=False,
      use_bfloat16=False,
      dropout=0,
      dropatt=0,
      init='normal',
      init_range=0.1,
      init_std=0.02,
      clamp_len=-1)

xlnet_parameters = xlnet.RunConfig(**kwargs)
xlnet_config = xlnet.XLNetConfig(json_path='xlnet_cased_L-12_H-768_A-12/xlnet_config.json')




In [9]:
MAX_SEQ_LENGTH = 150

SEG_ID_A   = 0
SEG_ID_B   = 1
SEG_ID_CLS = 2
SEG_ID_SEP = 3
SEG_ID_PAD = 4

special_symbols = {
    "<unk>"  : 0,
    "<s>"    : 1,
    "</s>"   : 2,
    "<cls>"  : 3,
    "<sep>"  : 4,
    "<pad>"  : 5,
    "<mask>" : 6,
    "<eod>"  : 7,
    "<eop>"  : 8,
}

VOCAB_SIZE = 32000
UNK_ID = special_symbols["<unk>"]
CLS_ID = special_symbols["<cls>"]
SEP_ID = special_symbols["<sep>"]
MASK_ID = special_symbols["<mask>"]
EOD_ID = special_symbols["<eod>"]

size_vocab = VOCAB_SIZE

In [10]:
epoch = 20
batch_size = 32
warmup_proportion = 0.1
num_train_steps = int(len(train_Y) / batch_size * epoch)
num_warmup_steps = int(num_train_steps * warmup_proportion)
print(num_train_steps, num_warmup_steps)

training_parameters = dict(
      decay_method = 'poly',
      train_steps = num_train_steps,
      learning_rate = 1e-5,
      warmup_steps = num_warmup_steps,
      min_lr_ratio = 0.0,
      weight_decay = 0.00,
      adam_epsilon = 1e-8,
      num_core_per_host = 1,
      lr_layer_decay_rate = 1,
      use_tpu=False,
      use_bfloat16=False,
      dropout=0.0,
      dropatt=0.0,
      init='normal',
      init_range=0.1,
      init_std=0.02,
      clip = 1.0,
      clamp_len=-1,)

83286 8328


In [11]:
class Parameter:
    def __init__(self, decay_method, warmup_steps, weight_decay, adam_epsilon, 
                num_core_per_host, lr_layer_decay_rate, use_tpu, learning_rate, train_steps,
                min_lr_ratio, clip, **kwargs):
        self.decay_method = decay_method
        self.warmup_steps = warmup_steps
        self.weight_decay = weight_decay
        self.adam_epsilon = adam_epsilon
        self.num_core_per_host = num_core_per_host
        self.lr_layer_decay_rate = lr_layer_decay_rate
        self.use_tpu = use_tpu
        self.learning_rate = learning_rate
        self.train_steps = train_steps
        self.min_lr_ratio = min_lr_ratio
        self.clip = clip
        
training_parameters = Parameter(**training_parameters)

In [12]:
def pad_second_dim(x, desired_size):
    padding = tf.tile([[[0.0]]], tf.stack([tf.shape(x)[0], desired_size - tf.shape(x)[1], tf.shape(x)[2]], 0))
    return tf.concat([x, padding], 1)

def ln(inputs, epsilon = 1e-8, scope="ln"):
    with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
        inputs_shape = inputs.get_shape()
        params_shape = inputs_shape[-1:]
    
        mean, variance = tf.nn.moments(inputs, [-1], keep_dims=True)
        beta= tf.get_variable("beta", params_shape, initializer=tf.zeros_initializer())
        gamma = tf.get_variable("gamma", params_shape, initializer=tf.ones_initializer())
        normalized = (inputs - mean) / ( (variance + epsilon) ** (.5) )
        outputs = gamma * normalized + beta
        
    return outputs

def scaled_dot_product_attention(Q, K, V,
                                 causality=False, dropout_rate=0.,
                                 training=True,
                                 scope="scaled_dot_product_attention"):
    with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
        d_k = Q.get_shape().as_list()[-1]

        outputs = tf.matmul(Q, tf.transpose(K, [0, 2, 1]))  # (N, T_q, T_k)
        outputs /= d_k ** 0.5
        outputs = mask(outputs, Q, K, type="key")
        if causality:
            outputs = mask(outputs, type="future")
        outputs = tf.nn.softmax(outputs)
        attention = tf.transpose(outputs, [0, 2, 1])
        #tf.summary.image("attention", tf.expand_dims(attention[:1], -1))
        outputs = mask(outputs, Q, K, type="query")
        outputs = tf.layers.dropout(outputs, rate=dropout_rate, training=training)
        outputs = tf.matmul(outputs, V)
    return outputs

def mask(inputs, queries=None, keys=None, type=None):
    padding_num = -2 ** 32 + 1
    if type in ("k", "key", "keys"):
        masks = tf.sign(tf.reduce_sum(tf.abs(keys), axis=-1))  # (N, T_k)
        masks = tf.expand_dims(masks, 1) # (N, 1, T_k)
        masks = tf.tile(masks, [1, tf.shape(queries)[1], 1])  # (N, T_q, T_k)
        paddings = tf.ones_like(inputs) * padding_num
        outputs = tf.where(tf.equal(masks, 0), paddings, inputs)  # (N, T_q, T_k)
    elif type in ("q", "query", "queries"):
        masks = tf.sign(tf.reduce_sum(tf.abs(queries), axis=-1))  # (N, T_q)
        masks = tf.expand_dims(masks, -1)  # (N, T_q, 1)
        masks = tf.tile(masks, [1, 1, tf.shape(keys)[1]])  # (N, T_q, T_k)
        outputs = inputs*masks
    elif type in ("f", "future", "right"):
        diag_vals = tf.ones_like(inputs[0, :, :])  # (T_q, T_k)
        tril = tf.linalg.LinearOperatorLowerTriangular(diag_vals).to_dense()  # (T_q, T_k)
        masks = tf.tile(tf.expand_dims(tril, 0), [tf.shape(inputs)[0], 1, 1])  # (N, T_q, T_k)
        paddings = tf.ones_like(masks) * padding_num
        outputs = tf.where(tf.equal(masks, 0), paddings, inputs)
    else:
        print("Check if you entered type correctly!")


    return outputs

def multihead_attention(queries, keys, values,
                        num_heads=8, 
                        dropout_rate=0,
                        training=True,
                        causality=False,
                        scope="multihead_attention"):
    d_model = queries.get_shape().as_list()[-1]
    with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
        # Linear projections
        Q = tf.layers.dense(queries, d_model, use_bias=False) # (N, T_q, d_model)
        K = tf.layers.dense(keys, d_model, use_bias=False) # (N, T_k, d_model)
        V = tf.layers.dense(values, d_model, use_bias=False) # (N, T_k, d_model)
        
        Q_ = tf.concat(tf.split(Q, num_heads, axis=2), axis=0) # (h*N, T_q, d_model/h)
        K_ = tf.concat(tf.split(K, num_heads, axis=2), axis=0) # (h*N, T_k, d_model/h)
        V_ = tf.concat(tf.split(V, num_heads, axis=2), axis=0) # (h*N, T_k, d_model/h)

        outputs = scaled_dot_product_attention(Q_, K_, V_, causality, dropout_rate, training)
        outputs = tf.concat(tf.split(outputs, num_heads, axis=0), axis=2 ) # (N, T_q, d_model)
        outputs += queries
        outputs = ln(outputs)
 
    return outputs

def ff(inputs, num_units, scope="positionwise_feedforward"):
    with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
        outputs = tf.layers.dense(inputs, num_units[0], activation=tf.nn.relu)
        outputs = tf.layers.dense(outputs, num_units[1])
        outputs += inputs
        outputs = ln(outputs)
    
    return outputs

def label_smoothing(inputs, epsilon=0.1):
    V = inputs.get_shape().as_list()[-1] # number of channels
    return ((1-epsilon) * inputs) + (epsilon / V)

def sinusoidal_position_encoding(inputs, mask, repr_dim):
    T = tf.shape(inputs)[1]
    pos = tf.reshape(tf.range(0.0, tf.to_float(T), dtype=tf.float32), [-1, 1])
    i = np.arange(0, repr_dim, 2, np.float32)
    denom = np.reshape(np.power(10000.0, i / repr_dim), [1, -1])
    enc = tf.expand_dims(tf.concat([tf.sin(pos / denom), tf.cos(pos / denom)], 1), 0)
    return tf.tile(enc, [tf.shape(inputs)[0], 1, 1]) * tf.expand_dims(tf.to_float(mask), -1)

class Translator:
    def __init__(self, size_layer, learning_rate,
                num_blocks = 4, num_heads = 8, ratio_hidden = 2, beam_width = 5):
        
        self.X = tf.placeholder(tf.int32, [None, None])
        self.segment_ids = tf.placeholder(tf.int32, [None, None])
        self.input_masks = tf.placeholder(tf.float32, [None, None])
        self.Y = tf.placeholder(tf.int32, [None, None])
        self.X_seq_len = tf.count_nonzero(self.X, 1, dtype=tf.int32)
        self.Y_seq_len = tf.count_nonzero(self.Y, 1, dtype=tf.int32)
        batch_size = tf.shape(self.X)[0]
        
        decoder_embedding = tf.Variable(tf.random_uniform([len(dictionary_to), size_layer], -1, 1))
        
        
        def forward(x, segment, mask, y, reuse = False):
            
            with tf.variable_scope('xlnet',reuse=reuse):
                xlnet_model = xlnet.XLNetModel(
                    xlnet_config=xlnet_config,
                    run_config=xlnet_parameters,
                    input_ids=tf.transpose(x, [1, 0]),
                    seg_ids=tf.transpose(segment, [1, 0]),
                    input_mask=tf.transpose(mask, [1, 0]))
                memory = xlnet_model.get_sequence_output()
                memory = tf.transpose(memory, (1, 0, 2))
            print(memory)
            
            decoder_embedded = tf.nn.embedding_lookup(decoder_embedding, y)
            de_masks = tf.sign(y)
            decoder_embedded += sinusoidal_position_encoding(y, de_masks, size_layer)
            dec = decoder_embedded
            
            for i in range(num_blocks):
                with tf.variable_scope('decoder_self_attn_%d'%i,reuse=reuse):
                    dec = multihead_attention(queries=dec,
                                              keys=dec,
                                              values=dec,
                                              num_heads=num_heads,
                                              causality=True,
                                              scope="self_attention")

                    dec = multihead_attention(queries=dec,
                                              keys=memory,
                                              values=memory,
                                              num_heads=num_heads,
                                              causality=False,
                                              scope="vanilla_attention")
                    
                    dec = ff(dec, num_units=[size_layer * ratio_hidden, size_layer])
                
            weights = tf.transpose(decoder_embedding)
            logits = tf.einsum('ntd,dk->ntk', dec, weights)
            return logits
        
        main = tf.strided_slice(self.Y, [0, 0], [batch_size, -1], [1, 1])
        decoder_input = tf.concat([tf.fill([batch_size, 1], GO), main], 1)
        self.training_logits = forward(self.X, self.segment_ids, self.input_masks, decoder_input)

        masks = tf.sequence_mask(self.Y_seq_len, tf.reduce_max(self.Y_seq_len), dtype=tf.float32)
        self.cost = tf.contrib.seq2seq.sequence_loss(logits = self.training_logits,
                                                     targets = self.Y,
                                                     weights = masks)
        self.optimizer, self.learning_rate, _ = model_utils.get_train_op(training_parameters, self.cost)
        y_t = tf.argmax(self.training_logits,axis=2)
        y_t = tf.cast(y_t, tf.int32)
        self.prediction = tf.boolean_mask(y_t, masks)
        mask_label = tf.boolean_mask(self.Y, masks)
        correct_pred = tf.equal(self.prediction, mask_label)
        correct_index = tf.cast(correct_pred, tf.float32)
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
        
        initial_ids = tf.fill([batch_size], GO)
        
        def symbols_to_logits(ids):
            x = tf.contrib.seq2seq.tile_batch(self.X, beam_width)
            segment = tf.contrib.seq2seq.tile_batch(self.segment_ids, beam_width)
            masks = tf.contrib.seq2seq.tile_batch(self.input_masks, beam_width)
            logits = forward(x, segment, masks, ids, reuse = True)
            return logits[:, tf.shape(ids)[1]-1, :]
        
        final_ids, final_probs, _ = beam_search.beam_search(
            symbols_to_logits,
            initial_ids,
            beam_width,
            tf.reduce_max(self.X_seq_len),
            size_vocab,
            0.0,
            eos_id = EOS)
        
        self.predicting_ids = final_ids

In [13]:
size_layer = 768
learning_rate = 1e-5

In [14]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Translator(size_layer, learning_rate)
sess.run(tf.global_variables_initializer())

Instructions for updating:
reduction_indices is deprecated, use axis instead



INFO:tensorflow:memory input None
INFO:tensorflow:Use float type <dtype: 'float32'>

Instructions for updating:
Use keras.layers.dropout instead.
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Use keras.layers.dense instead.
Tensor("xlnet/transpose_3:0", shape=(?, ?, 768), dtype=float32)
Instructions for updating:
Use `tf.cast` instead.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Instructions for updati

In [15]:
import collections
import re

def get_assignment_map_from_checkpoint(tvars, init_checkpoint):
    """Compute the union of the current variables and checkpoint variables."""
    assignment_map = {}
    initialized_variable_names = {}

    name_to_variable = collections.OrderedDict()
    for var in tvars:
        name = var.name
        m = re.match('^(.*):\\d+$', name)
        if m is not None:
            name = m.group(1)
        name_to_variable[name] = var

    init_vars = tf.train.list_variables(init_checkpoint)

    assignment_map = collections.OrderedDict()
    for x in init_vars:
        (name, var) = (x[0], x[1])
        if 'xlnet/' + name not in name_to_variable:
            continue
        assignment_map[name] = name_to_variable['xlnet/' + name]
        initialized_variable_names[name] = 1
        initialized_variable_names[name + ':0'] = 1

    return (assignment_map, initialized_variable_names)

In [16]:
tvars = tf.trainable_variables()

checkpoint = 'xlnet_cased_L-12_H-768_A-12/xlnet_model.ckpt'
assignment_map, initialized_variable_names = get_assignment_map_from_checkpoint(tvars, 
                                                                                checkpoint)

In [17]:
saver = tf.train.Saver(var_list = assignment_map)
saver.restore(sess, checkpoint)

Instructions for updating:
Use standard file APIs to check for files with this prefix.
INFO:tensorflow:Restoring parameters from xlnet_cased_L-12_H-768_A-12/xlnet_model.ckpt


In [18]:
def str_idx(corpus, dic):
    X = []
    for i in corpus:
        ints = []
        for k in i.split():
            ints.append(dic.get(k,UNK))
        X.append(ints)
    return X

def pad_sentence_batch(sentence_batch, pad_int):
    padded_seqs = []
    seq_lens = []
    max_sentence_len = max([len(sentence) for sentence in sentence_batch])
    for sentence in sentence_batch:
        padded_seqs.append(sentence + [pad_int] * (max_sentence_len - len(sentence)))
        seq_lens.append(len(sentence))
    return padded_seqs, seq_lens

In [19]:
train_Y = str_idx(train_Y, dictionary_to)
test_Y = str_idx(test_Y, dictionary_to)

In [20]:
# train_input_ids, train_input_masks, train_segment_ids
# test_input_ids, test_input_masks, test_segment_ids
# self.X = tf.placeholder(tf.int32, [None, None])
# self.segment_ids = tf.placeholder(tf.int32, [None, None])
# self.input_masks = tf.placeholder(tf.float32, [None, None])

In [21]:
len(train_input_ids), len(train_segment_ids), len(train_input_masks), len(train_Y)

(133259, 133259, 133259, 133259)

In [22]:
len(test_input_ids), len(test_segment_ids), len(test_input_masks), len(test_Y)

(2821, 2821, 2821, 2821)

In [23]:
sess.run([model.cost, model.accuracy], feed_dict = {model.X: [train_input_ids[0]],
                                            model.segment_ids: [train_segment_ids[0]],
                                            model.input_masks: [train_input_masks[0]],
                                            model.Y: [train_Y[0]]})

[69.729965, 0.0]

In [24]:
import tqdm

for e in range(epoch):
    pbar = tqdm.tqdm(
        range(0, len(train_input_ids), batch_size), desc = 'minibatch loop')
    train_loss, train_acc, test_loss, test_acc = [], [], [], []
    for i in pbar:
        index = min(i + batch_size, len(train_input_ids))
        batch_x, _ = pad_sentence_batch(train_input_ids[i : index], PAD)
        batch_masks, _ = pad_sentence_batch(train_input_masks[i : index], 1)
        batch_segment, _ = pad_sentence_batch(train_segment_ids[i : index], SEG_ID_PAD)
        batch_y, seq_y = pad_sentence_batch(train_Y[i : index], PAD)
        feed = {model.X: batch_x,
                model.Y: batch_y,
                model.input_masks: batch_masks,
                model.segment_ids: batch_segment}
        accuracy, loss, _ = sess.run([model.accuracy,model.cost,model.optimizer],
                                    feed_dict = feed)
        train_loss.append(loss)
        train_acc.append(accuracy)
        pbar.set_postfix(cost = loss, accuracy = accuracy)
    
    pbar = tqdm.tqdm(
        range(0, len(test_input_ids), batch_size), desc = 'minibatch loop')
    for i in pbar:
        index = min(i + batch_size, len(test_input_ids))
        batch_x, _ = pad_sentence_batch(test_input_ids[i : index], PAD)
        batch_masks, _ = pad_sentence_batch(test_input_masks[i : index], 1)
        batch_segment, _ = pad_sentence_batch(test_segment_ids[i : index], SEG_ID_PAD)
        batch_y, seq_y = pad_sentence_batch(test_Y[i : index], PAD)
        feed = {model.X: batch_x,
                model.Y: batch_y,
                model.input_masks: batch_masks,
                model.segment_ids: batch_segment}
        accuracy, loss = sess.run([model.accuracy,model.cost],
                                    feed_dict = feed)

        test_loss.append(loss)
        test_acc.append(accuracy)
        pbar.set_postfix(cost = loss, accuracy = accuracy)
    
    print('epoch %d, training avg loss %f, training avg acc %f'%(e+1,
                                                                 np.mean(train_loss),np.mean(train_acc)))
    print('epoch %d, testing avg loss %f, testing avg acc %f'%(e+1,
                                                              np.mean(test_loss),np.mean(test_acc)))

minibatch loop: 100%|██████████| 4165/4165 [25:33<00:00,  2.72it/s, accuracy=0.0793, cost=13.9] 
minibatch loop: 100%|██████████| 89/89 [00:10<00:00,  8.18it/s, accuracy=0.096, cost=12.7] 
minibatch loop:   0%|          | 0/4165 [00:00<?, ?it/s]

epoch 1, training avg loss 21.692877, training avg acc 0.047738
epoch 1, testing avg loss 12.675898, testing avg acc 0.083630


minibatch loop: 100%|██████████| 4165/4165 [25:24<00:00,  2.73it/s, accuracy=0.141, cost=8.01] 
minibatch loop: 100%|██████████| 89/89 [00:10<00:00,  8.19it/s, accuracy=0.169, cost=7.82]
minibatch loop:   0%|          | 0/4165 [00:00<?, ?it/s]

epoch 2, training avg loss 9.337718, training avg acc 0.128022
epoch 2, testing avg loss 7.195321, testing avg acc 0.169290


minibatch loop: 100%|██████████| 4165/4165 [25:23<00:00,  2.73it/s, accuracy=0.179, cost=6.2] 
minibatch loop: 100%|██████████| 89/89 [00:10<00:00,  8.20it/s, accuracy=0.203, cost=6.12]
minibatch loop:   0%|          | 0/4165 [00:00<?, ?it/s]

epoch 3, training avg loss 6.143311, training avg acc 0.214900
epoch 3, testing avg loss 5.739505, testing avg acc 0.212040


minibatch loop: 100%|██████████| 4165/4165 [25:24<00:00,  2.73it/s, accuracy=0.21, cost=5.5]  
minibatch loop: 100%|██████████| 89/89 [00:10<00:00,  8.17it/s, accuracy=0.232, cost=5.55]
minibatch loop:   0%|          | 0/4165 [00:00<?, ?it/s]

epoch 4, training avg loss 5.164574, training avg acc 0.264736
epoch 4, testing avg loss 5.216041, testing avg acc 0.231996


minibatch loop: 100%|██████████| 4165/4165 [25:25<00:00,  2.73it/s, accuracy=0.241, cost=5.05]
minibatch loop: 100%|██████████| 89/89 [00:10<00:00,  8.17it/s, accuracy=0.26, cost=5.11] 
minibatch loop:   0%|          | 0/4165 [00:00<?, ?it/s]

epoch 5, training avg loss 4.676654, training avg acc 0.295971
epoch 5, testing avg loss 4.794361, testing avg acc 0.270686


minibatch loop: 100%|██████████| 4165/4165 [25:25<00:00,  2.73it/s, accuracy=0.262, cost=4.73]
minibatch loop: 100%|██████████| 89/89 [00:10<00:00,  8.21it/s, accuracy=0.266, cost=4.92]
minibatch loop:   0%|          | 0/4165 [00:00<?, ?it/s]

epoch 6, training avg loss 4.348268, training avg acc 0.320956
epoch 6, testing avg loss 4.603140, testing avg acc 0.287897


minibatch loop: 100%|██████████| 4165/4165 [25:25<00:00,  2.73it/s, accuracy=0.324, cost=4.42]
minibatch loop: 100%|██████████| 89/89 [00:10<00:00,  8.19it/s, accuracy=0.254, cost=4.83]
minibatch loop:   0%|          | 0/4165 [00:00<?, ?it/s]

epoch 7, training avg loss 4.095811, training avg acc 0.342908
epoch 7, testing avg loss 4.526076, testing avg acc 0.292742


minibatch loop: 100%|██████████| 4165/4165 [25:37<00:00,  2.71it/s, accuracy=0.359, cost=4.12]
minibatch loop: 100%|██████████| 89/89 [00:10<00:00,  8.20it/s, accuracy=0.249, cost=4.77]
minibatch loop:   0%|          | 0/4165 [00:00<?, ?it/s]

epoch 8, training avg loss 3.883485, training avg acc 0.363184
epoch 8, testing avg loss 4.498834, testing avg acc 0.294282


minibatch loop: 100%|██████████| 4165/4165 [25:25<00:00,  2.73it/s, accuracy=0.397, cost=3.83]
minibatch loop: 100%|██████████| 89/89 [00:10<00:00,  8.22it/s, accuracy=0.243, cost=4.73]
minibatch loop:   0%|          | 0/4165 [00:00<?, ?it/s]

epoch 9, training avg loss 3.693927, training avg acc 0.383235
epoch 9, testing avg loss 4.486375, testing avg acc 0.296450


minibatch loop: 100%|██████████| 4165/4165 [25:25<00:00,  2.73it/s, accuracy=0.417, cost=3.54]
minibatch loop: 100%|██████████| 89/89 [00:10<00:00,  8.19it/s, accuracy=0.226, cost=4.71]
minibatch loop:   0%|          | 0/4165 [00:00<?, ?it/s]

epoch 10, training avg loss 3.518262, training avg acc 0.403752
epoch 10, testing avg loss 4.522978, testing avg acc 0.297018


minibatch loop: 100%|██████████| 4165/4165 [25:25<00:00,  2.73it/s, accuracy=0.434, cost=3.28]
minibatch loop: 100%|██████████| 89/89 [00:10<00:00,  8.20it/s, accuracy=0.237, cost=4.74]
minibatch loop:   0%|          | 0/4165 [00:00<?, ?it/s]

epoch 11, training avg loss 3.354566, training avg acc 0.423882
epoch 11, testing avg loss 4.623637, testing avg acc 0.293335


minibatch loop: 100%|██████████| 4165/4165 [25:25<00:00,  2.73it/s, accuracy=0.483, cost=3.05]
minibatch loop: 100%|██████████| 89/89 [00:10<00:00,  8.15it/s, accuracy=0.237, cost=4.76]
minibatch loop:   0%|          | 0/4165 [00:00<?, ?it/s]

epoch 12, training avg loss 3.202179, training avg acc 0.443475
epoch 12, testing avg loss 4.733461, testing avg acc 0.286778


minibatch loop: 100%|██████████| 4165/4165 [25:25<00:00,  2.73it/s, accuracy=0.531, cost=2.85]
minibatch loop: 100%|██████████| 89/89 [00:10<00:00,  8.22it/s, accuracy=0.249, cost=4.78]
minibatch loop:   0%|          | 0/4165 [00:00<?, ?it/s]

epoch 13, training avg loss 3.061537, training avg acc 0.462180
epoch 13, testing avg loss 4.819852, testing avg acc 0.283115


minibatch loop: 100%|██████████| 4165/4165 [25:23<00:00,  2.73it/s, accuracy=0.559, cost=2.68] 
minibatch loop: 100%|██████████| 89/89 [00:10<00:00,  8.17it/s, accuracy=0.249, cost=4.84]
minibatch loop:   0%|          | 0/4165 [00:00<?, ?it/s]

epoch 14, training avg loss 2.933535, training avg acc 0.479504
epoch 14, testing avg loss 4.938883, testing avg acc 0.278135


minibatch loop: 100%|██████████| 4165/4165 [25:24<00:00,  2.73it/s, accuracy=0.559, cost=2.53] 
minibatch loop: 100%|██████████| 89/89 [00:10<00:00,  8.22it/s, accuracy=0.237, cost=4.93]
minibatch loop:   0%|          | 0/4165 [00:00<?, ?it/s]

epoch 15, training avg loss 2.818423, training avg acc 0.495691
epoch 15, testing avg loss 5.102652, testing avg acc 0.272984


minibatch loop: 100%|██████████| 4165/4165 [25:22<00:00,  2.73it/s, accuracy=0.59, cost=2.42]  
minibatch loop: 100%|██████████| 89/89 [00:10<00:00,  8.22it/s, accuracy=0.254, cost=5.04]
minibatch loop:   0%|          | 0/4165 [00:00<?, ?it/s]

epoch 16, training avg loss 2.718724, training avg acc 0.509931
epoch 16, testing avg loss 5.265281, testing avg acc 0.272901


minibatch loop: 100%|██████████| 4165/4165 [25:22<00:00,  2.74it/s, accuracy=0.576, cost=2.35] 
minibatch loop: 100%|██████████| 89/89 [00:10<00:00,  8.21it/s, accuracy=0.243, cost=5.15]
minibatch loop:   0%|          | 0/4165 [00:00<?, ?it/s]

epoch 17, training avg loss 2.636617, training avg acc 0.522048
epoch 17, testing avg loss 5.476555, testing avg acc 0.269884


minibatch loop: 100%|██████████| 4165/4165 [25:22<00:00,  2.74it/s, accuracy=0.576, cost=2.32] 
minibatch loop: 100%|██████████| 89/89 [00:10<00:00,  8.22it/s, accuracy=0.237, cost=5.17]
minibatch loop:   0%|          | 0/4165 [00:00<?, ?it/s]

epoch 18, training avg loss 2.577286, training avg acc 0.530053
epoch 18, testing avg loss 5.566386, testing avg acc 0.269047


minibatch loop: 100%|██████████| 4165/4165 [25:21<00:00,  2.74it/s, accuracy=0.583, cost=2.33] 
minibatch loop: 100%|██████████| 89/89 [00:10<00:00,  8.20it/s, accuracy=0.266, cost=5.08]
minibatch loop:   0%|          | 0/4165 [00:00<?, ?it/s]

epoch 19, training avg loss 2.554250, training avg acc 0.530188
epoch 19, testing avg loss 5.461653, testing avg acc 0.274765


minibatch loop: 100%|██████████| 4165/4165 [25:22<00:00,  2.73it/s, accuracy=0.538, cost=2.52] 
minibatch loop: 100%|██████████| 89/89 [00:11<00:00,  7.84it/s, accuracy=0.249, cost=4.89]

epoch 20, training avg loss 2.601457, training avg acc 0.515407
epoch 20, testing avg loss 5.154055, testing avg acc 0.288339



