In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '0'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

In [2]:
import numpy as np
import json
import tensorflow as tf
import itertools
import collections
import re
import random
import sentencepiece as spm
from tqdm import tqdm
import xlnet_utils as squad_utils
import xlnet

In [3]:
from prepro_utils import preprocess_text, encode_ids

sp_model = spm.SentencePieceProcessor()
sp_model.Load('sp10m.cased.v9.model')

True

In [4]:
import tensorflow as tf
import logging

tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
tf.get_logger().setLevel(logging.ERROR)
tf.autograph.set_verbosity(1)

In [5]:
import pickle

with open('xlnet-squad-train.pkl', 'rb') as fopen:
    train_features, train_examples = pickle.load(fopen)

In [6]:
max_seq_length = 512
doc_stride = 128
max_query_length = 64

In [7]:
epoch = 5
batch_size = 6
warmup_proportion = 0.1
n_best_size = 20
num_train_steps = int(len(train_features) / batch_size * epoch)
num_warmup_steps = int(num_train_steps * warmup_proportion)
learning_rate = 2e-5

In [8]:
kwargs = dict(
      is_training=True,
      use_tpu=False,
      use_bfloat16=False,
      dropout=0.1,
      dropatt=0.1,
      init='normal',
      init_range=0.1,
      init_std=0.05,
      clamp_len=-1)

xlnet_parameters = xlnet.RunConfig(**kwargs)
xlnet_config = xlnet.XLNetConfig(json_path='xlnet-base-29-03-2020/config.json')

In [9]:
training_parameters = dict(
      decay_method = 'poly',
      train_steps = num_train_steps,
      learning_rate = learning_rate,
      warmup_steps = num_warmup_steps,
      min_lr_ratio = 0.0,
      weight_decay = 0.00,
      adam_epsilon = 1e-8,
      num_core_per_host = 1,
      lr_layer_decay_rate = 1,
      use_tpu=False,
      use_bfloat16=False,
      dropout=0.0,
      dropatt=0.0,
      init='normal',
      init_range=0.1,
      init_std=0.05,
      clip = 1.0,
      clamp_len=-1,)

In [10]:
class Parameter:
    def __init__(self, decay_method, warmup_steps, weight_decay, adam_epsilon, 
                num_core_per_host, lr_layer_decay_rate, use_tpu, learning_rate, train_steps,
                min_lr_ratio, clip, **kwargs):
        self.decay_method = decay_method
        self.warmup_steps = warmup_steps
        self.weight_decay = weight_decay
        self.adam_epsilon = adam_epsilon
        self.num_core_per_host = num_core_per_host
        self.lr_layer_decay_rate = lr_layer_decay_rate
        self.use_tpu = use_tpu
        self.learning_rate = learning_rate
        self.train_steps = train_steps
        self.min_lr_ratio = min_lr_ratio
        self.clip = clip
        
training_parameters = Parameter(**training_parameters)

In [11]:
from tensorflow.contrib import layers as contrib_layers

class Model:
    def __init__(self, is_training = True):
        self.X = tf.placeholder(tf.int32, [None, None])
        self.segment_ids = tf.placeholder(tf.int32, [None, None])
        self.input_masks = tf.placeholder(tf.float32, [None, None])
        self.start_positions = tf.placeholder(tf.int32, [None])
        self.end_positions = tf.placeholder(tf.int32, [None])
        self.p_mask = tf.placeholder(tf.float32, [None, None])
        self.is_impossible = tf.placeholder(tf.int32, [None])
        self.cls_index = tf.placeholder(tf.int32, [None])
        
        xlnet_model = xlnet.XLNetModel(
            xlnet_config=xlnet_config,
            run_config=xlnet_parameters,
            input_ids=tf.transpose(self.X, [1, 0]),
            seg_ids=tf.transpose(self.segment_ids, [1, 0]),
            input_mask=tf.transpose(self.input_masks, [1, 0]))
        
        output = xlnet_model.get_sequence_output()
        self.output = output
        self.model = xlnet_model

In [12]:
is_training = True

tf.reset_default_graph()
model = Model(is_training = is_training)

In [13]:
start_n_top = 5
end_n_top = 5
seq_len = tf.shape(model.X)[1]
initializer = model.model.get_initializer()
return_dict = {}
p_mask = model.p_mask
output = model.output
cls_index = model.cls_index

with tf.variable_scope('start_logits'):
    start_logits = tf.layers.dense(
        output, 1, kernel_initializer = initializer
    )
    start_logits = tf.transpose(tf.squeeze(start_logits, -1), [1, 0])
    start_logits_masked = start_logits * (1 - p_mask) - 1e30 * p_mask
    start_log_probs = tf.nn.log_softmax(start_logits_masked, -1)
    
with tf.variable_scope('end_logits'):
    if is_training:
        # during training, compute the end logits based on the
        # ground truth of the start position

        start_positions = tf.reshape(model.start_positions, [-1])
        start_index = tf.one_hot(
            start_positions, depth = seq_len, axis = -1, dtype = tf.float32
        )
        start_features = tf.einsum('lbh,bl->bh', output, start_index)
        start_features = tf.tile(start_features[None], [seq_len, 1, 1])
        end_logits = tf.layers.dense(
            tf.concat([output, start_features], axis = -1),
            xlnet_config.d_model,
            kernel_initializer = initializer,
            activation = tf.tanh,
            name = 'dense_0',
        )
        end_logits = tf.contrib.layers.layer_norm(
            end_logits, begin_norm_axis = -1
        )

        end_logits = tf.layers.dense(
            end_logits,
            1,
            kernel_initializer = initializer,
            name = 'dense_1',
        )
        end_logits = tf.transpose(tf.squeeze(end_logits, -1), [1, 0])
        end_logits_masked = end_logits * (1 - p_mask) - 1e30 * p_mask
        end_log_probs = tf.nn.log_softmax(end_logits_masked, -1)
    else:
        # during inference, compute the end logits based on beam search

        start_top_log_probs, start_top_index = tf.nn.top_k(
            start_log_probs, k = start_n_top
        )
        start_index = tf.one_hot(
            start_top_index, depth = seq_len, axis = -1, dtype = tf.float32
        )
        start_features = tf.einsum('lbh,bkl->bkh', output, start_index)
        end_input = tf.tile(
            output[:, :, None], [1, 1, start_n_top, 1]
        )
        start_features = tf.tile(start_features[None], [seq_len, 1, 1, 1])
        end_input = tf.concat([end_input, start_features], axis = -1)
        end_logits = tf.layers.dense(
            end_input,
            xlnet_config.d_model,
            kernel_initializer = initializer,
            activation = tf.tanh,
            name = 'dense_0',
        )
        end_logits = tf.contrib.layers.layer_norm(
            end_logits, begin_norm_axis = -1
        )
        end_logits = tf.layers.dense(
            end_logits,
            1,
            kernel_initializer = initializer,
            name = 'dense_1',
        )
        end_logits = tf.reshape(
            end_logits, [seq_len, -1, start_n_top]
        )
        end_logits = tf.transpose(end_logits, [1, 2, 0])
        end_logits_masked = (
            end_logits * (1 - p_mask[:, None]) - 1e30 * p_mask[:, None]
        )
        end_log_probs = tf.nn.log_softmax(end_logits_masked, -1)
        end_top_log_probs, end_top_index = tf.nn.top_k(
            end_log_probs, k = end_n_top
        )
        end_top_log_probs = tf.reshape(
            end_top_log_probs, [-1, start_n_top * end_n_top]
        )
        end_top_index = tf.reshape(
            end_top_index, [-1, start_n_top * end_n_top]
        )

if is_training:
    return_dict['start_log_probs'] = start_log_probs
    return_dict['end_log_probs'] = end_log_probs
else:
    return_dict['start_top_log_probs'] = start_top_log_probs
    return_dict['start_top_index'] = start_top_index
    return_dict['end_top_log_probs'] = end_top_log_probs
    return_dict['end_top_index'] = end_top_index

# an additional layer to predict answerability
with tf.variable_scope('answer_class'):
    # get the representation of CLS
    cls_index = tf.one_hot(
        cls_index, seq_len, axis = -1, dtype = tf.float32
    )
    cls_feature = tf.einsum('lbh,bl->bh', output, cls_index)

    # get the representation of START
    start_p = tf.nn.softmax(
        start_logits_masked, axis = -1, name = 'softmax_start'
    )
    start_feature = tf.einsum('lbh,bl->bh', output, start_p)

    # note(zhiliny): no dependency on end_feature so that we can obtain
    # one single `cls_logits` for each sample
    ans_feature = tf.concat([start_feature, cls_feature], -1)
    ans_feature = tf.layers.dense(
        ans_feature,
        xlnet_config.d_model,
        activation = tf.tanh,
        kernel_initializer = initializer,
        name = 'dense_0',
    )
    ans_feature = tf.layers.dropout(
        ans_feature, 0.1, training = is_training
    )
    cls_logits = tf.layers.dense(
        ans_feature,
        1,
        kernel_initializer = initializer,
        name = 'dense_1',
        use_bias = False,
    )
    cls_logits = tf.squeeze(cls_logits, -1)

    return_dict['cls_logits'] = cls_logits

In [14]:
seq_length = tf.shape(model.X)[1]

def compute_loss(log_probs, positions):
    one_hot_positions = tf.one_hot(
        positions, depth = seq_length, dtype = tf.float32
    )

    loss = -tf.reduce_sum(one_hot_positions * log_probs, axis = -1)
    loss = tf.reduce_mean(loss)
    return loss

start_loss = compute_loss(
    return_dict['start_log_probs'], model.start_positions
)
end_loss = compute_loss(
    return_dict['end_log_probs'], model.end_positions
)

total_loss = (start_loss + end_loss) * 0.5

cls_logits = return_dict['cls_logits']
is_impossible = tf.reshape(model.is_impossible, [-1])
regression_loss = tf.nn.sigmoid_cross_entropy_with_logits(
    labels = tf.cast(is_impossible, dtype = tf.float32),
    logits = cls_logits,
)
regression_loss = tf.reduce_mean(regression_loss)

# note(zhiliny): by default multiply the loss by 0.5 so that the scale is
# comparable to start_loss and end_loss
total_loss += regression_loss * 0.5

In [15]:
import model_utils

optimizer, _, _ = model_utils.get_train_op(training_parameters, total_loss)

In [16]:
import collections
import re

def get_assignment_map_from_checkpoint(tvars, init_checkpoint):
    """Compute the union of the current variables and checkpoint variables."""
    assignment_map = {}
    initialized_variable_names = {}

    name_to_variable = collections.OrderedDict()
    for var in tvars:
        name = var.name
        m = re.match('^(.*):\\d+$', name)
        if m is not None:
            name = m.group(1)
        name_to_variable[name] = var

    init_vars = tf.train.list_variables(init_checkpoint)

    assignment_map = collections.OrderedDict()
    for x in init_vars:
        (name, var) = (x[0], x[1])
        if name not in name_to_variable:
            continue
        assignment_map[name] = name_to_variable[name]
        initialized_variable_names[name] = 1
        initialized_variable_names[name + ':0'] = 1

    return (assignment_map, initialized_variable_names)


In [17]:
tvars = tf.trainable_variables()
checkpoint = 'xlnet-base-29-03-2020/model.ckpt-300000'
assignment_map, initialized_variable_names = get_assignment_map_from_checkpoint(tvars, 
                                                                                checkpoint)

In [18]:
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())
saver = tf.train.Saver(var_list = assignment_map)
saver.restore(sess, checkpoint)

In [19]:
from tqdm import tqdm

for e in range(epoch):
    pbar = tqdm(
        range(0, len(train_features), batch_size), desc = 'train minibatch loop'
    )
    costs, start_losses, end_losses, regression_losses = [], [], [], []
    for i in pbar:
        batch = train_features[i: i + batch_size]
        batch_ids = [b.input_ids for b in batch]
        batch_masks = [b.input_mask for b in batch]
        batch_segment = [b.segment_ids for b in batch]
        batch_start = [b.start_position for b in batch]
        batch_end = [b.end_position for b in batch]
        is_impossible = [b.is_impossible for b in batch]
        p_mask = [b.p_mask for b in batch]
        cls_index = [b.cls_index for b in batch]
        cost, start_loss_, end_loss_, regression_loss_, _ = sess.run(
            [total_loss, start_loss, end_loss, regression_loss, optimizer],
            feed_dict = {
                model.start_positions: batch_start,
                model.end_positions: batch_end,
                model.X: batch_ids,
                model.segment_ids: batch_segment,
                model.input_masks: batch_masks,
                model.is_impossible: is_impossible,
                model.p_mask: p_mask,
                model.cls_index: cls_index
            },
        )
        pbar.set_postfix(cost = cost, start_loss = start_loss_,
                        end_loss = end_loss_, regression_loss = regression_loss_)
        costs.append(cost)
        start_losses.append(start_loss_)
        end_losses.append(end_loss_)
        regression_losses.append(regression_loss_)
        
    print(f'epoch: {e}')
    print(np.mean(costs))
    print(np.mean(start_losses))
    print(np.mean(end_losses))
    print(np.mean(regression_losses))

train minibatch loop: 100%|██████████| 21796/21796 [3:11:16<00:00,  1.90it/s, cost=0.000533, end_loss=2.36e-6, regression_loss=0.00103, start_loss=3.3e-5]    
train minibatch loop:   0%|          | 0/21796 [00:00<?, ?it/s]

epoch: 0
2.1290479
2.4141414
1.2993002
0.54465365


train minibatch loop: 100%|██████████| 21796/21796 [3:11:11<00:00,  1.90it/s, cost=0.000506, end_loss=6.1e-6, regression_loss=0.000869, start_loss=0.000136]
train minibatch loop:   0%|          | 0/21796 [00:00<?, ?it/s]

epoch: 1
1.317361
1.5911316
0.5834332
0.460157


train minibatch loop: 100%|██████████| 21796/21796 [3:11:12<00:00,  1.90it/s, cost=0.000582, end_loss=4.21e-6, regression_loss=0.000962, start_loss=0.000198]  
train minibatch loop:   0%|          | 0/21796 [00:00<?, ?it/s]

epoch: 2
1.0899074
1.3063318
0.4826195
0.3908636


train minibatch loop: 100%|██████████| 21796/21796 [3:13:01<00:00,  1.88it/s, cost=0.0176, end_loss=2.17e-5, regression_loss=0.00791, start_loss=0.0273]       
train minibatch loop:   0%|          | 0/21796 [00:00<?, ?it/s]

epoch: 3
0.92369837
1.102461
0.41842037
0.3265154


train minibatch loop: 100%|██████████| 21796/21796 [3:22:28<00:00,  1.79it/s, cost=0.259, end_loss=0.000213, regression_loss=0.29, start_loss=0.228]           

epoch: 4
0.827502
0.9784572
0.37899488
0.2975519





In [20]:
saver = tf.train.Saver(tf.trainable_variables())
saver.save(sess, 'xlnet-base-squad/model.ckpt')

'xlnet-base-squad/model.ckpt'