In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [2]:
from albert import modeling
from albert import optimization
from albert import tokenization
import tensorflow as tf
import numpy as np




In [3]:
tokenizer = tokenization.FullTokenizer(
      vocab_file='albert-base-2020-04-10/sp10m.cased.v10.vocab', do_lower_case=False,
      spm_model_file='albert-base-2020-04-10/sp10m.cased.v10.model')


INFO:tensorflow:loading sentence piece model


In [4]:
bert_config = modeling.AlbertConfig.from_json_file('albert-tiny-2020-04-17/config.json')
bert_config




<albert.modeling.AlbertConfig at 0x7f860ac7c940>

In [5]:
import pickle

with open('albert-squad-test.pkl', 'rb') as fopen:
    test_features, test_examples = pickle.load(fopen)

In [6]:
max_seq_length = 384
doc_stride = 128
max_query_length = 64

In [7]:
epoch = 5
batch_size = 22
warmup_proportion = 0.1
n_best_size = 20
num_train_steps = int(len(test_features) / batch_size * epoch)
num_warmup_steps = int(num_train_steps * warmup_proportion)

In [8]:
from tensorflow.contrib import layers as contrib_layers

class Model:
    def __init__(self, is_training = True):
        self.X = tf.placeholder(tf.int32, [None, None])
        self.segment_ids = tf.placeholder(tf.int32, [None, None])
        self.input_masks = tf.placeholder(tf.int32, [None, None])
        self.start_positions = tf.placeholder(tf.int32, [None])
        self.end_positions = tf.placeholder(tf.int32, [None])
        self.p_mask = tf.placeholder(tf.int32, [None, None])
        self.is_impossible = tf.placeholder(tf.int32, [None])
        
        model = modeling.AlbertModel(
            config=bert_config,
            is_training=is_training,
            input_ids=self.X,
            input_mask=self.input_masks,
            token_type_ids=self.segment_ids,
            use_one_hot_embeddings=False)
        
        final_hidden = model.get_sequence_output()
        self.output = final_hidden

In [9]:
learning_rate = 2e-5
start_n_top = 5
end_n_top = 5
is_training = False

tf.reset_default_graph()
model = Model(is_training = is_training)





Instructions for updating:
Use keras.layers.Dense instead.
Instructions for updating:
Please use `layer.__call__` method instead.


In [10]:
output = model.output
bsz = tf.shape(output)[0]
return_dict = {}
output = tf.transpose(output, [1, 0, 2])

# invalid position mask such as query and special symbols (PAD, SEP, CLS)
p_mask = tf.cast(model.p_mask, dtype = tf.float32)

# logit of the start position
with tf.variable_scope('start_logits'):
    start_logits = tf.layers.dense(
        output,
        1,
        kernel_initializer = modeling.create_initializer(
            bert_config.initializer_range
        ),
    )
    start_logits = tf.transpose(tf.squeeze(start_logits, -1), [1, 0])
    start_logits_masked = start_logits * (1 - p_mask) - 1e30 * p_mask
    start_log_probs = tf.nn.log_softmax(start_logits_masked, -1)

# logit of the end position
with tf.variable_scope('end_logits'):
    if is_training:
        # during training, compute the end logits based on the
        # ground truth of the start position
        start_positions = tf.reshape(model.start_positions, [-1])
        start_index = tf.one_hot(
            start_positions,
            depth = max_seq_length,
            axis = -1,
            dtype = tf.float32,
        )
        start_features = tf.einsum('lbh,bl->bh', output, start_index)
        start_features = tf.tile(
            start_features[None], [max_seq_length, 1, 1]
        )
        end_logits = tf.layers.dense(
            tf.concat([output, start_features], axis = -1),
            bert_config.hidden_size,
            kernel_initializer = modeling.create_initializer(
                bert_config.initializer_range
            ),
            activation = tf.tanh,
            name = 'dense_0',
        )
        end_logits = contrib_layers.layer_norm(
            end_logits, begin_norm_axis = -1
        )

        end_logits = tf.layers.dense(
            end_logits,
            1,
            kernel_initializer = modeling.create_initializer(
                bert_config.initializer_range
            ),
            name = 'dense_1',
        )
        end_logits = tf.transpose(tf.squeeze(end_logits, -1), [1, 0])
        end_logits_masked = end_logits * (1 - p_mask) - 1e30 * p_mask
        end_log_probs = tf.nn.log_softmax(end_logits_masked, -1)
    else:
        # during inference, compute the end logits based on beam search

        start_top_log_probs, start_top_index = tf.nn.top_k(
            start_log_probs, k = start_n_top
        )
        start_index = tf.one_hot(
            start_top_index,
            depth = max_seq_length,
            axis = -1,
            dtype = tf.float32,
        )
        start_features = tf.einsum('lbh,bkl->bkh', output, start_index)
        end_input = tf.tile(output[:, :, None], [1, 1, start_n_top, 1])
        start_features = tf.tile(
            start_features[None], [max_seq_length, 1, 1, 1]
        )
        end_input = tf.concat([end_input, start_features], axis = -1)
        end_logits = tf.layers.dense(
            end_input,
            bert_config.hidden_size,
            kernel_initializer = modeling.create_initializer(
                bert_config.initializer_range
            ),
            activation = tf.tanh,
            name = 'dense_0',
        )
        end_logits = contrib_layers.layer_norm(
            end_logits, begin_norm_axis = -1
        )
        end_logits = tf.layers.dense(
            end_logits,
            1,
            kernel_initializer = modeling.create_initializer(
                bert_config.initializer_range
            ),
            name = 'dense_1',
        )
        end_logits = tf.reshape(
            end_logits, [max_seq_length, -1, start_n_top]
        )
        end_logits = tf.transpose(end_logits, [1, 2, 0])
        end_logits_masked = (
            end_logits * (1 - p_mask[:, None]) - 1e30 * p_mask[:, None]
        )
        end_log_probs = tf.nn.log_softmax(end_logits_masked, -1)
        end_top_log_probs, end_top_index = tf.nn.top_k(
            end_log_probs, k = end_n_top
        )
        end_top_log_probs = tf.reshape(
            end_top_log_probs, [-1, start_n_top * end_n_top]
        )
        end_top_index = tf.reshape(
            end_top_index, [-1, start_n_top * end_n_top]
        )
        
if is_training:
    return_dict['start_log_probs'] = start_log_probs
    return_dict['end_log_probs'] = end_log_probs
else:
    return_dict['start_top_log_probs'] = start_top_log_probs
    return_dict['start_top_index'] = start_top_index
    return_dict['end_top_log_probs'] = end_top_log_probs
    return_dict['end_top_index'] = end_top_index

# an additional layer to predict answerability
with tf.variable_scope('answer_class'):
    # get the representation of CLS
    cls_index = tf.one_hot(
        tf.zeros([bsz], dtype = tf.int32),
        max_seq_length,
        axis = -1,
        dtype = tf.float32,
    )
    cls_feature = tf.einsum('lbh,bl->bh', output, cls_index)

    # get the representation of START
    start_p = tf.nn.softmax(
        start_logits_masked, axis = -1, name = 'softmax_start'
    )
    start_feature = tf.einsum('lbh,bl->bh', output, start_p)

    # note(zhiliny): no dependency on end_feature so that we can obtain
    # one single `cls_logits` for each sample
    ans_feature = tf.concat([start_feature, cls_feature], -1)
    ans_feature = tf.layers.dense(
        ans_feature,
        bert_config.hidden_size,
        activation = tf.tanh,
        kernel_initializer = modeling.create_initializer(
            bert_config.initializer_range
        ),
        name = 'dense_0',
    )
    ans_feature = tf.layers.dropout(
        ans_feature, bert_config.hidden_dropout_prob, training = is_training
    )
    cls_logits = tf.layers.dense(
        ans_feature,
        1,
        kernel_initializer = modeling.create_initializer(
            bert_config.initializer_range
        ),
        name = 'dense_1',
        use_bias = False,
    )
    cls_logits = tf.squeeze(cls_logits, -1)
    
return_dict['cls_logits'] = cls_logits

Instructions for updating:
Use keras.layers.dropout instead.


In [11]:
seq_length = tf.shape(model.X)[1]

cls_logits = return_dict['cls_logits']
is_impossible = tf.reshape(model.is_impossible, [-1])

In [12]:
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())
saver = tf.train.Saver(var_list = tf.trainable_variables())
saver.restore(sess, 'albert-tiny-squad/model.ckpt')

INFO:tensorflow:Restoring parameters from albert-tiny-squad/model.ckpt


In [13]:
import bert_utils as squad_utils

In [14]:
from tqdm import tqdm

all_results = []
pbar = tqdm(
    range(0, len(test_features), batch_size), desc = 'test minibatch loop'
)
for i in pbar:
    batch = test_features[i: i + batch_size]
    batch_ids = [b.input_ids for b in batch]
    batch_masks = [b.input_mask for b in batch]
    batch_segment = [b.segment_ids for b in batch]
    batch_start = [b.start_position for b in batch]
    batch_end = [b.end_position for b in batch]
    is_impossible = [b.is_impossible for b in batch]
    p_mask = [b.p_mask for b in batch]
    o = sess.run(
        [start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits],
        feed_dict = {
            model.X: batch_ids,
            model.segment_ids: batch_segment,
            model.input_masks: batch_masks,
            model.p_mask: p_mask
        },
    )
    for no, b in enumerate(batch):
        start_top_log_probs_ = (
            [float(x) for x in o[0][no].flat])
        start_top_index_ = [int(x) for x in o[1][no].flat]
        end_top_log_probs_ = (
            [float(x) for x in o[2][no].flat])
        end_top_index_ = [int(x) for x in o[3][no].flat]
        cls_logits_ = float(o[4][no].flat[0])
        all_results.append(squad_utils.RawResultV2(
                    unique_id=b.unique_id,
                    start_top_log_probs=start_top_log_probs_,
                    start_top_index=start_top_index_,
                    end_top_log_probs=end_top_log_probs_,
                    end_top_index=end_top_index_,
                    cls_logits=cls_logits_))

test minibatch loop: 100%|██████████| 559/559 [00:28<00:00, 19.31it/s]


In [15]:
n_best_size = 20
max_answer_length = 30
result_dict = {}
cls_dict = {}

squad_utils.accumulate_predictions_v2(
  result_dict, cls_dict, test_examples, test_features,
  all_results, n_best_size, max_answer_length,
  start_n_top, end_n_top)

In [16]:
import json

with open('/home/husein/pure-text/ms-dev-2.0.json') as predict_file:
    prediction_json = json.load(predict_file)["data"]

In [17]:
output_prediction_file = 'predict.json'
output_nbest_file = 'nbest_predictions.json'
output_null_log_odds_file = 'null_odds.json'

squad_utils.evaluate_v2(
  result_dict, cls_dict, prediction_json, test_examples,
  test_features, all_results, n_best_size,
  max_answer_length, output_prediction_file, output_nbest_file,
  output_null_log_odds_file)

INFO:tensorflow:Writing predictions to: predict.json
INFO:tensorflow:Writing nbest to: nbest_predictions.json
INFO:tensorflow:Writing predictions to: predict.json
INFO:tensorflow:Writing nbest to: nbest_predictions.json


OrderedDict([('exact', 50.008433125316245),
             ('f1', 50.008433125316245),
             ('total', 11858),
             ('null_score_diff_threshold', -2.365844249725342)])