In [20]:
import tensorflow as tf

In [15]:
from convert_msmarco_to_duobert_tfrecord import load_run, load_queries, load_qrels, load_corpus, merge

In [12]:
queries = load_queries('./queries.dev.small.tsv')
qrels = load_qrels('./qrels.dev.small.tsv')

6980it [00:00, 262169.82it/s]
1234it [00:00, 10955.87it/s]

Loading queries: ./queries.dev.small.tsv
Loading qrels: ./qrels.dev.small.tsv


7437it [00:00, 14361.67it/s]


In [7]:
run = load_run('./run.monobert.dev.small.tsv')
run.keys()

odict_keys(['188714', '1082792', '995526', '199776', '660957', '820267', '837202', '130825', '408149', '345453', '1019649', '1099065', '542431', '1084910', '959083', '816483', '995825', '559771', '1091246', '167436', '899212', '1101211', '1047854', '991832', '426442', '1093443', '779475', '1040038', '706950', '185299', '574730', '1085008', '1011382', '1062784', '1090701', '973917', '999517', '596716', '289276', '990995', '609628', '988787', '996805', '1025483', '998493', '358455', '435412', '1090730', '348594', '218000', '760512', '665972', '1006751', '596130', '988119', '942221', '1001108', '23285', '1002148', '1097438', '727707', '791140', '259417', '913568', '198246', '370734', '1056265', '432874', '1089312', '141694', '1093781', '1009183', '762059', '931147', '1027650', '170770', '1027817', '939866', '1031456', '480064', '1098608', '234114', '27743', '1089868', '404051', '1075636', '684459', '591940', '313940', '1043545', '510893', '250367', '1101531', '920753', '1054339', '1093552

In [14]:
data = merge(qrels=qrels, run=run, queries=queries)

In [16]:
corpus = load_corpus('./collection.tsv')

8841823it [00:34, 255214.11it/s]


In [51]:
import tokenization
from tqdm import tqdm
from collections import defaultdict

pad_doc_id = 5500000
max_query_length = 64
max_seq_length = 512
vocab_file = './duobert-large-msmarco-pretrained-and-finetuned/vocab.txt'
max_docs = 10

tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=True)
examples_for_query = defaultdict(list)

def feature_generator():
    for i, query_id in tqdm(enumerate(data), total=len(data)):
        query, qrels, doc_ids = data[query_id]

        query = tokenization.convert_to_unicode(query)
        query_ids = tokenization.convert_to_bert_input(
                text=query,
                max_seq_length=max_query_length,
                tokenizer=tokenizer,
                add_cls=True)

        doc_ids = doc_ids[:max_docs]

        # Add fake docs so we always have max_docs per query.
        doc_ids += max(0, max_docs - len(doc_ids)) * [pad_doc_id]

        labels = [
                1 if doc_id in qrels else 0
                for doc_id in doc_ids
        ]

        doc_token_ids = [
                tokenization.convert_to_bert_input(
                        text=tokenization.convert_to_unicode(
                            corpus[doc_id]),
                        max_seq_length=(
                            max_seq_length - len(query_ids)) // 2,
                        tokenizer=tokenizer,
                        add_cls=False)
                for doc_id in doc_ids
        ]

        input_ids = []
        segment_ids = []
        pair_doc_ids = []
        labels_pair = []
        for num_a, (doc_id_a, doc_token_id_a, label_a) in enumerate(
                zip(doc_ids, doc_token_ids, labels)):
            for num_b, (doc_id_b, doc_token_id_b) in enumerate(
                    zip(doc_ids, doc_token_ids)):
                if num_a == num_b:
                    continue
                input_ids.append(
                    query_ids + doc_token_id_a + doc_token_id_b)
                segment_ids.append((
                        [0] * len(query_ids) +
                        [1] * len(doc_token_id_a) +
                        [2] * len(doc_token_id_b)))
                pair_doc_ids.append((doc_id_a, doc_id_b))
                labels_pair.append(label_a)


        for input_id, segment_id, label, pair_doc_id in zip(
                input_ids, segment_ids, labels_pair, pair_doc_ids):
            
            input_mask = [1] * len(input_ids)
            yield {
                    'input_ids': input_id,
                    'segment_ids': segment_id,
                    'label_ids': label,
                    'input_mask' : input_mask  
            }

In [52]:
def input_fn():
    """The actual input function."""

    output_types = {
        "input_ids": tf.int32,
        "segment_ids": tf.int32,
        "input_mask": tf.int32,
        "label_ids": tf.int32,
    }
    dataset = tf.data.Dataset.from_generator(feature_generator, output_types)

    dataset = dataset.padded_batch(
        batch_size=batch_size,
        padded_shapes={
            "input_ids": [max_seq_length],
            "segment_ids": [max_seq_length],
            "input_mask": [max_seq_length],
            "label_ids": [],
        },
        padding_values={
            "input_ids": 0,
            "segment_ids": 0,
            "input_mask": 0,
            "label_ids": 0,
        },
        drop_remainder=True)

    return dataset

In [53]:
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
                 labels, num_labels, use_one_hot_embeddings):
    """Creates a classification model."""
    model = modeling.BertModel(
        config=bert_config,
        is_training=is_training,
        input_ids=input_ids,
        input_mask=input_mask,
        token_type_ids=segment_ids,
        use_one_hot_embeddings=use_one_hot_embeddings)

    output_layer = model.get_pooled_output()

    hidden_size = output_layer.shape[-1].value

    output_weights = tf.get_variable(
        "output_weights", [num_labels, hidden_size],
        initializer=tf.truncated_normal_initializer(stddev=0.02))

    output_bias = tf.get_variable(
        "output_bias", [num_labels], initializer=tf.zeros_initializer())

    with tf.variable_scope("loss"):
        if is_training:
            # I.e., 0.1 dropout
            output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)

        logits = tf.matmul(output_layer, output_weights, transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)
        log_probs = tf.nn.log_softmax(logits, axis=-1)
        probs = tf.nn.softmax(logits, axis=-1)

        one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)

        per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
        loss = tf.reduce_mean(per_example_loss)

        return (loss, per_example_loss, probs)

In [54]:
def model_fn_builder(bert_config, num_labels, init_checkpoint, learning_rate=0.0003,
                     num_train_steps=0, num_warmup_steps=0, use_tpu=False,
                     use_one_hot_embeddings=False):
    """Returns `model_fn` closure for TPUEstimator."""

    def model_fn(features, labels, mode, params):
        """The `model_fn` for TPUEstimator."""

        tf.logging.info('*** Features ***')
        for name in sorted(features.keys()):
            tf.logging.info(
                '    name = %s, shape = %s' % (name, features[name].shape))

        input_ids = features['input_ids']
        input_mask = features['input_mask']
        segment_ids = features['segment_ids']
        label_ids = features['label_ids']

        is_training = (mode == tf.estimator.ModeKeys.TRAIN)
        (total_loss, per_example_loss, probs) = create_model(
                bert_config, is_training, input_ids, input_mask, segment_ids,
                label_ids, num_labels, use_one_hot_embeddings)

        tvars = tf.trainable_variables()

        scaffold_fn = None
        initialized_variable_names = []
        if init_checkpoint:
            (assignment_map, initialized_variable_names
             ) = modeling.get_assignment_map_from_checkpoint(
                tvars, init_checkpoint)
            if use_tpu:
                def tpu_scaffold():
                    tf.train.init_from_checkpoint(init_checkpoint,
                                                  assignment_map)
                    return tf.train.Scaffold()

                scaffold_fn = tpu_scaffold
            else:
                tf.train.init_from_checkpoint(init_checkpoint, assignment_map)

        tf.logging.info('**** Trainable Variables ****')
        for var in tvars:
            init_string = ''
            if var.name in initialized_variable_names:
                init_string = ', *INIT_FROM_CKPT*'
            tf.logging.info('    name = %s, shape = %s%s', var.name, var.shape,
                            init_string)
            
        output_spec = tf.estimator.EstimatorSpec(
                mode=mode,
                predictions={
                    "probs": probs,
                    "label_ids": label_ids,
                })
        return output_spec

    return model_fn

In [None]:
import os
import modeling
import numpy as np

model_dir = './duobert-large-msmarco-pretrained-and-finetuned'
bert_config_file = os.path.join(model_dir, 'bert_config.json')
checkpoint = os.path.join(model_dir, 'model.ckpt-100000')
batch_size = 1

bert_config = modeling.BertConfig.from_json_file(bert_config_file)
assert max_seq_len <= bert_config.max_position_embeddings

run_config = tf.estimator.RunConfig(model_dir=str(model_dir))

model_fn = model_fn_builder(
    bert_config=bert_config,
    num_labels=2,
    init_checkpoint=checkpoint)

estimator = tf.estimator.Estimator(
    model_fn=model_fn,
    config=run_config)

results = estimator.predict(input_fn=input_fn,
                           yield_single_examples=True)

for query_id in data.keys():
    query, qrels, doc_ids = data[query_id]
    doc_ids = doc_ids[:max_docs]
    
    labels = [
        1 if doc_id in qrels else 0
        for doc_id in doc_ids
    ]
    scores = []
    for num_a, (doc_id_a, doc_token_id_a, label_a) in enumerate(
            zip(doc_ids, doc_token_ids, labels)):
        candidate_score = 0
        for num_b, (doc_id_b, doc_token_id_b) in enumerate(
                zip(doc_ids, doc_token_ids)):
            if num_a == num_b:
                continue
            result = next(results)
            probs = result['probs']
            label_id = result['label_ids']

            assert label_a == label_id

            candidate_score += probs[1]
        scores.append(candidate_score)
        
    scores = np.array(scores)
    pred_docs = scores.argsort()[::-1]
    print(pred_docs)

INFO:tensorflow:Using config: {'_model_dir': './duobert-large-msmarco-pretrained-and-finetuned', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0xb8443e470>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Could not find trained model in model_dir: ./duobert-large-msmarco-pretrained-and-finetuned, running initialization to predict.
INFO:t

INFO:tensorflow:    name = bert/encoder/layer_3/attention/output/dense/bias:0, shape = (1024,), *INIT_FROM_CKPT*
INFO:tensorflow:    name = bert/encoder/layer_3/attention/output/LayerNorm/beta:0, shape = (1024,), *INIT_FROM_CKPT*
INFO:tensorflow:    name = bert/encoder/layer_3/attention/output/LayerNorm/gamma:0, shape = (1024,), *INIT_FROM_CKPT*
INFO:tensorflow:    name = bert/encoder/layer_3/intermediate/dense/kernel:0, shape = (1024, 4096), *INIT_FROM_CKPT*
INFO:tensorflow:    name = bert/encoder/layer_3/intermediate/dense/bias:0, shape = (4096,), *INIT_FROM_CKPT*
INFO:tensorflow:    name = bert/encoder/layer_3/output/dense/kernel:0, shape = (4096, 1024), *INIT_FROM_CKPT*
INFO:tensorflow:    name = bert/encoder/layer_3/output/dense/bias:0, shape = (1024,), *INIT_FROM_CKPT*
INFO:tensorflow:    name = bert/encoder/layer_3/output/LayerNorm/beta:0, shape = (1024,), *INIT_FROM_CKPT*
INFO:tensorflow:    name = bert/encoder/layer_3/output/LayerNorm/gamma:0, shape = (1024,), *INIT_FROM_CKPT*

INFO:tensorflow:    name = bert/encoder/layer_8/attention/self/query/kernel:0, shape = (1024, 1024), *INIT_FROM_CKPT*
INFO:tensorflow:    name = bert/encoder/layer_8/attention/self/query/bias:0, shape = (1024,), *INIT_FROM_CKPT*
INFO:tensorflow:    name = bert/encoder/layer_8/attention/self/key/kernel:0, shape = (1024, 1024), *INIT_FROM_CKPT*
INFO:tensorflow:    name = bert/encoder/layer_8/attention/self/key/bias:0, shape = (1024,), *INIT_FROM_CKPT*
INFO:tensorflow:    name = bert/encoder/layer_8/attention/self/value/kernel:0, shape = (1024, 1024), *INIT_FROM_CKPT*
INFO:tensorflow:    name = bert/encoder/layer_8/attention/self/value/bias:0, shape = (1024,), *INIT_FROM_CKPT*
INFO:tensorflow:    name = bert/encoder/layer_8/attention/output/dense/kernel:0, shape = (1024, 1024), *INIT_FROM_CKPT*
INFO:tensorflow:    name = bert/encoder/layer_8/attention/output/dense/bias:0, shape = (1024,), *INIT_FROM_CKPT*
INFO:tensorflow:    name = bert/encoder/layer_8/attention/output/LayerNorm/beta:0, s

INFO:tensorflow:    name = bert/encoder/layer_12/attention/output/LayerNorm/gamma:0, shape = (1024,), *INIT_FROM_CKPT*
INFO:tensorflow:    name = bert/encoder/layer_12/intermediate/dense/kernel:0, shape = (1024, 4096), *INIT_FROM_CKPT*
INFO:tensorflow:    name = bert/encoder/layer_12/intermediate/dense/bias:0, shape = (4096,), *INIT_FROM_CKPT*
INFO:tensorflow:    name = bert/encoder/layer_12/output/dense/kernel:0, shape = (4096, 1024), *INIT_FROM_CKPT*
INFO:tensorflow:    name = bert/encoder/layer_12/output/dense/bias:0, shape = (1024,), *INIT_FROM_CKPT*
INFO:tensorflow:    name = bert/encoder/layer_12/output/LayerNorm/beta:0, shape = (1024,), *INIT_FROM_CKPT*
INFO:tensorflow:    name = bert/encoder/layer_12/output/LayerNorm/gamma:0, shape = (1024,), *INIT_FROM_CKPT*
INFO:tensorflow:    name = bert/encoder/layer_13/attention/self/query/kernel:0, shape = (1024, 1024), *INIT_FROM_CKPT*
INFO:tensorflow:    name = bert/encoder/layer_13/attention/self/query/bias:0, shape = (1024,), *INIT_FR

INFO:tensorflow:    name = bert/encoder/layer_17/attention/self/key/kernel:0, shape = (1024, 1024), *INIT_FROM_CKPT*
INFO:tensorflow:    name = bert/encoder/layer_17/attention/self/key/bias:0, shape = (1024,), *INIT_FROM_CKPT*
INFO:tensorflow:    name = bert/encoder/layer_17/attention/self/value/kernel:0, shape = (1024, 1024), *INIT_FROM_CKPT*
INFO:tensorflow:    name = bert/encoder/layer_17/attention/self/value/bias:0, shape = (1024,), *INIT_FROM_CKPT*
INFO:tensorflow:    name = bert/encoder/layer_17/attention/output/dense/kernel:0, shape = (1024, 1024), *INIT_FROM_CKPT*
INFO:tensorflow:    name = bert/encoder/layer_17/attention/output/dense/bias:0, shape = (1024,), *INIT_FROM_CKPT*
INFO:tensorflow:    name = bert/encoder/layer_17/attention/output/LayerNorm/beta:0, shape = (1024,), *INIT_FROM_CKPT*
INFO:tensorflow:    name = bert/encoder/layer_17/attention/output/LayerNorm/gamma:0, shape = (1024,), *INIT_FROM_CKPT*
INFO:tensorflow:    name = bert/encoder/layer_17/intermediate/dense/ke

INFO:tensorflow:    name = bert/encoder/layer_21/intermediate/dense/kernel:0, shape = (1024, 4096), *INIT_FROM_CKPT*
INFO:tensorflow:    name = bert/encoder/layer_21/intermediate/dense/bias:0, shape = (4096,), *INIT_FROM_CKPT*
INFO:tensorflow:    name = bert/encoder/layer_21/output/dense/kernel:0, shape = (4096, 1024), *INIT_FROM_CKPT*
INFO:tensorflow:    name = bert/encoder/layer_21/output/dense/bias:0, shape = (1024,), *INIT_FROM_CKPT*
INFO:tensorflow:    name = bert/encoder/layer_21/output/LayerNorm/beta:0, shape = (1024,), *INIT_FROM_CKPT*
INFO:tensorflow:    name = bert/encoder/layer_21/output/LayerNorm/gamma:0, shape = (1024,), *INIT_FROM_CKPT*
INFO:tensorflow:    name = bert/encoder/layer_22/attention/self/query/kernel:0, shape = (1024, 1024), *INIT_FROM_CKPT*
INFO:tensorflow:    name = bert/encoder/layer_22/attention/self/query/bias:0, shape = (1024,), *INIT_FROM_CKPT*
INFO:tensorflow:    name = bert/encoder/layer_22/attention/self/key/kernel:0, shape = (1024, 1024), *INIT_FROM








  0%|          | 0/6980 [00:00<?, ?it/s][A[A[A[A[A[A[A

In [59]:
data['188714']

('foods and supplements to lower blood sugar',
 {'4321745', '8003843', '8003849'},
 ['4321745',
  '1051360',
  '5499899',
  '1051356',
  '1022485',
  '6442308',
  '4260032',
  '6301923',
  '1022490',
  '2470194',
  '4058667',
  '3045306',
  '3181388',
  '7524224',
  '1679250',
  '4816868',
  '3181390',
  '8177651',
  '3045308',
  '4071546',
  '6629693',
  '1471691',
  '8267410',
  '426112',
  '1679252',
  '7401332',
  '5499905',
  '5030633',
  '4058664',
  '7813525',
  '4058662',
  '8029904',
  '3091043',
  '8003843',
  '5273639',
  '1022482',
  '5273637',
  '297451',
  '8523352',
  '7681662',
  '6947936',
  '3301063',
  '4499516',
  '363666',
  '1051357',
  '1160716',
  '7153968',
  '7280738',
  '6301920',
  '426116',
  '894638',
  '89481',
  '1796942',
  '5366354',
  '8177653',
  '651469',
  '4940858',
  '297445',
  '8029905',
  '8516039',
  '1796944',
  '1224787',
  '3347300',
  '3172505',
  '5548808',
  '7134570',
  '718895',
  '8029907',
  '355518',
  '2094556',
  '3181391',
  '48