In [37]:
import os
import sys
import numpy as np
import pandas as pd
import torch
from tqdm import tqdm
from collections import defaultdict, Counter
from transformers import AutoTokenizer, BasicTokenizer, AutoModelForMaskedLM
from beir.datasets.data_loader import GenericDataLoader
from beir.retrieval.search.dense import DenseRetrievalExactSearch as DRES
from beir.retrieval.evaluation import EvaluateRetrieval
import pytrec_eval

sys.path.append(os.pardir)
from splade_vocab.models import Splade, BEIRSpladeModel, BEIRSpladeModelIDF
from tqdm import tqdm_notebook

In [49]:
!export CUDA_VISIBLE_DEVICES=1
data_path = "/home/gaia_data/iida.h/BEIR/datasets/scidocs"
corpus, queries, qrels = GenericDataLoader(data_folder=data_path).load(split="test")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/25657 [00:00<?, ?it/s]

In [50]:
def search_result(model_path):
    model = Splade(model_path)
    model.eval()
    tokenizer = model.tokenizer
    beir_splade = BEIRSpladeModel(model, tokenizer)
    dres = DRES(beir_splade, batch_size=64)
    retriever = EvaluateRetrieval(dres, score_function="dot", )
    results = retriever.retrieve(corpus, queries)
    del retriever
    del dres
    del model
    return results

In [51]:
model_pathes = {"mlm-splade-62783": "/home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/62783/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--62783-batch_size_40-2022-04-14_08-39-18/",
               "mlm-splade-30522": "/home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/30522/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--30522-batch_size_40-2022-04-17_10-06-25/",
                "splade-62783": "/home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model_init/raw/remove/62783/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-init_model-raw-remove--62783-batch_size_40-2022-04-17_09-55-08/",
               "splade": "/home/gaia_data/iida.h/BEIR/model/msmarco/splade/distilSplade_0.1_0.08_bert-base-uncased-batch_size_24-2022-04-07_21-45-37/"}

In [52]:
tks = {"splade-62783": AutoTokenizer.from_pretrained(os.path.join(model_pathes["mlm-splade-62783"], "0_MLMTransformer")),
       "splade": AutoTokenizer.from_pretrained(os.path.join(model_pathes["splade"], "0_MLMTransformer")),
      "space": BasicTokenizer()}

# Subword Ratio

In [109]:
def word_subword_alignment(t_subs, t_words):
    word_subword = defaultdict(list)
    t_sub = t_subs.pop(0)
    while len(t_subs) and len(t_words):
        t_word = t_words.pop(0)
        while t_sub.strip("#") in t_word and len(t_words) and len(t_subs):
            if t_sub != t_word:
                word_subword[t_word].append(t_sub)
            t_sub = t_subs.pop(0)
            
    del_key = []
    for word, subs in word_subword.items():
        if subs[0][0] != word[0]:
            del_key.append(word)
            
    for word in del_key:
        del word_subword[word]
    return word_subword

In [105]:
corpus['4983']

{'text': 'Alterations of the architecture of cerebral white matter in the developing human brain can affect cortical development and result in functional disabilities. A line scan diffusion-weighted magnetic resonance imaging (MRI) sequence with diffusion tensor analysis was applied to measure the apparent diffusion coefficient, to calculate relative anisotropy, and to delineate three-dimensional fiber architecture in cerebral white matter in preterm (n = 17) and full-term infants (n = 7). To assess effects of prematurity on cerebral white matter development, early gestation preterm infants (n = 10) were studied a second time at term. In the central white matter the mean apparent diffusion coefficient at 28 wk was high, 1.8 microm2/ms, and decreased toward term to 1.2 microm2/ms. In the posterior limb of the internal capsule, the mean apparent diffusion coefficients at both times were similar (1.2 versus 1.1 microm2/ms). Relative anisotropy was higher the closer birth was to term with 

In [113]:
def word_subword_ratio(corpus, tks):
    nume = 0
    denom = 0
    for cid, d_text in tqdm(corpus.items()):
        text = d_text["title"] + " " + d_text["text"]
        t_words = tks["space"].tokenize(text)
        t_subs_org = tks["splade"].tokenize(text)
        word_subword_splade = word_subword_alignment(t_subs_org, t_words)
        t_words = tks["space"].tokenize(text)
        t_subs_62783 = tks["splade-62783"].tokenize(text)
        word_subword_splade_62783 = word_subword_alignment(t_subs_62783, t_words)
        nume += len(word_subword_splade_62783)
        denom += len(word_subword_splade)
    print(nume, denom, (denom - nume) / denom)
        
        


In [114]:
word_subword_ratio(corpus, tks)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5183/5183 [00:29<00:00, 175.12it/s]

22911 105263 0.7823451735177602





# Embedding

In [305]:
def embed_move_w_splade(ini_model, splade_model):
    mlm_head_vec = ini_model.cls.predictions.decoder.weight
    mlm_sphead_vec = splade_model.transformer.cls.predictions.decoder.weight
    mlm_head_vec_norm = mlm_head_vec / torch.unsqueeze(torch.norm(mlm_head_vec, dim=1), 1)
    mlm_sphead_vec_norm = mlm_sphead_vec / torch.unsqueeze(torch.norm(mlm_sphead_vec, dim=1), 1)
    l2 = torch.norm(mlm_head_vec - mlm_sphead_vec, dim=1)
    l2_org = torch.mean(l2[:30522])
    l2_add = torch.mean(l2[30522:])
    print(f"l2: {torch.mean(l2)}, l2_org: {l2_org}, l2_add: {l2_add}")
    cos_sim = torch.sum(mlm_head_vec_norm * mlm_sphead_vec_norm, dim=1)
    cos_sim_org = torch.mean(cos_sim[:30522])
    cos_sim_add = torch.mean(cos_sim[30522:])
    print(f"cos_sim: {torch.mean(cos_sim)}, cos_sim_org: {cos_sim_org}, cos_sim_add: {cos_sim_add}")
    topk_l2, topk_ids = torch.topk(l2, 50)
    print(topk_l2)
    print(topk_ids)
    print(splade_model.tokenizer.decode(topk_ids))

In [301]:
def embed_move_w_bert(ini_model, mlm_model, tokenizer):
    ini_head_vec = ini_model.cls.predictions.decoder.weight
    mlm_head_vec = mlm_model.cls.predictions.decoder.weight
    ini_head_vec_norm = ini_head_vec / torch.unsqueeze(torch.norm(ini_head_vec, dim=1), 1)
    mlm_head_vec_norm = mlm_head_vec / torch.unsqueeze(torch.norm(mlm_head_vec, dim=1), 1)
    l2 = torch.norm(ini_head_vec - mlm_head_vec, dim=1)
    l2_org = torch.mean(l2[:30522])
    l2_add = torch.mean(l2[30522:])
    print(f"l2_org: {l2_org}, l2_add: {l2_add}")
    cos_sim = torch.sum(ini_head_vec_norm * mlm_head_vec_norm, dim=1)
    cos_sim_org = torch.mean(cos_sim[:30522])
    cos_sim_add = torch.mean(cos_sim[30522:])
    print(f"cos_sim_org: {cos_sim_org}, cos_sim_add: {cos_sim_add}")
    topk_sim, topk_ids = torch.topk(l2, 50)
    print(topk_sim)
    print(topk_ids)
    print(tokenizer.decode(topk_ids))

In [306]:
mlm_splade_62783 = Splade(model_pathes["mlm-splade-62783"])
mlm_62783 = AutoModelForMaskedLM.from_pretrained("/home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/mlm_model/raw/remove/62783/")
embed_move_w_splade(mlm_62783, mlm_splade_62783)
del mlm_splade_62783
del mlm_62783

path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/62783/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--62783-batch_size_40-2022-04-14_08-39-18/
l2: 0.06935861706733704, l2_org: 0.08058971166610718, l2_add: 0.05873292312026024
cos_sim: 0.9980770945549011, cos_sim_org: 0.9979574084281921, cos_sim_add: 0.9981902837753296
tensor([0.4130, 0.3312, 0.3051, 0.3013, 0.2687, 0.2524, 0.2512, 0.2466, 0.2381,
        0.2375, 0.2348, 0.2327, 0.2322, 0.2254, 0.2248, 0.2216, 0.2180, 0.2172,
        0.2168, 0.2167, 0.2157, 0.2105, 0.2081, 0.2072, 0.2043, 0.2040, 0.2029,
        0.2026, 0.2000, 0.1999, 0.1993, 0.1964, 0.1934, 0.1911, 0.1908, 0.1900,
        0.1899, 0.1882, 0.1882, 0.1865, 0.1863, 0.1862, 0.1853, 0.1842, 0.1840,
        0.1836, 0.1836, 0.1835, 0.1825, 0.1824], grad_fn=<TopkBackward0>)
tensor([ 1010,  1996,  1998,  1011,  1012,  2035,  2008,  1999,  2034,  1000,
         2028,  2119,  2036, 29649,

In [307]:
mlm_splade = Splade(model_pathes["mlm-splade-30522"])
mlm_bert = AutoModelForMaskedLM.from_pretrained("/home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/mlm_model/raw/remove/30522/")
embed_move_w_splade(mlm_bert, mlm_splade)
del mlm_bert
del mlm_splade

path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/30522/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--30522-batch_size_40-2022-04-17_10-06-25/
l2: 0.15539950132369995, l2_org: 0.15539950132369995, l2_add: nan
cos_sim: 0.9933962225914001, cos_sim_org: 0.9933962225914001, cos_sim_add: nan
tensor([0.4178, 0.3455, 0.3240, 0.3098, 0.2900, 0.2883, 0.2878, 0.2853, 0.2830,
        0.2827, 0.2812, 0.2800, 0.2789, 0.2766, 0.2760, 0.2752, 0.2748, 0.2728,
        0.2717, 0.2697, 0.2690, 0.2662, 0.2661, 0.2656, 0.2654, 0.2652, 0.2649,
        0.2636, 0.2633, 0.2632, 0.2626, 0.2624, 0.2623, 0.2615, 0.2607, 0.2598,
        0.2597, 0.2589, 0.2586, 0.2585, 0.2580, 0.2580, 0.2578, 0.2575, 0.2574,
        0.2572, 0.2572, 0.2564, 0.2560, 0.2559], grad_fn=<TopkBackward0>)
tensor([ 1010,  1998,  1011, 29649,  1996,  1000,  1012, 10985, 28075,  2015,
        29655, 23726,  2035,  1999,  7414, 21767,  5933, 29023, 19

In [308]:
splade = Splade(model_pathes["splade"])
bert = AutoModelForMaskedLM.from_pretrained("bert-base-uncased")
embed_move_w_splade(bert, splade)
del bert
del splade

path /home/gaia_data/iida.h/BEIR/model/msmarco/splade/distilSplade_0.1_0.08_bert-base-uncased-batch_size_24-2022-04-07_21-45-37/


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


l2: 0.20133037865161896, l2_org: 0.20133037865161896, l2_add: nan
cos_sim: 0.9893619418144226, cos_sim_org: 0.9893619418144226, cos_sim_add: nan
tensor([0.3735, 0.3532, 0.3423, 0.3410, 0.3384, 0.3377, 0.3334, 0.3330, 0.3289,
        0.3278, 0.3277, 0.3276, 0.3252, 0.3244, 0.3236, 0.3219, 0.3217, 0.3207,
        0.3207, 0.3205, 0.3197, 0.3191, 0.3191, 0.3190, 0.3189, 0.3185, 0.3180,
        0.3167, 0.3165, 0.3144, 0.3130, 0.3127, 0.3121, 0.3119, 0.3117, 0.3116,
        0.3114, 0.3112, 0.3107, 0.3106, 0.3101, 0.3100, 0.3099, 0.3098, 0.3095,
        0.3094, 0.3094, 0.3088, 0.3086, 0.3085], grad_fn=<TopkBackward0>)
tensor([29649, 19591, 26262,  1996, 28079,  5058,  1067,  6120,  1998, 21639,
        23726, 24140,  5567,  9432,  9857, 27717,  1010, 18300, 10372, 22462,
         7451, 12781, 29247,  1000,  5296, 29662, 16662, 29653, 26052,  3758,
        29652,  5378, 20984,  5933,  3386,  2441,  2249,  4052,  4786,  2760,
        24915, 29656,  8168, 20821, 12938,  4793, 16115, 26375,  4036

In [290]:
bert = AutoModelForMaskedLM.from_pretrained("bert-base-uncased")
tokenizer= AutoTokenizer.from_pretrained("bert-base-uncased")
mlm_bert = AutoModelForMaskedLM.from_pretrained("/home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/mlm_model/raw/remove/30522/")
embed_move_w_bert(bert, mlm_bert, tokenizer)
del bert
del mlm_bert

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


l2_org: 0.1896594762802124, l2_add: nan
cos_sim_org: 0.9900968670845032, cos_sim_add: nan
tensor([0.8245, 0.8822, 0.9048, 0.9196, 0.9361, 0.9366, 0.9368, 0.9401, 0.9420,
        0.9433, 0.9434, 0.9450, 0.9467, 0.9469, 0.9472, 0.9478, 0.9481, 0.9482,
        0.9487, 0.9490, 0.9493, 0.9508, 0.9510, 0.9515, 0.9520, 0.9521, 0.9521,
        0.9522, 0.9523, 0.9524, 0.9524, 0.9529, 0.9532, 0.9533, 0.9536, 0.9539,
        0.9542, 0.9542, 0.9543, 0.9544, 0.9548, 0.9557, 0.9563, 0.9565, 0.9567,
        0.9573, 0.9576, 0.9577, 0.9577, 0.9580], grad_fn=<NegBackward0>)
tensor([ 2133,  1529, 21932, 29658,   999,  2010,  2002,  2201,  3140,  6398,
         4316,  5525,  2014,  4098,  4172,  2182, 10061,  2720,  6473,  3117,
         4632,  5043,  6384,     0,  2516,  3906,  3606,  2745, 15008, 25850,
         1079,  2033,  9809,  3611, 11588,  2017,  3931, 15287,  4727,  6574,
         8142,  2282,  7782,  2986,  4267,  4575,  4075,  2396,  6785,  2751])
... …´s´! his he album forced reporter vehicle

In [292]:
ini_62783 = AutoModelForMaskedLM.from_pretrained("/home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/init_model/raw/remove/62783/")
mlm_62783 = AutoModelForMaskedLM.from_pretrained("/home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/mlm_model/raw/remove/62783/")
tokenizer = AutoTokenizer.from_pretrained("/home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/mlm_model/raw/remove/62783/")
embed_move_w_bert(ini_62783, mlm_62783, tokenizer)
del ini_62783
del mlm_62783

l2_org: 0.1923682689666748, l2_add: 0.2712787985801697
cos_sim_org: 0.9897450804710388, cos_sim_add: 0.9603198766708374
tensor([0.7223, 0.7256, 0.7430, 0.7435, 0.7449, 0.7472, 0.7490, 0.7515, 0.7532,
        0.7561, 0.7609, 0.7617, 0.7648, 0.7676, 0.7707, 0.7709, 0.7722, 0.7731,
        0.7734, 0.7734, 0.7746, 0.7754, 0.7771, 0.7779, 0.7791, 0.7809, 0.7817,
        0.7824, 0.7825, 0.7827, 0.7832, 0.7833, 0.7852, 0.7852, 0.7864, 0.7872,
        0.7874, 0.7874, 0.7875, 0.7889, 0.7892, 0.7893, 0.7894, 0.7901, 0.7902,
        0.7904, 0.7910, 0.7916, 0.7933, 0.7934], grad_fn=<NegBackward0>)
tensor([30525, 30806, 30675, 30729, 34649, 31151, 30660, 31240, 30743, 32271,
        31713, 30809, 60347, 59650, 32793, 31250, 59963, 59553, 33892, 61525,
        58426, 32057, 60199, 32999, 30834, 60231, 32415, 60871, 32579, 57866,
        58037, 59736, 32934, 61184, 61709, 60003, 35333, 62377, 61701, 62121,
        59403, 62778, 33950, 57087, 61130, 59407, 62317, 38682, 50031, 61681])
∼ ␤ ϭ ␣ ∝ ⊙ ϯ ؉ 

In [53]:
all_results = {}
for k, model_path in model_pathes.items():
    results = search_result(model_path)
    all_results[k] = results

path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/62783/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--62783-batch_size_40-2022-04-14_08-39-18/


Batches:   0%|          | 0/32 [00:00<?, ?it/s]

Batches:   0%|          | 0/802 [00:00<?, ?it/s]

path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/30522/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--30522-batch_size_40-2022-04-17_10-06-25/


Batches:   0%|          | 0/32 [00:00<?, ?it/s]

Batches:   0%|          | 0/802 [00:00<?, ?it/s]

path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model_init/raw/remove/62783/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-init_model-raw-remove--62783-batch_size_40-2022-04-17_09-55-08/


Batches:   0%|          | 0/32 [00:00<?, ?it/s]

Batches:   0%|          | 0/802 [00:00<?, ?it/s]

path /home/gaia_data/iida.h/BEIR/model/msmarco/splade/distilSplade_0.1_0.08_bert-base-uncased-batch_size_24-2022-04-07_21-45-37/


Batches:   0%|          | 0/32 [00:00<?, ?it/s]

Batches:   0%|          | 0/802 [00:00<?, ?it/s]

In [54]:
evaluator = pytrec_eval.RelevanceEvaluator(qrels, {'ndcg_cut_10'})
all_ndcg = {}
all_ave_ndcg = {}
for k, result in all_results.items():
    ndcg = evaluator.evaluate(result)
    all_ndcg[k] = ndcg
    all_ave_ndcg[k] = np.mean([val["ndcg_cut_10"] for val in ndcg.values()])

In [55]:
all_ave_ndcg

{'mlm-splade-62783': 0.15577710688764188,
 'mlm-splade-30522': 0.15691002377354152,
 'splade-62783': 0.15610918267920665,
 'splade': 0.1549133613255218}

In [56]:
df_all_ndcg = {}
model_names = ["mlm-splade-62783", "mlm-splade-30522", "splade-62783", "splade"]
for k in model_names:
    target = all_ndcg[k]
    df_all_ndcg[k] = [val['ndcg_cut_10'] for val in target.values()]
    
df_all_ndcg = pd.DataFrame(df_all_ndcg)
df_all_ndcg.head()

Unnamed: 0,mlm-splade-62783,mlm-splade-30522,splade-62783,splade
0,0.33916,0.213986,0.33916,0.16958
1,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0
3,0.213986,0.16958,0.213986,0.33916
4,0.867044,0.636682,0.636682,0.636682


In [57]:
diff_from_splade = {}
diff_model_names = ["mlm-splade-62783", "mlm-splade-30522"]
q_index = list(queries.keys())
for k in diff_model_names:
    target = all_ndcg[k]
    base = all_ndcg["splade"]
    diff_from_splade[k] = [val1["ndcg_cut_10"] - val2["ndcg_cut_10"] for val1, val2 in zip(target.values(), base.values())]
    
df_diff_from_splade = pd.DataFrame(diff_from_splade)
df_diff_from_splade

Unnamed: 0,mlm-splade-62783,mlm-splade-30522
0,0.169580,0.044406
1,0.000000,0.000000
2,0.000000,0.000000
3,-0.125174,-0.169580
4,0.230361,0.000000
...,...,...
995,-0.125174,-0.125174
996,0.000000,0.000000
997,-0.098039,0.004058
998,0.000000,0.000000


In [58]:
diff_from_mlm_splade = {}
diff_model_names = ["mlm-splade-62783", "splade"]
q_index = list(queries.keys())
for k in diff_model_names:
    target = all_ndcg[k]
    base = all_ndcg["mlm-splade-30522"]
    diff_from_mlm_splade[k] = [val1["ndcg_cut_10"] - val2["ndcg_cut_10"] for val1, val2 in zip(target.values(), base.values())]
    
df_diff_from_mlm_splade = pd.DataFrame(diff_from_mlm_splade)
df_diff_from_mlm_splade

Unnamed: 0,mlm-splade-62783,splade
0,0.125174,-0.044406
1,0.000000,0.000000
2,0.000000,0.000000
3,0.044406,0.169580
4,0.230361,0.000000
...,...,...
995,0.000000,0.125174
996,0.000000,0.000000
997,-0.102097,-0.004058
998,0.000000,0.000000


In [59]:
qids_adalm_splade_better = df_diff_from_splade[(df_diff_from_splade["mlm-splade-62783"] > 0.0)].index
qids_adalm_splade_better = [q_index[i] for i in qids_adalm_splade_better]

In [60]:
qids_adalm_splade_worse = df_diff_from_splade[df_diff_from_splade["mlm-splade-62783"] < 0.0].index
qids_adalm_splade_worse = [q_index[i] for i in qids_adalm_splade_worse]

In [61]:
print(len(qrels), len(qids_davosplade_better), len(qids_davosplade_worse))

NameError: name 'qids_davosplade_better' is not defined

In [None]:
def count_em_w_space_tk(model_name, qids, all_results):
    def encode_doc(model, e_query, text, t_query, t_query_space):
        t_text = model.tokenizer(text, max_length=512, return_tensors="pt")
        with torch.no_grad():
            e_text = model.encode(**t_text)
            match_scores = e_query * e_text
            score, ids = torch.topk(match_scores, k=20)
            ids = ids[0][score[0] > 0.1]
            score = score[0][score[0] > 0.1]
            tids = model.tokenizer.decode(ids)
        t_tids = tids.split()
        match_type = {"em": [], "not-em": [], "space-tk": []}
        not_em = []
        for t in t_tids:
            if t in t_query:
                match_type["em"].append(t)
            else:
                match_type["not-em"].append(t)
         
            if t in t_query_space:
                match_type["space-tk"].append(t)
                
        return match_type
    
    model_path = model_pathes[model_name]
    model = Splade(model_path)
    space_tk = BasicTokenizer()
    
    match_token_splade_nums = []
    expand_token_splade_nums = []
    match_token_tk_nums = []
        
    for qid in qids:
        query = queries[qid]
        top_10_doc = sorted(all_results[model_name][qid].items(), key=lambda x: -x[1])[:10]
        t_query_id = model.tokenizer(query, max_length=512, return_tensors="pt")
        t_query = model.tokenizer.tokenize(query)
        t_query_space = space_tk.tokenize(query)
        with torch.no_grad():    
            e_query = model.encode(**t_query_id)

        top_qrels = []
        try:
            q_qrels = qrels[qid]
        except:
            continue
        for i, (cid, qrel) in enumerate(q_qrels.items()):            
            if qrel > 0:
                text = corpus[cid]["title"] + " " + corpus[cid]["text"]
                t_text = model.tokenizer.tokenize(text)
                match_token_tk_nums.append(len([t for t in t_query if t in t_text]))
                match_type = encode_doc(model, e_query, text, t_query, t_query_space)
                match_token_splade_nums.append(len(match_type["em"]))
                expand_token_splade_nums.append(len(match_type["not-em"]))
        

    return match_token_tk_nums, match_token_splade_nums, expand_token_splade_nums

In [None]:
observe_model = ["mlm-splade-62783", "mlm-splade-30522", "splade-62783", "splade"]
# observe_model = ["mlm-splade-62783", "splade"]
for model_name in observe_model:
    match_token_tk_nums, match_token_splade_nums, expand_token_splade_nums = count_em_w_space_tk(model_name, qids_adalm_splade_better, all_results)
    print(f"{model_name}, em_tk: {np.mean(match_token_tk_nums)}, em_splade: {np.mean(match_token_splade_nums)}, expand: {np.mean(expand_token_splade_nums)}")

In [None]:
observe_model = ["mlm-splade-62783", "mlm-splade-30522", "splade-62783", "splade"]
# observe_model = ["mlm-splade-62783", "splade"]
for model_name in observe_model:
    match_token_tk_nums, match_token_splade_nums, expand_token_splade_nums = count_em_w_space_tk(model_name, qids_adalm_splade_worse, all_results)
    print(f"{model_name}, em_tk: {np.mean(match_token_tk_nums)}, em_splade: {np.mean(match_token_splade_nums)}, expand: {np.mean(expand_token_splade_nums)}")

In [None]:
observe_model = ["mlm-splade-62783", "mlm-splade-30522", "splade-62783", "splade"]
# observe_model = ["mlm-splade-62783", "splade"]
for model_name in observe_model:
    match_token_tk_nums, match_token_splade_nums, expand_token_splade_nums = count_em_w_space_tk(model_name, q_index, all_results)
    print(f"{model_name}, em_tk: {np.mean(match_token_tk_nums)}, em_splade: {np.mean(match_token_splade_nums)}, expand: {np.mean(expand_token_splade_nums)}")

In [348]:
ndcg_adalm_splade_better = df_all_ndcg[(df_diff_from_splade["mlm-splade-62783"] > 0.1) & (df_diff_from_mlm_splade["mlm-splade-62783"] > 0.1)]
qids_adalm_splade_better_show = df_diff_from_splade[(df_diff_from_splade["mlm-splade-62783"] > 0.1) & (df_diff_from_mlm_splade["mlm-splade-62783"] > 0.1)].index
qids_adalm_splade_better_show = [q_index[i] for i in qids_adalm_splade_better_show]

In [358]:
ndcg_adalm_splade_worse = df_all_ndcg[(df_diff_from_splade["mlm-splade-62783"] < -0.1) & (df_diff_from_mlm_splade["mlm-splade-62783"] < -0.1)]
qids_adalm_splade_worse_show = df_diff_from_splade[(df_diff_from_splade["mlm-splade-62783"] < -0.1) & (df_diff_from_mlm_splade["mlm-splade-62783"] < -0.1)].index
qids_adalm_splade_worse_show = [q_index[i] for i in qids_adalm_splade_worse_show]

# domain specific words

In [None]:
def em_w_space_tk(model_name, qids, all_results):
    def encode_doc(model, e_query, text, t_query, t_query_space):
        t_text_id = model.tokenizer(text, max_length=512, return_tensors="pt")
        t_text = model.tokenizer.tokenize(text)
        with torch.no_grad():
            e_text = model.encode(**t_text_id)
            match_scores = e_query * e_text
            score, ids = torch.topk(match_scores, k=20)
            ids = ids[0][score[0] > 0.0]
            score = score[0][score[0] > 0.0]
            tids = model.tokenizer.decode(ids)
        t_tids = tids.split()
        match_type = {"em": [], "not-em": [], "space-tk": []}
        not_em = []
        for t in t_tids:
            if t in t_text:
                match_type["em"].append(t)
            else:
                match_type["not-em"].append(t)
         
            if t in t_query_space:
                match_type["space-tk"].append(t)
                
        return match_type
    
    model_path = model_pathes[model_name]
    model = Splade(model_path)
    space_tk = BasicTokenizer()
    
    match_token_splade = []
    expand_token_splade = []
    match_token_tk = []
        
    for qid in qids:
        query = queries[qid]
        top_10_doc = sorted(all_results[model_name][qid].items(), key=lambda x: -x[1])[:10]
        t_query_id = model.tokenizer(query, max_length=512, return_tensors="pt")
        t_query = model.tokenizer.tokenize(query)
        t_query_space = space_tk.tokenize(query)
        with torch.no_grad():    
            e_query = model.encode(**t_query_id)

        top_qrels = []
        try:
            q_qrels = qrels[qid]
        except:
            continue
        for i, (cid, qrel) in enumerate(q_qrels.items()):            
            if qrel > 0:
                text = corpus[cid]["title"] + " " + corpus[cid]["text"]
                t_text = model.tokenizer.tokenize(text)
                # match_token_tk_nums.append(len([t for t in t_query if t in t_text]))
                match_type = encode_doc(model, e_query, text, t_query, t_query_space)
                match_token_splade += match_type["em"]
                expand_token_splade += match_type["not-em"]
        

    return match_token_splade, expand_token_splade

In [33]:
def calc_idf(corpus, tokenizer):
    N = len(corpus)
    idf = defaultdict(float)
    df = Counter()
    for cid, doc in tqdm_notebook(corpus.items()):
        text = doc["title"] + " " + doc["text"]
        t_doc = tokenizer.tokenize(text)
        df.update(list(set(t_doc)))
    
    for v, freq in df.items():
        idf[v] = np.log(N/freq)
        df[v] = freq / N
    return df, idf

In [25]:
data_path = "/home/gaia_data/iida.h/BEIR/datasets/msmarco"
ms_corpus, ms_queries, ms_qrels = GenericDataLoader(data_folder=data_path).load(split="test")

  0%|          | 0/8841823 [00:00<?, ?it/s]

In [39]:
vocab_size = 62783
tk_names = ["splade", f"splade-{vocab_size}"]
ms_tk_df = {}
for model_name in tk_names:
    df, idf = calc_idf(ms_corpus, tks[model_name])
    ms_tk_df[model_name] = df

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for cid, doc in tqdm_notebook(corpus.items()):


  0%|          | 0/8841823 [00:00<?, ?it/s]

  0%|          | 0/8841823 [00:00<?, ?it/s]

In [62]:
tk_df = {}
for model_name in tk_names:
    df, idf = calc_idf(corpus, tks[model_name])
    tk_df[model_name] = df

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for cid, doc in tqdm_notebook(corpus.items()):


  0%|          | 0/25657 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (638 > 512). Running this sequence through the model will result in indexing errors


  0%|          | 0/25657 [00:00<?, ?it/s]

In [63]:
word_kind_model = {}

for model_name in tk_names:
    word_kind_model[model_name] = {"domain_words": [], "general_words": []}
    for v in tk_df[model_name]:
        if tk_df[model_name][v] > ms_tk_df[model_name].get(v, 0) * 5:
            word_kind_model[model_name]["domain_words"].append(v)
        else:
            word_kind_model[model_name]["general_words"].append(v)

In [64]:
def num_expand_words(model_name, expand_token_splade, word_kind_model):
    domain_words = []
    general_words = []
    if str(vocab_size) in model_name:
        for w in expand_token_splade:
            if w in word_kind_model[f"splade-{vocab_size}"]["domain_words"]:
                domain_words.append(w)
            else:
                general_words.append(w)
    else:
        for w in expand_token_splade:
            if w in word_kind_model["splade"]["domain_words"]:
                domain_words.append(w)
            else:
                general_words.append(w)
                
    print(f"expand-domain_words:{len(domain_words)}, expand-general_words:{len(general_words)}")

In [65]:
observe_model = [f"mlm-splade-{vocab_size}", "mlm-splade-30522", f"splade-{vocab_size}", "splade"]
# observe_model = ["mlm-splade-62783", "splade"]
for model_name in observe_model:
    match_token_splade, expand_token_splade = em_w_space_tk(model_name, qids_adalm_splade_better, all_results)
    num_expand_words(model_name, expand_token_splade, word_kind_model)

path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/62783/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--62783-batch_size_40-2022-04-14_08-39-18/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


expand-domain_words:11, expand-general_words:8966
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/30522/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--30522-batch_size_40-2022-04-17_10-06-25/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


expand-domain_words:0, expand-general_words:9167
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model_init/raw/remove/62783/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-init_model-raw-remove--62783-batch_size_40-2022-04-17_09-55-08/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


expand-domain_words:7, expand-general_words:12428
path /home/gaia_data/iida.h/BEIR/model/msmarco/splade/distilSplade_0.1_0.08_bert-base-uncased-batch_size_24-2022-04-07_21-45-37/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Token indices sequence length is longer than the specified maximum sequence length for this model (1817 > 512). Running this sequence through the model will result in indexing errors


expand-domain_words:0, expand-general_words:9838


In [68]:
N = len(qids_adalm_splade_better)
print(11/N, 8996/N)
print(9167/N)
print(7/N, 12428/N)
print(9838/N)

0.052132701421800945 42.63507109004739
43.44549763033175
0.03317535545023697 58.90047393364929
46.62559241706161


In [66]:
observe_model = [f"mlm-splade-{vocab_size}", "mlm-splade-30522", f"splade-{vocab_size}", "splade"]
# observe_model = ["mlm-splade-62783", "splade"]
for model_name in observe_model:
    match_token_splade, expand_token_splade = em_w_space_tk(model_name, q_index, all_results)
    num_expand_words(model_name, expand_token_splade, word_kind_model)

path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/62783/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--62783-batch_size_40-2022-04-14_08-39-18/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


expand-domain_words:48, expand-general_words:37127
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/30522/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--30522-batch_size_40-2022-04-17_10-06-25/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


expand-domain_words:0, expand-general_words:39386
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model_init/raw/remove/62783/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-init_model-raw-remove--62783-batch_size_40-2022-04-17_09-55-08/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


expand-domain_words:20, expand-general_words:56410
path /home/gaia_data/iida.h/BEIR/model/msmarco/splade/distilSplade_0.1_0.08_bert-base-uncased-batch_size_24-2022-04-07_21-45-37/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Token indices sequence length is longer than the specified maximum sequence length for this model (783 > 512). Running this sequence through the model will result in indexing errors


expand-domain_words:0, expand-general_words:43078


# show docs

In [392]:
def analysis(model_name, qid, all_results):
    def encode_vec(model, e_query, text):
        t_text = model.tokenizer(text, max_length=512, return_tensors="pt")
        with torch.no_grad():
            e_text = model.encode(**t_text)
            match_scores = e_query * e_text
            score, ids = torch.topk(match_scores, k=20)
            ids = ids[0][score[0] > 0.0]
            score = score[0][score[0] > 0.0]
            tids = model.tokenizer.decode(ids)
        print([round(s, 2) for s in score.tolist()])
        print(tids)
        t_tids = tids.split()
        match_type = {"em": [], "not-em": [], "space-tk": []}
        not_em = []
        for t in t_tids:
            if t in t_query:
                match_type["em"].append(t)
            else:
                match_type["not-em"].append(t)
            
            if t in t_query_space:
                match_type["space-tk"].append(t)
                
        print(match_type)
                    
    model_path = model_pathes[model_name]
    query = queries[qid]
    top_10_doc = sorted(all_results[model_name][qid].items(), key=lambda x: -x[1])[:10]
    model = Splade(model_path)
    space_tk = BasicTokenizer()
    t_query_id = model.tokenizer(query, max_length=512, return_tensors="pt")
    t_query = model.tokenizer.tokenize(query)
    t_query_space = space_tk.tokenize(query)
    with torch.no_grad():    
        e_query = model.encode(**t_query_id)
    
    for c_cid, qrel in qrels[qid].items():
        if qrel > 0:
            within_top10 = [(i+1, cid, score) for i, (cid, score) in enumerate(top_10_doc) if cid==c_cid]
            if within_top10:
                print(within_top10, qrel)
            else:
                print("not exist within top10", c_cid)
            text = corpus[c_cid]["title"] + " " + corpus[c_cid]["text"]
            encode_vec(model, e_query, text)
        
    del model

In [394]:
observe_model = ["mlm-splade-62783", "mlm-splade-30522", "splade"]
for qid in qids_adalm_splade_better_show:
    print("----------------------------")
    print(qid, queries[qid])
    for model_name in observe_model:
        print("-------------")
        print(model_name)
        analysis(model_name, qid, all_results)

----------------------------
70 Activation of PPM1D suppresses p53 function.
-------------
mlm-splade-62783
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/62783/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--62783-batch_size_40-2022-04-14_08-39-18/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(1, '5956380', 28.107315063476562)] 1
[3.31, 3.21, 3.19, 2.06, 1.9, 1.87, 1.55, 1.5, 1.42, 1.41, 1.19, 1.12, 0.97, 0.9, 0.4, 0.39, 0.37, 0.34, 0.25, 0.21]
ppm p53 pp activation tp531 function53 suppress pd 1 suppression dna ppt 53 cancer activated suppressor suppressed
{'em': ['ppm', 'p53', 'activation'], 'not-em': ['pp', 'tp531', 'function53', 'suppress', 'pd', '1', 'suppression', 'dna', 'ppt', '53', 'cancer', 'activated', 'suppressor', 'suppressed'], 'space-tk': ['p53', 'activation']}
not exist within top10 4414547
[3.44, 3.22, 3.1, 1.73, 1.73, 1.58, 1.45, 1.31, 1.27, 1.07, 1.06, 0.87, 0.64, 0.47, 0.46, 0.4, 0.4, 0.29, 0.25, 0.15]
p53 ppm pp p153 suppression tp53d dna 1 suppress control 53 cancer ppt function suppressed effect suppressor
{'em': ['p53', 'ppm', 'function'], 'not-em': ['pp', 'p153', 'suppression', 'tp53d', 'dna', '1', 'suppress', 'control', '53', 'cancer', 'ppt', 'suppressed', 'effect', 'suppressor'], 'space-tk': ['p53', 'function']}
-------------
mlm-splade-30522
path

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(3, '5956380', 19.199443817138672)] 1
[6.05, 2.92, 2.57, 1.54, 1.34, 1.28, 1.21, 0.94, 0.79, 0.32, 0.2, 0.01]
pp531m p activation suppressd function suppression 1 drug
{'em': ['p', 'activation', 'function'], 'not-em': ['pp531m', 'suppressd', 'suppression', '1', 'drug'], 'space-tk': ['activation', 'function']}
not exist within top10 4414547
[5.78, 3.12, 2.45, 1.7, 1.51, 0.86, 0.49, 0.18, 0.16, 0.16, 0.12, 0.03]
pp531 pmd suppression influence control 1 effect 53
{'em': [], 'not-em': ['pp531', 'pmd', 'suppression', 'influence', 'control', '1', 'effect', '53'], 'space-tk': []}
-------------
splade
path /home/gaia_data/iida.h/BEIR/model/msmarco/splade/distilSplade_0.1_0.08_bert-base-uncased-batch_size_24-2022-04-07_21-45-37/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(6, '5956380', 17.339065551757812)] 1
[5.25, 2.12, 2.08, 2.07, 1.32, 1.04, 0.99, 0.63, 0.63, 0.41, 0.22, 0.21, 0.18, 0.13, 0.13, 0.1, 0.08, 0.07, 0.06, 0.01]
pp53 p1m function suppress activationd activate gene 1 effect drug barrett enzyme suppressionmd receptor cause
{'em': ['function', 'suppress'], 'not-em': ['pp53', 'p1m', 'activationd', 'activate', 'gene', '1', 'effect', 'drug', 'barrett', 'enzyme', 'suppressionmd', 'receptor', 'cause'], 'space-tk': ['function']}
not exist within top10 4414547
[5.03, 2.59, 2.47, 1.9, 1.16, 0.63, 0.62, 0.51, 0.43, 0.33, 0.31, 0.25, 0.17, 0.14, 0.12, 0.1, 0.04, 0.04, 0.04, 0.04]
pp53 p1m suppressiond function effect suppress gene control 152 enzyme58md barrett not role
{'em': ['function', 'suppress'], 'not-em': ['pp53', 'p1m', 'suppressiond', 'effect', 'gene', 'control', '152', 'enzyme58md', 'barrett', 'not', 'role'], 'space-tk': ['function']}
----------------------------
94 Albendazole is used to treat lymphatic filariasis.
-------------
mlm-splade

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(2, '1215116', 8.476642608642578)] 1
[2.62, 2.54, 1.15, 0.8, 0.53, 0.4, 0.17, 0.16, 0.08, 0.06, 0.03, 0.0, 0.0, 0.0]
lymphatic filariasis lymph fi helpis filarial treatment word ascaris cause include le fibrosis
{'em': ['lymphatic', 'filariasis'], 'not-em': ['lymph', 'fi', 'helpis', 'filarial', 'treatment', 'word', 'ascaris', 'cause', 'include', 'le', 'fibrosis'], 'space-tk': ['lymphatic', 'filariasis']}
-------------
mlm-splade-30522
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/30522/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--30522-batch_size_40-2022-04-17_10-06-25/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(3, '1215116', 10.482891082763672)] 1
[2.1, 1.55, 1.54, 1.5, 1.43, 0.69, 0.67, 0.43, 0.36, 0.25, 0.06, 0.05, 0.02, 0.01, 0.01, 0.0]
##larymias fipha benefit liais treatmenttic for treat therapy a medical
{'em': ['treat'], 'not-em': ['##larymias', 'fipha', 'benefit', 'liais', 'treatmenttic', 'for', 'therapy', 'a', 'medical'], 'space-tk': ['treat']}
-------------
splade
path /home/gaia_data/iida.h/BEIR/model/msmarco/splade/distilSplade_0.1_0.08_bert-base-uncased-batch_size_24-2022-04-07_21-45-37/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(7, '1215116', 9.221842765808105)] 1
[2.24, 1.69, 1.44, 1.33, 0.97, 0.66, 0.44, 0.26, 0.25, 0.21, 0.14, 0.06, 0.06, 0.06, 0.02, 0.02, 0.01, 0.0]
##larymiaspha fi l cure diseaseistic drug purpose effect treatment.ia therapy medical
{'em': ['fi', 'l'], 'not-em': ['##larymiaspha', 'cure', 'diseaseistic', 'drug', 'purpose', 'effect', 'treatment.ia', 'therapy', 'medical'], 'space-tk': []}
----------------------------
124 Antiretroviral therapy reduces rates of tuberculosis across a broad range of CD4 strata.
-------------
mlm-splade-62783
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/62783/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--62783-batch_size_40-2022-04-14_08-39-18/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(1, '4883040', 37.030921936035156)] 1
[4.08, 4.08, 3.97, 2.81, 2.48, 2.44, 1.23, 1.13, 1.0, 0.95, 0.95, 0.9, 0.9, 0.87, 0.83, 0.81, 0.77, 0.65, 0.63, 0.6]
antiretroviral tuberculosis tb hiv cd4 therapy strata antibiotic help anti drug poly reduced art aids treatment std reduce prednisone antiviral
{'em': ['antiretroviral', 'tuberculosis', 'cd4', 'therapy', 'strata'], 'not-em': ['tb', 'hiv', 'antibiotic', 'help', 'anti', 'drug', 'poly', 'reduced', 'art', 'aids', 'treatment', 'std', 'reduce', 'prednisone', 'antiviral'], 'space-tk': ['antiretroviral', 'tuberculosis', 'cd4', 'therapy', 'strata']}
-------------
mlm-splade-30522
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/30522/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--30522-batch_size_40-2022-04-17_10-06-25/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(2, '4883040', 30.298553466796875)] 1
[4.3, 3.71, 3.39, 2.99, 2.79, 2.73, 2.45, 2.17, 1.73, 1.35, 0.83, 0.82, 0.7, 0.62, 0.6, 0.55, 0.52, 0.4, 0.4, 0.31]
cd tuberculosis tbirarov anti therapyret4 cds hiv 4 reduce benefitl drug treatment strata reduction rate
{'em': ['cd', 'tuberculosis', 'anti', 'strata'], 'not-em': ['tbirarov', 'therapyret4', 'cds', 'hiv', '4', 'reduce', 'benefitl', 'drug', 'treatment', 'reduction', 'rate'], 'space-tk': ['tuberculosis', 'strata']}
-------------
splade
path /home/gaia_data/iida.h/BEIR/model/msmarco/splade/distilSplade_0.1_0.08_bert-base-uncased-batch_size_24-2022-04-07_21-45-37/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(2, '4883040', 25.94072723388672)] 1
[3.42, 3.38, 3.18, 2.8, 2.69, 2.49, 2.21, 1.93, 1.35, 0.82, 0.78, 0.76, 0.72, 0.66, 0.6, 0.41, 0.37, 0.34, 0.3, 0.27]
##ira cd tuberculosis tb antirovret therapy4 cdsl reduce cure drug treatment 4 preventlov rate risk
{'em': ['##ira', 'cd', 'tuberculosis'], 'not-em': ['tb', 'antirovret', 'therapy4', 'cdsl', 'reduce', 'cure', 'drug', 'treatment', '4', 'preventlov', 'rate', 'risk'], 'space-tk': ['tuberculosis']}
----------------------------
300 Cytosolic proteins bind to iron-responsive elements on mRNAs coding for DMT1. Cytosolic proteins bind to iron-responsive elements on mRNAs coding for proteins involved in iron uptake.
-------------
mlm-splade-62783
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/62783/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--62783-batch_size_40-2022-04-14_08-39-18/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(1, '3553087', 28.224151611328125)] 1
[4.21, 3.1, 2.1, 1.88, 1.58, 1.56, 1.35, 1.26, 1.23, 1.09, 1.05, 1.02, 0.95, 0.88, 0.83, 0.75, 0.7, 0.66, 0.37, 0.29]
iron protein responsive binding proteins metal bind element cy rna mitochondrial responsiveness mitochondria response cytochrome organelle gene sensitive function help
{'em': ['iron', 'responsive', 'proteins', 'bind'], 'not-em': ['protein', 'binding', 'metal', 'element', 'cy', 'rna', 'mitochondrial', 'responsiveness', 'mitochondria', 'response', 'cytochrome', 'organelle', 'gene', 'sensitive', 'function', 'help'], 'space-tk': ['iron', 'responsive', 'proteins', 'bind']}
-------------
mlm-splade-30522
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/30522/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--30522-batch_size_40-2022-04-17_10-06-25/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


not exist within top10 3553087
[5.44, 2.42, 1.93, 1.73, 1.71, 1.53, 0.97, 0.49, 0.48, 0.42, 0.32, 0.31, 0.21, 0.2, 0.19, 0.15, 0.13, 0.11, 0.1, 0.09]
iron protein rna responsive cy binding element bind metal proteins gene genetic respond response enzyme function drug dna link involved
{'em': ['iron', 'responsive', 'cy', 'bind', 'proteins', 'involved'], 'not-em': ['protein', 'rna', 'binding', 'element', 'metal', 'gene', 'genetic', 'respond', 'response', 'enzyme', 'function', 'drug', 'dna', 'link'], 'space-tk': ['iron', 'responsive', 'bind', 'proteins', 'involved']}
-------------
splade
path /home/gaia_data/iida.h/BEIR/model/msmarco/splade/distilSplade_0.1_0.08_bert-base-uncased-batch_size_24-2022-04-07_21-45-37/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(2, '3553087', 20.495887756347656)] 1
[4.74, 2.45, 2.22, 2.08, 1.67, 1.58, 1.14, 1.11, 1.09, 0.74, 0.65, 0.32, 0.26, 0.18, 0.17, 0.15, 0.1, 0.05, 0.04, 0.03]
iron responsive protein cy rna binding element steel bind gene response proteins2 sensitive mechanism receptor role function enzyme detect
{'em': ['iron', 'responsive', 'cy', 'bind'], 'not-em': ['protein', 'rna', 'binding', 'element', 'steel', 'gene', 'response', 'proteins2', 'sensitive', 'mechanism', 'receptor', 'role', 'function', 'enzyme', 'detect'], 'space-tk': ['iron', 'responsive', 'bind']}
----------------------------
312 De novo assembly of sequence data has more specific contigs than unassembled sequence data.
-------------
mlm-splade-62783
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/62783/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--62783-batch_size_40-2022-04-14_08-39-18/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(8, '6173523', 15.483285903930664)] 1
[3.23, 2.37, 2.1, 1.77, 1.61, 0.97, 0.95, 0.87, 0.78, 0.61, 0.59, 0.41, 0.3, 0.22, 0.21, 0.16, 0.12, 0.12, 0.04, 0.04]
sequence novo assembly sequencing data dna sequences specific de con sequenced assemblies specificity synthesis compared di comparison genetic meta word
{'em': ['sequence', 'novo', 'assembly', 'data', 'specific', 'de'], 'not-em': ['sequencing', 'dna', 'sequences', 'con', 'sequenced', 'assemblies', 'specificity', 'synthesis', 'compared', 'di', 'comparison', 'genetic', 'meta', 'word'], 'space-tk': ['sequence', 'novo', 'assembly', 'data', 'specific', 'de']}
-------------
mlm-splade-30522
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/30522/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--30522-batch_size_40-2022-04-17_10-06-25/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


not exist within top10 6173523
[2.72, 2.06, 1.73, 1.3, 1.19, 1.15, 0.81, 0.73, 0.66, 0.63, 0.18, 0.18, 0.17, 0.09, 0.09, 0.08, 0.06, 0.05, 0.03, 0.01]
sequence assembly novo dna data de sequencing con sequences specific genetic analysis comparison gene technique genes purpose samples database of
{'em': ['sequence', 'assembly', 'novo', 'data', 'de', 'con', 'specific', 'of'], 'not-em': ['dna', 'sequencing', 'sequences', 'genetic', 'analysis', 'comparison', 'gene', 'technique', 'genes', 'purpose', 'samples', 'database'], 'space-tk': ['sequence', 'assembly', 'novo', 'data', 'de', 'specific', 'of']}
-------------
splade
path /home/gaia_data/iida.h/BEIR/model/msmarco/splade/distilSplade_0.1_0.08_bert-base-uncased-batch_size_24-2022-04-07_21-45-37/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


not exist within top10 6173523
[3.12, 1.05, 0.94, 0.55, 0.52, 0.41, 0.4, 0.38, 0.16, 0.13, 0.11, 0.08, 0.08, 0.04, 0.03, 0.03, 0.02, 0.01, 0.0, 0.0]
sequence data con dna sequences gene sequencing sample analysis specific comparison di course assembly biology software micro compared - module
{'em': ['sequence', 'data', 'con', 'specific', 'assembly'], 'not-em': ['dna', 'sequences', 'gene', 'sequencing', 'sample', 'analysis', 'comparison', 'di', 'course', 'biology', 'software', 'micro', 'compared', '-', 'module'], 'space-tk': ['sequence', 'data', 'specific', 'assembly']}
----------------------------
314 Deamination of cytidine to uridine on the minus strand of viral DNA results in catastrophic G-to-A mutations in the viral genome.
-------------
mlm-splade-62783
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/62783/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--62783-batch_size_40-2022-04-14_08-

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(4, '4347374', 27.291305541992188)] 1
[2.69, 2.48, 2.45, 2.3, 2.17, 1.81, 1.52, 1.51, 1.14, 0.93, 0.82, 0.8, 0.79, 0.67, 0.66, 0.65, 0.51, 0.49, 0.4, 0.35]
g viral dna deamination dea virus lethal cytosine deaminase hiv viruses replication antibiotic rna cy enzyme penicillin recombination dnas virally
{'em': ['g', 'viral', 'dna', 'deamination'], 'not-em': ['dea', 'virus', 'lethal', 'cytosine', 'deaminase', 'hiv', 'viruses', 'replication', 'antibiotic', 'rna', 'cy', 'enzyme', 'penicillin', 'recombination', 'dnas', 'virally'], 'space-tk': ['g', 'viral', 'dna', 'deamination']}
-------------
mlm-splade-30522
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/30522/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--30522-batch_size_40-2022-04-17_10-06-25/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


not exist within top10 4347374
[2.01, 1.89, 1.78, 1.74, 1.34, 1.06, 0.9, 0.88, 0.62, 0.6, 0.53, 0.51, 0.51, 0.38, 0.23, 0.22, 0.18, 0.15, 0.14, 0.11]
viral dna dea virus g cyg geneticmina viruses rna infectious hiv drug gene replication effect infection factoration
{'em': ['viral', 'dna', 'dea', 'g'], 'not-em': ['virus', 'cyg', 'geneticmina', 'viruses', 'rna', 'infectious', 'hiv', 'drug', 'gene', 'replication', 'effect', 'infection', 'factoration'], 'space-tk': ['viral', 'dna', 'g']}
-------------
splade
path /home/gaia_data/iida.h/BEIR/model/msmarco/splade/distilSplade_0.1_0.08_bert-base-uncased-batch_size_24-2022-04-07_21-45-37/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


not exist within top10 4347374
[2.24, 1.74, 1.6, 1.49, 1.34, 1.2, 0.72, 0.44, 0.43, 0.4, 0.35, 0.31, 0.26, 0.16, 0.16, 0.14, 0.13, 0.12, 0.11, 0.09]
dna viral virus g cy dea gene viruses effect hiv drug rna enzymeminate bacterial lethal infection infectiousmina mechanism
{'em': ['dna', 'viral', 'g', 'cy', 'dea'], 'not-em': ['virus', 'gene', 'viruses', 'effect', 'hiv', 'drug', 'rna', 'enzymeminate', 'bacterial', 'lethal', 'infection', 'infectiousmina', 'mechanism'], 'space-tk': ['dna', 'viral', 'g']}
----------------------------
343 Diabetic patients with acute coronary syndrome experience increased short-term and long-term risk for bleeding events.
-------------
mlm-splade-62783
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/62783/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--62783-batch_size_40-2022-04-14_08-39-18/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(4, '7873737', 28.73906898498535)] 1
[4.02, 3.55, 3.12, 2.75, 1.64, 1.61, 1.14, 1.12, 0.85, 0.79, 0.76, 0.53, 0.52, 0.5, 0.41, 0.36, 0.35, 0.35, 0.34, 0.33]
diabetic coronary acute diabetes risk diabetics syndrome myocardialbet risks patients patient increase hospital cardio disease increasing corona long prediabetes
{'em': ['diabetic', 'coronary', 'acute', 'risk', 'syndrome', 'patients', 'long'], 'not-em': ['diabetes', 'diabetics', 'myocardialbet', 'risks', 'patient', 'increase', 'hospital', 'cardio', 'disease', 'increasing', 'corona', 'prediabetes'], 'space-tk': ['diabetic', 'coronary', 'acute', 'risk', 'syndrome', 'patients', 'long']}
[(1, '5884524', 31.494159698486328)] 1
[3.98, 3.72, 3.45, 2.24, 1.94, 1.84, 1.82, 1.74, 1.1, 0.99, 0.95, 0.82, 0.59, 0.58, 0.58, 0.55, 0.54, 0.54, 0.41, 0.41]
diabetic coronary diabetes acute myocardial risk term diabetics risksbet syndrome patients disease patient heart nondiabetic cardio prediabetes prediabetic cause
{'em': ['diabetic', 'coronary', 

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(4, '7873737', 20.511951446533203)] 1
[2.94, 2.44, 2.32, 1.94, 1.84, 1.55, 1.53, 1.01, 0.57, 0.53, 0.51, 0.42, 0.39, 0.29, 0.26, 0.21, 0.2, 0.18, 0.17, 0.14]
diabetes acute corona glucose risk syndrome heart dia experience increase patient increasingry disease patients influence effect increased with hospital
{'em': ['acute', 'corona', 'risk', 'syndrome', 'dia', 'experience', 'patients', 'increased', 'with'], 'not-em': ['diabetes', 'glucose', 'heart', 'increase', 'patient', 'increasingry', 'disease', 'influence', 'effect', 'hospital'], 'space-tk': ['acute', 'risk', 'syndrome', 'experience', 'patients', 'increased', 'with']}
[(3, '5884524', 22.219032287597656)] 1
[3.64, 2.83, 2.37, 2.25, 2.0, 1.83, 1.5, 1.28, 1.07, 1.01, 0.86, 0.82, 0.58, 0.56, 0.53, 0.49, 0.48, 0.46, 0.28, 0.25]
diabetes corona glucose heart risk acute syndrome long term artery dia patient cardiovascularry event patients disease influence with sugar
{'em': ['corona', 'risk', 'acute', 'syndrome', 'long', 'term', 'dia',

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(4, '7873737', 18.662357330322266)] 1
[3.56, 2.96, 2.78, 1.56, 1.35, 1.07, 0.72, 0.7, 0.6, 0.59, 0.53, 0.47, 0.36, 0.34, 0.32, 0.25, 0.13, 0.13, 0.08, 0.06]
diabetes corona acute risk syndrome dia increase effect experience patient heart large increased patients disease benefit elevated dangerous with hospital
{'em': ['corona', 'acute', 'risk', 'syndrome', 'dia', 'experience', 'increased', 'patients', 'with'], 'not-em': ['diabetes', 'increase', 'effect', 'patient', 'heart', 'large', 'disease', 'benefit', 'elevated', 'dangerous', 'hospital'], 'space-tk': ['acute', 'risk', 'syndrome', 'experience', 'increased', 'patients', 'with']}
[(3, '5884524', 18.67609214782715)] 1
[4.19, 2.79, 2.17, 1.56, 1.37, 1.21, 1.11, 0.81, 0.81, 0.8, 0.78, 0.6, 0.5, 0.45, 0.33, 0.26, 0.16, 0.15, 0.12, 0.1]
diabetes corona acute risk syndrome heart term dia large patient effect disease event patients surgery with chance insulin cause increase
{'em': ['corona', 'acute', 'risk', 'syndrome', 'term', 'dia', 'patie

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(1, '17930286', 17.87362289428711)] 1
[4.83, 3.86, 3.14, 1.66, 1.45, 1.09, 0.79, 0.67, 0.53, 0.5, 0.44, 0.37, 0.29, 0.16, 0.11, 0.1, 0.09, 0.08, 0.01, 0.0]
headache headaches cognitive cognition associated impairment migraine mental correlation with brain correlated percentage correlations cognitively score impairments measure determine dysfunction
{'em': ['headaches', 'cognitive', 'impairment', 'with', 'correlated'], 'not-em': ['headache', 'cognition', 'associated', 'migraine', 'mental', 'correlation', 'brain', 'percentage', 'correlations', 'cognitively', 'score', 'impairments', 'measure', 'determine', 'dysfunction'], 'space-tk': ['headaches', 'cognitive', 'impairment', 'with', 'correlated']}
-------------
mlm-splade-30522
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/30522/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--30522-batch_size_40-2022-04-17_10-06-25/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(2, '17930286', 14.328018188476562)] 1
[7.12, 3.7, 1.43, 1.33, 1.14, 0.7, 0.37, 0.36, 0.33, 0.32, 0.23, 0.23, 0.15, 0.13, 0.1, 0.1, 0.09, 0.06, 0.05, 0.05]
headache cognitive associated impairment cognition correlation brain relationship correlated influences symptoms with disability is mental determine percent cause score
{'em': ['headache', 'cognitive', 'impairment', 'correlated', 'with'], 'not-em': ['associated', 'cognition', 'correlation', 'brain', 'relationship', 'influences', 'symptoms', 'disability', 'is', 'mental', 'determine', 'percent', 'cause', 'score'], 'space-tk': ['cognitive', 'impairment', 'correlated', 'with']}
-------------
splade
path /home/gaia_data/iida.h/BEIR/model/msmarco/splade/distilSplade_0.1_0.08_bert-base-uncased-batch_size_24-2022-04-07_21-45-37/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(4, '17930286', 11.259998321533203)] 1
[6.53, 2.95, 1.18, 1.16, 1.03, 0.39, 0.26, 0.21, 0.2, 0.19, 0.16, 0.11, 0.05, 0.04, 0.02, 0.02, 0.01, 0.01, 0.0, 0.0]
headache cognitive impairment brain associated mental without symptoms with not disabilitys score ache detect pattern test significant indicate traits
{'em': ['headache', 'cognitive', 'impairment', 'with', 'not'], 'not-em': ['brain', 'associated', 'mental', 'without', 'symptoms', 'disabilitys', 'score', 'ache', 'detect', 'pattern', 'test', 'significant', 'indicate', 'traits'], 'space-tk': ['cognitive', 'impairment', 'with', 'not']}
----------------------------
513 High cardiopulmonary fitness causes increased mortality rate.
-------------
mlm-splade-62783
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/62783/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--62783-batch_size_40-2022-04-14_08-39-18/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(1, '13230773', 20.507173538208008)] 1
[4.7, 2.52, 2.05, 1.95, 1.59, 1.12, 1.11, 0.98, 0.57, 0.57, 0.57, 0.54, 0.46, 0.43, 0.36, 0.27, 0.05, 0.04]
fitness mortality cardio cardiopulmonary death rate survival percentage gym cause exercise cardiorespiratory high physical activity die because effect
{'em': ['fitness', 'mortality', 'cardiopulmonary', 'rate', 'high'], 'not-em': ['cardio', 'death', 'survival', 'percentage', 'gym', 'cause', 'exercise', 'cardiorespiratory', 'physical', 'activity', 'die', 'because', 'effect'], 'space-tk': ['fitness', 'mortality', 'cardiopulmonary', 'rate', 'high']}
-------------
mlm-splade-30522
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/30522/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--30522-batch_size_40-2022-04-17_10-06-25/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(2, '13230773', 17.751262664794922)] 1
[4.65, 2.27, 2.07, 1.77, 1.17, 1.16, 0.95, 0.7, 0.64, 0.38, 0.38, 0.32, 0.29, 0.23, 0.2, 0.18, 0.16, 0.12, 0.12, 0.09]
fitness card exercise mortality rate death heart percent high cause cardiovascular risk gym factor increase influence survival higher cardiac weight
{'em': ['fitness', 'card', 'mortality', 'rate', 'high'], 'not-em': ['exercise', 'death', 'heart', 'percent', 'cause', 'cardiovascular', 'risk', 'gym', 'factor', 'increase', 'influence', 'survival', 'higher', 'cardiac', 'weight'], 'space-tk': ['fitness', 'mortality', 'rate', 'high']}
-------------
splade
path /home/gaia_data/iida.h/BEIR/model/msmarco/splade/distilSplade_0.1_0.08_bert-base-uncased-batch_size_24-2022-04-07_21-45-37/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(2, '13230773', 13.890487670898438)] 1
[3.43, 2.11, 1.54, 1.25, 1.13, 0.9, 0.83, 0.76, 0.57, 0.49, 0.2, 0.15, 0.1, 0.1, 0.07, 0.06, 0.06, 0.05, 0.03, 0.01]
fitness card exercise mortality rate high gym heart death fit cause obesity factor exercises highest low effect tennis higher training
{'em': ['fitness', 'card', 'mortality', 'rate', 'high'], 'not-em': ['exercise', 'gym', 'heart', 'death', 'fit', 'cause', 'obesity', 'factor', 'exercises', 'highest', 'low', 'effect', 'tennis', 'higher', 'training'], 'space-tk': ['fitness', 'mortality', 'rate', 'high']}
----------------------------
569 In adult tissue, most T cells are memory T cells.
-------------
mlm-splade-62783
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/62783/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--62783-batch_size_40-2022-04-14_08-39-18/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(1, '23460562', 21.04050064086914)] 1
[3.86, 3.23, 2.79, 2.3, 2.09, 2.09, 1.48, 0.74, 0.51, 0.5, 0.34, 0.3, 0.21, 0.15, 0.12, 0.1, 0.04, 0.04, 0.03, 0.02]
t memory tissue cell cells adult tissues adults human memories are tk l lymphocyte activation lymphocytes in location blood t2
{'em': ['t', 'memory', 'tissue', 'cells', 'adult', 'are', 'in'], 'not-em': ['cell', 'tissues', 'adults', 'human', 'memories', 'tk', 'l', 'lymphocyte', 'activation', 'lymphocytes', 'location', 'blood', 't2'], 'space-tk': ['t', 'memory', 'tissue', 'cells', 'adult', 'are', 'in']}
-------------
mlm-splade-30522
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/30522/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--30522-batch_size_40-2022-04-17_10-06-25/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(2, '23460562', 20.732620239257812)] 1
[4.54, 3.0, 2.57, 2.42, 2.36, 1.86, 0.81, 0.79, 0.4, 0.31, 0.31, 0.26, 0.25, 0.2, 0.14, 0.1, 0.08, 0.04, 0.04, 0.03]
t tissue cell adult memory cells tissues immune ts blood human memories containocytes are is organcgliayte
{'em': ['t', 'tissue', 'adult', 'memory', 'cells', 'are'], 'not-em': ['cell', 'tissues', 'immune', 'ts', 'blood', 'human', 'memories', 'containocytes', 'is', 'organcgliayte'], 'space-tk': ['t', 'tissue', 'adult', 'memory', 'cells', 'are']}
-------------
splade
path /home/gaia_data/iida.h/BEIR/model/msmarco/splade/distilSplade_0.1_0.08_bert-base-uncased-batch_size_24-2022-04-07_21-45-37/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(2, '23460562', 18.107812881469727)] 1
[4.49, 2.59, 2.45, 2.43, 1.77, 1.44, 1.15, 0.61, 0.47, 0.17, 0.13, 0.11, 0.1, 0.09, 0.03, 0.03, 0.03, 0.02, 0.0, 0.0]
t tissue memory cell adult cells memories tissues human ts neurons component found characteristic adultstocytes in composed not
{'em': ['t', 'tissue', 'memory', 'adult', 'cells', 'in'], 'not-em': ['cell', 'memories', 'tissues', 'human', 'ts', 'neurons', 'component', 'found', 'characteristic', 'adultstocytes', 'composed', 'not'], 'space-tk': ['t', 'tissue', 'memory', 'adult', 'cells', 'in']}
----------------------------
619 Increased vessel density along with a reduction in fibrosis decreases the efficacy of chemotherapy treatments.
-------------
mlm-splade-62783
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/62783/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--62783-batch_size_40-2022-04-14_08-39-18/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(2, '20888849', 17.396976470947266)] 1
[3.4, 2.94, 1.96, 1.56, 0.72, 0.7, 0.67, 0.52, 0.5, 0.45, 0.44, 0.4, 0.36, 0.34, 0.28, 0.27, 0.25, 0.23, 0.21, 0.19]
chemotherapy cancer density efficacy increase chemotherapeutic vascular therapy inhibition treatment radiation leukemia enhanced drug cause vessel affect resistance effect tumor
{'em': ['chemotherapy', 'density', 'efficacy', 'vessel'], 'not-em': ['cancer', 'increase', 'chemotherapeutic', 'vascular', 'therapy', 'inhibition', 'treatment', 'radiation', 'leukemia', 'enhanced', 'drug', 'cause', 'affect', 'resistance', 'effect', 'tumor'], 'space-tk': ['chemotherapy', 'density', 'efficacy', 'vessel']}
not exist within top10 2565138
[2.69, 1.73, 1.67, 1.43, 1.01, 0.61, 0.59, 0.52, 0.47, 0.39, 0.39, 0.36, 0.26, 0.24, 0.24, 0.16, 0.14, 0.09, 0.08, 0.08]
cancer vessel vessels chemotherapy vascular affect increase therapy treatment chemotherapeutic drug inhibition artery effect cause suppression tumor increasing help benefit
{'em': ['vessel', 

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(7, '20888849', 14.203570365905762)] 1
[2.84, 2.8, 1.32, 0.98, 0.9, 0.87, 0.81, 0.49, 0.49, 0.44, 0.38, 0.36, 0.2, 0.19, 0.18, 0.16, 0.15, 0.15, 0.15, 0.13]
cancer chemotherapy therapy density efficacy vascular tumor increase treatment effectiveness influence enhancement effect enhanced leukemia radiation associated benefit medication inhibit
{'em': ['chemotherapy', 'density', 'efficacy'], 'not-em': ['cancer', 'therapy', 'vascular', 'tumor', 'increase', 'treatment', 'effectiveness', 'influence', 'enhancement', 'effect', 'enhanced', 'leukemia', 'radiation', 'associated', 'benefit', 'medication', 'inhibit'], 'space-tk': ['chemotherapy', 'density', 'efficacy']}
not exist within top10 2565138
[2.53, 1.14, 1.11, 1.09, 0.79, 0.56, 0.38, 0.35, 0.27, 0.21, 0.17, 0.13, 0.13, 0.11, 0.1, 0.05, 0.05, 0.05, 0.05, 0.03]
cancer vessel therapy vascular increase tumor influence benefittherapy increasing treatment effect cause effectiveness medication vein va inhibit patient affected
{'em': ['vessel'],

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


not exist within top10 20888849
[2.64, 2.41, 1.2, 1.01, 0.63, 0.42, 0.29, 0.22, 0.17, 0.15, 0.14, 0.13, 0.13, 0.1, 0.1, 0.08, 0.06, 0.06, 0.04, 0.04]
chemotherapy cancer efficacy therapy effect increase drug enhanced cure effective improve tissue treatment benefit vascular effects affect medication leukemia toxic
{'em': ['chemotherapy', 'efficacy'], 'not-em': ['cancer', 'therapy', 'effect', 'increase', 'drug', 'enhanced', 'cure', 'effective', 'improve', 'tissue', 'treatment', 'benefit', 'vascular', 'effects', 'affect', 'medication', 'leukemia', 'toxic'], 'space-tk': ['chemotherapy', 'efficacy']}
not exist within top10 2565138
[2.19, 1.11, 1.1, 0.61, 0.47, 0.34, 0.32, 0.21, 0.15, 0.14, 0.14, 0.13, 0.11, 0.1, 0.07, 0.06, 0.05, 0.03, 0.02, 0.02]
cancer therapy vessel increase effect treatment drug affect vascular cure vein improve chemotherapytherapy result response cause drugs medication therapeutic
{'em': ['vessel'], 'not-em': ['cancer', 'therapy', 'increase', 'effect', 'treatment', 'dr

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(2, '24294572', 47.19951629638672)] 1
[4.2, 3.62, 2.95, 2.95, 2.1, 1.84, 1.84, 1.61, 1.44, 1.41, 1.41, 1.35, 1.35, 1.34, 1.24, 1.22, 1.15, 1.11, 1.02, 0.85]
pten pt lipid phosphataseen p pi pta 4 phosphatases pter 2 3 phosphate phosphorylationpha lipids ph ⌬ phos
{'em': ['pten', 'lipid', 'p', '4', '2', '3', 'phosphate'], 'not-em': ['pt', 'phosphataseen', 'pi', 'pta', 'phosphatases', 'pter', 'phosphorylationpha', 'lipids', 'ph', '⌬', 'phos'], 'space-tk': ['pten', 'lipid', 'p', '4', '2', '3', 'phosphate']}
-------------
mlm-splade-30522
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/30522/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--30522-batch_size_40-2022-04-17_10-06-25/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(3, '24294572', 31.380104064941406)] 1
[6.22, 3.62, 2.86, 2.51, 2.18, 2.01, 1.28, 1.17, 1.1, 1.09, 0.96, 0.84, 0.8, 0.74, 0.53, 0.5, 0.45, 0.38, 0.36, 0.29]
pten lip ptasephaos 4id3 enzyme 3 ph three kinase2 2 drug4 synthesis
{'em': ['lip', '3', 'ph', '2'], 'not-em': ['pten', 'ptasephaos', '4id3', 'enzyme', 'three', 'kinase2', 'drug4', 'synthesis'], 'space-tk': ['pten', '3', '2']}
-------------
splade
path /home/gaia_data/iida.h/BEIR/model/msmarco/splade/distilSplade_0.1_0.08_bert-base-uncased-batch_size_24-2022-04-07_21-45-37/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(3, '24294572', 24.247833251953125)] 1
[4.8, 2.52, 2.52, 2.1, 1.7, 1.59, 1.13, 1.08, 1.07, 0.85, 0.71, 0.65, 0.53, 0.43, 0.32, 0.26, 0.24, 0.22, 0.2, 0.19]
pt lipen ptasephaid phos 4 three enzyme2 3 ingredient hormone4 mechanism molecule gene
{'em': ['pt', '4', '3'], 'not-em': ['lipen', 'ptasephaid', 'phos', 'three', 'enzyme2', 'ingredient', 'hormone4', 'mechanism', 'molecule', 'gene'], 'space-tk': ['4', '3']}
----------------------------
659 Ivermectin is used to treat lymphatic filariasis.
-------------
mlm-splade-62783
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/62783/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--62783-batch_size_40-2022-04-14_08-39-18/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(1, '1215116', 8.569381713867188)] 1
[2.6, 2.54, 1.16, 0.78, 0.53, 0.51, 0.18, 0.16, 0.1, 0.07, 0.01, 0.0, 0.0]
lymphatic filariasis lymph fi helpis filarial treatment ascaris word cause include fibrosis
{'em': ['lymphatic', 'filariasis'], 'not-em': ['lymph', 'fi', 'helpis', 'filarial', 'treatment', 'ascaris', 'word', 'cause', 'include', 'fibrosis'], 'space-tk': ['lymphatic', 'filariasis']}
-------------
mlm-splade-30522
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/30522/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--30522-batch_size_40-2022-04-17_10-06-25/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(2, '1215116', 10.18790340423584)] 1
[1.98, 1.5, 1.47, 1.46, 1.4, 0.69, 0.67, 0.41, 0.35, 0.25, 0.1, 0.05, 0.02, 0.01, 0.01, 0.0, 0.0]
##larias fiympha benefit liais treatmenttic for treat therapy a medical is
{'em': ['treat', 'is'], 'not-em': ['##larias', 'fiympha', 'benefit', 'liais', 'treatmenttic', 'for', 'therapy', 'a', 'medical'], 'space-tk': ['treat', 'is']}
-------------
splade
path /home/gaia_data/iida.h/BEIR/model/msmarco/splade/distilSplade_0.1_0.08_bert-base-uncased-batch_size_24-2022-04-07_21-45-37/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(6, '1215116', 9.026580810546875)] 1
[2.19, 1.65, 1.43, 1.36, 0.92, 0.65, 0.41, 0.27, 0.25, 0.14, 0.13, 0.06, 0.05, 0.05, 0.03, 0.03, 0.02, 0.0]
##larymiaspha fi l cureistic drug disease purpose treatment effect antibiotics.ia therapy
{'em': ['fi', 'l'], 'not-em': ['##larymiaspha', 'cureistic', 'drug', 'disease', 'purpose', 'treatment', 'effect', 'antibiotics.ia', 'therapy'], 'space-tk': []}
----------------------------
660 Ivermectin is used to treat onchocerciasis.
-------------
mlm-splade-62783
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/62783/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--62783-batch_size_40-2022-04-14_08-39-18/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(2, '1215116', 7.601224899291992)] 1
[3.63, 2.34, 0.53, 0.37, 0.24, 0.22, 0.17, 0.08, 0.06, 0.01, 0.0, 0.0]
onchocerciasis onchocerc help although ian sarcoidosis treatment on wordcho osteoporosis include
{'em': ['onchocerciasis'], 'not-em': ['onchocerc', 'help', 'although', 'ian', 'sarcoidosis', 'treatment', 'on', 'wordcho', 'osteoporosis', 'include'], 'space-tk': ['onchocerciasis']}
-------------
mlm-splade-30522
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/30522/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--30522-batch_size_40-2022-04-17_10-06-25/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


not exist within top10 1215116
[2.02, 1.29, 0.99, 0.75, 0.38, 0.24, 0.16, 0.11, 0.05, 0.02, 0.01, 0.01]
##chocercia benefit on treatmentsis cancer for treat is a
{'em': ['on', 'treat', 'is'], 'not-em': ['##chocercia', 'benefit', 'treatmentsis', 'cancer', 'for', 'a'], 'space-tk': ['treat', 'is']}
-------------
splade
path /home/gaia_data/iida.h/BEIR/model/msmarco/splade/distilSplade_0.1_0.08_bert-base-uncased-batch_size_24-2022-04-07_21-45-37/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


not exist within top10 1215116
[1.74, 1.05, 1.03, 0.56, 0.54, 0.4, 0.14, 0.06, 0.06, 0.05, 0.04, 0.04, 0.02, 0.0, 0.0, 0.0]
##chocercia onsis cure drug effect purpose treatment infection antibiotics. not therapy bacterial
{'em': [], 'not-em': ['##chocercia', 'onsis', 'cure', 'drug', 'effect', 'purpose', 'treatment', 'infection', 'antibiotics.', 'not', 'therapy', 'bacterial'], 'space-tk': []}
----------------------------
684 Lack of clpC does not affect sporulation efficiency in Bacillus subtilis cells.
-------------
mlm-splade-62783
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/62783/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--62783-batch_size_40-2022-04-14_08-39-18/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(1, '4942718', 27.211273193359375)] 1
[3.65, 3.28, 2.79, 2.58, 2.12, 1.98, 1.33, 0.89, 0.86, 0.64, 0.54, 0.51, 0.5, 0.44, 0.42, 0.42, 0.41, 0.38, 0.35, 0.33]
sporulation subtilis bacillus spore sp spor nodulation cell sub inoculation spores bacteriaulation cells blast ba mutant bacterial gastrulationor
{'em': ['sporulation', 'subtilis', 'bacillus', 'cells'], 'not-em': ['spore', 'sp', 'spor', 'nodulation', 'cell', 'sub', 'inoculation', 'spores', 'bacteriaulation', 'blast', 'ba', 'mutant', 'bacterial', 'gastrulationor'], 'space-tk': ['sporulation', 'subtilis', 'bacillus', 'cells']}
-------------
mlm-splade-30522
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/30522/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--30522-batch_size_40-2022-04-17_10-06-25/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(2, '4942718', 21.698780059814453)] 1
[2.49, 2.35, 2.21, 2.13, 1.74, 1.55, 1.43, 1.27, 1.12, 1.04, 1.04, 0.85, 0.38, 0.22, 0.19, 0.15, 0.14, 0.11, 0.09, 0.08]
sp subulationor bati bacteriaci celllisllusulate bacterialulated organismccus differentiation fungus affected cells
{'em': ['sp', 'cells'], 'not-em': ['subulationor', 'bati', 'bacteriaci', 'celllisllusulate', 'bacterialulated', 'organismccus', 'differentiation', 'fungus', 'affected'], 'space-tk': ['cells']}
-------------
splade
path /home/gaia_data/iida.h/BEIR/model/msmarco/splade/distilSplade_0.1_0.08_bert-base-uncased-batch_size_24-2022-04-07_21-45-37/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(3, '4942718', 17.512161254882812)] 1
[2.34, 2.06, 1.74, 1.51, 1.49, 1.48, 1.12, 0.99, 0.87, 0.73, 0.64, 0.61, 0.33, 0.28, 0.28, 0.08, 0.07, 0.07, 0.07, 0.06]
spulationor subti ballus celllis bacterialci bacteriaulate factor gene test didlization cells mechanism
{'em': ['cells'], 'not-em': ['spulationor', 'subti', 'ballus', 'celllis', 'bacterialci', 'bacteriaulate', 'factor', 'gene', 'test', 'didlization', 'mechanism'], 'space-tk': ['cells']}
----------------------------
700 Localization of PIN1 in the Arabidopsis embryo does not require VPS9a
-------------
mlm-splade-62783
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/62783/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--62783-batch_size_40-2022-04-14_08-39-18/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(1, '4350400', 14.79353141784668)] 1
[4.45, 1.98, 1.91, 1.88, 1.69, 1.36, 0.76, 0.35, 0.33, 0.09, 0.07, 0.05, 0.04, 0.01, 0.0]
pin pins arabidopsis localization embryo pin1 plant embryonic embryos locationpsis localize non function localizing
{'em': ['arabidopsis', 'localization', 'embryo', 'pin1'], 'not-em': ['pin', 'pins', 'plant', 'embryonic', 'embryos', 'locationpsis', 'localize', 'non', 'function', 'localizing'], 'space-tk': ['arabidopsis', 'localization', 'embryo', 'pin1']}
-------------
mlm-splade-30522
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/30522/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--30522-batch_size_40-2022-04-17_10-06-25/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(3, '4350400', 13.603483200073242)] 1
[4.97, 2.16, 1.56, 1.27, 0.95, 0.66, 0.46, 0.41, 0.18, 0.15, 0.05, 0.05, 0.03, 0.02, 0.01, 0.0]
pin embryo localizationidopsis arabize plant signal expression inized formation do locally
{'em': ['pin', 'embryo'], 'not-em': ['localizationidopsis', 'arabize', 'plant', 'signal', 'expression', 'inized', 'formation', 'do', 'locally'], 'space-tk': ['embryo']}
-------------
splade
path /home/gaia_data/iida.h/BEIR/model/msmarco/splade/distilSplade_0.1_0.08_bert-base-uncased-batch_size_24-2022-04-07_21-45-37/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(5, '4350400', 13.297731399536133)] 1
[4.84, 1.82, 1.62, 1.06, 1.03, 0.81, 0.63, 0.56, 0.41, 0.33, 0.2, 0.15, 0.14, 0.06, 0.01, 0.01]
pin embryo localidoizedization locally arabpsis pins gene signal mechanism in found connect
{'em': ['pin', 'embryo', 'in'], 'not-em': ['localidoizedization', 'locally', 'arabpsis', 'pins', 'gene', 'signal', 'mechanism', 'found', 'connect'], 'space-tk': ['embryo', 'in']}
----------------------------
702 Localization of PIN1 in the roots of Arabidopsis does not require VPS9a
-------------
mlm-splade-62783
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/62783/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--62783-batch_size_40-2022-04-14_08-39-18/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(2, '4350400', 14.941852569580078)] 1
[4.57, 2.1, 1.86, 1.84, 1.38, 1.22, 0.98, 0.82, 0.13, 0.1, 0.06, 0.05, 0.03, 0.03, 0.01, 0.01, 0.0]
pin pins localization arabidopsis pin1 roots root plantpsis location non localize help function localisation localized localizing
{'em': ['localization', 'arabidopsis', 'pin1', 'roots'], 'not-em': ['pin', 'pins', 'root', 'plantpsis', 'location', 'non', 'localize', 'help', 'function', 'localisation', 'localized', 'localizing'], 'space-tk': ['localization', 'arabidopsis', 'pin1', 'roots']}
-------------
mlm-splade-30522
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/30522/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--30522-batch_size_40-2022-04-17_10-06-25/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(3, '4350400', 13.19176197052002)] 1
[5.26, 1.49, 1.32, 1.04, 0.99, 0.83, 0.73, 0.5, 0.48, 0.44, 0.16, 0.13, 0.05, 0.03, 0.01]
pin localization plantido rootpsis arab rootsize signalized in locally do
{'em': ['pin', 'arab', 'in'], 'not-em': ['localization', 'plantido', 'rootpsis', 'rootsize', 'signalized', 'locally', 'do'], 'space-tk': ['localization', 'in']}
-------------
splade
path /home/gaia_data/iida.h/BEIR/model/msmarco/splade/distilSplade_0.1_0.08_bert-base-uncased-batch_size_24-2022-04-07_21-45-37/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(9, '4350400', 12.713798522949219)] 1
[4.81, 1.52, 1.09, 1.01, 0.83, 0.7, 0.69, 0.62, 0.59, 0.59, 0.42, 0.21, 0.18, 0.18, 0.07, 0.06, 0.03, 0.02, 0.01]
pin localizedidoization plant pins locally arab rootpsis roots gene signal connect in region tree found
{'em': ['pin', 'arab', 'roots', 'in'], 'not-em': ['localizedidoization', 'plant', 'pins', 'locally', 'rootpsis', 'gene', 'signal', 'connect', 'region', 'tree', 'found'], 'space-tk': ['roots', 'in']}
----------------------------
715 Low expression of miR7a does represses target genes and exerts a biological function in ovaries.
-------------
mlm-splade-62783
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/62783/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--62783-batch_size_40-2022-04-14_08-39-18/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(10, '18421962', 32.416141510009766)] 1
[4.28, 2.84, 2.23, 1.95, 1.81, 1.48, 1.41, 1.39, 1.32, 1.32, 1.2, 1.12, 1.12, 1.1, 0.95, 0.89, 0.54, 0.5, 0.48, 0.47]
mir target expression mirnas repression repressor targets repressed gene repress mirs rna suppression mirna genes regulation affect dna microrna effect
{'em': ['mir', 'target', 'expression', 'genes'], 'not-em': ['mirnas', 'repression', 'repressor', 'targets', 'repressed', 'gene', 'repress', 'mirs', 'rna', 'suppression', 'mirna', 'regulation', 'affect', 'dna', 'microrna', 'effect'], 'space-tk': ['target', 'expression', 'genes']}
-------------
mlm-splade-30522
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/30522/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--30522-batch_size_40-2022-04-17_10-06-25/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


not exist within top10 18421962
[6.72, 3.12, 1.43, 0.93, 0.89, 0.87, 0.79, 0.62, 0.6, 0.55, 0.52, 0.48, 0.47, 0.42, 0.28, 0.27, 0.22, 0.21, 0.18, 0.17]
mir target expression rna genetic repression gene targeting suppression genes dna targets suppress influenceress function activity mrna regulation bind
{'em': ['mir', 'target', 'expression', 'genes', 'function'], 'not-em': ['rna', 'genetic', 'repression', 'gene', 'targeting', 'suppression', 'dna', 'targets', 'suppress', 'influenceress', 'activity', 'mrna', 'regulation', 'bind'], 'space-tk': ['target', 'expression', 'genes', 'function']}
-------------
splade
path /home/gaia_data/iida.h/BEIR/model/msmarco/splade/distilSplade_0.1_0.08_bert-base-uncased-batch_size_24-2022-04-07_21-45-37/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


not exist within top10 18421962
[5.99, 3.07, 1.43, 1.05, 1.01, 0.72, 0.65, 0.61, 0.29, 0.28, 0.22, 0.19, 0.16, 0.13, 0.09, 0.08, 0.08, 0.05, 0.04, 0.04]
mir target expression rna gene effect genes targets mechanism affect dna receptor protein not benefit effects targeting do molecule genetic
{'em': ['mir', 'target', 'expression', 'genes'], 'not-em': ['rna', 'gene', 'effect', 'targets', 'mechanism', 'affect', 'dna', 'receptor', 'protein', 'not', 'benefit', 'effects', 'targeting', 'do', 'molecule', 'genetic'], 'space-tk': ['target', 'expression', 'genes']}
----------------------------
718 Low nucleosome occupancy correlates with low methylation levels across species.
-------------
mlm-splade-62783
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/62783/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--62783-batch_size_40-2022-04-14_08-39-18/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(1, '17587795', 22.394620895385742)] 1
[4.39, 3.02, 2.89, 1.75, 1.67, 1.6, 1.32, 0.92, 0.89, 0.69, 0.65, 0.56, 0.46, 0.42, 0.31, 0.23, 0.11, 0.09, 0.08, 0.08]
methylation methyl nucleosome dna species methyltransferase methylated nucle nucleosomes cytosine histone epigenetic demethylation nucleotide methylene ⌬cle ligation cause density
{'em': ['methylation', 'nucleosome', 'species'], 'not-em': ['methyl', 'dna', 'methyltransferase', 'methylated', 'nucle', 'nucleosomes', 'cytosine', 'histone', 'epigenetic', 'demethylation', 'nucleotide', 'methylene', '⌬cle', 'ligation', 'cause', 'density'], 'space-tk': ['methylation', 'nucleosome', 'species']}
-------------
mlm-splade-30522
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/30522/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--30522-batch_size_40-2022-04-17_10-06-25/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(2, '17587795', 19.55850601196289)] 1
[4.58, 2.86, 2.03, 1.81, 1.79, 1.34, 1.24, 1.08, 0.92, 0.68, 0.46, 0.41, 0.32, 0.22, 0.15, 0.13, 0.12, 0.12, 0.1, 0.08]
methyloso nuationclehyl dname species influence genetic organism lack dependent bacteria cause molecule factor enzyme likely
{'em': ['species'], 'not-em': ['methyloso', 'nuationclehyl', 'dname', 'influence', 'genetic', 'organism', 'lack', 'dependent', 'bacteria', 'cause', 'molecule', 'factor', 'enzyme', 'likely'], 'space-tk': ['species']}
-------------
splade
path /home/gaia_data/iida.h/BEIR/model/msmarco/splade/distilSplade_0.1_0.08_bert-base-uncased-batch_size_24-2022-04-07_21-45-37/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(2, '17587795', 15.196617126464844)] 1
[4.72, 2.29, 2.09, 2.01, 1.68, 1.12, 0.39, 0.32, 0.2, 0.18, 0.15, 0.13, 0.12, 0.11, 0.11, 0.06, 0.05, 0.03, 0.01, 0.0]
methyl nucleosoationme effect speciesmesatin enzyme affect gene cause organism factor bacterialate component biology
{'em': ['methyl'], 'not-em': ['nucleosoationme', 'effect', 'speciesmesatin', 'enzyme', 'affect', 'gene', 'cause', 'organism', 'factor', 'bacterialate', 'component', 'biology'], 'space-tk': []}
----------------------------
742 Macrolides have no protective effect against myocardial infarction.
-------------
mlm-splade-62783
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/62783/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--62783-batch_size_40-2022-04-14_08-39-18/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(1, '32159283', 19.19904136657715)] 1
[3.73, 2.47, 2.35, 2.29, 2.25, 2.0, 1.63, 1.06, 0.83, 0.73, 0.72, 0.61, 0.57, 0.56, 0.53, 0.5, 0.5, 0.3, 0.27, 0.26]
myocardial macrolides macrolide antibiotic infarction penicillin coronary infarct myocardium against drug effect quinoloneslide heart death have prevent cause do
{'em': ['myocardial', 'macrolides', 'infarction', 'against', 'effect', 'have'], 'not-em': ['macrolide', 'antibiotic', 'penicillin', 'coronary', 'infarct', 'myocardium', 'drug', 'quinoloneslide', 'heart', 'death', 'prevent', 'cause', 'do'], 'space-tk': ['myocardial', 'macrolides', 'infarction', 'against', 'effect', 'have']}
-------------
mlm-splade-30522
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/30522/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--30522-batch_size_40-2022-04-17_10-06-25/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(4, '32159283', 16.3913516998291)] 1
[3.47, 2.94, 2.55, 2.21, 2.07, 1.75, 1.25, 0.98, 0.59, 0.52, 0.48, 0.39, 0.35, 0.3, 0.27, 0.26, 0.25, 0.2, 0.2, 0.18]
##farlideoca macrordial heart myction against influence effect drug kidneyfra is effects medication cause cardiac death
{'em': ['against', 'effect'], 'not-em': ['##farlideoca', 'macrordial', 'heart', 'myction', 'influence', 'drug', 'kidneyfra', 'is', 'effects', 'medication', 'cause', 'cardiac', 'death'], 'space-tk': ['against', 'effect']}
-------------
splade
path /home/gaia_data/iida.h/BEIR/model/msmarco/splade/distilSplade_0.1_0.08_bert-base-uncased-batch_size_24-2022-04-07_21-45-37/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(3, '32159283', 11.117390632629395)] 1
[3.51, 3.33, 2.95, 1.88, 1.6, 0.92, 0.78, 0.51, 0.43, 0.42, 0.41, 0.34, 0.19, 0.18, 0.14, 0.11, 0.11, 0.09, 0.06, 0.06]
##far macrolideocardial my effect drug againstction antibiotics not effects infection without in prevent ingredient drugs medication
{'em': ['##far', 'my', 'effect', 'in'], 'not-em': ['macrolideocardial', 'drug', 'againstction', 'antibiotics', 'not', 'effects', 'infection', 'without', 'prevent', 'ingredient', 'drugs', 'medication'], 'space-tk': ['effect']}
----------------------------
743 Macrolides protect against myocardial infarction.
-------------
mlm-splade-62783
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/62783/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--62783-batch_size_40-2022-04-14_08-39-18/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(1, '32159283', 17.453205108642578)] 1
[3.86, 2.58, 2.48, 2.45, 2.33, 2.11, 1.56, 1.08, 0.79, 0.71, 0.58, 0.54, 0.44, 0.43, 0.4, 0.35, 0.2, 0.17, 0.15, 0.08]
myocardial macrolides macrolide antibiotic infarction penicillin coronary infarct against drug myocardiumlide quinolones heart death prevent infarctions cause leprosy injury
{'em': ['myocardial', 'macrolides', 'infarction', 'against'], 'not-em': ['macrolide', 'antibiotic', 'penicillin', 'coronary', 'infarct', 'drug', 'myocardiumlide', 'quinolones', 'heart', 'death', 'prevent', 'infarctions', 'cause', 'leprosy', 'injury'], 'space-tk': ['myocardial', 'macrolides', 'infarction', 'against']}
-------------
mlm-splade-30522
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/30522/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--30522-batch_size_40-2022-04-17_10-06-25/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(6, '32159283', 13.189335823059082)] 1
[3.44, 3.11, 2.5, 2.35, 1.77, 1.31, 1.15, 0.78, 0.77, 0.54, 0.28, 0.22, 0.22, 0.21, 0.18, 0.17, 0.14, 0.13, 0.1, 0.09]
##farlideoca macrordial heart my againstction drug medication cause effect preventfra collision treatment death reduce drugs
{'em': ['my'], 'not-em': ['##farlideoca', 'macrordial', 'heart', 'againstction', 'drug', 'medication', 'cause', 'effect', 'preventfra', 'collision', 'treatment', 'death', 'reduce', 'drugs'], 'space-tk': []}
-------------
splade
path /home/gaia_data/iida.h/BEIR/model/msmarco/splade/distilSplade_0.1_0.08_bert-base-uncased-batch_size_24-2022-04-07_21-45-37/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(4, '32159283', 10.231046676635742)] 1
[3.66, 3.51, 3.14, 1.88, 1.53, 0.89, 0.63, 0.51, 0.4, 0.34, 0.25, 0.19, 0.12, 0.11, 0.1, 0.04, 0.04, 0.02, 0.02, 0.01]
##far macrolideocardial my against drugction infection antibiotics ingredient prevent in heart nots detect medication bacterial
{'em': ['##far', 'my', 'against', 'in'], 'not-em': ['macrolideocardial', 'drugction', 'infection', 'antibiotics', 'ingredient', 'prevent', 'heart', 'nots', 'detect', 'medication', 'bacterial'], 'space-tk': ['against']}
----------------------------
768 Mercaptopurine is anabolized into the inactive methylmercaptopurine by thiopurine methyltrasnferase (TPMT).
-------------
mlm-splade-62783
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/62783/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--62783-batch_size_40-2022-04-14_08-39-18/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(3, '6421792', 14.414590835571289)] 1
[3.53, 2.17, 1.63, 1.28, 1.0, 0.79, 0.77, 0.69, 0.47, 0.37, 0.35, 0.28, 0.21, 0.2, 0.17, 0.14, 0.11, 0.1, 0.08, 0.05]
##puri mercapto t thio enzyme mer drugne word th activity inactivated drugs mercapt rnase oxidase acetyl dnaot ter
{'em': ['##puri', 'mercapto'], 'not-em': ['t', 'thio', 'enzyme', 'mer', 'drugne', 'word', 'th', 'activity', 'inactivated', 'drugs', 'mercapt', 'rnase', 'oxidase', 'acetyl', 'dnaot', 'ter'], 'space-tk': []}
-------------
mlm-splade-30522
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/30522/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--30522-batch_size_40-2022-04-17_10-06-25/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


not exist within top10 6421792
[1.56, 1.31, 1.0, 0.91, 0.83, 0.54, 0.24, 0.22, 0.11, 0.1, 0.08, 0.06, 0.04, 0.03, 0.03, 0.01, 0.0]
enzyme tse mercap drug dna transformationde enzymestop a th activease discovered that
{'em': ['th'], 'not-em': ['enzyme', 'tse', 'mercap', 'drug', 'dna', 'transformationde', 'enzymestop', 'a', 'activease', 'discovered', 'that'], 'space-tk': []}
-------------
splade
path /home/gaia_data/iida.h/BEIR/model/msmarco/splade/distilSplade_0.1_0.08_bert-base-uncased-batch_size_24-2022-04-07_21-45-37/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


not exist within top10 6421792
[1.42, 1.02, 0.87, 0.82, 0.74, 0.71, 0.63, 0.61, 0.57, 0.57, 0.54, 0.45, 0.36, 0.15, 0.09, 0.07, 0.06, 0.04, 0.02, 0.01]
t enzymetopurise gene thne mercap activate drugio enzymes dna mechanism discovered metabolism receptor.
{'em': ['t'], 'not-em': ['enzymetopurise', 'gene', 'thne', 'mercap', 'activate', 'drugio', 'enzymes', 'dna', 'mechanism', 'discovered', 'metabolism', 'receptor.'], 'space-tk': []}
----------------------------
785 Microarray results from culture-amplified mixtures of serotypes correlate poorly with microarray results from uncultured mixtures.
-------------
mlm-splade-62783
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/62783/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--62783-batch_size_40-2022-04-14_08-39-18/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(3, '12471115', 14.75417423248291)] 1
[2.19, 2.16, 1.81, 1.48, 1.14, 1.0, 0.95, 0.87, 0.81, 0.79, 0.79, 0.76, 0.74, 0.67, 0.58, 0.58, 0.5, 0.47, 0.36, 0.32]
microarray serotypes serotype microarrays ser micro test serology culture genotype salmonella serogroup sero serotyping serovars sert results serovar serogroups virus
{'em': ['microarray', 'serotypes', 'culture', 'results'], 'not-em': ['serotype', 'microarrays', 'ser', 'micro', 'test', 'serology', 'genotype', 'salmonella', 'serogroup', 'sero', 'serotyping', 'serovars', 'sert', 'serovar', 'serogroups', 'virus'], 'space-tk': ['microarray', 'serotypes', 'culture', 'results']}
-------------
mlm-splade-30522
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/30522/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--30522-batch_size_40-2022-04-17_10-06-25/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


not exist within top10 12471115
[2.74, 2.61, 2.17, 2.05, 1.3, 1.17, 0.95, 0.4, 0.33, 0.31, 0.3, 0.29, 0.28, 0.22, 0.22, 0.22, 0.21, 0.14, 0.13, 0.13]
##otype ser microrayar culture test study amp determinerre sampletypeele results genetic laboratory lab tested detect
{'em': ['##otype', 'ser', 'culture', 'results'], 'not-em': ['microrayar', 'test', 'study', 'amp', 'determinerre', 'sampletypeele', 'genetic', 'laboratory', 'lab', 'tested', 'detect'], 'space-tk': ['culture', 'results']}
-------------
splade
path /home/gaia_data/iida.h/BEIR/model/msmarco/splade/distilSplade_0.1_0.08_bert-base-uncased-batch_size_24-2022-04-07_21-45-37/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


not exist within top10 12471115
[2.84, 2.84, 2.77, 1.73, 1.67, 1.23, 0.72, 0.38, 0.28, 0.28, 0.16, 0.15, 0.12, 0.11, 0.11, 0.08, 0.08, 0.07, 0.01, 0.01]
serotype microray culturear test sample lab detect determine gene sensitive research not effect testing sensor laboratory analysis
{'em': [], 'not-em': ['serotype', 'microray', 'culturear', 'test', 'sample', 'lab', 'detect', 'determine', 'gene', 'sensitive', 'research', 'not', 'effect', 'testing', 'sensor', 'laboratory', 'analysis'], 'space-tk': []}
----------------------------
1100 Statins increase blood cholesterol.
-------------
mlm-splade-62783
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/62783/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--62783-batch_size_40-2022-04-14_08-39-18/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(2, '7662206', 18.985036849975586)] 1
[4.13, 3.58, 3.26, 1.74, 1.73, 1.0, 0.75, 0.57, 0.49, 0.37, 0.36, 0.26, 0.23, 0.23, 0.12, 0.1, 0.04, 0.03]
cholesterol statins statin hypercholesterolemia blood increase help simvastatin drug increasing abs lipid cause decrease ace effect affect because
{'em': ['cholesterol', 'statins', 'blood', 'increase'], 'not-em': ['statin', 'hypercholesterolemia', 'help', 'simvastatin', 'drug', 'increasing', 'abs', 'lipid', 'cause', 'decrease', 'ace', 'effect', 'affect', 'because'], 'space-tk': ['cholesterol', 'statins', 'blood', 'increase']}
-------------
mlm-splade-30522
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/30522/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--30522-batch_size_40-2022-04-17_10-06-25/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(5, '7662206', 16.917160034179688)] 1
[4.88, 3.16, 1.72, 1.34, 1.25, 1.23, 1.06, 0.6, 0.48, 0.36, 0.31, 0.3, 0.15, 0.04, 0.02, 0.01, 0.0]
statterol cho bloodles increasein drug influence statisticsins benefit medication increasing function improve treatment
{'em': ['cho'], 'not-em': ['statterol', 'bloodles', 'increasein', 'drug', 'influence', 'statisticsins', 'benefit', 'medication', 'increasing', 'function', 'improve', 'treatment'], 'space-tk': []}
-------------
splade
path /home/gaia_data/iida.h/BEIR/model/msmarco/splade/distilSplade_0.1_0.08_bert-base-uncased-batch_size_24-2022-04-07_21-45-37/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(3, '7662206', 16.194345474243164)] 1
[5.26, 3.03, 1.89, 1.37, 1.37, 0.75, 0.67, 0.65, 0.43, 0.23, 0.19, 0.12, 0.12, 0.11]
statterol cho bloodinles drug increase raiseins cure reduce effect statistics
{'em': ['cho', 'increase'], 'not-em': ['statterol', 'bloodinles', 'drug', 'raiseins', 'cure', 'reduce', 'effect', 'statistics'], 'space-tk': ['increase']}
----------------------------
1225 The locus rs647161 is associated with colorectal carcinoma.
-------------
mlm-splade-62783
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/62783/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--62783-batch_size_40-2022-04-14_08-39-18/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(1, '9650982', 25.589221954345703)] 1
[4.18, 2.91, 2.3, 2.09, 1.82, 1.45, 1.45, 1.3, 1.25, 0.89, 0.89, 0.88, 0.86, 0.78, 0.54, 0.38, 0.38, 0.27, 0.22, 0.2]
colorectal cancer loci crc color47 association locus rs tumor16 gene associated rs6 rectal dna carcinoma7 pancreatic colon
{'em': ['colorectal', 'locus', 'associated', 'rs6'], 'not-em': ['cancer', 'loci', 'crc', 'color47', 'association', 'rs', 'tumor16', 'gene', 'rectal', 'dna', 'carcinoma7', 'pancreatic', 'colon'], 'space-tk': ['colorectal', 'locus', 'associated']}
-------------
mlm-splade-30522
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/30522/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--30522-batch_size_40-2022-04-17_10-06-25/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


not exist within top10 9650982
[3.25, 2.59, 2.59, 1.52, 1.45, 1.26, 1.1, 0.69, 0.53, 0.52, 0.45, 0.37, 0.29, 0.25, 0.13, 0.13, 0.12, 0.1, 0.09, 0.04]
##ect cancer color rs genetic tumor association associated dna gene chromosomeal mutation disease number colon mutations testotype for
{'em': ['##ect', 'color', 'rs', 'associated'], 'not-em': ['cancer', 'genetic', 'tumor', 'association', 'dna', 'gene', 'chromosomeal', 'mutation', 'disease', 'number', 'colon', 'mutations', 'testotype', 'for'], 'space-tk': ['associated']}
-------------
splade
path /home/gaia_data/iida.h/BEIR/model/msmarco/splade/distilSplade_0.1_0.08_bert-base-uncased-batch_size_24-2022-04-07_21-45-37/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


not exist within top10 9650982
[2.72, 2.39, 2.13, 1.91, 1.3, 0.74, 0.31, 0.31, 0.26, 0.25, 0.21, 0.2, 0.19, 0.18, 0.18, 0.16, 0.16, 0.12, 0.1, 0.1]
##ect color rs cancer tumor gene chromosome16 score virus dna12 for hereditaryal number disease test cancers
{'em': ['##ect', 'color', 'rs'], 'not-em': ['cancer', 'tumor', 'gene', 'chromosome16', 'score', 'virus', 'dna12', 'for', 'hereditaryal', 'number', 'disease', 'test', 'cancers'], 'space-tk': []}
----------------------------
1226 The loss of the TET protein functions may have dire biological consequences, such as myeloid cancers.
-------------
mlm-splade-62783
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/62783/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--62783-batch_size_40-2022-04-14_08-39-18/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(5, '13777138', 24.03824234008789)] 1
[3.67, 3.1, 1.14, 1.12, 1.09, 1.09, 1.08, 1.04, 0.94, 0.83, 0.83, 0.72, 0.68, 0.66, 0.64, 0.53, 0.45, 0.44, 0.39, 0.38]
te tet tet2 function tetracycline tet1 t teg tether protein tetra tect tem tec enzyme tetanus ⌬ tek tes functions
{'em': ['tet', 'protein', 'functions'], 'not-em': ['te', 'tet2', 'function', 'tetracycline', 'tet1', 't', 'teg', 'tether', 'tetra', 'tect', 'tem', 'tec', 'enzyme', 'tetanus', '⌬', 'tek', 'tes'], 'space-tk': ['tet', 'protein', 'functions']}
-------------
mlm-splade-30522
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/30522/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--30522-batch_size_40-2022-04-17_10-06-25/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


not exist within top10 13777138
[5.06, 2.2, 0.91, 0.87, 0.73, 0.4, 0.27, 0.2, 0.18, 0.14, 0.12, 0.1, 0.08, 0.07, 0.06, 0.05, 0.01]
tet protein loss function enzyme influenceid functions gene lack biologicalts genetic biology vitamin reduce
{'em': ['protein', 'loss', 'functions'], 'not-em': ['tet', 'function', 'enzyme', 'influenceid', 'gene', 'lack', 'biologicalts', 'genetic', 'biology', 'vitamin', 'reduce'], 'space-tk': ['tet', 'protein', 'loss', 'functions']}
-------------
splade
path /home/gaia_data/iida.h/BEIR/model/msmarco/splade/distilSplade_0.1_0.08_bert-base-uncased-batch_size_24-2022-04-07_21-45-37/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


not exist within top10 13777138
[3.94, 2.08, 0.9, 0.53, 0.45, 0.43, 0.39, 0.29, 0.27, 0.26, 0.2, 0.09, 0.09, 0.05, 0.03, 0.02, 0.01, 0.01, 0.0, 0.0]
tet function protein effectid gene t enzyme functions receptor drug affect chemical biology purpose virus caused control role
{'em': ['protein', 'functions'], 'not-em': ['tet', 'function', 'effectid', 'gene', 't', 'enzyme', 'receptor', 'drug', 'affect', 'chemical', 'biology', 'purpose', 'virus', 'caused', 'control', 'role'], 'space-tk': ['tet', 'protein', 'functions']}
----------------------------
1319 Transplanted human glial cells can differentiate within the host animal.
-------------
mlm-splade-62783
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/62783/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--62783-batch_size_40-2022-04-14_08-39-18/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(1, '16284655', 18.019319534301758)] 1
[2.79, 2.14, 1.7, 1.6, 1.57, 1.55, 1.18, 1.0, 0.98, 0.72, 0.58, 0.37, 0.36, 0.34, 0.3, 0.29, 0.16, 0.16, 0.11, 0.07]
glial human cells glia host cell g transplant animal animals glioma hosts mature into propagated humans transplanted gn gl implant
{'em': ['glial', 'human', 'cells', 'host', 'animal', 'transplanted'], 'not-em': ['glia', 'cell', 'g', 'transplant', 'animals', 'glioma', 'hosts', 'mature', 'into', 'propagated', 'humans', 'gn', 'gl', 'implant'], 'space-tk': ['glial', 'human', 'cells', 'host', 'animal', 'transplanted']}
-------------
mlm-splade-30522
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/30522/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--30522-batch_size_40-2022-04-17_10-06-25/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(5, '16284655', 16.43416976928711)] 1
[3.06, 2.11, 1.89, 1.72, 1.48, 1.3, 1.21, 0.83, 0.41, 0.34, 0.3, 0.24, 0.2, 0.15, 0.13, 0.12, 0.11, 0.09, 0.09, 0.09]
##lia g human cell host animal cellsglia transplantl dog mammalocyteslio animals differentiation humans mature recipient processed
{'em': ['##lia', 'g', 'human', 'host', 'animal'], 'not-em': ['cell', 'cellsglia', 'transplantl', 'dog', 'mammalocyteslio', 'animals', 'differentiation', 'humans', 'mature', 'recipient', 'processed'], 'space-tk': ['human', 'host', 'animal']}
-------------
splade
path /home/gaia_data/iida.h/BEIR/model/msmarco/splade/distilSplade_0.1_0.08_bert-base-uncased-batch_size_24-2022-04-07_21-45-37/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(6, '16284655', 12.43342113494873)] 1
[2.36, 2.08, 1.9, 1.75, 1.0, 0.97, 0.56, 0.51, 0.26, 0.21, 0.1, 0.08, 0.08, 0.08, 0.06, 0.05, 0.05, 0.04, 0.03, 0.03]
##lia g human cell host cellsl animal purpose hosts biology gene benefit mechanism into function dog humans neurons receptor
{'em': ['##lia', 'g', 'human', 'host', 'animal'], 'not-em': ['cell', 'cellsl', 'purpose', 'hosts', 'biology', 'gene', 'benefit', 'mechanism', 'into', 'function', 'dog', 'humans', 'neurons', 'receptor'], 'space-tk': ['human', 'host', 'animal']}
----------------------------
1362 Venules have a larger lumen diameter than arterioles.
-------------
mlm-splade-62783
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/62783/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--62783-batch_size_40-2022-04-14_08-39-18/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(8, '8290953', 5.479719638824463)] 1
[1.63, 1.41, 1.08, 0.69, 0.39, 0.29, 0.19, 0.12, 0.07, 0.04, 0.03, 0.01]
venules arterioles arteriole ve ventricle lumen arterio vessel greater structure dimensions increase
{'em': ['venules', 'arterioles', 'lumen'], 'not-em': ['arteriole', 've', 'ventricle', 'arterio', 'vessel', 'greater', 'structure', 'dimensions', 'increase'], 'space-tk': ['venules', 'arterioles', 'lumen']}
-------------
mlm-splade-30522
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/30522/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--30522-batch_size_40-2022-04-17_10-06-25/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


not exist within top10 8290953
[1.85, 1.09, 0.76, 0.67, 0.27, 0.22, 0.04, 0.04, 0.03]
##nurio ve arteles structure tissue dimension vein
{'em': ['ve'], 'not-em': ['##nurio', 'arteles', 'structure', 'tissue', 'dimension', 'vein'], 'space-tk': []}
-------------
splade
path /home/gaia_data/iida.h/BEIR/model/msmarco/splade/distilSplade_0.1_0.08_bert-base-uncased-batch_size_24-2022-04-07_21-45-37/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


not exist within top10 8290953
[2.44, 1.85, 1.79, 1.19, 0.28, 0.15, 0.08, 0.05, 0.04, 0.01, 0.01]
##nurio arte velesle characteristic dimensions structure type better
{'em': ['arte'], 'not-em': ['##nurio', 'velesle', 'characteristic', 'dimensions', 'structure', 'type', 'better'], 'space-tk': []}
----------------------------
1370 Vitamin D deficiency is unrelated to birth weight.
-------------
mlm-splade-62783
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/62783/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--62783-batch_size_40-2022-04-14_08-39-18/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(8, '2425364', 12.10324478149414)] 1
[3.37, 2.51, 2.26, 2.15, 1.66, 0.83, 0.75, 0.34, 0.21, 0.18, 0.12, 0.04, 0.02, 0.01, 0.01, 0.0]
d vitamin birth weight pregnancy vitamins baby lack weighting child affect cause with l difference drug
{'em': ['d', 'vitamin', 'birth', 'weight'], 'not-em': ['pregnancy', 'vitamins', 'baby', 'lack', 'weighting', 'child', 'affect', 'cause', 'with', 'l', 'difference', 'drug'], 'space-tk': ['d', 'vitamin', 'birth', 'weight']}
-------------
mlm-splade-30522
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/30522/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--30522-batch_size_40-2022-04-17_10-06-25/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


not exist within top10 2425364
[3.23, 3.17, 1.59, 1.51, 1.33, 0.97, 0.96, 0.68, 0.48, 0.47, 0.23, 0.09, 0.06, 0.06, 0.04, 0.03, 0.02, 0.02, 0.01, 0.0]
vitamin d pregnancy weight birth baby deficiency supplement influence lack relationship is determine cause diseaseweight affected related ) child
{'em': ['vitamin', 'd', 'weight', 'birth', 'deficiency', 'is'], 'not-em': ['pregnancy', 'baby', 'supplement', 'influence', 'lack', 'relationship', 'determine', 'cause', 'diseaseweight', 'affected', 'related', ')', 'child'], 'space-tk': ['vitamin', 'd', 'weight', 'birth', 'deficiency', 'is']}
-------------
splade
path /home/gaia_data/iida.h/BEIR/model/msmarco/splade/distilSplade_0.1_0.08_bert-base-uncased-batch_size_24-2022-04-07_21-45-37/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


not exist within top10 2425364
[3.33, 3.21, 1.62, 1.23, 1.08, 1.04, 1.01, 0.29, 0.17, 0.16, 0.12, 0.08, 0.06, 0.05, 0.05, 0.04, 0.03, 0.03, 0.01, 0.01]
vitamin d weight baby pregnancy deficiency birth nutrients born fat child effect affect babies delivery not related newborn cause determine
{'em': ['vitamin', 'd', 'weight', 'deficiency', 'birth'], 'not-em': ['baby', 'pregnancy', 'nutrients', 'born', 'fat', 'child', 'effect', 'affect', 'babies', 'delivery', 'not', 'related', 'newborn', 'cause', 'determine'], 'space-tk': ['vitamin', 'd', 'weight', 'deficiency', 'birth']}
----------------------------
1379 Women with a higher birth weight are more likely to develop breast cancer later in life.
-------------
mlm-splade-62783
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/62783/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--62783-batch_size_40-2022-04-14_08-39-18/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(10, '16322674', 22.686885833740234)] 1
[3.58, 3.56, 2.33, 2.25, 1.37, 1.14, 1.14, 1.13, 1.11, 0.72, 0.65, 0.63, 0.62, 0.62, 0.47, 0.3, 0.21, 0.17, 0.16, 0.08]
cancer breast birth weight tumor risk breasts women pregnancy weighting risks baby cancers woman breastfeeding cancerous fat chance child increase
{'em': ['cancer', 'breast', 'birth', 'weight', 'women'], 'not-em': ['tumor', 'risk', 'breasts', 'pregnancy', 'weighting', 'risks', 'baby', 'cancers', 'woman', 'breastfeeding', 'cancerous', 'fat', 'chance', 'child', 'increase'], 'space-tk': ['cancer', 'breast', 'birth', 'weight', 'women']}
[(4, '27123743', 25.149755477905273)] 1
[3.89, 3.79, 1.95, 1.92, 1.75, 1.44, 1.26, 1.15, 0.77, 0.68, 0.68, 0.68, 0.66, 0.56, 0.52, 0.46, 0.4, 0.37, 0.36, 0.24]
cancer breast birth weight tumor breasts women risk cancers baby risks cause woman likely high cancerous weighting breastfeeding overweight bmi
{'em': ['cancer', 'breast', 'birth', 'weight', 'women', 'likely'], 'not-em': ['tumor', 'breasts', 

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


not exist within top10 16322674
[3.84, 2.98, 2.17, 1.87, 1.58, 1.46, 0.98, 0.95, 0.88, 0.83, 0.64, 0.38, 0.38, 0.37, 0.28, 0.24, 0.22, 0.17, 0.14, 0.14]
breast cancer weight birth risk tumor weigh baby pregnancy woman women influence effect cancers lump breasts percent age during bra
{'em': ['breast', 'cancer', 'weight', 'birth', 'women'], 'not-em': ['risk', 'tumor', 'weigh', 'baby', 'pregnancy', 'woman', 'influence', 'effect', 'cancers', 'lump', 'breasts', 'percent', 'age', 'during', 'bra'], 'space-tk': ['breast', 'cancer', 'weight', 'birth', 'women']}
[(1, '27123743', 25.320796966552734)] 1
[4.19, 3.45, 2.63, 1.79, 1.69, 1.39, 0.81, 0.75, 0.71, 0.64, 0.62, 0.62, 0.61, 0.58, 0.54, 0.49, 0.44, 0.41, 0.28, 0.26]
breast cancer weight birth tumor risk babyweight high cancers women woman cause pregnancy likely influence obesity weigh breasts disease
{'em': ['breast', 'cancer', 'weight', 'birth', 'women', 'likely'], 'not-em': ['tumor', 'risk', 'babyweight', 'high', 'cancers', 'woman', 'caus

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(8, '16322674', 15.468591690063477)] 1
[3.21, 2.6, 1.93, 1.6, 1.46, 0.89, 0.81, 0.53, 0.51, 0.43, 0.42, 0.33, 0.32, 0.28, 0.23, 0.18, 0.15, 0.09, 0.06, 0.05]
breast cancer weight birth tumor baby woman pregnancy effect risk female born women fat cancers chance obesity breasts positive effects
{'em': ['breast', 'cancer', 'weight', 'birth', 'women'], 'not-em': ['tumor', 'baby', 'woman', 'pregnancy', 'effect', 'risk', 'female', 'born', 'fat', 'cancers', 'chance', 'obesity', 'breasts', 'positive', 'effects'], 'space-tk': ['breast', 'cancer', 'weight', 'birth', 'women']}
[(1, '27123743', 20.315526962280273)] 1
[3.55, 2.94, 1.92, 1.82, 1.63, 0.76, 0.75, 0.69, 0.56, 0.55, 0.49, 0.44, 0.43, 0.39, 0.36, 0.35, 0.33, 0.32, 0.31, 0.31]
breast cancer weight tumor birth baby fat woman obesity high pregnancy cause likely low chance risk female born women cancers
{'em': ['breast', 'cancer', 'weight', 'birth', 'likely', 'women'], 'not-em': ['tumor', 'baby', 'fat', 'woman', 'obesity', 'high', 'pregnanc

In [360]:
observe_model = ["mlm-splade-62783", "mlm-splade-30522", "splade"]
for qid in qids_adalm_splade_worse_show:
    print("----------------------------")
    print(qid, queries[qid])
    for model_name in observe_model:
        print("-------------")
        print(model_name)
        analysis(model_name, qid, all_results)

----------------------------
54 AMP-activated protein kinase (AMPK) activation increases inflammation-related fibrosis in the lungs.
-------------
mlm-splade-62783
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/62783/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--62783-batch_size_40-2022-04-14_08-39-18/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


not exist within top10 49556906
[4.07, 4.02, 3.74, 2.95, 2.84, 2.49, 1.61, 1.57, 1.19, 0.81, 0.73, 0.67, 0.47, 0.46, 0.42, 0.42, 0.4, 0.38, 0.37, 0.37]
fibrosis amp ampk lungs lung activation amps fi activated activity regulation amplifier inflammatory help cause breathing activations word signaling ␣
{'em': ['fibrosis', 'amp', 'ampk', 'lungs', 'activation', 'activated'], 'not-em': ['lung', 'amps', 'fi', 'activity', 'regulation', 'amplifier', 'inflammatory', 'help', 'cause', 'breathing', 'activations', 'word', 'signaling', '␣'], 'space-tk': ['fibrosis', 'amp', 'ampk', 'lungs', 'activation', 'activated']}
-------------
mlm-splade-30522
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/30522/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--30522-batch_size_40-2022-04-17_10-06-25/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(6, '49556906', 31.937511444091797)] 1
[6.76, 4.39, 3.59, 3.26, 2.78, 2.21, 2.15, 1.83, 1.35, 0.72, 0.43, 0.41, 0.35, 0.32, 0.31, 0.27, 0.25, 0.21, 0.18, 0.17]
ampbrok lung kinase activation fi lungssis activate protein drug activated k function breath hormone associated benefit enzyme
{'em': ['kinase', 'activation', 'fi', 'protein', 'activated'], 'not-em': ['ampbrok', 'lung', 'lungssis', 'activate', 'drug', 'k', 'function', 'breath', 'hormone', 'associated', 'benefit', 'enzyme'], 'space-tk': ['kinase', 'activation', 'protein', 'activated']}
-------------
splade
path /home/gaia_data/iida.h/BEIR/model/msmarco/splade/distilSplade_0.1_0.08_bert-base-uncased-batch_size_24-2022-04-07_21-45-37/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(8, '49556906', 27.50501251220703)] 1
[5.6, 3.52, 3.52, 2.56, 2.42, 1.83, 1.55, 1.54, 1.26, 1.21, 0.45, 0.32, 0.25, 0.24, 0.22, 0.2, 0.17, 0.14, 0.13, 0.1]
ampbrok fi lung lungssis kinase activation activate activity drug mechanism receptor k effect associated active function test
{'em': ['fi', 'kinase', 'activation'], 'not-em': ['ampbrok', 'lung', 'lungssis', 'activate', 'activity', 'drug', 'mechanism', 'receptor', 'k', 'effect', 'associated', 'active', 'function', 'test'], 'space-tk': ['kinase', 'activation']}
----------------------------
388 Ethanol stress decreases the expression of IBP in bacteria.
-------------
mlm-splade-62783
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/62783/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--62783-batch_size_40-2022-04-14_08-39-18/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(6, '1148122', 11.668325424194336)] 1
[4.39, 2.79, 1.44, 0.91, 0.85, 0.46, 0.28, 0.19, 0.14, 0.1]
ethanol alcohol bacteria bacterial affect alcohols regulation gene effect cause
{'em': ['ethanol', 'bacteria'], 'not-em': ['alcohol', 'bacterial', 'affect', 'alcohols', 'regulation', 'gene', 'effect', 'cause'], 'space-tk': ['ethanol', 'bacteria']}
-------------
mlm-splade-30522
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/30522/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--30522-batch_size_40-2022-04-17_10-06-25/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(3, '1148122', 12.905157089233398)] 1
[4.54, 3.02, 1.75, 0.89, 0.65, 0.44, 0.43, 0.32, 0.24, 0.2, 0.15, 0.12]
ethanol alcohol bacteria regulation bacterial genetic influence strain organism drinking cause affect
{'em': ['ethanol', 'bacteria'], 'not-em': ['alcohol', 'regulation', 'bacterial', 'genetic', 'influence', 'strain', 'organism', 'drinking', 'cause', 'affect'], 'space-tk': ['ethanol', 'bacteria']}
-------------
splade
path /home/gaia_data/iida.h/BEIR/model/msmarco/splade/distilSplade_0.1_0.08_bert-base-uncased-batch_size_24-2022-04-07_21-45-37/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(2, '1148122', 11.530694007873535)] 1
[4.46, 2.74, 1.77, 0.78, 0.35, 0.34, 0.29, 0.23, 0.14, 0.12]
ethanol alcohol bacteria bacterial gene affect effect biology mechanism fuel
{'em': ['ethanol', 'bacteria'], 'not-em': ['alcohol', 'bacterial', 'gene', 'affect', 'effect', 'biology', 'mechanism', 'fuel'], 'space-tk': ['ethanol', 'bacteria']}
----------------------------
527 Homozygous deletion of murine Sbds gene from osterix-expressing mesenchymal stem and progenitor cells (MPCs) prevents oxidative stress.
-------------
mlm-splade-62783
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/62783/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--62783-batch_size_40-2022-04-14_08-39-18/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(5, '3863543', 36.55259704589844)] 1
[3.86, 2.95, 2.69, 2.45, 2.41, 1.81, 1.81, 1.61, 1.41, 1.37, 1.22, 1.04, 1.03, 0.94, 0.64, 0.58, 0.5, 0.49, 0.47, 0.46]
mesenchymal stress stem oxidative mouse progenitor mesenchyme stressed cell cells murine sd dna progenitors emt rat stemness gene parenchymal genetic
{'em': ['mesenchymal', 'stress', 'stem', 'oxidative', 'progenitor', 'cells', 'murine', 'gene'], 'not-em': ['mouse', 'mesenchyme', 'stressed', 'cell', 'sd', 'dna', 'progenitors', 'emt', 'rat', 'stemness', 'parenchymal', 'genetic'], 'space-tk': ['mesenchymal', 'stress', 'stem', 'oxidative', 'progenitor', 'cells', 'murine', 'gene']}
-------------
mlm-splade-30522
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/30522/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--30522-batch_size_40-2022-04-17_10-06-25/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(2, '3863543', 31.317371368408203)] 1
[4.29, 4.16, 2.92, 2.76, 1.72, 1.62, 1.51, 1.5, 1.43, 1.36, 1.32, 1.3, 1.06, 1.02, 0.86, 0.74, 0.69, 0.64, 0.59, 0.32]
stress stemitorsen cellmalida megenchy pro ox genetic cells sdctive dna het mouse
{'em': ['stress', 'pro', 'ox', 'cells'], 'not-em': ['stemitorsen', 'cellmalida', 'megenchy', 'genetic', 'sdctive', 'dna', 'het', 'mouse'], 'space-tk': ['stress', 'cells']}
-------------
splade
path /home/gaia_data/iida.h/BEIR/model/msmarco/splade/distilSplade_0.1_0.08_bert-base-uncased-batch_size_24-2022-04-07_21-45-37/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(3, '3863543', 26.427852630615234)] 1
[4.41, 2.82, 2.74, 2.74, 1.67, 1.62, 1.57, 1.45, 1.37, 1.35, 1.21, 1.14, 0.86, 0.6, 0.57, 0.4, 0.33, 0.25, 0.25, 0.23]
stress stemitorsen ox cell me promalgenchyida cells genec dna mechanismitors deter stems
{'em': ['stress', 'ox', 'me', 'cells'], 'not-em': ['stemitorsen', 'cell', 'promalgenchyida', 'genec', 'dna', 'mechanismitors', 'deter', 'stems'], 'space-tk': ['stress', 'cells']}
----------------------------
551 ITAM phosphorylation prevents the transfer of the T cell receptor (TCR) signal from the echo-domain to the cytoplasmic tail of the T cell receptor (TCR).
-------------
mlm-splade-62783
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/62783/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--62783-batch_size_40-2022-04-14_08-39-18/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


not exist within top10 33499189
[3.28, 3.26, 2.78, 2.67, 2.35, 2.28, 1.95, 1.94, 1.46, 1.06, 1.06, 1.03, 0.75, 0.58, 0.54, 0.5, 0.46, 0.44, 0.37, 0.31]
tcr t receptor cytoplasmic tail activation cell tc signaling tcrs cytoplasm receptors tails cells tk signal phosphorylation binding cy ts
{'em': ['tcr', 't', 'receptor', 'cytoplasmic', 'tail', 'cell', 'signal', 'phosphorylation'], 'not-em': ['activation', 'tc', 'signaling', 'tcrs', 'cytoplasm', 'receptors', 'tails', 'cells', 'tk', 'binding', 'cy', 'ts'], 'space-tk': ['tcr', 't', 'receptor', 'cytoplasmic', 'tail', 'cell', 'signal', 'phosphorylation']}
-------------
mlm-splade-30522
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/30522/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--30522-batch_size_40-2022-04-17_10-06-25/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(3, '33499189', 28.818477630615234)] 1
[4.74, 3.89, 3.65, 2.85, 2.34, 1.73, 1.36, 1.33, 1.3, 0.69, 0.66, 0.56, 0.55, 0.42, 0.34, 0.31, 0.27, 0.26, 0.25, 0.18]
tc t receptor tail celltopr cylas cells activation receptors signal tails ts binding drug mechanism enzyme transmission
{'em': ['tc', 't', 'receptor', 'tail', 'signal'], 'not-em': ['celltopr', 'cylas', 'cells', 'activation', 'receptors', 'tails', 'ts', 'binding', 'drug', 'mechanism', 'enzyme', 'transmission'], 'space-tk': ['t', 'receptor', 'tail', 'signal']}
-------------
splade
path /home/gaia_data/iida.h/BEIR/model/msmarco/splade/distilSplade_0.1_0.08_bert-base-uncased-batch_size_24-2022-04-07_21-45-37/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(3, '33499189', 22.746097564697266)] 1
[4.17, 3.44, 3.05, 1.96, 1.76, 1.75, 1.24, 1.05, 1.0, 0.84, 0.72, 0.4, 0.2, 0.2, 0.15, 0.13, 0.13, 0.1]
tc t receptor tailr celltoplas receptors cy mechanism signal cells sense to effectpha drug
{'em': ['tc', 't', 'receptor', 'cy', 'signal', 'to'], 'not-em': ['tailr', 'celltoplas', 'receptors', 'mechanism', 'cells', 'sense', 'effectpha', 'drug'], 'space-tk': ['t', 'receptor', 'signal', 'to']}
----------------------------
554 Immune complex triggered cell death leads to extracellular release of neutrophil protein HMGB1.
-------------
mlm-splade-62783
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/62783/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--62783-batch_size_40-2022-04-14_08-39-18/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(7, '1049501', 31.52410125732422)] 1
[3.93, 3.45, 2.42, 2.14, 1.71, 1.52, 1.42, 1.22, 1.22, 1.22, 1.1, 0.82, 0.79, 0.64, 0.62, 0.6, 0.57, 0.53, 0.53, 0.41]
neutrophil extracellular immune neutrophils complex release intracellular complexes cell extra neutropenia leukocyte autoimmune cause neutropenic neuro neut inflammatory autoimmunity ne
{'em': ['neutrophil', 'extracellular', 'immune', 'complex', 'release', 'cell'], 'not-em': ['neutrophils', 'intracellular', 'complexes', 'extra', 'neutropenia', 'leukocyte', 'autoimmune', 'cause', 'neutropenic', 'neuro', 'neut', 'inflammatory', 'autoimmunity', 'ne'], 'space-tk': ['neutrophil', 'extracellular', 'immune', 'complex', 'release', 'cell']}
-------------
mlm-splade-30522
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/30522/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--30522-batch_size_40-2022-04-17_10-06-25/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(2, '1049501', 28.059900283813477)] 1
[3.53, 2.8, 2.79, 2.46, 2.41, 2.24, 1.97, 1.67, 1.47, 1.37, 0.66, 0.52, 0.51, 0.4, 0.4, 0.35, 0.27, 0.24, 0.22, 0.17]
##ut extracellular immunehil nerop complex cell release trigger complexes induced stimulate cellular vitro lead cause pathway vivo
{'em': ['##ut', 'complex', 'cell', 'release'], 'not-em': ['extracellular', 'immunehil', 'nerop', 'trigger', 'complexes', 'induced', 'stimulate', 'cellular', 'vitro', 'lead', 'cause', 'pathway', 'vivo'], 'space-tk': ['extracellular', 'complex', 'cell', 'release']}
-------------
splade
path /home/gaia_data/iida.h/BEIR/model/msmarco/splade/distilSplade_0.1_0.08_bert-base-uncased-batch_size_24-2022-04-07_21-45-37/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(3, '1049501', 23.741046905517578)] 1
[2.94, 2.58, 2.52, 2.3, 1.9, 1.85, 1.69, 1.52, 1.25, 0.98, 0.65, 0.34, 0.31, 0.25, 0.25, 0.22, 0.19, 0.18, 0.18, 0.16]
extrautcellularhilrop immune ne release complex cell stimulate mechanism protein gene cellular released effect receptor trans signal
{'em': ['immune', 'ne', 'release', 'complex', 'cell', 'protein'], 'not-em': ['extrautcellularhilrop', 'stimulate', 'mechanism', 'gene', 'cellular', 'released', 'effect', 'receptor', 'trans', 'signal'], 'space-tk': ['immune', 'release', 'complex', 'cell', 'protein']}
----------------------------
575 In domesticated populations of Saccharomyces cerevisiae, whole chromosome aneuploidy is very uncommon.
-------------
mlm-splade-62783
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/62783/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--62783-batch_size_40-2022-04-14_08-39-18/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(7, '10300888', 22.13182258605957)] 1
[2.8, 2.73, 2.69, 2.38, 2.22, 1.37, 1.17, 0.9, 0.74, 0.64, 0.61, 0.58, 0.48, 0.44, 0.37, 0.26, 0.23, 0.23, 0.16, 0.16]
saccharomyces cerevisiae domesticated domestication yeast domestic yeasts dna wild candida ce origin sac domestica originated animals clone breed chromosomes bacteria
{'em': ['saccharomyces', 'cerevisiae', 'domesticated'], 'not-em': ['domestication', 'yeast', 'domestic', 'yeasts', 'dna', 'wild', 'candida', 'ce', 'origin', 'sac', 'domestica', 'originated', 'animals', 'clone', 'breed', 'chromosomes', 'bacteria'], 'space-tk': ['saccharomyces', 'cerevisiae', 'domesticated']}
-------------
mlm-splade-30522
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/30522/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--30522-batch_size_40-2022-04-17_10-06-25/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(1, '10300888', 30.07414436340332)] 1
[4.01, 3.08, 2.66, 2.41, 2.39, 2.01, 1.96, 1.84, 1.35, 0.99, 0.85, 0.85, 0.79, 0.74, 0.65, 0.54, 0.48, 0.4, 0.36, 0.21]
##vis domestic sacyce ce yeastchaiaeromre bacteria genetic chromosome species dnaated fungusation organism is
{'em': ['##vis', 'domestic', 'ce', 'chromosome', 'is'], 'not-em': ['sacyce', 'yeastchaiaeromre', 'bacteria', 'genetic', 'species', 'dnaated', 'fungusation', 'organism'], 'space-tk': ['chromosome', 'is']}
-------------
splade
path /home/gaia_data/iida.h/BEIR/model/msmarco/splade/distilSplade_0.1_0.08_bert-base-uncased-batch_size_24-2022-04-07_21-45-37/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(1, '10300888', 23.18876075744629)] 1
[3.45, 2.71, 2.61, 1.89, 1.83, 1.59, 1.54, 1.17, 1.05, 0.75, 0.69, 0.53, 0.45, 0.34, 0.31, 0.29, 0.25, 0.22, 0.21, 0.18]
##vis domestic sacyce ceromchareiae gene dna species farm fungiated bacteria chromosome genomeation virus
{'em': ['##vis', 'domestic', 'chromosome'], 'not-em': ['sacyce', 'ceromchareiae', 'gene', 'dna', 'species', 'farm', 'fungiated', 'bacteria', 'genomeation', 'virus'], 'space-tk': ['chromosome']}
----------------------------
756 Many proteins in human cells can be post-translationally modified at lysine residues via acetylation.
-------------
mlm-splade-62783
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/62783/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--62783-batch_size_40-2022-04-14_08-39-18/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(5, '2831620', 44.50040817260742)] 1
[4.18, 3.63, 3.25, 2.55, 2.02, 1.99, 1.91, 1.69, 1.59, 1.34, 1.3, 1.24, 0.99, 0.98, 0.98, 0.9, 0.87, 0.8, 0.8, 0.75]
lysine acetylation protein acetyl lys acetylated proteins deacetylation l posttranslational acetyltransferase ace alkylation phenylalanine deacetylase modified lyso histone deacetylases cell
{'em': ['lysine', 'acetylation', 'proteins', 'modified'], 'not-em': ['protein', 'acetyl', 'lys', 'acetylated', 'deacetylation', 'l', 'posttranslational', 'acetyltransferase', 'ace', 'alkylation', 'phenylalanine', 'deacetylase', 'lyso', 'histone', 'deacetylases', 'cell'], 'space-tk': ['lysine', 'acetylation', 'proteins', 'modified']}
-------------
mlm-splade-30522
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/30522/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--30522-batch_size_40-2022-04-17_10-06-25/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(2, '2831620', 31.055965423583984)] 1
[4.35, 3.53, 3.04, 2.75, 2.39, 2.12, 1.68, 1.6, 1.33, 1.09, 0.97, 0.71, 0.71, 0.65, 0.57, 0.53, 0.52, 0.48, 0.39, 0.29]
##ys acelation protein ltyine post mod proteins aminolatelated metabolism cell modulation enzymes enzymelase after
{'em': ['##ys', 'post', 'proteins'], 'not-em': ['acelation', 'protein', 'ltyine', 'mod', 'aminolatelated', 'metabolism', 'cell', 'modulation', 'enzymes', 'enzymelase', 'after'], 'space-tk': ['post', 'proteins']}
-------------
splade
path /home/gaia_data/iida.h/BEIR/model/msmarco/splade/distilSplade_0.1_0.08_bert-base-uncased-batch_size_24-2022-04-07_21-45-37/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(3, '2831620', 22.697677612304688)] 1
[3.54, 3.1, 2.58, 2.57, 2.54, 2.04, 1.51, 1.33, 0.74, 0.38, 0.37, 0.31, 0.26, 0.25, 0.21, 0.17, 0.14, 0.11, 0.1]
##ys acety proteinlation l postine proteins modifiedlate cell after amino mechanism enzyme gene metater
{'em': ['##ys', 'l', 'proteins'], 'not-em': ['acety', 'proteinlation', 'postine', 'modifiedlate', 'cell', 'after', 'amino', 'mechanism', 'enzyme', 'gene', 'metater'], 'space-tk': ['proteins']}
----------------------------
793 Mitochondria are uninvolved in apoptosis.
-------------
mlm-splade-62783
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/62783/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--62783-batch_size_40-2022-04-14_08-39-18/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(4, '8551160', 20.20009994506836)] 1
[4.96, 3.96, 3.24, 2.54, 1.57, 1.29, 0.53, 0.49, 0.45, 0.36, 0.35, 0.19, 0.11]
mitochondria mitochondrial apoptosis mitochondrion apoptotic ap bax are death organelle necrosis function amy
{'em': ['mitochondria', 'apoptosis', 'are'], 'not-em': ['mitochondrial', 'mitochondrion', 'apoptotic', 'ap', 'bax', 'death', 'organelle', 'necrosis', 'function', 'amy'], 'space-tk': ['mitochondria', 'apoptosis', 'are']}
-------------
mlm-splade-30522
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/30522/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--30522-batch_size_40-2022-04-17_10-06-25/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(1, '8551160', 24.53205680847168)] 1
[5.34, 3.21, 2.87, 2.62, 2.42, 2.42, 1.1, 1.05, 0.99, 0.81, 0.58, 0.41, 0.26, 0.13, 0.11]
mittosisondoch organ mitochondrialop ap cellria cells are nucleus tissue during
{'em': ['ap', 'are'], 'not-em': ['mittosisondoch', 'organ', 'mitochondrialop', 'cellria', 'cells', 'nucleus', 'tissue', 'during'], 'space-tk': ['are']}
-------------
splade
path /home/gaia_data/iida.h/BEIR/model/msmarco/splade/distilSplade_0.1_0.08_bert-base-uncased-batch_size_24-2022-04-07_21-45-37/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(1, '8551160', 16.363235473632812)] 1
[4.01, 2.82, 2.76, 2.44, 1.23, 1.02, 0.97, 0.74, 0.14]
mitondtosisoch apriaop cell mechanism
{'em': [], 'not-em': ['mitondtosisoch', 'apriaop', 'cell', 'mechanism'], 'space-tk': []}
----------------------------
800 Modifying the epigenome in the brain affects the normal human aging process by affecting certain genes related to neurogenesis.
-------------
mlm-splade-62783
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/62783/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--62783-batch_size_40-2022-04-14_08-39-18/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(2, '22543403', 44.7893180847168)] 1
[3.35, 2.91, 2.71, 2.63, 2.13, 2.03, 1.86, 1.51, 1.48, 1.43, 1.13, 1.08, 1.08, 1.06, 1.04, 0.99, 0.98, 0.91, 0.81, 0.74]
epigenetic epigenome brain epigenetics aging neuro epigenomic brains epigenome exome ep elderly genetic modified modification alzheimer dna age ageing
{'em': ['epigenome', 'brain', 'aging', 'epigenome'], 'not-em': ['epigenetic', 'epigenetics', 'neuro', 'epigenomic', 'brains', 'exome', 'ep', 'elderly', 'genetic', 'modified', 'modification', 'alzheimer', 'dna', 'age', 'ageing'], 'space-tk': ['epigenome', 'brain', 'aging', 'epigenome']}
-------------
mlm-splade-30522
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/30522/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--30522-batch_size_40-2022-04-17_10-06-25/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(1, '22543403', 28.587663650512695)] 1
[4.66, 2.94, 2.46, 2.32, 2.08, 1.91, 1.43, 1.38, 1.01, 0.78, 0.69, 0.67, 0.63, 0.63, 0.57, 0.47, 0.47, 0.29, 0.26, 0.25]
##igen brain ep aginguroome modification elderly genetic modified alzheimer ne dna age brains influence developmentomic cause modifications
{'em': ['##igen', 'brain', 'ep', 'ne'], 'not-em': ['aginguroome', 'modification', 'elderly', 'genetic', 'modified', 'alzheimer', 'dna', 'age', 'brains', 'influence', 'developmentomic', 'cause', 'modifications'], 'space-tk': ['brain']}
-------------
splade
path /home/gaia_data/iida.h/BEIR/model/msmarco/splade/distilSplade_0.1_0.08_bert-base-uncased-batch_size_24-2022-04-07_21-45-37/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(1, '22543403', 19.76633071899414)] 1
[3.75, 2.66, 2.22, 1.7, 1.61, 1.16, 1.09, 0.88, 0.5, 0.46, 0.37, 0.34, 0.3, 0.27, 0.26, 0.23, 0.19, 0.17, 0.17, 0.17]
##igen brain epuroome aging modified elderly ne modification gene genetic mechanism dna neural ageuron cause neurons disease
{'em': ['##igen', 'brain', 'aging', 'ne'], 'not-em': ['epuroome', 'modified', 'elderly', 'modification', 'gene', 'genetic', 'mechanism', 'dna', 'neural', 'ageuron', 'cause', 'neurons', 'disease'], 'space-tk': ['brain', 'aging']}
----------------------------
1024 Recurrent mutations occur frequently within CTCF anchor sites adjacent to oncogenes.
-------------
mlm-splade-62783
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/62783/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--62783-batch_size_40-2022-04-14_08-39-18/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(4, '5373138', 20.43761444091797)] 1
[3.34, 1.83, 1.68, 1.61, 1.55, 1.25, 1.08, 0.94, 0.93, 0.67, 0.63, 0.55, 0.5, 0.46, 0.39, 0.34, 0.33, 0.31, 0.29, 0.28]
ctcf anchor ctf ct mutation ctc cancer mutations site associated occur mutant frequent genec rec anchors geneticf sites
{'em': ['ctcf', 'anchor', 'mutations', 'occur', 'sites'], 'not-em': ['ctf', 'ct', 'mutation', 'ctc', 'cancer', 'site', 'associated', 'mutant', 'frequent', 'genec', 'rec', 'anchors', 'geneticf'], 'space-tk': ['ctcf', 'anchor', 'mutations', 'occur', 'sites']}
-------------
mlm-splade-30522
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/30522/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--30522-batch_size_40-2022-04-17_10-06-25/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(1, '5373138', 20.357667922973633)] 1
[4.27, 4.15, 2.49, 2.13, 1.16, 1.1, 0.95, 0.83, 0.59, 0.54, 0.41, 0.38, 0.25, 0.25, 0.19, 0.18, 0.14, 0.13]
ct anchorfc mutations mutation genetic occur site anchors dna gene cell chromosome co structure connecticut within
{'em': ['ct', 'mutations', 'occur', 'within'], 'not-em': ['anchorfc', 'mutation', 'genetic', 'site', 'anchors', 'dna', 'gene', 'cell', 'chromosome', 'co', 'structure', 'connecticut'], 'space-tk': ['mutations', 'occur', 'within']}
-------------
splade
path /home/gaia_data/iida.h/BEIR/model/msmarco/splade/distilSplade_0.1_0.08_bert-base-uncased-batch_size_24-2022-04-07_21-45-37/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(1, '5373138', 16.47776222229004)] 1
[4.28, 3.21, 2.02, 1.17, 1.11, 0.8, 0.77, 0.69, 0.49, 0.34, 0.32, 0.23, 0.21, 0.14, 0.14, 0.13, 0.12]
ctfc mutations anchor mutation gene site associated dna occur chromosome domain frequent genes sites component
{'em': ['mutations', 'anchor', 'occur', 'sites'], 'not-em': ['ctfc', 'mutation', 'gene', 'site', 'associated', 'dna', 'chromosome', 'domain', 'frequent', 'genes', 'component'], 'space-tk': ['mutations', 'anchor', 'occur', 'sites']}
----------------------------
1049 Ribosomopathies have a low degree of cell and tissue specific pathology.
-------------
mlm-splade-62783
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/62783/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--62783-batch_size_40-2022-04-14_08-39-18/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(8, '12486491', 12.581689834594727)] 1
[2.62, 2.17, 1.51, 1.49, 0.87, 0.83, 0.74, 0.66, 0.63, 0.4, 0.21, 0.14, 0.13]
tissue specificity rib specific tissues ribosomes ribozyme ribosome ribozymes specificities mar word phenotype
{'em': ['tissue', 'specific'], 'not-em': ['specificity', 'rib', 'tissues', 'ribosomes', 'ribozyme', 'ribosome', 'ribozymes', 'specificities', 'mar', 'word', 'phenotype'], 'space-tk': ['tissue', 'specific']}
-------------
mlm-splade-30522
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/30522/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--30522-batch_size_40-2022-04-17_10-06-25/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(4, '12486491', 12.935647964477539)] 1
[3.47, 3.05, 2.84, 2.12, 0.32, 0.32, 0.18, 0.18, 0.14, 0.1]
riboso tissue specific embryo tissues disease characteristic ribs is
{'em': ['tissue', 'specific'], 'not-em': ['riboso', 'embryo', 'tissues', 'disease', 'characteristic', 'ribs', 'is'], 'space-tk': ['tissue', 'specific']}
-------------
splade
path /home/gaia_data/iida.h/BEIR/model/msmarco/splade/distilSplade_0.1_0.08_bert-base-uncased-batch_size_24-2022-04-07_21-45-37/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(4, '12486491', 10.961007118225098)] 1
[3.47, 2.72, 2.19, 1.89, 0.13, 0.12]
riboso tissue specific characteristic gene
{'em': ['tissue', 'specific'], 'not-em': ['riboso', 'characteristic', 'gene'], 'space-tk': ['tissue', 'specific']}
----------------------------
1088 Silencing of Bcl2 is important for the maintenance and progression of tumors.
-------------
mlm-splade-62783
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/62783/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--62783-batch_size_40-2022-04-14_08-39-18/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(2, '37549932', 23.543411254882812)] 1
[2.88, 2.65, 2.45, 2.38, 2.08, 1.76, 1.33, 1.29, 1.16, 1.1, 0.67, 0.58, 0.38, 0.38, 0.33, 0.31, 0.27, 0.25, 0.2, 0.17]
maintenance bcl tumor cancer bc bcl2 maintain bax 2 tumors b help necessary important expression gene survival2 development function
{'em': ['maintenance', 'bcl2', 'tumors', 'important'], 'not-em': ['bcl', 'tumor', 'cancer', 'bc', 'maintain', 'bax', '2', 'b', 'help', 'necessary', 'expression', 'gene', 'survival2', 'development', 'function'], 'space-tk': ['maintenance', 'bcl2', 'tumors', 'important']}
-------------
mlm-splade-30522
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/30522/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--30522-batch_size_40-2022-04-17_10-06-25/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(1, '37549932', 23.550312042236328)] 1
[4.83, 2.86, 2.63, 2.5, 2.14, 1.38, 1.11, 1.04, 0.89, 0.65, 0.64, 0.39, 0.38, 0.31, 0.24, 0.22, 0.13, 0.13, 0.13, 0.12]
bc tumor maintenance cancerl2 maintain 2 target tumors survival therapy important drug for benefit function repair used role
{'em': ['bc', 'maintenance', 'tumors', 'important', 'for'], 'not-em': ['tumor', 'cancerl2', 'maintain', '2', 'target', 'survival', 'therapy', 'drug', 'benefit', 'function', 'repair', 'used', 'role'], 'space-tk': ['maintenance', 'tumors', 'important', 'for']}
-------------
splade
path /home/gaia_data/iida.h/BEIR/model/msmarco/splade/distilSplade_0.1_0.08_bert-base-uncased-batch_size_24-2022-04-07_21-45-37/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(1, '37549932', 21.46903419494629)] 1
[4.82, 2.26, 1.95, 1.92, 1.43, 1.24, 0.86, 0.82, 0.63, 0.62, 0.59, 0.44, 0.41, 0.32, 0.28, 0.26, 0.26, 0.24, 0.23, 0.2]
bc tumorl cancer maintenance2 maintainmour important tumors 2 drug cure purpose gene vitamin therapy molecule test ii
{'em': ['bc', 'important', 'tumors'], 'not-em': ['tumorl', 'cancer', 'maintenance2', 'maintainmour', '2', 'drug', 'cure', 'purpose', 'gene', 'vitamin', 'therapy', 'molecule', 'test', 'ii'], 'space-tk': ['important', 'tumors']}
----------------------------
1180 The PRR MDA5 is a sensor of RNA virus infection.
-------------
mlm-splade-62783
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/62783/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--62783-batch_size_40-2022-04-14_08-39-18/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(5, '31272411', 21.51691246032715)] 1
[3.26, 2.03, 1.91, 1.91, 1.88, 1.52, 1.26, 1.13, 0.99, 0.93, 0.67, 0.51, 0.49, 0.46, 0.43, 0.3, 0.27, 0.26, 0.25, 0.2]
rna virus5 infection sensing sensor md sensors 5 mda viruses rnase r viral word pathogen dna help test signal
{'em': ['rna', 'infection', 'sensor', 'mda'], 'not-em': ['virus5', 'sensing', 'md', 'sensors', '5', 'viruses', 'rnase', 'r', 'viral', 'word', 'pathogen', 'dna', 'help', 'test', 'signal'], 'space-tk': ['rna', 'infection', 'sensor']}
-------------
mlm-splade-30522
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/30522/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--30522-batch_size_40-2022-04-17_10-06-25/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(1, '31272411', 27.802757263183594)] 1
[4.39, 4.14, 2.95, 2.66, 2.62, 1.68, 0.98, 0.95, 0.74, 0.72, 0.72, 0.65, 0.63, 0.41, 0.38, 0.35, 0.35, 0.33, 0.32, 0.28]
md rna5 infection virus sensor receptora pathogen viruses detectr signal dna function sense five sensing test drug
{'em': ['md', 'infection', 'virus', 'sensor'], 'not-em': ['rna5', 'receptora', 'pathogen', 'viruses', 'detectr', 'signal', 'dna', 'function', 'sense', 'five', 'sensing', 'test', 'drug'], 'space-tk': ['infection', 'virus', 'sensor']}
-------------
splade
path /home/gaia_data/iida.h/BEIR/model/msmarco/splade/distilSplade_0.1_0.08_bert-base-uncased-batch_size_24-2022-04-07_21-45-37/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(1, '31272411', 21.889751434326172)] 1
[4.04, 3.46, 1.91, 1.74, 1.7, 1.63, 0.98, 0.76, 0.58, 0.5, 0.46, 0.42, 0.4, 0.35, 0.34, 0.34, 0.29, 0.29, 0.21, 0.17]
md rna5 infection virus sensor receptor detecta test 5 signal viruses viral gene instrument drug anti immune pathogen
{'em': ['md', 'infection', 'virus', 'sensor'], 'not-em': ['rna5', 'receptor', 'detecta', 'test', '5', 'signal', 'viruses', 'viral', 'gene', 'instrument', 'drug', 'anti', 'immune', 'pathogen'], 'space-tk': ['infection', 'virus', 'sensor']}
----------------------------
1303 Tirasemtiv has no effect on fast-twitch muscle.
-------------
mlm-splade-62783
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/62783/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--62783-batch_size_40-2022-04-14_08-39-18/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(3, '12631697', 9.477374076843262)] 1
[3.16, 2.78, 0.63, 0.61, 0.58, 0.46, 0.25, 0.25, 0.17, 0.11, 0.11]
muscle fast muscles fasting help drug tri effect treatment cause benefit
{'em': ['muscle', 'fast', 'effect'], 'not-em': ['muscles', 'fasting', 'help', 'drug', 'tri', 'treatment', 'cause', 'benefit'], 'space-tk': ['muscle', 'fast', 'effect']}
-------------
mlm-splade-30522
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/30522/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--30522-batch_size_40-2022-04-17_10-06-25/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(1, '12631697', 9.282419204711914)] 1
[3.41, 2.86, 0.46, 0.44, 0.38, 0.35, 0.27, 0.24, 0.23, 0.1]
fast muscle drug benefitiva effect influence slow treatment muscles
{'em': ['fast', 'muscle', 'effect'], 'not-em': ['drug', 'benefitiva', 'influence', 'slow', 'treatment', 'muscles'], 'space-tk': ['fast', 'muscle', 'effect']}
-------------
splade
path /home/gaia_data/iida.h/BEIR/model/msmarco/splade/distilSplade_0.1_0.08_bert-base-uncased-batch_size_24-2022-04-07_21-45-37/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(2, '12631697', 7.853088855743408)] 1
[2.69, 2.6, 0.73, 0.72, 0.39, 0.18, 0.13]
muscle fast drug effect cure muscles receptor
{'em': ['muscle', 'fast', 'effect'], 'not-em': ['drug', 'cure', 'muscles', 'receptor'], 'space-tk': ['muscle', 'fast', 'effect']}
----------------------------
1336 UCB T cells reduce TCR diversity after transplantation.
-------------
mlm-splade-62783
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/62783/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--62783-batch_size_40-2022-04-14_08-39-18/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(2, '27910499', 24.472455978393555)] 1
[3.83, 3.12, 2.68, 2.62, 1.91, 1.79, 1.72, 1.23, 1.12, 1.07, 1.03, 0.67, 0.61, 0.46, 0.15, 0.12]
diversity tcr t transplant after tc cell cells diverse transplantation transplanting donation tcrs transplanted reduced increase
{'em': ['diversity', 'tcr', 't', 'after', 'cells', 'transplantation'], 'not-em': ['transplant', 'tc', 'cell', 'diverse', 'transplanting', 'donation', 'tcrs', 'transplanted', 'reduced', 'increase'], 'space-tk': ['diversity', 'tcr', 't', 'after', 'cells', 'transplantation']}
-------------
mlm-splade-30522
path /home/gaia_data/iida.h/BEIR/model/S2ORC/bert-base-uncased/splade_model/raw/remove/30522/distilSplade_0.1_0.08_-groups-gcb50243-iida.h-BEIR-model-S2ORC-bert-base-uncased-mlm_model-raw-remove--30522-batch_size_40-2022-04-17_10-06-25/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(1, '27910499', 29.307308197021484)] 1
[4.44, 4.28, 3.49, 3.45, 2.38, 1.81, 1.2, 1.16, 0.95, 0.78, 0.73, 0.65, 0.6, 0.58, 0.47, 0.43, 0.42, 0.33, 0.3, 0.23]
tc diversity transplant t cell after cellsr implant donateation variation diverse graf dna donation surgery biodiversity donor blood
{'em': ['tc', 'diversity', 'transplant', 't', 'after'], 'not-em': ['cell', 'cellsr', 'implant', 'donateation', 'variation', 'diverse', 'graf', 'dna', 'donation', 'surgery', 'biodiversity', 'donor', 'blood'], 'space-tk': ['diversity', 't', 'after']}
-------------
splade
path /home/gaia_data/iida.h/BEIR/model/msmarco/splade/distilSplade_0.1_0.08_bert-base-uncased-batch_size_24-2022-04-07_21-45-37/


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[(1, '27910499', 22.54183006286621)] 1
[3.9, 3.73, 3.01, 2.88, 1.94, 1.92, 1.1, 0.73, 0.69, 0.68, 0.63, 0.4, 0.16, 0.14, 0.13, 0.13, 0.11, 0.1]
diversity tc t transplant cell afterr cellsation donation implant biodiversity transfert gene surgery improve limit
{'em': ['diversity', 'tc', 't', 'transplant'], 'not-em': ['cell', 'afterr', 'cellsation', 'donation', 'implant', 'biodiversity', 'transfert', 'gene', 'surgery', 'improve', 'limit'], 'space-tk': ['diversity', 't']}


In [395]:
corpus["1215116"]

{'text': 'Over the past two decades there have been significant achievements in the control of a handful of important human tropical infections [1]. These achievements include the substantive reductions in the prevalence and incidence of the so-called neglected diseases such as lymphatic filariasis, onchocerciasis, guinea worm, leprosy, and trachoma (Box 1) [2]. Each of these neglected diseases is a poverty-promoting and often stigmatizing condition occurring primarily in rural areas of low-income countries (Box 2) [3]. They are ancient afflictions, described in the Bible and other ancient texts, which have burdened humanity for millennia [3]. But now, as a result of aggressive regional vertical interventions, there is a possibility that some neglected tropical infections could be eventually controlled to the point of elimination in some areas of endemicity [2–8]. In the case of guinea worm infection, disease eradication might also soon be possible [9]. Box 2. Common Features of the Ne