In [1]:
import os
import gzip
import pickle

data_folder = "../msmarco/"
# Load a dict (qid, pid) -> ce_score that maps query-ids (qid) and paragraph-ids (pid)
# to the CrossEncoder score computed by the cross-encoder-ms-marco-MiniLM-L-6-v2 model
ce_scores_file = os.path.join(data_folder, 'cross-encoder-ms-marco-MiniLM-L-6-v2-scores.pkl.gz')

with gzip.open(ce_scores_file, 'rb') as fIn:
    ce_scores = pickle.load(fIn)

# As training data we use hard-negatives that have been mined using various systems
hard_negatives_filepath = os.path.join(data_folder, 'msmarco-hard-negatives-splade.jsonl.gz')

KeyboardInterrupt: 

In [9]:
import tqdm
import json
import random

# As training data we use hard-negatives that have been mined using various systems
hard_negatives_filepath = os.path.join(data_folder, 'msmarco-hard-negatives.jsonl.gz')

score_diff1 = []
negs_to_use = None
num_negs_per_system = 5

with gzip.open(hard_negatives_filepath, 'rt') as fIn:
    for line in tqdm.tqdm(fIn):
        
        data = json.loads(line)

        #Get the positive passage ids
        pos_pids = data['pos']

        if len(pos_pids) == 0:  #Skip entries without positives passages
            continue

        pos_min_ce_score = min([ce_scores[data['qid']][pid] for pid in data['pos']])
        
        neg_pids = []
        
        #Get the hard negatives
        if negs_to_use is None:
            negs_to_use = list(data['neg'].keys())
            
        for system_name in negs_to_use:
            if system_name not in data['neg']:
                continue

            system_negs = data['neg'][system_name]
            negs_added = 0
            for pid in system_negs:
                pid = int(pid)
               
                if pid not in neg_pids:
                    neg_pids.append(pid)
                    negs_added += 1
                    if negs_added >= num_negs_per_system:
                        break

        if (len(pos_pids) > 0 and len(neg_pids) > 0):
            score_diff1.append(ce_scores[data['qid']][random.choice(pos_pids)] - ce_scores[data['qid']][random.choice(neg_pids)]) 

        

279708it [01:01, 4576.94it/s]


KeyboardInterrupt: 

In [31]:
score_diff2 = []
negs_to_use = ['splade']
num_negs_per_system = 50

hard_negatives_filepath = os.path.join(data_folder, 'msmarco-hard-negatives-splade.jsonl.gz')

with gzip.open(hard_negatives_filepath, 'rt') as fIn:
    for line in tqdm.tqdm(fIn):
        
        data = json.loads(line)

        #Get the positive passage ids
        pos_pids = data['pos']

        if len(pos_pids) == 0:  #Skip entries without positives passages
            continue

        pos_min_ce_score = min([ce_scores[data['qid']][pid] for pid in data['pos']])
        
        neg_pids = []
        
        #Get the hard negatives
        if negs_to_use is None:
            negs_to_use = list(data['neg'].keys())
            
        for system_name in negs_to_use:
            if system_name not in data['neg']:
                continue

            system_negs = data['neg'][system_name]
            negs_added = 0
            for pid in system_negs:
                pid = int(pid)
               
                if pid not in neg_pids:
                    neg_pids.append(pid)
                    negs_added += 1
                    if negs_added >= num_negs_per_system:
                        break

        if (len(pos_pids) > 0 and len(neg_pids) > 0):
            score_diff2.append(ce_scores[data['qid']][random.choice(pos_pids)] - ce_scores[data['qid']][random.choice(neg_pids)]) 



258743it [00:37, 6981.37it/s]


KeyboardInterrupt: 

In [11]:
import pandas as pd
pd.Series(score_diff1[:250000]).describe()

count    250000.000000
mean          3.839755
std           4.811721
min         -20.550394
25%           0.464112
50%           3.092275
75%           6.794941
max          22.218096
dtype: float64

In [19]:
#top5
pd.Series(score_diff2[:250000]).describe()

count    250000.000000
mean         -0.475410
std           2.694157
min         -21.946896
25%          -1.377347
50%          -0.111829
75%           0.520512
max          16.619560
dtype: float64

In [32]:
#top50
pd.Series(score_diff2[:250000]).describe()

count    250000.000000
mean          2.541670
std           3.985315
min         -20.960496
25%           0.000000
50%           2.046005
75%           5.069055
max          19.802739
dtype: float64

In [22]:
data['neg'].keys()

dict_keys(['bm25', 'msmarco-distilbert-base-tas-b', 'msmarco-distilbert-base-v3', 'msmarco-MiniLM-L-6-v3', 'distilbert-margin_mse-cls-dot-v2', 'distilbert-margin_mse-cls-dot-v1', 'distilbert-margin_mse-mean-dot-v1', 'mpnet-margin_mse-mean-v1', 'co-condenser-margin_mse-cls-v1', 'distilbert-margin_mse-mnrl-mean-v1', 'distilbert-margin_mse-sym_mnrl-mean-v1', 'distilbert-margin_mse-sym_mnrl-mean-v2', 'co-condenser-margin_mse-sym_mnrl-mean-v1', 'splade'])

In [29]:
score_diff_bm25 = []
negs_to_use = ['bm25']
num_negs_per_system = 5

hard_negatives_filepath = os.path.join(data_folder, 'msmarco-hard-negatives-splade.jsonl.gz')

with gzip.open(hard_negatives_filepath, 'rt') as fIn:
    for line in tqdm.tqdm(fIn):
        
        data = json.loads(line)

        #Get the positive passage ids
        pos_pids = data['pos']

        if len(pos_pids) == 0:  #Skip entries without positives passages
            continue

        pos_min_ce_score = min([ce_scores[data['qid']][pid] for pid in data['pos']])
        
        neg_pids = []
        
        #Get the hard negatives
        if negs_to_use is None:
            negs_to_use = list(data['neg'].keys())
            
        for system_name in negs_to_use:
            if system_name not in data['neg']:
                continue

            system_negs = data['neg'][system_name]
            negs_added = 0
            for pid in system_negs:
                pid = int(pid)
               
                if pid not in neg_pids:
                    neg_pids.append(pid)
                    negs_added += 1
                    if negs_added >= num_negs_per_system:
                        break

        if (len(pos_pids) > 0 and len(neg_pids) > 0):
            score_diff_bm25.append(ce_scores[data['qid']][random.choice(pos_pids)] - ce_scores[data['qid']][random.choice(neg_pids)]) 




256330it [00:30, 8514.61it/s]


KeyboardInterrupt: 

In [30]:
pd.Series(score_diff_bm25[:250000]).describe()

count    250000.000000
mean          3.573688
std           5.004123
min         -21.470811
25%           0.000000
50%           2.527936
75%           6.504770
max          22.604844
dtype: float64

In [26]:
score_diff_ce = []
negs_to_use = ['msmarco-MiniLM-L-6-v3']
num_negs_per_system = 5

hard_negatives_filepath = os.path.join(data_folder, 'msmarco-hard-negatives-splade.jsonl.gz')

with gzip.open(hard_negatives_filepath, 'rt') as fIn:
    for line in tqdm.tqdm(fIn):
        
        data = json.loads(line)

        #Get the positive passage ids
        pos_pids = data['pos']

        if len(pos_pids) == 0:  #Skip entries without positives passages
            continue

        pos_min_ce_score = min([ce_scores[data['qid']][pid] for pid in data['pos']])
        
        neg_pids = []
        
        #Get the hard negatives
        if negs_to_use is None:
            negs_to_use = list(data['neg'].keys())
            
        for system_name in negs_to_use:
            if system_name not in data['neg']:
                continue

            system_negs = data['neg'][system_name]
            negs_added = 0
            for pid in system_negs:
                pid = int(pid)
               
                if pid not in neg_pids:
                    neg_pids.append(pid)
                    negs_added += 1
                    if negs_added >= num_negs_per_system:
                        break

        if (len(pos_pids) > 0 and len(neg_pids) > 0):
            score_diff_ce.append(ce_scores[data['qid']][random.choice(pos_pids)] - ce_scores[data['qid']][random.choice(neg_pids)]) 


258427it [00:29, 8636.55it/s]


KeyboardInterrupt: 

In [28]:
pd.Series(score_diff_ce[:250000]).describe()

count    250000.000000
mean          0.756582
std           3.687631
min         -21.946896
25%          -0.933666
50%           0.237544
75%           2.093596
max          22.549027
dtype: float64

In [4]:
import json
train_queries0 = dict()
train_queries1 = dict()


with open("training_with_sentence_transformers/output/colbert_dynamic_fromwarmup_num20_marginkldiv_position5-batch_size_8x4-2022-07-18_06-05-04/num0/train_queries.json") as f:
    for line in f:
        train_queries0[line.split("\t")[0]] = json.loads(line.split("\t")[1])
        train_queries0[line.split("\t")[0]]['neg'] = train_queries0[line.split("\t")[0]]['neg'][:20]

with open("training_with_sentence_transformers/output/colbert_dynamic_fromwarmup_num20_marginkldiv_position5-batch_size_8x4-2022-07-18_06-05-04/num1/train_queries.json") as f:
    for line in f:
        train_queries1[line.split("\t")[0]] = json.loads(line.split("\t")[1])
        train_queries1[line.split("\t")[0]]['neg'] = train_queries1[line.split("\t")[0]]['neg'][:20]



In [12]:
mrrs0 = []
for k in train_queries0:
    mrrs0.append(1/train_queries0[k]['pos'][0][0])
print(sum(mrrs0)/len(mrrs0))

mrrs1 = []
for k in train_queries1:
    mrrs1.append(1/train_queries1[k]['pos'][0][0])
print(sum(mrrs1)/len(mrrs1))

mrrsce = []
for k in train_queries1:
    mrrsce.append(1/train_queries1[k]['pos'][0][2])
print(sum(mrrsce)/len(mrrsce))

mrrsce0 = []
for k in train_queries0:
    mrrsce0.append(1/train_queries0[k]['pos'][0][2])
print(sum(mrrsce0)/len(mrrsce0))


0.3701764793743478
0.3747282177201545
0.3900201530890185
0.3900201530890185


In [10]:
mrrs0[:10]

[7, 2, 21, 8, 25, 9, 47, 6, 2, 4]

In [11]:
mrrs1[:10]

[6, 1, 20, 6, 22, 8, 45, 5, 2, 5]

In [1]:
with open("../msmarco/wentai_splade_dev_top1000_spladedoc3_10.tsv") as f:
    for line in f:
        break
print(line)

188714	4321745	foods and supplements to lower blood sugar [SEP] [unused2] cinnamon sugar barley insulin glucose garlic ginger curry basil blood	Food And Supplements That Lower Blood Sugar Levels. Cinnamon: Researchers are finding that cinnamon reduces blood sugar levels naturally when taken daily. If you absolutely love cinnamon you can sprinkle the recommended six grams of cinnamon on your food throughout the day to achieve the desired effect.



In [13]:
query_len3 = []
with open("inference_results/queries_dev_50.gamma3.processed.tsv") as f:
    for line in f:
        query_len3.append(len(set(line.split(" "))))

In [14]:
query_len5 = []
with open("inference_results/queries_dev_50.gamma5.processed.tsv") as f:
    for line in f:
        query_len5.append(len(set(line.split(" "))))

In [12]:
query_lenkl = []
with open("inference_results/queries_dev_50.kldiv_processed.tsv") as f:
    for line in f:
        query_lenkl.append(len(set(line.split(" "))))

In [16]:
import numpy as np
np.mean(query_lenkl), np.mean(query_len5), np.mean(query_len3)

(19.916522410058064, 12.364001463998497, 13.878458449150783)

In [1]:
import numpy as np
query_lenwp = []
with open("inference_results/queries_dev_50.warmup.processed.tsv") as f:
    for line in f:
        query_lenwp.append(len(set(line.split(" "))))

np.mean(query_lenwp)

26.82966179656356

In [3]:
import numpy as np
query_lenwp = []
with open("inference_results/queries_dev_50.klfr_5-1.0_0_25000.processed.tsv") as f:
    for line in f:
        query_lenwp.append(len(set(line.split(" "))))

np.mean(query_lenwp)

19.33629430326531

In [1]:
import numpy as np
import gzip
import json
doclens = []
with gzip.open(os.path.join('inference_results/klfr_5-1_0_25000', f"file_5.jsonl.gz")) as f:
    for line in f:
        data = json.loads(line)
        doclens.append(len(data['vector']))

np.mean(doclens)


579.58783

In [1]:
collection = dict()
with open("../msmarco/collection.tsv") as f:
    for line in f:
        did, dtext = line.strip().split("\t")
        collection[did] = dtext
    
queries = dict()
with open("../msmarco/queries.dev.tsv") as f:
    for line in f:
        qid, qtext = line.strip().split("\t")
        queries[qid] = qtext
    

In [3]:
with open("../msmarco/splade_klfr_5-1_num1.dev.trec.trec") as f, open("../msmarco/splade_klfr_5-1_num1.dev.top1000.trec.tsv", "w") as fo:
    for line in f:
        qid, _, did, rank, score, _ = line.split("\t")
        #if int(rank) < 100:
        fo.write(f"{qid}\t{did}\t{queries[qid]}\t{collection[did]}\n")

In [None]:
with open("../msmarco/splade_klfr_5-1_num1.dev.trec.trec") as f, open("../msmarco/splade_klfr_5-1_num1.dev.top1000.trec.tsv", "w") as fo:
    for line in f:
        qid, _, did, rank, score, _ = line.split("\t")
        #if int(rank) < 100:
        fo.write(f"{qid}\t{did}\t{queries[qid]}\t{collection[did]}\n")

In [6]:
collection = dict()
with open("../msmarco/collection.tsv") as f:
    for line in f:
        did, dtext = line.strip().split("\t")
        collection[did] = dtext
    
queries = dict()
with open("../msmarco/queries.2020.tsv") as f:
    for line in f:
        qid, qtext = line.strip().split("\t")
        queries[qid] = qtext
    

In [7]:
with open("../msmarco/splade_klfr_5-1_num1.2020.trec.trec") as f, open("../msmarco/splade_klfr_5-1_num1.2020.top1000.trec.tsv", "w") as fo:
    for line in f:
        qid, _, did, rank, score, _ = line.split("\t")
        #if int(rank) < 100:
        fo.write(f"{qid}\t{did}\t{queries[qid]}\t{collection[did]}\n")