In [36]:
import json
import pandas as pd
from pyserini.search.lucene import LuceneImpactSearcher
from pyserini.pyclass import JFloat, JInt, JHashMap

In [115]:
def dot_product(dict1, dict2):
    # Calculate the dot product of two dictionaries
    print([[k, dict1[k] * dict2[k]] for k in dict1 if k in dict2])
    return sum(dict1[key] * dict2.get(key, 0) for key in dict1)


def create_jquery(encoded_query, searcher, fields = {}):
    jfields = JHashMap()
    for (field, boost) in fields.items():
        jfields.put(field, JFloat(boost))

    jquery = JHashMap()
    for (token, weight) in encoded_query.items():
        # if token in searcher.idf and searcher.idf[token] >= searcher.min_idf:
        jquery.put(token, JInt(weight))

    return jquery

In [116]:
phrase_splade_metadata_path = "/home/lamdo/splade/pyserini_evaluation/metadata/nfcorpus__phrase_splade"
normal_splade_metadata_path = "/home/lamdo/splade/pyserini_evaluation/metadata/nfcorpus__eru_kg"

qrel_path = "/home/lamdo/splade/data/beir/nfcorpus/qrels/test.tsv"

In [117]:
phrase_splade_searcher = LuceneImpactSearcher("/scratch/lamdo/beir_splade/indexes/nfcorpus__phrase_splade", query_encoder=None)
normal_splade_searcher = LuceneImpactSearcher("/scratch/lamdo/beir_splade/indexes/nfcorpus__eru_kg", query_encoder=None)

In [118]:
_qrels = pd.read_csv(qrel_path, sep='\t').to_dict("records")
    
qrel_metadata = {}
for line in _qrels:
    query_id = str(line["query-id"])
    doc_id = line["corpus-id"]
    score = line["score"]

    if query_id not in qrel_metadata:
        qrel_metadata[query_id] = []

    qrel_metadata[query_id].append({
        "docid": doc_id,
        "score": score
    })

In [119]:
with open(phrase_splade_metadata_path) as f:
    phrase_splade_metadata = json.load(f)

with open(normal_splade_metadata_path) as f:
    normal_splade_metadata = json.load(f)

In [120]:
# normal_splade_metadata["encoded_queries"][1], phrase_splade_metadata["encoded_queries"][1]
chosen_index = 0

phrase_splade_query_rep = phrase_splade_metadata["encoded_queries"][chosen_index]["vector"]
normal_splade_query_rep = normal_splade_metadata["encoded_queries"][chosen_index]["vector"]

query_id = normal_splade_metadata["encoded_queries"][chosen_index]["query_id"]

print("QueryId", query_id)
print({k: v for k, v in sorted(phrase_splade_query_rep.items(), key=lambda item: -item[1])})
print()
print({k: v for k, v in sorted(normal_splade_query_rep.items(), key=lambda item: -item[1])})

QueryId PLAIN-2
{'cho': 164, 'breast-cancer': 158, 'stat': 138, 'cancer': 129, 'gene': 122, 'cause': 99, 'breast': 98, 'do': 96, 'drugs': 94, 'drug': 75, 'vitamin-d': 51, 'does': 34, 'causes': 29, 'toxicity': 25, 'inhibitors': 17, 'ch': 2, '?': 1, 'status': 1}

{'stat': 174, 'cho': 172, 'breast': 170, 'cancer': 158, 'drugs': 132, 'cause': 128, 'do': 99, 'drug': 81, '##terol': 80, 'christian': 48, '?': 37, '##in': 22}


In [121]:
chosen_query_id = query_id
doc_ids = [line['docid'] for line in qrel_metadata[chosen_query_id] if line["score"]]

In [122]:
min_normal, max_normal = min(normal_splade_metadata["predictions"][chosen_query_id].values()), max(normal_splade_metadata["predictions"][chosen_query_id].values())
min_phrase, max_phrase = min(phrase_splade_metadata["predictions"][chosen_query_id].values()), max(phrase_splade_metadata["predictions"][chosen_query_id].values())

print("score range normal", min_normal, max_normal)
print("score range phrase", min_phrase, max_phrase)
for docid in doc_ids:
    print(docid)
    print("normal splade", normal_splade_metadata["predictions"][chosen_query_id].get(docid, 0))
    print("phrase splade", phrase_splade_metadata["predictions"][chosen_query_id].get(docid, 0))

    print()

score range normal 46056.0 91039.0
score range phrase 50706.0 75067.0
MED-2427
normal splade 54602.0
phrase splade 57254.0

MED-10
normal splade 91039.0
phrase splade 73880.0

MED-2429
normal splade 74674.0
phrase splade 73096.0

MED-2430
normal splade 0
phrase splade 0

MED-2431
normal splade 85050.0
phrase splade 75067.0

MED-14
normal splade 83504.0
phrase splade 73082.0

MED-2432
normal splade 55626.0
phrase splade 0

MED-2428
normal splade 55418.0
phrase splade 0

MED-2440
normal splade 80482.0
phrase splade 58168.0

MED-2434
normal splade 61922.0
phrase splade 63489.0

MED-2435
normal splade 60078.0
phrase splade 64462.0

MED-2436
normal splade 52208.0
phrase splade 59550.0

MED-2437
normal splade 52422.0
phrase splade 58320.0

MED-2438
normal splade 0
phrase splade 0

MED-2439
normal splade 72384.0
phrase splade 64919.0

MED-3597
normal splade 0
phrase splade 0

MED-3598
normal splade 0
phrase splade 0

MED-3599
normal splade 0
phrase splade 0

MED-4556
normal splade 0
phrase sp

In [123]:
chosen_doc_id = "MED-2437"

print(json.loads(phrase_splade_searcher.doc(chosen_doc_id).lucene_document().get("raw"))["text"])

phrase_splade_doc_rep = json.loads(phrase_splade_searcher.doc(chosen_doc_id).lucene_document().get("raw"))["vector"]
normal_splade_doc_rep = json.loads(normal_splade_searcher.doc(chosen_doc_id).lucene_document().get("raw"))["vector"]

BACKGROUND: Breast cancer is the most commonly diagnosed cancer among women in the United States. Extensive research has been completed to evaluate the relationship between dietary factors and breast cancer risk and survival after breast cancer; however, a summary report with clinical inference is needed. Materials and METHODS: This review summarizes the current epidemiological and clinical trial evidence relating diet to breast cancer incidence, recurrence, survival, and mortality. The review includes emerging epidemiological studies that assess risk within breast cancer subtypes as well as a summary of previous and ongoing dietary intervention trials designed to modify breast cancer risk. RESULTS: The available literature suggests that both low-fat and high-fiber diets may be weakly protective against breast cancer, whereas total energy intake and alcohol appear to be positively associated. Fiber may be weakly protective possibly through modulation of estrogen, whereas fruit and vege

In [124]:
print({k: v for k, v in sorted(phrase_splade_doc_rep.items(), key=lambda item: -item[1])})
print()
print({k: v for k, v in sorted(normal_splade_doc_rep.items(), key=lambda item: -item[1])})

{'diet': 175, 'breast-cancer': 135, 'food': 125, 'cancer': 109, 'dietary': 105, 'review': 97, 'risk': 92, 'fiber': 91, 'nutrition': 91, 'breast': 89, 'gene': 84, 'benefits': 84, 'intervention': 77, 'fat': 74, 'obesity': 73, 'volcano': 71, 'meat': 70, 'intake': 70, 'estrogen': 68, 'clinical-trial': 66, 'fruit': 63, 'vitamin-d': 59, 'big-data': 56, 'survival': 53, 'gut': 51, 'recurrence': 51, 'men': 50, 'book': 49, 'choice': 49, 'benefit': 49, 'and': 47, 'trial': 42, 'medical': 41, 'post': 40, 'toxicity': 38, 'green-tea': 33, 'mammography': 33, 'summary': 32, 'health': 31, 'bc': 31, 'risk-factor': 31, 'factors': 30, 'clinical': 30, 'influencer-marketing': 30, 'disadvantage': 29, 'aging': 28, 'her2': 28, 'weight': 27, 'survey': 25, 'disease': 24, 'adult': 22, 'incidence': 21, 'interpretability': 21, 'pesticide': 21, 'alcohol': 20, 'sleep': 19, 'risk-factors': 19, 'against': 18, 'report': 18, 'proteomics': 18, 'energy': 17, 'sex': 17, 'protein': 17, 'influence': 15, 'women': 14, 'materials

In [125]:
dot_product(phrase_splade_doc_rep, phrase_splade_query_rep), dot_product(normal_splade_doc_rep, normal_splade_query_rep)

[['cancer', 14061], ['gene', 10248], ['breast', 8722], ['toxicity', 950], ['breast-cancer', 21330], ['vitamin-d', 3009]]
[['christian', 624], ['cancer', 23068], ['breast', 28730]]


(58320, 52422)

In [126]:
print("QueryId", query_id)
print({k: v for k, v in sorted(phrase_splade_query_rep.items(), key=lambda item: -item[1])})
print()
print({k: v for k, v in sorted(normal_splade_query_rep.items(), key=lambda item: -item[1])})

QueryId PLAIN-2
{'cho': 164, 'breast-cancer': 158, 'stat': 138, 'cancer': 129, 'gene': 122, 'cause': 99, 'breast': 98, 'do': 96, 'drugs': 94, 'drug': 75, 'vitamin-d': 51, 'does': 34, 'causes': 29, 'toxicity': 25, 'inhibitors': 17, 'ch': 2, '?': 1, 'status': 1}

{'stat': 174, 'cho': 172, 'breast': 170, 'cancer': 158, 'drugs': 132, 'cause': 128, 'do': 99, 'drug': 81, '##terol': 80, 'christian': 48, '?': 37, '##in': 22}


In [127]:
phrase_splade_metadata["predictions"][query_id]

{'MED-2431': 75067.0,
 'MED-10': 73880.0,
 'MED-2429': 73096.0,
 'MED-14': 73082.0,
 'MED-2439': 64919.0,
 'MED-2435': 64462.0,
 'MED-4695': 64196.0,
 'MED-4226': 63963.0,
 'MED-2434': 63489.0,
 'MED-4117': 62829.0,
 'MED-4223': 62120.0,
 'MED-3856': 61849.0,
 'MED-1371': 60873.0,
 'MED-3832': 60854.0,
 'MED-1825': 60557.0,
 'MED-2122': 60474.0,
 'MED-2436': 59550.0,
 'MED-4097': 59150.0,
 'MED-3849': 58640.0,
 'MED-2437': 58320.0,
 'MED-2440': 58168.0,
 'MED-3833': 57984.0,
 'MED-3841': 57984.0,
 'MED-1564': 57860.0,
 'MED-4827': 57724.0,
 'MED-4096': 57613.0,
 'MED-950': 57523.0,
 'MED-4652': 57403.0,
 'MED-3799': 57374.0,
 'MED-3860': 57340.0,
 'MED-1193': 57338.0,
 'MED-2427': 57254.0,
 'MED-1829': 56792.0,
 'MED-5357': 56745.0,
 'MED-4057': 56610.0,
 'MED-4465': 56494.0,
 'MED-4787': 56346.0,
 'MED-2103': 56155.0,
 'MED-3205': 55775.0,
 'MED-3766': 55582.0,
 'MED-838': 55308.0,
 'MED-4440': 55103.0,
 'MED-4049': 55088.0,
 'MED-4643': 54946.0,
 'MED-3862': 54922.0,
 'MED-3447': 549

In [131]:
jquery = create_jquery({"clinical-trial": 100}, phrase_splade_searcher)

In [132]:
hits = phrase_splade_searcher.object.search(jquery, 100)

In [133]:
hits

[<io.anserini.search.ScoredDoc at 0x7f8fc4fce610 jclass=io/anserini/search/ScoredDoc jself=<LocalRef obj=0x5556164b20e2 at 0x7f8fc5af71b0>>,
 <io.anserini.search.ScoredDoc at 0x7f8fc4fce1b0 jclass=io/anserini/search/ScoredDoc jself=<LocalRef obj=0x5556164b20f2 at 0x7f8fc5c170d0>>,
 <io.anserini.search.ScoredDoc at 0x7f8fc4fcf970 jclass=io/anserini/search/ScoredDoc jself=<LocalRef obj=0x5556164b20fa at 0x7f8fc5c16ad0>>,
 <io.anserini.search.ScoredDoc at 0x7f8fc4fce2f0 jclass=io/anserini/search/ScoredDoc jself=<LocalRef obj=0x5556164b210a at 0x7f8fc5c16d10>>,
 <io.anserini.search.ScoredDoc at 0x7f8fc562b8d0 jclass=io/anserini/search/ScoredDoc jself=<LocalRef obj=0x5556164b2112 at 0x7f8fc5c16af0>>,
 <io.anserini.search.ScoredDoc at 0x7f8fc5628900 jclass=io/anserini/search/ScoredDoc jself=<LocalRef obj=0x5556164b2122 at 0x7f8fc5c173d0>>,
 <io.anserini.search.ScoredDoc at 0x7f8fc562b510 jclass=io/anserini/search/ScoredDoc jself=<LocalRef obj=0x5556164b212a at 0x7f8fc5c16b30>>,
 <io.anserini

In [105]:
phrase_splade_searcher.doc(chosen_doc_id).lucene_document().get("raw")

'{\n  "_id" : "MED-2437",\n  "title" : "Diet and breast cancer: understanding risks and benefits.",\n  "text" : "BACKGROUND: Breast cancer is the most commonly diagnosed cancer among women in the United States. Extensive research has been completed to evaluate the relationship between dietary factors and breast cancer risk and survival after breast cancer; however, a summary report with clinical inference is needed. Materials and METHODS: This review summarizes the current epidemiological and clinical trial evidence relating diet to breast cancer incidence, recurrence, survival, and mortality. The review includes emerging epidemiological studies that assess risk within breast cancer subtypes as well as a summary of previous and ongoing dietary intervention trials designed to modify breast cancer risk. RESULTS: The available literature suggests that both low-fat and high-fiber diets may be weakly protective against breast cancer, whereas total energy intake and alcohol appear to be posi

In [82]:
for item in jquery.keySet():
    print(item)

toxicity
inhibitors
stat
gene
drugs
ch
vitamin d
cause
breast cancer
do
cho
drug
does
causes
cancer
breast
?
status


In [70]:
phrase_splade_query_rep

{'?': 1,
 'do': 96,
 'does': 34,
 'cause': 99,
 'status': 1,
 'drug': 75,
 'cancer': 129,
 'gene': 122,
 'causes': 29,
 'drugs': 94,
 'breast': 98,
 'ch': 2,
 'cho': 164,
 'toxicity': 25,
 'inhibitors': 17,
 'stat': 138,
 'breast cancer': 158,
 'vitamin d': 51}