In [1]:
import json
from pyserini.search.lucene import LuceneImpactSearcher
from pyserini.pyclass import autoclass, JFloat, JInt, JArrayList, JHashMap
from transformers import AutoTokenizer
from scipy.sparse import csr_matrix

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
splade_vocab = tokenizer.vocab

In [9]:
searcher = LuceneImpactSearcher("/scratch/lamdo/beir_splade/indexes/scifact__splade_maxsim_150k_lowregv4", query_encoder=None)

In [10]:
raw = json.loads(searcher.doc(0).lucene_document().get("raw"))

In [11]:
raw

{'_id': '4983',
 'title': 'Microstructural development of human newborn cerebral white matter assessed in vivo by diffusion tensor magnetic resonance imaging.',
 'text': 'Alterations of the architecture of cerebral white matter in the developing human brain can affect cortical development and result in functional disabilities. A line scan diffusion-weighted magnetic resonance imaging (MRI) sequence with diffusion tensor analysis was applied to measure the apparent diffusion coefficient, to calculate relative anisotropy, and to delineate three-dimensional fiber architecture in cerebral white matter in preterm (n = 17) and full-term infants (n = 7). To assess effects of prematurity on cerebral white matter development, early gestation preterm infants (n = 10) were studied a second time at term. In the central white matter the mean apparent diffusion coefficient at 28 wk was high, 1.8 microm2/ms, and decreased toward term to 1.2 microm2/ms. In the posterior limb of the internal capsule, t

In [8]:
vector = raw["vector"]
token_indices = raw["token_indices"]

_vector = {splade_vocab[k]:v for k,v in vector.items()}

In [17]:
row_col_val = [[v, splade_vocab[k], vector[k]] for k,v in token_indices.items()]
row = [item[0] for item in row_col_val]
col = [item[1] for item in row_col_val]
val = [item[2] for item in row_col_val]

In [18]:
sparse_matrix = csr_matrix((val, (row, col)), shape=(256, len(splade_vocab)))

In [19]:
sparse_matrix

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 130 stored elements and shape (256, 30522)>

In [10]:
{splade_vocab[k]:v for k,v in vector.items()}

{1055: 160,
 1996: 68,
 2001: 114,
 2003: 138,
 2006: 18,
 2013: 56,
 2024: 33,
 2027: 74,
 2028: 1,
 2037: 84,
 2045: 24,
 2049: 2,
 2073: 83,
 2074: 76,
 2088: 53,
 2095: 90,
 2105: 3,
 2124: 54,
 2136: 10,
 2149: 28,
 2150: 80,
 2170: 112,
 2171: 80,
 2177: 88,
 2189: 115,
 2193: 118,
 2201: 113,
 2207: 23,
 2211: 116,
 2248: 28,
 2257: 121,
 2288: 47,
 2293: 156,
 2299: 167,
 2309: 46,
 2315: 56,
 2316: 200,
 2318: 92,
 2327: 62,
 2404: 11,
 2406: 35,
 2468: 39,
 2501: 51,
 2511: 22,
 2544: 6,
 2557: 55,
 2600: 134,
 2631: 94,
 2650: 25,
 2677: 35,
 2707: 113,
 2713: 61,
 2718: 140,
 2719: 73,
 2732: 12,
 2751: 138,
 2759: 55,
 2773: 18,
 2834: 46,
 2866: 18,
 2983: 62,
 2996: 49,
 3086: 4,
 3107: 0,
 3112: 58,
 3131: 19,
 3185: 46,
 3220: 91,
 3233: 3,
 3239: 60,
 3285: 19,
 3297: 72,
 3311: 23,
 3315: 27,
 3364: 109,
 3381: 126,
 3405: 32,
 3432: 16,
 3440: 65,
 3562: 47,
 3585: 230,
 3603: 23,
 3614: 38,
 3624: 26,
 3673: 89,
 3769: 54,
 3777: 74,
 3786: 76,
 3803: 208,
 3830: 4

In [3]:
def create_jquery(encoded_query, searcher, fields = {}):
    jfields = JHashMap()
    for (field, boost) in fields.items():
        jfields.put(field, JFloat(boost))

    jquery = JHashMap()
    for (token, weight) in encoded_query.items():
        if token in searcher.idf and searcher.idf[token] > searcher.min_idf:
            jquery.put(token, JInt(weight))

    return jquery

    # if not fields:
    #     hits = self.object.search(jquery, k)
    # else:
    #     hits = self.object.searchFields(jquery, jfields, k)

    # return hits

In [4]:
jquery = create_jquery({"service": 1}, searcher)

In [5]:
results = searcher.object.search(jquery, 10)

results[8].lucene_document.get("raw"), results[8].score

('{\n  "_id" : "26067999",\n  "title" : "Screening for Lung Cancer: U.S. Preventive Services Task Force Recommendation Statement",\n  "text" : "The U.S. Preventive Services Task Force (USPSTF) makes recommendations about the effectiveness of specific preventive care services for patients without related signs or symptoms. It bases its recommendations on the evidence of both the benefits and harms of the service and an assessment of the balance. The USPSTF does not consider the costs of providing a service in this assessment. The USPSTF recognizes that clinical decisions involve more considerations than evidence alone. Clinicians should understand the evidence but individualize decision making to the specific patient or situation. Similarly, the USPSTF notes that policy and coverage decisions involve considerations in addition to the evidence of clinical benefits and harms. Summary of Recommendation and Evidence The USPSTF recommends annual screening for lung cancer with low-dose comput

In [6]:
json.loads(results[8].lucene_document.get("raw"))

{'_id': '26067999',
 'title': 'Screening for Lung Cancer: U.S. Preventive Services Task Force Recommendation Statement',
 'text': "The U.S. Preventive Services Task Force (USPSTF) makes recommendations about the effectiveness of specific preventive care services for patients without related signs or symptoms. It bases its recommendations on the evidence of both the benefits and harms of the service and an assessment of the balance. The USPSTF does not consider the costs of providing a service in this assessment. The USPSTF recognizes that clinical decisions involve more considerations than evidence alone. Clinicians should understand the evidence but individualize decision making to the specific patient or situation. Similarly, the USPSTF notes that policy and coverage decisions involve considerations in addition to the evidence of clinical benefits and harms. Summary of Recommendation and Evidence The USPSTF recommends annual screening for lung cancer with low-dose computed tomography

In [1]:
from scipy.sparse import load_npz

In [12]:
for i in range(100):
    full_representations = load_npz("/scratch/lamdo/beir_splade/indexes/scifact__splade_maxsim_100k_lowreg/full_representations.npz")

In [3]:
full_representations.shape[0]/256

25657.0

In [4]:
import json
from tqdm import tqdm

In [5]:
with open("/scratch/lamdo/beir_splade/indexes/scidocs__splade_maxsim_100k_lowreg/full_representations_docids.json") as f:
    docids = json.load(f)

In [6]:
len(docids)

25657

In [7]:
doc2fullrep = {}
for i in tqdm(range(len(docids))):
    docid = docids[i]

    full_rep = full_representations[i*256:(i+1) * 256]

    doc2fullrep[docid] = full_rep

  0%|          | 0/25657 [00:00<?, ?it/s]

  1%|▏         | 382/25657 [00:04<05:21, 78.71it/s]


KeyboardInterrupt: 

In [14]:
docids[:2]

['MED-10', 'MED-14']

In [15]:
doc2fullrep['MED-10']

<Compressed Sparse Row sparse matrix of dtype 'float16'
	with 275 stored elements and shape (256, 30522)>

In [23]:
res = doc2fullrep['MED-10'] @ doc2fullrep['MED-14'].transpose()#.todense()
res = res.todense()

In [24]:
res.max(1).sum()

np.float32(83.581375)

In [30]:
dir(full_representations)

['T',
 '__abs__',
 '__add__',
 '__array_priority__',
 '__bool__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__div__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__iadd__',
 '__idiv__',
 '__imul__',
 '__init__',
 '__init_subclass__',
 '__isub__',
 '__iter__',
 '__itruediv__',
 '__le__',
 '__len__',
 '__lt__',
 '__matmul__',
 '__module__',
 '__mul__',
 '__ne__',
 '__neg__',
 '__new__',
 '__nonzero__',
 '__pow__',
 '__radd__',
 '__rdiv__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__rmatmul__',
 '__rmul__',
 '__round__',
 '__rsub__',
 '__rtruediv__',
 '__setattr__',
 '__setitem__',
 '__sizeof__',
 '__str__',
 '__sub__',
 '__subclasshook__',
 '__truediv__',
 '__weakref__',
 '_add_dense',
 '_add_sparse',
 '_arg_min_or_max',
 '_arg_min_or_max_axis',
 '_ascontainer',
 '_asfptype',
 '_asindices',
 '_binopt',
 '_bsr_container',
 '_container',
 '_coo_container',
 '_csc_container',
 '_csr_container',
 '_dedu

In [11]:
full_representations[256 : 512]

<Compressed Sparse Row sparse matrix of dtype 'float16'
	with 177 stored elements and shape (256, 30522)>

In [1]:
import json

In [5]:
with open("/home/lamdo/splade/data/msmarco/msmarco/queries.jsonl") as f:
    count = 0
    for line in f:
        count += 1

count

509962

In [4]:
json.loads(line)

{'_id': '1185869',
 'text': ')what was the immediate impact of the success of the manhattan project?',
 'metadata': {}}

In [1]:
import json

In [2]:
with open("/home/lamdo/splade/pyserini_evaluation/metadata/doris_mae__phrase_splade") as f:
    data = json.load(f)

In [4]:
data["predictions"]

{'0': {'null': 67497.0},
 '1': {'null': 76991.0},
 '2': {'null': 60044.0},
 '3': {'null': 56210.0},
 '4': {'null': 78528.0},
 '5': {'null': 78833.0},
 '6': {'null': 70791.0},
 '7': {'null': 63380.0},
 '8': {'null': 66842.0},
 '9': {'null': 84496.0},
 '10': {'null': 60048.0},
 '11': {'null': 76833.0},
 '12': {'null': 87153.0},
 '13': {'null': 84944.0},
 '14': {'null': 76432.0},
 '15': {'null': 64565.0},
 '16': {'null': 61598.0},
 '17': {'null': 80091.0},
 '18': {'null': 44232.0},
 '19': {'null': 74001.0},
 '20': {'null': 74834.0},
 '21': {'null': 56068.0},
 '22': {'null': 87080.0},
 '23': {'null': 77499.0},
 '24': {'null': 74568.0},
 '25': {'null': 90181.0},
 '26': {'null': 59781.0},
 '27': {'null': 73747.0},
 '28': {'null': 48882.0},
 '29': {'null': 87148.0},
 '30': {'null': 47163.0},
 '31': {'null': 63321.0},
 '32': {'null': 75348.0},
 '33': {'null': 71708.0},
 '34': {'null': 98624.0},
 '35': {'null': 50502.0},
 '36': {'null': 93751.0},
 '37': {'null': 67114.0},
 '38': {'null': 66229.

In [8]:
searcher = LuceneImpactSearcher("/scratch/lamdo/beir_splade/indexes/doris_mae__eru_kg", query_encoder=None)

Apr 04, 2025 6:27:43 PM org.apache.lucene.store.MemorySegmentIndexInputProvider <init>
INFO: Using MemorySegmentIndexInput with Java 21; to disable start with -Dorg.apache.lucene.store.MMapDirectory.enableMemorySegments=false


In [12]:
searcher.doc(0).lucene_document().get("raw")

'{\n  "_id" : null,\n  "title" : "How do lexical semantics affect translation? An empirical study",\n  "text" : "Neural machine translation (NMT) systems aim to map text from one language into another. While there are a wide variety of applications of NMT, one of the most important is translation of natural language. A distinguishing factor of natural language is that words are typically ordered according to the rules of the grammar of a given language. Although many advances have been made in developing NMT systems for translating natural language, little research has been done on understanding how the word ordering of and lexical similarity between the source and target language affect translation performance. Here, we investigate these relationships on a variety of low-resource language pairs from the OpenSubtitles2016 database, where the source language is English, and find that the more similar the target language is to English, the greater the translation performance. In addition

In [14]:
import torch
import torch.nn.functional as F

# Original tensor (example)
original = torch.randn(100, 1, 30522)

# Pad the second dimension (axis=1) with 207 zeros on the right
padded = F.pad(original, pad=(0, 0, 0, 256 - original.size(1)), mode='constant', value=0)

print(padded.shape)  # Output: torch.Size([100, 256, 30522])


torch.Size([100, 256, 30522])
