In [93]:
import csv
import gzip
from typing import List

import numpy as np
from pyserini.index import IndexReader

In [2]:
index_reader = IndexReader("../anserini/indexes/msmarco-doc/lucene-index-msmarco")



In [3]:
# The query string for each topicid is querystring[topicid]
querystring = {}
with gzip.open("data/msmarco-doctrain-queries.tsv.gz", 'rt', encoding='utf8') as f:
    tsvreader = csv.reader(f, delimiter="\t")
    for [topicid, querystring_of_topicid] in tsvreader:
        querystring[topicid] = querystring_of_topicid

In [4]:
# In the corpus tsv, each docid occurs at offset docoffset[docid]

docoffset = {}
with gzip.open("data/msmarco-docs-lookup.tsv.gz", 'rt', encoding='utf8') as f:
    tsvreader = csv.reader(f, delimiter="\t")
    for [docid, _, offset] in tsvreader:
        docoffset[docid] = int(offset)

In [5]:
# For each topicid, the list of positive docids is qrel[topicid]
qrel = {}
with gzip.open("data/msmarco-doctrain-qrels.tsv.gz", 'rt', encoding='utf8') as f:
    tsvreader = csv.reader(f, delimiter=" ")
    for [topicid, _, docid, rel] in tsvreader:
        assert rel == "1"
        if topicid in qrel:
            qrel[topicid].append(docid)
        else:
            qrel[topicid] = [docid]

In [6]:
# def compute_doc_tf(query, document_id):
#     val = 0
#     query_terms = index_reader.analyze(query)
#     doc_vector = index_reader.get_document_vector(document_id)
#
#     for term in query_terms:
#         tf = doc_vector.get(term,0)
#         val += tf
#     return val


In [7]:
# def compute_tf(query, string):
#     val = 0
#     query_terms = index_reader.analyze(query)
#     string_terms = index_reader.analyze(string)
#     string_terms_counts = Counter(string_terms)
#     for term in query_terms:
#         val += string_terms_counts.get(term,0)
#     return val


In [118]:
# with open('data/msmarco-doc-libsvm/msmarco-doctrain-libsv.txt', 'w', newline='') as csvfile:
#     csvwriter = csv.writer(csvfile, delimiter=' ')
#     for query_id in itertools.islice(querystring, 10):
#         query = querystring[query_id]
#         rel_doc_ids = qrel[query_id]
#         for rel_doc_id in rel_doc_ids:
#             doc_tf = compute_doc_tf(query,rel_doc_id)
#             doc_len = len(index_reader.doc(rel_doc_id).raw())
#             csvwriter.writerow([1,f"qid:{query_id}",f"1:{doc_tf}", f"2:{doc_len}", f"#DOCID:{rel_doc_id}"])


In [8]:
d = index_reader.doc('D2192591')

In [7]:

index_reader.get_document_vector('D2192591')

{'bomb.cit': 1,
 'german': 2,
 'been': 1,
 'half': 2,
 'refus': 2,
 'govern': 3,
 'potenti': 2,
 'year': 4,
 'japanes': 2,
 'del': 1,
 'complet': 1,
 'via': 1,
 'nazi': 3,
 'suffer': 1,
 'would': 9,
 'near': 2,
 'ten': 1,
 'chicago': 2,
 'berklei': 1,
 'despit': 3,
 'tennesse': 1,
 'pass': 1,
 'ag': 2,
 'mile': 7,
 'impact': 2,
 'reluct': 1,
 'shock': 1,
 'enthusiast': 1,
 'am': 1,
 'headquart': 1,
 'truman': 3,
 'codenam': 3,
 'aioi': 1,
 'extend': 2,
 'alamo': 3,
 '2': 2,
 'befor': 2,
 'globe': 1,
 'nagasaki': 3,
 'size': 1,
 '5': 1,
 'left': 1,
 '6': 1,
 '7': 1,
 'plant': 1,
 'urei': 1,
 'wwii': 2,
 '8': 3,
 '9': 1,
 'juli': 1,
 'much': 1,
 'be': 2,
 'fate': 1,
 'dead': 2,
 'turn': 1,
 "we'r": 1,
 'dollar': 1,
 'journal': 1,
 'same': 1,
 'releas': 2,
 'pacifist': 1,
 'foot': 1,
 'descend': 1,
 'mind': 3,
 'b': 1,
 'c': 1,
 'd': 1,
 'i': 1,
 'abl': 2,
 'j': 1,
 'co': 1,
 'blast': 2,
 'survivor': 1,
 'insid': 1,
 'r': 1,
 's': 9,
 '40,000': 1,
 'u': 10,
 'y': 1,
 'somewhat': 1,
 'gogg

In [104]:
def lmir_jm(term, doc_vector, sm_param, corpus_prob_term, doc_len):
    return (1-sm_param) * doc_vector[term] / doc_len + sm_param * corpus_prob_term

def lmir_dirichlet(term, doc_vector, param, corpus_prob_term, doc_len):
    return (doc_vector[term] + param * corpus_prob_term) / (doc_len + param)

def lmir_abs(term, doc_vector, param, corpus_prob_term, doc_len):
   return np.max(doc_vector[term] - param, 0) / doc_len + param * (len(doc_vector) / doc_len) * corpus_prob_term

def compute_lmir(query_terms: List[str], index_reader: IndexReader, doc_id: str, smoothing, sm_param=0.1):
    doc_vector = index_reader.get_document_vector(doc_id)
    doc_len = len(index_reader.doc(doc_id).raw())

    doc_vector_not_none_keys = [key for key in doc_vector.keys() if doc_vector[key] is not None]
    corpus_frequency_dict = dict.fromkeys(doc_vector_not_none_keys)
    lmir_dict = dict.fromkeys(doc_vector_not_none_keys)

    total_term_count = index_reader.stats()['total_terms']

    if smoothing == 'JM':
        lmir = lmir_jm
    elif smoothing =='DIR':
        lmir = lmir_dirichlet
    elif smoothing == "ABS":
        lmir = lmir_abs
    else:
        raise Exception

    for term in lmir_dict:
        corpus_prob_term = index_reader.get_term_counts(term, analyzer=None)[1] / total_term_count
        lmir_prob_term = lmir(term, doc_vector, sm_param, corpus_prob_term, doc_len)

        corpus_frequency_dict[term] = corpus_prob_term
        lmir_dict[term] = lmir_prob_term


    alpha_norm = (1 - sum(lmir_dict.values())) / (1 - sum(corpus_frequency_dict.values()))

    lmir = 1
    for term in query_terms:
        if term in lmir_dict:
            lmir *= lmir_dict[term]
        else:
            lmir *= alpha_norm * index_reader.get_term_counts(term, analyzer= None)[1] / total_term_count


    return lmir

In [105]:
q = querystring['1185869']
qt = index_reader.analyze(q)

In [106]:
doc_len = len(index_reader.doc('D59219').raw())

In [107]:
compute_lmir(qt, index_reader,'D59219', smoothing='JM',sm_param=0.1 / (0.1 + doc_len))

8.92007884921995e-22

In [108]:
compute_lmir(qt, index_reader,'D59219', smoothing='DIR', sm_param=0.1)

8.920078849219953e-22

In [109]:
compute_lmir(qt, index_reader,'D59219', smoothing='ABS', sm_param=0.1)

8.560142081818295e-22