In [80]:
import csv
import gzip
import itertools
from collections import Counter

from pyserini.index import IndexReader

In [24]:
index_reader = IndexReader("../anserini/indexes/msmarco-doc/lucene-index-msmarco")



In [28]:
# The query string for each topicid is querystring[topicid]
querystring = {}
with gzip.open("data/msmarco-doctrain-queries.tsv.gz", 'rt', encoding='utf8') as f:
    tsvreader = csv.reader(f, delimiter="\t")
    for [topicid, querystring_of_topicid] in tsvreader:
        querystring[topicid] = querystring_of_topicid

In [29]:
# In the corpus tsv, each docid occurs at offset docoffset[docid]

docoffset = {}
with gzip.open("data/msmarco-docs-lookup.tsv.gz", 'rt', encoding='utf8') as f:
    tsvreader = csv.reader(f, delimiter="\t")
    for [docid, _, offset] in tsvreader:
        docoffset[docid] = int(offset)

In [30]:
# For each topicid, the list of positive docids is qrel[topicid]
qrel = {}
with gzip.open("data/msmarco-doctrain-qrels.tsv.gz", 'rt', encoding='utf8') as f:
    tsvreader = csv.reader(f, delimiter=" ")
    for [topicid, _, docid, rel] in tsvreader:
        assert rel == "1"
        if topicid in qrel:
            qrel[topicid].append(docid)
        else:
            qrel[topicid] = [docid]

In [75]:
def compute_doc_tf(query, document_id):
    val = 0
    query_terms = index_reader.analyze(query)
    doc_vector = index_reader.get_document_vector(document_id)

    for term in query_terms:
        tf = doc_vector.get(term,0)
        val += tf
    return val


In [81]:
def compute_tf(query, string):
    val = 0
    query_terms = index_reader.analyze(query)
    string_terms = index_reader.analyze(string)
    string_terms_counts = Counter(string_terms)
    for term in query_terms:
        val += string_terms_counts.get(term,0)
    return val


In [118]:
with open('data/msmarco-doc-libsvm/msmarco-doctrain-libsv.txt', 'w', newline='') as csvfile:
    csvwriter = csv.writer(csvfile, delimiter=' ')
    for query_id in itertools.islice(querystring, 10):
        query = querystring[query_id]
        rel_doc_ids = qrel[query_id]
        for rel_doc_id in rel_doc_ids:
            doc_tf = compute_doc_tf(query,rel_doc_id)
            doc_len = len(index_reader.doc(rel_doc_id).raw())
            csvwriter.writerow([1,f"qid:{query_id}",f"1:{doc_tf}", f"2:{doc_len}", f"#DOCID:{rel_doc_id}"])


In [87]:
d = index_reader.doc('D2192591')

In [93]:
len(d.raw())

10066