In [87]:
import csv
import pprint

import pandas as pd
from pyserini.index import IndexReader

In [88]:
index_reader_base = IndexReader("../../anserini/indexes/msmarco-doc/lucene-index-msmarco")
index_reader_expanded = IndexReader("../../anserini/indexes/lucene-index-msmarco-doc-expanded-with-vectors")



In [89]:
rundict_exp = {}
with open('../runs/run.msmarco-doc.test.bm25k1.4.68.b.0.87.expandex.txt') as runfile:
    reader = csv.reader(runfile, delimiter=' ')

    count = 0
    for [qid, _, docid, _, _, _] in reader:
        if qid not in rundict_exp:
            rundict_exp[qid] = []
        rundict_exp[qid].append(docid)

In [90]:
rundict_base = {}
with open('../data/run.msmarco-doc.test.bm25.txt') as runfile:
    reader = csv.reader(runfile, delimiter=' ')

    count = 0
    for [qid, _, docid, _, _, _] in reader:
        if qid not in rundict_base:
            rundict_base[qid] = []
        rundict_base[qid].append(docid)

In [91]:
rundict_exp = {k:v[:100] for k,v in rundict_exp.items()}
rundict_base = {k:v[:100] for k,v in rundict_base.items()}

In [92]:
test_qrels = {}
with open('../data/msmarco-doctest-qrels.tsv') as qrels:
    reader = csv.reader(qrels, delimiter=' ')

    for [qid, _, docid, rel] in reader:
        if qid not in test_qrels:
            test_qrels[qid] = {}
        test_qrels[qid][docid] = rel

In [93]:
qdict = {}
with open('../data/msmarco-doctest-queries.tsv') as qfile:
    reader = csv.reader(qfile, delimiter='\t')
    for [qid, q] in reader:
        qdict[qid] = q

In [94]:
metric_dict = {}

with open('../data/bm25_metrics.txt') as mf:
    reader = csv.reader(mf, delimiter='\t')
    for [metric, id, value] in reader:
        if id not in metric_dict:
            metric_dict[id] = {}
        metric_dict[id][metric] = value
metric_list = [[k] + list(v.values()) for k, v in metric_dict.items()]


In [95]:
def df_from_records(records):

    df = pd.DataFrame.from_records(records, columns=['qid','map', 'mrr', 'ndcg'])
    return df

In [96]:
df = df_from_records(metric_list)

In [97]:
df.describe()


Unnamed: 0,qid,map,mrr,ndcg
count,44,44.0,44.0,44.0
unique,44,44.0,7.0,43.0
top,1133167,0.0372,1.0,0.0
freq,1,1.0,30.0,2.0


In [98]:
df.sort_values(by=['ndcg'], ascending=False).index


Int64Index([38, 42, 12, 28, 15, 10, 17, 18, 13, 39,  9, 23, 37,  8, 16, 40, 21,
            29, 36, 25, 26, 19, 11, 34, 22, 20, 43, 27, 24,  5, 35, 32,  3, 14,
             2,  4, 41,  6, 31,  7, 33,  1, 30,  0],
           dtype='int64')

In [99]:
qid = df.iloc[30]['qid']
query = qdict[qid]
query

'lps laws definition'

In [100]:
doclist_exp = rundict_exp[qid]
doclist_base = rundict_base[qid]

In [101]:
pp = pprint.PrettyPrinter(indent=4)


In [102]:
# count the number of relevant documents
docs_og = []
docs_bm25_exp = []
docs_bm25_base = []
for docid, label in test_qrels[qid].items():
    if int(label) != 0:
        docs_og.append(docid)
        if docid in doclist_exp:
            docs_bm25_exp.append(docid)
        if docid in doclist_base:
            docs_bm25_base.append(docid)

print(f"OG docs for ``{query}'': {len(docs_og)}")
print(f"BM25_exp docs for ``{query}'': {len(docs_bm25_exp)}")
print(f"BM25_base docs for ``{query}'': {len(docs_bm25_base)}")


OG docs for ``lps laws definition'': 195
BM25_exp docs for ``lps laws definition'': 11
BM25_base docs for ``lps laws definition'': 9


In [103]:
missed_docs_exp = [docid for docid in docs_og if not docid in docs_bm25_exp]
missed_docs_base = [docid for docid in docs_og if not docid in docs_bm25_base]

In [104]:
analys = index_reader_base.analyze(qdict[qid])
analys

['lp', 'law', 'definit']

In [105]:
# top_missed_exp = []
# top_missed_base = []
#
# for missed_doc in missed_docs_exp:
#     termcount = 0
#     vec = index_reader_expanded.get_document_vector(missed_doc)
#     raw_len = len(index_reader_expanded.doc_raw(missed_doc))
#     if test_qrels[qid][missed_doc] != '0':
#         top_missed_exp.append(missed_doc)
#         for term in analys:
#             termcount += vec.get(term,0)
#         tot = sum([vv for vv in vec.values() if vv is not None])
#         pp.pprint(f"{missed_doc}: {termcount}/{tot} -> {round(termcount/tot * 100,2)}%, {round(termcount/raw_len * 100,2)}%")
# for missed_doc in missed_docs_base:
#     termcount = 0
#     vec = index_reader_base.get_document_vector(missed_doc)
#     raw_len = len(index_reader_base.doc_raw(missed_doc))
#     if test_qrels[qid][missed_doc] != '0':
#         top_missed_base.append(missed_doc)
#         for term in analys:
#             termcount += vec.get(term,0)
#         tot = sum([vv for vv in vec.values() if vv is not None])
#         pp.pprint(f"{missed_doc}: {termcount}/{tot} -> {round(termcount/tot * 100,2)}%, {round(termcount/raw_len * 100,2)}%")



In [106]:
# for doc in docs_bm25_exp:
#     rank = doclist_exp.index(doc) + 1
#     pp.pprint(f"{rank} {doc}")

In [107]:
top_retrieved_nonrel_docs_exp = []
top_retrieved_rel_docs_exp = []
top_retrieved_nonrel_docs_base= []
top_retrieved_rel_docs_base = []
for i in range(20):
    docid_exp = doclist_exp[i]
    docid_base = doclist_base[i]
    rel_exp = test_qrels[qid].get(docid_exp, 0)
    rel_base = test_qrels[qid].get(docid_base, 0)
    if int(rel_exp) != 0:
        top_retrieved_rel_docs_exp.append(docid_exp)
    else:
        top_retrieved_nonrel_docs_exp.append(docid_exp)
    if int(rel_base) != 0:
            top_retrieved_rel_docs_base.append(docid_base)
    else:
            top_retrieved_nonrel_docs_exp.append(docid_base)
    pp.pprint(f"{i+1} {docid_exp}, label: {test_qrels[qid].get(docid_exp,0)}")
    pp.pprint(f"{i+1} {docid_base}, label: {test_qrels[qid].get(docid_base,0)}")
    print()


'1 D3431635, label: 0'
'1 D3431635, label: 0'

'2 D3451687, label: 3'
'2 D1925226, label: 0'

'3 D1024943, label: 0'
'3 D686052, label: 0'

'4 D3272237, label: 0'
'4 D1024943, label: 0'

'5 D3451688, label: 3'
'5 D3272237, label: 0'

'6 D1994699, label: 0'
'6 D1994699, label: 0'

'7 D572440, label: 3'
'7 D949868, label: 0'

'8 D225186, label: 0'
'8 D1513011, label: 0'

'9 D2524565, label: 0'
'9 D69628, label: 0'

'10 D1925226, label: 0'
'10 D525396, label: 0'

'11 D110769, label: 0'
'11 D399445, label: 0'

'12 D2156484, label: 0'
'12 D3205474, label: 1'

'13 D686052, label: 0'
'13 D2349859, label: 0'

'14 D1650877, label: 0'
'14 D11414, label: 0'

'15 D2476015, label: 0'
'15 D1222032, label: 0'

'16 D2112804, label: 0'
'16 D1124553, label: 0'

'17 D2429119, label: 0'
'17 D1292180, label: 0'

'18 D438814, label: 0'
'18 D409290, label: 0'

'19 D264750, label: 0'
'19 D2580348, label: 0'

'20 D1149878, label: 0'
'20 D2047828, label: 0'



In [108]:
def get_doc_stats(docs, doclist, index_reader):
    for iii in docs:
        pp.pprint(f"docid: {iii}")
        if iii in doclist:
            pp.pprint(f"rank: {doclist.index(iii)}")
        termcount = 0
        vec = index_reader.get_document_vector(iii)
        raw_len = len(index_reader.doc_raw(iii))
        for term in analys:
            pp.pprint(f"{term}: {index_reader.get_document_vector(iii).get(term, '')}")
            termcount += vec.get(term,0)
        termsum = sum([vv for vv in vec.values() if vv is not None])
        pp.pprint(f"{termcount}/{termsum} -> {round(termcount/termsum * 100,2)}%")
        pp.pprint(f"{termcount}/{raw_len} -> {round(termcount/raw_len * 100,2)}%")

        pp.pprint(f"words: {len(index_reader.doc_raw(iii))}")
        pp.pprint(f"terms: {sum([v for v in index_reader.get_document_vector(iii).values() if v is not None])}")
        pp.pprint(index_reader.doc_raw(iii)[:10])
        print("\n")



In [111]:
get_doc_stats(top_retrieved_rel_docs_exp,doclist_exp, index_reader_expanded)

'docid: D3451687'
'rank: 1'
'lp: 14'
'law: 2'
'definit: 1'
'17/122 -> 13.93%'
'17/1094 -> 1.55%'
'words: 1094'
'terms: 122'
'{\n  "id" :'


'docid: D3451688'
'rank: 4'
'lp: 19'
'law: 12'
'definit: 4'
'35/1004 -> 3.49%'
'35/8719 -> 0.4%'
'words: 8719'
'terms: 1004'
'{\n  "id" :'


'docid: D572440'
'rank: 6'
'lp: 14'
'law: 21'
'definit: 4'
'39/1257 -> 3.1%'
'39/11321 -> 0.34%'
'words: 11321'
'terms: 1257'
'{\n  "id" :'




In [112]:
get_doc_stats(top_retrieved_rel_docs_base,doclist_base, index_reader_base)

'docid: D3205474'
'rank: 11'
'lp: 3'
'law: 1'
'definit: 2'
'6/285 -> 2.11%'
'6/3030 -> 0.2%'
'words: 3030'
'terms: 285'
'<TEXT>\nhtt'


