In [1]:
import csv
import pprint

import pandas as pd
from pyserini.index import IndexReader

In [2]:
index_reader = IndexReader("../indexes/msmarco-doc/lucene-index-msmarco")



In [3]:
rundict = {}
with open('../data/run.msmarco-doc.test.bm25.txt') as runfile:
    reader = csv.reader(runfile, delimiter=' ')

    count = 0
    for [qid, _, docid, _, _, _] in reader:
        if qid not in rundict:
            rundict[qid] = []
        rundict[qid].append(docid)

In [4]:
rundict = {k:v[:100] for k,v in rundict.items()}

In [5]:
test_qrels = {}
with open('../data/msmarco-doctest-qrels.tsv') as qrels:
    reader = csv.reader(qrels, delimiter=' ')

    for [qid, _, docid, rel] in reader:
        if qid not in test_qrels:
            test_qrels[qid] = {}
        test_qrels[qid][docid] = rel

In [6]:
qdict = {}
with open('../data/msmarco-doctest-queries.tsv') as qfile:
    reader = csv.reader(qfile, delimiter='\t')
    for [qid, q] in reader:
        qdict[qid] = q

In [7]:
metric_dict = {}

with open('../data/bm25_metrics.txt') as mf:
    reader = csv.reader(mf, delimiter='\t')
    for [metric, id, value] in reader:
        if id not in metric_dict:
            metric_dict[id] = {}
        metric_dict[id][metric] = value
metric_list = [[k] + list(v.values()) for k, v in metric_dict.items()]



In [8]:
top100dict = {}

with open('../data/msmarco-doctest-top100.tsv') as tf:
    reader = csv.reader(tf, delimiter=' ')
    for [qid, _, docid, _, _, _] in reader:
        if qid not in top100dict:
            top100dict[qid] = []
        top100dict[qid].append(docid)
# top100dict

In [9]:
def df_from_records(records):

    df = pd.DataFrame.from_records(records, columns=['qid','map', 'mrr', 'ndcg'])
    return df



In [10]:
df = df_from_records(metric_list)

In [11]:
df.describe()


Unnamed: 0,qid,map,mrr,ndcg
count,44,44.0,44.0,44.0
unique,44,44.0,7.0,43.0
top,359349,0.0663,1.0,0.0
freq,1,1.0,30.0,2.0


In [12]:
df.sort_values(by=['ndcg'], ascending=False).index


Int64Index([38, 42, 12, 28, 15, 10, 17, 18, 13, 39,  9, 23, 37,  8, 16, 40, 21,
            29, 36, 25, 26, 19, 11, 34, 22, 20, 43, 27, 24,  5, 35, 32,  3, 14,
             2,  4, 41,  6, 31,  7, 33,  1, 30,  0],
           dtype='int64')

In [55]:
qid = df.iloc[30]['qid']
query = qdict[qid]

In [28]:
doclist = rundict[qid]

In [29]:
pp = pprint.PrettyPrinter(indent=4)

In [30]:
pp.pprint(qdict[qid])

'lps laws definition'


In [58]:
# count the number of relevant documents
docs_og = []
docs_bm25 = []
for docid, label in test_qrels[qid].items():
    if int(label) != 0:
        docs_og.append(docid)
        if docid in doclist:
            docs_bm25.append(docid)
print(f"OG docs for ``{query}'': {len(docs_og)}")
print(f"BM25 docs for ``{query}'': {len(docs_bm25)}")


OG docs for ``lps laws definition'': 195
BM25 docs for ``lps laws definition'': 9


In [83]:
missed_docs = [docid for docid in docs_og if not docid in docs_bm25]

for missed_doc in missed_docs:
    doc_raw = index_reader.doc_raw(missed_doc)
    print(f"{test_qrels[qid][missed_doc]} {missed_doc}")

1 D1014847
1 D1014848
1 D1053817
1 D1068540
1 D110770
1 D110772
1 D1117710
1 D1117714
1 D1117718
1 D1151607
1 D1176489
1 D118226
1 D118227
1 D118228
2 D118230
1 D1253531
1 D1296134
1 D1311771
1 D1311772
1 D1311773
1 D1311774
1 D1311775
1 D1319289
1 D132260
1 D1335657
1 D135630
1 D1422369
1 D1422370
1 D1431884
1 D1432650
1 D1463439
1 D147816
1 D1501381
1 D1539715
1 D1539718
1 D1561118
1 D1617163
3 D1628366
1 D1657830
2 D1657833
1 D1766592
1 D1798970
1 D1798972
1 D1815619
1 D1904191
2 D1961803
2 D1986028
1 D2057812
1 D2063950
1 D2076494
1 D2107278
1 D2140801
1 D2142273
2 D2144809
1 D2154218
2 D2155969
1 D2162303
2 D2167562
1 D2170788
1 D2170789
1 D2170790
1 D2170791
1 D2206673
1 D2206674
1 D2213169
2 D2236787
3 D2236788
1 D2296762
2 D2296763
2 D2326994
1 D2369118
1 D2372219
1 D240798
1 D2411978
1 D2450429
1 D2622974
1 D2629533
1 D2630591
1 D2671265
1 D2691824
1 D2691825
1 D2697580
1 D2707959
1 D2767598
1 D2788150
3 D287672
2 D2877639
1 D2881629
3 D2897764
1 D2921023
1 D2936155
1 D2936157

In [85]:
top_missed = []
analys = index_reader.analyze(qdict[qid])
for missed_doc in missed_docs:
    termcount = 0
    vec = index_reader.get_document_vector(missed_doc)
    raw_len = len(index_reader.doc_raw(missed_doc))
    if test_qrels[qid][missed_doc] == '3':
        top_missed.append(missed_doc)
        for term in analys:
            termcount += vec.get(term,0)
        tot = sum([vv for vv in vec.values() if vv is not None])
        pp.pprint(f"{missed_doc}: {termcount}/{tot} -> {round(termcount/tot * 100,2)}%, {round(termcount/raw_len * 100,2)}%")


'D1628366: 8/2400 -> 0.33%, 0.04%'
'D2236788: 4/1269 -> 0.32%, 0.03%'
'D287672: 0/391 -> 0.0%, 0.0%'
'D2897764: 0/97 -> 0.0%, 0.0%'
'D657187: 12/1140 -> 1.05%, 0.11%'
'D766522: 7/851 -> 0.82%, 0.09%'
'D981162: 3/702 -> 0.43%, 0.04%'


In [65]:
for doc in docs_bm25:
    rank = doclist.index(doc) + 1
    pp.pprint(f"{rank} {doc}")

'32 D1245575'
'42 D214330'
'80 D2667112'
'12 D3205474'
'93 D3451687'
'67 D3451688'
'62 D572440'
'68 D574190'
'81 D854131'


In [66]:
for i in range(0,10):
    docid = doclist[i]
    rel = test_qrels[qid][docid]
    if int(rel) != 0:
        continue
    pp.pprint(f"{i+1} {docid}, label: {test_qrels[qid][docid]}")


'1 D3431635, label: 0'
'length: 27050'
'2 D1925226, label: 0'
'length: 29329'
'3 D686052, label: 0'
'length: 13408'
'4 D1024943, label: 0'
'length: 2617'
'5 D3272237, label: 0'
'length: 2618'
'6 D1994699, label: 0'
'length: 4718'
'7 D949868, label: 0'
'length: 4964'
'8 D1513011, label: 0'
'length: 9643'
'9 D69628, label: 0'
'length: 10059'
'10 D525396, label: 0'
'length: 14119'


In [92]:
def get_doc_stats(docs):
    for iii in docs:
        # pp.pprint(f"{doclist.index(iii)} docid: {iii}")
        pp.pprint(f"docid: {iii}")
        termcount = 0
        vec = index_reader.get_document_vector(iii)
        raw_len = len(index_reader.doc_raw(iii))
        for term in analys + ['lps']:
            pp.pprint(f"{term}: {index_reader.get_document_vector(iii).get(term, '')}")
            termcount += vec.get(term,0)
        termsum = sum([vv for vv in vec.values() if vv is not None])
        pp.pprint(f"{termcount}/{termsum} -> {round(termcount/termsum * 100,2)}%")
        pp.pprint(f"{termcount}/{raw_len} -> {round(termcount/raw_len * 100,2)}%")

        pp.pprint(f"words: {len(index_reader.doc_raw(iii))}")
        pp.pprint(f"terms: {sum([v for v in index_reader.get_document_vector(iii).values() if v is not None])}")
        pp.pprint(index_reader.doc_raw(iii)[:10])
        print("\n")


In [93]:
get_doc_stats(top_missed)


'docid: D1628366'
'lp: 5'
'law: 3'
'definit: '
'lps: '
'8/2400 -> 0.33%'
'8/21387 -> 0.04%'
'words: 21387'
'terms: 2400'
'<TEXT>\nhtt'


'docid: D2236788'
'lp: '
'law: 4'
'definit: '
'lps: '
'4/1269 -> 0.32%'
'4/11475 -> 0.03%'
'words: 11475'
'terms: 1269'
'<TEXT>\nhtt'


'docid: D287672'
'lp: '
'law: '
'definit: '
'lps: '
'0/391 -> 0.0%'
'0/3581 -> 0.0%'
'words: 3581'
'terms: 391'
'<TEXT>\nhtt'


'docid: D2897764'
'lp: '
'law: '
'definit: '
'lps: '
'0/97 -> 0.0%'
'0/849 -> 0.0%'
'words: 849'
'terms: 97'
'<TEXT>\nhtt'


'docid: D657187'
'lp: '
'law: 12'
'definit: '
'lps: '
'12/1140 -> 1.05%'
'12/10924 -> 0.11%'
'words: 10924'
'terms: 1140'
'<TEXT>\nhtt'


'docid: D766522'
'lp: '
'law: 7'
'definit: '
'lps: '
'7/851 -> 0.82%'
'7/8127 -> 0.09%'
'words: 8127'
'terms: 851'
'<TEXT>\nhtt'


'docid: D981162'
'lp: 1'
'law: 2'
'definit: '
'lps: '
'3/702 -> 0.43%'
'3/6738 -> 0.04%'
'words: 6738'
'terms: 702'
'<TEXT>\nhtt'




In [96]:


pp.pprint(index_reader.doc_raw('D1628366'))

('<TEXT>\n'
 'http://www.scscourt.org/self_help/probate/conservatorship/conservatorship_overview.shtml\n'
 '.\n'
 'This section tells you about probate conservatorships.\n'
 'Probate conservatorships are only for adults over 18.\n'
 'If you are trying to help a child (minor), see the guardianship section of '
 'this website.\n'
 'Click on a topic to learn more:\n'
 'What is probate conservatorship?\n'
 'If I become conservator of the person, will I automatically become '
 'conservator of the estate?\n'
 'Is a probate conservatorship different from a mental health (LPS) '
 'conservatorship?\n'
 'Is a probate conservatorship different from a limited conservatorship?\n'
 'Who can file for conservatorship?\n'
 'Who can be appointed as conservator?\n'
 'What if no one is qualified to be conservator?\n'
 'When should the Public Guardian be conservator?\n'
 'Can I make medical decisions for the conservatee?\n'
 'Can I make estate planning decisions for the conservatee?\n'
 'What does the cour

In [97]:
index_reader.get_document_vector('D1628366')



{'parent': 2,
 'extens': 1,
 'refus': 1,
 'govern': 1,
 'year': 3,
 'if:th': 1,
 'addit': 2,
 'complet': 1,
 'your': 36,
 'without': 1,
 'describ': 2,
 'temporari': 7,
 'bar': 2,
 'suffer': 1,
 'would': 2,
 'permiss': 2,
 'draft': 1,
 'record': 3,
 'sister': 1,
 'you': 114,
 'happen': 3,
 'sometim': 2,
 'serv': 6,
 'sure': 6,
 'instruct': 5,
 'citat': 1,
 'ag': 1,
 'bill': 2,
 'neutral': 1,
 'elig': 1,
 'automat': 2,
 'especi': 1,
 'court': 46,
 'click': 2,
 'secur': 6,
 'conserv': 52,
 'former': 1,
 'ap': 2,
 '1': 4,
 '2': 5,
 'befor': 7,
 'folder': 1,
 '4': 1,
 'andspous': 1,
 'carefulli': 1,
 '800': 1,
 'dementia': 5,
 'handl': 2,
 'detail': 1,
 'placement': 1,
 'adult': 5,
 'brother': 1,
 'much': 3,
 'object': 1,
 'treatment': 2,
 'be': 2,
 'prove': 1,
 'least': 3,
 'yellow': 1,
 'polici': 1,
 'how': 10,
 'see': 12,
 'same': 4,
 'civil': 1,
 'term': 1,
 'after': 3,
 'annuiti': 1,
 'behavior': 1,
 'close': 1,
 'address': 3,
 'must:manag': 1,
 'set': 4,
 'learn': 5,
 'g': 2,
 '2100':