In [1]:
from flexneuart import configure_classpath
# add Java JAR to the class path
configure_classpath()
COLLECTION_ROOT='/disk3/collections'

In [2]:
COLLECTION='wikipedia_dpr_nq_sample'
QUERY_SETS=['dev_official', 'train_fusion']
QUERY_FIELD_LIST=['text_raw', ]

In [3]:
# A parsedBOW or parsedText index needs to exist for this field!
INDEX_FIELD='text_bert_tok'

In [4]:
from flexneuart.retrieval import create_featextr_resource_manager
from flexneuart.retrieval.fwd_index import get_forward_index

# create a resource manager
resource_manager=create_featextr_resource_manager(resource_root_dir=f'{COLLECTION_ROOT}/{COLLECTION}/',
                                                  fwd_index_dir='forward_index')


[main] INFO edu.cmu.lti.oaqa.flexneuart.resources.ResourceManager - Resource manager initialization. Resource root:/disk3/collections/wikipedia_dpr_nq_sample/


In [5]:
fld_indx = get_forward_index(resource_manager, INDEX_FIELD)

[main] INFO edu.cmu.lti.oaqa.flexneuart.fwdindx.MapDbBackend - MapDB opened for reading: /disk3/collections/wikipedia_dpr_nq_sample/forward_index/text_bert_tok.mapdb_dataDict
[main] INFO edu.cmu.lti.oaqa.flexneuart.fwdindx.ForwardIndexBinaryDataDict - Finished loading context from file: /disk3/collections/wikipedia_dpr_nq_sample/forward_index/text_bert_tok.mapdb_dataDict


In [6]:
print(COLLECTION, '# of docs', fld_indx.get_doc_qty(), 
      'avg. # of BERT tokens per doc', round(fld_indx.get_avg_doc_len(), 1))

wikipedia_dpr_nq_sample # of docs 774392 avg. # of BERT tokens per doc 140.7


In [7]:
from transformers import AutoTokenizer

tok = AutoTokenizer.from_pretrained('bert-base-uncased')

In [8]:
from tqdm import tqdm
import numpy as np
from flexneuart.io.queries import read_queries_dict
from flexneuart.io.qrels import read_qrels_dict
from flexneuart.config import QUESTION_FILE_JSON, QREL_FILE

MIN_REL_GRADE=1

for query_part in QUERY_SETS:
    queries = read_queries_dict(f'{COLLECTION_ROOT}/{COLLECTION}/input_data/{query_part}/{QUESTION_FILE_JSON}')
    qrel_dict = read_qrels_dict(f'{COLLECTION_ROOT}/{COLLECTION}/input_data/{query_part}/{QREL_FILE}')
    
            
    query_qty = len(queries)
            
    qrel_pos_qty  = 0

    for qid, qdict in qrel_dict.items():
        for did, grade in qdict.items():
            qrel_pos_qty += int(grade >= MIN_REL_GRADE)
    
    print('input part:', query_part,
          '# of queries', query_qty,
          'avg. # of positive QRELs per query:', round(qrel_pos_qty / query_qty, 1))
        
    for query_field in QUERY_FIELD_LIST:
        query_lens = []

        for qid, e in tqdm(queries.items()):
            text = e[query_field]
            query_lens.append(len(tok.tokenize(text)))
        print('Query field:', query_field, 'query part:', query_part, 
              'avg. # of BERT tokens per query:', round(np.mean(query_lens), 1))

                                                       

input part: dev_official # of queries 6515 avg. # of positive QRELs per query: 7.9


100%|██████████████████████████████████████████████████████████████████████████████████████| 6515/6515 [00:00<00:00, 7920.99it/s]


Query field: text_raw query part: dev_official avg. # of BERT tokens per query: 9.8


                                                       

input part: train_fusion # of queries 2500 avg. # of positive QRELs per query: 7.7


100%|██████████████████████████████████████████████████████████████████████████████████████| 2500/2500 [00:00<00:00, 8060.93it/s]

Query field: text_raw query part: train_fusion avg. # of BERT tokens per query: 9.8



