### First we need to move to the top-level directory.

In [1]:
cd ../..

/home/leo/SourceTreeGit/FlexNeuART.refact2021


In [2]:
COLLECTION='wikipedia_dpr_nq_sample'

In [3]:
from scripts.py_flexneuart.setup import *

In [4]:
# add Java JAR to the class path
configure_classpath('target')

In [5]:
# create a resource manager
resource_manager=create_featextr_resource_manager(f'collections/{COLLECTION}/forward_index',
                                                  model1_root_dir=f'collections/{COLLECTION}/derived_data/giza')

### Retrieval

In [6]:
from scripts.config import QUESTION_FILE_JSON, QREL_FILE, DOCID_FIELD, TEXT_FIELD_NAME

In [7]:
from scripts.py_flexneuart.cand_provider import *
# create a candidate provider/generator
cand_prov = create_cand_provider(resource_manager, PROVIDER_TYPE_LUCENE, f'collections/{COLLECTION}/lucene_index')

In [8]:
QUERY_TEXT = "vein carry blood heart away"

In [9]:
# An example of running a text query
query_res = run_text_query(cand_prov, 20, QUERY_TEXT)
query_res

(1338,
 [CandidateEntry(doc_id='639661', score=18.328275680541992),
  CandidateEntry(doc_id='472789', score=16.816619873046875),
  CandidateEntry(doc_id='1776205', score=16.630727767944336),
  CandidateEntry(doc_id='639669', score=15.6367826461792),
  CandidateEntry(doc_id='8448903', score=15.448601722717285),
  CandidateEntry(doc_id='8448902', score=15.369601249694824),
  CandidateEntry(doc_id='639670', score=15.27547550201416),
  CandidateEntry(doc_id='639663', score=14.904623985290527),
  CandidateEntry(doc_id='35722', score=14.59425163269043),
  CandidateEntry(doc_id='1302853', score=14.318553924560547),
  CandidateEntry(doc_id='639671', score=14.157160758972168),
  CandidateEntry(doc_id='1786523', score=14.077558517456055),
  CandidateEntry(doc_id='588394', score=13.997241973876953),
  CandidateEntry(doc_id='639690', score=13.810718536376953),
  CandidateEntry(doc_id='1450640', score=13.643953323364258),
  CandidateEntry(doc_id='3936360', score=13.642525672912598),
  CandidateEntr

In [10]:
# An example of running a generic query interface
query_res = run_query(cand_prov, 20, {TEXT_FIELD_NAME : QUERY_TEXT}, default_query_id=FAKE_QUERY_ID)
query_res

(1338,
 [CandidateEntry(doc_id='639661', score=18.328275680541992),
  CandidateEntry(doc_id='472789', score=16.816619873046875),
  CandidateEntry(doc_id='1776205', score=16.630727767944336),
  CandidateEntry(doc_id='639669', score=15.6367826461792),
  CandidateEntry(doc_id='8448903', score=15.448601722717285),
  CandidateEntry(doc_id='8448902', score=15.369601249694824),
  CandidateEntry(doc_id='639670', score=15.27547550201416),
  CandidateEntry(doc_id='639663', score=14.904623985290527),
  CandidateEntry(doc_id='35722', score=14.59425163269043),
  CandidateEntry(doc_id='1302853', score=14.318553924560547),
  CandidateEntry(doc_id='639671', score=14.157160758972168),
  CandidateEntry(doc_id='1786523', score=14.077558517456055),
  CandidateEntry(doc_id='588394', score=13.997241973876953),
  CandidateEntry(doc_id='639690', score=13.810718536376953),
  CandidateEntry(doc_id='1450640', score=13.643953323364258),
  CandidateEntry(doc_id='3936360', score=13.642525672912598),
  CandidateEntr

In [11]:
# An example of running a generic query interface
query_res = run_query(cand_prov, 20, {DOCID_FIELD: FAKE_QUERY_ID, TEXT_FIELD_NAME : QUERY_TEXT})
query_res

(1338,
 [CandidateEntry(doc_id='639661', score=18.328275680541992),
  CandidateEntry(doc_id='472789', score=16.816619873046875),
  CandidateEntry(doc_id='1776205', score=16.630727767944336),
  CandidateEntry(doc_id='639669', score=15.6367826461792),
  CandidateEntry(doc_id='8448903', score=15.448601722717285),
  CandidateEntry(doc_id='8448902', score=15.369601249694824),
  CandidateEntry(doc_id='639670', score=15.27547550201416),
  CandidateEntry(doc_id='639663', score=14.904623985290527),
  CandidateEntry(doc_id='35722', score=14.59425163269043),
  CandidateEntry(doc_id='1302853', score=14.318553924560547),
  CandidateEntry(doc_id='639671', score=14.157160758972168),
  CandidateEntry(doc_id='1786523', score=14.077558517456055),
  CandidateEntry(doc_id='588394', score=13.997241973876953),
  CandidateEntry(doc_id='639690', score=13.810718536376953),
  CandidateEntry(doc_id='1450640', score=13.643953323364258),
  CandidateEntry(doc_id='3936360', score=13.642525672912598),
  CandidateEntr

### Forward index demo

In [12]:
from scripts.py_flexneuart.fwd_index import get_forward_index

#### First let's play with a raw index that keeps ony unparsed text

In [13]:
raw_indx = get_forward_index(resource_manager, 'text_raw')

In [14]:
# Index type
raw_indx.indx_fld_type

'textRaw'

In [15]:
raw_indx.get_doc_text_raw('639661')

'vein "Vein Veins are blood vessels that carry blood toward the heart. Most veins carry deoxygenated blood from the tissues back to the heart; exceptions are the pulmonary and umbilical veins, both of which carry oxygenated blood to the heart. In contrast to veins, arteries carry blood away from the heart. Veins are less muscular than arteries and are often closer to the skin. There are valves in most veins to prevent backflow. Veins are present throughout the body as tubes that carry blood back to the heart. Veins are classified in a number of ways, including superficial vs. deep, pulmonary"'

#### A parsed index has more info

In [16]:
parsed_indx = get_forward_index(resource_manager, 'text')

In [17]:
# Index type
parsed_indx.indx_fld_type

'parsedText'

In [18]:
parsed_indx.get_doc_parsed('639661')

DocEntryParsed(word_ids=[75, 144, 210, 246, 506, 587, 589, 591, 594, 867, 1268, 1282, 2311, 2516, 3125, 3352, 4121, 5121, 7795, 8410, 8455, 12461, 14717, 14722, 14724, 23655, 23669, 27261, 59794, 102036], word_qtys=[1, 1, 5, 1, 5, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 9, 6, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1], word_id_seq=[7795, 7795, 102036, 8410, 5121, 506, 8410, 210, 7795, 506, 23655, 8410, 587, 2311, 210, 14724, 23669, 7795, 506, 14717, 8410, 210, 4121, 7795, 8455, 506, 8410, 3125, 210, 7795, 27261, 8455, 246, 589, 14722, 7795, 1282, 59794, 7795, 1268, 75, 2516, 506, 8410, 2311, 210, 7795, 3352, 144, 867, 591, 12461, 594, 14724], doc_len=54)

In [19]:
# Let's extract the first document word and its info
parsed_indx.get_word_by_id(75), parsed_indx.get_word_entry_by_id(75)

('body', WordEntry(word_id=75, word_freq=17735))

### Ranker API demo

In [20]:
from scripts.py_flexneuart.ranker import *
from scripts.py_flexneuart.utils import *

In [21]:
MODEL_FILE_NAME=f'collections/{COLLECTION}/exper_desc.best/models/bm25_model1.model'
FEAT_EXTR_FILE_NAME=f'collections/{COLLECTION}/exper_desc.best/extractors/bm25=text+model1=text_bert_tok+lambda=0.3+probSelfTran=0.35.json'
QUERY_FILE_NAME=f'collections/{COLLECTION}/input_data/dev/{QUESTION_FILE_JSON}'
QREL_FILE_NAME=f'collections/{COLLECTION}/input_data/dev/{QREL_FILE}'

#### A toy example where we generate a list of candidates for merely one query (using the candidate provider) and re-rank them using the Java-layer re-ranker

In [22]:
java_ranker = JavaQueryRanker(resource_manager, 
                              feat_extr_file_name=FEAT_EXTR_FILE_NAME, 
                              model_file_name=MODEL_FILE_NAME)

In [23]:
query_dict = {DOCID_FIELD : FAKE_QUERY_ID, 
              TEXT_FIELD_NAME : QUERY_TEXT}
java_ranker.rank_candidates(query_res[1], query_dict)

[('639661', 0.4589444396983704),
 ('1776205', 0.4341061334562677),
 ('472789', 0.4324617381414335),
 ('8448903', 0.4124776432310974),
 ('8448902', 0.41143788988374125),
 ('35722', 0.39951390916561724),
 ('639669', 0.38760818984443224),
 ('3936360', 0.3864522671342978),
 ('1450640', 0.38388484121055044),
 ('588394', 0.38289955583934543),
 ('639670', 0.3823782759630887),
 ('639663', 0.3775039059463895),
 ('47133', 0.3731257557936975),
 ('1302853', 0.36679545440286726),
 ('639690', 0.3653150559189846),
 ('639671', 0.36361852471115813),
 ('1786523', 0.36268165971062455),
 ('2992576', 0.353630291595394),
 ('5622935', 0.3518175514979561),
 ('47129', 0.34765838000119853)]

#### There's a function (used only for evaluation) to score candidates without sorting them scores

In [24]:
java_ranker.score_candidates(query_res[1], query_dict)

{'639661': 0.4589444396983704,
 '472789': 0.4324617381414335,
 '1776205': 0.4341061334562677,
 '639669': 0.38760818984443224,
 '8448903': 0.4124776432310974,
 '8448902': 0.41143788988374125,
 '639670': 0.3823782759630887,
 '639663': 0.3775039059463895,
 '35722': 0.39951390916561724,
 '1302853': 0.36679545440286726,
 '639671': 0.36361852471115813,
 '1786523': 0.36268165971062455,
 '588394': 0.38289955583934543,
 '639690': 0.3653150559189846,
 '1450640': 0.38388484121055044,
 '3936360': 0.3864522671342978,
 '5622935': 0.3518175514979561,
 '2992576': 0.353630291595394,
 '47133': 0.3731257557936975,
 '47129': 0.34765838000119853}

#### A toy example where we re-rank the list of candidate using a BERT re-ranker

In [25]:
# Re-ranking on CPU, which can be fairly slow
neural_ranker = PythonNNQueryRanker(resource_manager, 
                         query_field_name='text_raw', max_query_len=64,
                         index_field_name='text_raw', max_doc_len = 512 - 32 - 3,
                         device_name='cuda', batch_size=25, 
                         model_file_name=f'collections/{COLLECTION}/derived_data/ir_models/vanilla_bert/model.best')

In [27]:
query_dict = {DOCID_FIELD : FAKE_QUERY_ID, 
              'text_raw' : QUERY_TEXT}
neural_ranker.rank_candidates(query_res[1], query_dict)

> [0;32m/home/leo/SourceTreeGit/FlexNeuART.refact2021/scripts/py_flexneuart/ranker.py[0m(226)[0;36mscore_candidates[0;34m()[0m
[0;32m    224 [0;31m        [0;32mimport[0m [0mpdb[0m [0;34m;[0m [0mpdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    225 [0;31m[0;34m[0m[0m
[0m[0;32m--> 226 [0;31m        [0;32mwith[0m [0mtorch[0m[0;34m.[0m[0mno_grad[0m[0;34m([0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    227 [0;31m            for records in iter_valid_records(self.model, self.device_name, data_set, run,
[0m[0;32m    228 [0;31m                                              [0mself[0m[0;34m.[0m[0mbatch_size[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m
ipdb> p run
{'fake_query_id': {'639661': 0, '472789': 0, '1776205': 0, '639669': 0, '8448903': 0, '8448902': 0, '639670': 0, '639663': 0, '35722': 0, '1302853': 0, '639671': 0, '1786523': 0, '588394': 0, '639690': 0, '1450640': 0, '39

ipdb> scores.shape
*** NameError: name 'scores' is not defined
ipdb> n
> [0;32m/home/leo/SourceTreeGit/FlexNeuART.refact2021/scripts/py_flexneuart/ranker.py[0m(235)[0;36mscore_candidates[0;34m()[0m
[0;32m    233 [0;31m                                    records[DOC_MASK_FIELD])
[0m[0;32m    234 [0;31m[0;34m[0m[0m
[0m[0;32m--> 235 [0;31m                [0mscores[0m [0;34m=[0m [0mscores[0m[0;34m.[0m[0mtolist[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    236 [0;31m[0;34m[0m[0m
[0m[0;32m    237 [0;31m                [0;32mfor[0m [0mqid[0m[0;34m,[0m [0mdid[0m[0;34m,[0m [0mscore[0m [0;32min[0m [0mzip[0m[0;34m([0m[0mrecords[0m[0;34m[[0m[0mQUERY_ID_FIELD[0m[0;34m][0m[0;34m,[0m [0mrecords[0m[0;34m[[0m[0mDOC_ID_FIELD[0m[0;34m][0m[0;34m,[0m [0mscores[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m
ipdb> scores.shape
torch.Size([20])
ipdb> scores
tensor([3.1648, 2.1924, 1.6631, 1.7364, 1.4759, 

BdbQuit: 

In [None]:
query_dict = {DOCID_FIELD : FAKE_QUERY_ID, 
              'text_raw' : QUERY_TEXT}
import pdb ; pdb.set_trace()
neural_ranker.score_candidates(query_res[1], query_dict)

In [None]:
query_res[1]

#### A comprehensive example where we evaluate **all** queries from `dev`

In [None]:
from scripts.data_convert.convert_common import *
all_queries = read_queries(QUERY_FILE_NAME)

In [None]:
# Query sample
all_queries[0:5]

### Queries have one extra field that cannot be "digested" by the ranking API and we need to delete it:

In [None]:
from tqdm import tqdm
for query_dict in tqdm(all_queries):
    # Delete this field, it cannot be used by ranker
    del query_dict['answer_list']

In [None]:
from scripts.common_eval import *
from scripts.utils import sync_out_streams
from tqdm import tqdm

TOP_K=50
qrels=read_qrels_dict(QREL_FILE_NAME)

for ranker in [java_ranker, neural_ranker]:
    run_dict = {}
    with tqdm(all_queries[0:100]) as pbar:
        for query_dict in pbar:
            qid = query_dict[DOCID_FIELD]
            query_res = run_query(cand_prov, TOP_K, query_dict)
            rank_res = ranker.score_candidates(query_res[1], query_dict)
            run_dict[qid] = rank_res
    tqdm.write('\n')
        
    # Let us compute various metrics using our Python code. 
    # Note that results should generally match results obtained using `scripts/exper/run_experiments.sh`
    for eval_obj in [NormalizedDiscountedCumulativeGain(10), \
                 NormalizedDiscountedCumulativeGain(20), \
                 MeanAveragePrecision(), \
                 MeanReciprocalRank()]:
        tqdm.write(str(eval_run(rerank_run=run_dict, metric_func=eval_obj, qrels_dict=qrels)) + '\n')
    
    tqdm.write('==========================='+ '\n')

#### Optionally we can save the run to be later evaluated using external evaluation tools

In [None]:
write_run_dict(run_dict, 'run.txt')

In [None]:
!head run.txt