In [1]:
COLLECTION='wikipedia_dpr_nq_sample'

In [2]:
COLLECTION_ROOT='/home/leo/flexneuart_collections'

In [3]:
from flexneuart import configure_classpath

In [4]:
# add Java JAR to the class path
configure_classpath()

In [None]:
from flexneuart.retrieval import create_featextr_resource_manager

# create a resource manager
resource_manager=create_featextr_resource_manager(resource_root_dir=f'{COLLECTION_ROOT}/{COLLECTION}/',
                                                  fwd_index_dir='forward_index',
                                                  model1_root_dir=f'derived_data/giza',
                                                  embed_root_dir=f'derived_data/embeddings')

In [6]:
import flexneuart
flexneuart.__version__

'1.2.1'

### Retrieval

In [7]:
from flexneuart.config import QUESTION_FILE_JSON, QREL_FILE, DOCID_FIELD, TEXT_FIELD_NAME

In [8]:
from flexneuart.retrieval.cand_provider import *
# create a candidate provider/generator
cand_prov = create_cand_provider(resource_manager, PROVIDER_TYPE_LUCENE, f'lucene_index')

[main] INFO edu.cmu.lti.oaqa.flexneuart.resources.ResourceManager - Provider type: lucene
URI: lucene_index
Config file: none
# of threads: 1
[main] INFO edu.cmu.lti.oaqa.flexneuart.cand_providers.LuceneCandidateProvider - Lucene candidate provider k1=1.20000, b=0.750000 query field name: text index field name: text Exact field match?: false


In [9]:
QUERY_TEXT = "vein carry blood heart away"
QUERY_TEXT_BERT_TOK = "do veins carry blood to the heart or away"
QUTE_TEXT_UNLEMM = "veins carry blood heart away"

In [10]:
# An example of running a text query
query_res = run_text_query(cand_prov, 20, QUERY_TEXT)
query_res

(1329,
 [CandidateEntry(doc_id='639661', score=18.328275680541992),
  CandidateEntry(doc_id='472789', score=16.816619873046875),
  CandidateEntry(doc_id='1776205', score=16.630727767944336),
  CandidateEntry(doc_id='639669', score=15.6367826461792),
  CandidateEntry(doc_id='8448903', score=15.448601722717285),
  CandidateEntry(doc_id='8448902', score=15.369601249694824),
  CandidateEntry(doc_id='639670', score=15.27547550201416),
  CandidateEntry(doc_id='639663', score=14.904623985290527),
  CandidateEntry(doc_id='35722', score=14.59425163269043),
  CandidateEntry(doc_id='1302853', score=14.318553924560547),
  CandidateEntry(doc_id='639671', score=14.157160758972168),
  CandidateEntry(doc_id='1786523', score=14.077558517456055),
  CandidateEntry(doc_id='588394', score=13.997241973876953),
  CandidateEntry(doc_id='639690', score=13.810718536376953),
  CandidateEntry(doc_id='1450640', score=13.643953323364258),
  CandidateEntry(doc_id='3936360', score=13.642525672912598),
  CandidateEntr

In [11]:
# An example of running a generic query interface
query_res = run_query(cand_prov, 20, {TEXT_FIELD_NAME : QUERY_TEXT}, default_query_id=FAKE_QUERY_ID)
query_res

(1329,
 [CandidateEntry(doc_id='639661', score=18.328275680541992),
  CandidateEntry(doc_id='472789', score=16.816619873046875),
  CandidateEntry(doc_id='1776205', score=16.630727767944336),
  CandidateEntry(doc_id='639669', score=15.6367826461792),
  CandidateEntry(doc_id='8448903', score=15.448601722717285),
  CandidateEntry(doc_id='8448902', score=15.369601249694824),
  CandidateEntry(doc_id='639670', score=15.27547550201416),
  CandidateEntry(doc_id='639663', score=14.904623985290527),
  CandidateEntry(doc_id='35722', score=14.59425163269043),
  CandidateEntry(doc_id='1302853', score=14.318553924560547),
  CandidateEntry(doc_id='639671', score=14.157160758972168),
  CandidateEntry(doc_id='1786523', score=14.077558517456055),
  CandidateEntry(doc_id='588394', score=13.997241973876953),
  CandidateEntry(doc_id='639690', score=13.810718536376953),
  CandidateEntry(doc_id='1450640', score=13.643953323364258),
  CandidateEntry(doc_id='3936360', score=13.642525672912598),
  CandidateEntr

In [12]:
# An example of running a generic query interface
query_res = run_query(cand_prov, 20, {DOCID_FIELD: FAKE_QUERY_ID, TEXT_FIELD_NAME : QUERY_TEXT})
query_res

(1329,
 [CandidateEntry(doc_id='639661', score=18.328275680541992),
  CandidateEntry(doc_id='472789', score=16.816619873046875),
  CandidateEntry(doc_id='1776205', score=16.630727767944336),
  CandidateEntry(doc_id='639669', score=15.6367826461792),
  CandidateEntry(doc_id='8448903', score=15.448601722717285),
  CandidateEntry(doc_id='8448902', score=15.369601249694824),
  CandidateEntry(doc_id='639670', score=15.27547550201416),
  CandidateEntry(doc_id='639663', score=14.904623985290527),
  CandidateEntry(doc_id='35722', score=14.59425163269043),
  CandidateEntry(doc_id='1302853', score=14.318553924560547),
  CandidateEntry(doc_id='639671', score=14.157160758972168),
  CandidateEntry(doc_id='1786523', score=14.077558517456055),
  CandidateEntry(doc_id='588394', score=13.997241973876953),
  CandidateEntry(doc_id='639690', score=13.810718536376953),
  CandidateEntry(doc_id='1450640', score=13.643953323364258),
  CandidateEntry(doc_id='3936360', score=13.642525672912598),
  CandidateEntr

### Forward index demo

In [13]:
from flexneuart.retrieval.fwd_index import get_forward_index

#### First let's play with a raw index that keeps ony unparsed text

In [None]:
raw_indx = get_forward_index(resource_manager, 'text_raw')

In [15]:
# Index type
raw_indx.indx_fld_type

'textRaw'

In [16]:
raw_indx.get_doc_text_raw('639661')

'vein "Vein Veins are blood vessels that carry blood toward the heart. Most veins carry deoxygenated blood from the tissues back to the heart; exceptions are the pulmonary and umbilical veins, both of which carry oxygenated blood to the heart. In contrast to veins, arteries carry blood away from the heart. Veins are less muscular than arteries and are often closer to the skin. There are valves in most veins to prevent backflow. Veins are present throughout the body as tubes that carry blood back to the heart. Veins are classified in a number of ways, including superficial vs. deep, pulmonary"'

#### A parsed index has more info

In [None]:
parsed_indx = get_forward_index(resource_manager, 'text')

In [18]:
# Index type
parsed_indx.indx_fld_type

'parsedText'

In [19]:
parsed_indx.get_doc_parsed('639661')

DocEntryParsed(word_ids=[75, 144, 210, 246, 506, 587, 589, 591, 594, 867, 1268, 1282, 2311, 2516, 3125, 3352, 4121, 5121, 7795, 8410, 8455, 12461, 14717, 14722, 14724, 23655, 23669, 27261, 59794, 102036], word_qtys=[1, 1, 5, 1, 5, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 9, 6, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1], word_id_seq=[7795, 7795, 102036, 8410, 5121, 506, 8410, 210, 7795, 506, 23655, 8410, 587, 2311, 210, 14724, 23669, 7795, 506, 14717, 8410, 210, 4121, 7795, 8455, 506, 8410, 3125, 210, 7795, 27261, 8455, 246, 589, 14722, 7795, 1282, 59794, 7795, 1268, 75, 2516, 506, 8410, 2311, 210, 7795, 3352, 144, 867, 591, 12461, 594, 14724], doc_len=54)

In [20]:
# Let's extract the first document word and its info
parsed_indx.get_word_by_id(75), parsed_indx.get_word_entry_by_id(75)

('body', WordEntry(word_id=75, word_freq=17735))

### Ranker API demo

In [21]:
from flexneuart.ranker.neural import *
from flexneuart.ranker.classic import *
from flexneuart.io.queries import *
from flexneuart.io.qrels import *

Failed to load the NDRM models (which require additional libraries): No module named 'fasttext'


#### Model files and feature extractor configuration is relative to the collection (resource root) directory

In [22]:
MODEL_BM25_MODEL1_FILE_NAME='exper_desc.best/models/bm25_model1.model'
FEAT_EXTR_BM25_MODEL1_FILE_NAME='exper_desc.best/extractors/bm25=text+model1=text_bert_tok+lambda=0.3+probSelfTran=0.35.json'

#### However, we load queries using a full path or relative path that includes collection directory

In [23]:
QUERY_FILE_NAME=f'{COLLECTION_ROOT}/{COLLECTION}/input_data/dev/{QUESTION_FILE_JSON}'
QREL_FILE_NAME=f'{COLLECTION_ROOT}/{COLLECTION}/input_data/dev/{QREL_FILE}'

#### A toy example where we generate a list of candidates for merely one query (using the candidate provider) and re-rank them using the Java-layer re-ranker

In [None]:
java_ranker_bm25_model1 = ClassicRanker(resource_manager, 
                              feat_extr_file_name=FEAT_EXTR_BM25_MODEL1_FILE_NAME, 
                              model_file_name=MODEL_BM25_MODEL1_FILE_NAME)

In [25]:
query_dict = {DOCID_FIELD : FAKE_QUERY_ID,
              'text_bert_tok': QUERY_TEXT_BERT_TOK,
              'text_unlemm': QUTE_TEXT_UNLEMM,
              TEXT_FIELD_NAME : QUERY_TEXT}
java_ranker_bm25_model1.rank_candidates(query_res[1], query_dict)

[('639661', 3.110925911937358),
 ('472789', 2.899738798467234),
 ('1776205', 2.805803609295945),
 ('639669', 2.7800462122473557),
 ('8448903', 2.7269659839801665),
 ('35722', 2.710448890456231),
 ('8448902', 2.7087988053702823),
 ('639670', 2.5747502244230036),
 ('47133', 2.5242492838996617),
 ('1302853', 2.5076294275554907),
 ('5622935', 2.4809257731039933),
 ('47129', 2.469478684784585),
 ('639671', 2.4472600960890074),
 ('2992576', 2.441949382525879),
 ('588394', 2.4208285769287112),
 ('1450640', 2.4162580827133944),
 ('639690', 2.336798310942606),
 ('3936360', 2.294797693585073),
 ('639663', 2.2836513715212163),
 ('1786523', 2.1695004812280785)]

#### There's a function (used only for evaluation) to score candidates without sorting them scores

In [26]:
java_ranker_bm25_model1.score_candidates(query_res[1], query_dict)

{'639661': 3.110925911937358,
 '472789': 2.899738798467234,
 '1776205': 2.805803609295945,
 '639669': 2.7800462122473557,
 '8448903': 2.7269659839801665,
 '8448902': 2.7087988053702823,
 '639670': 2.5747502244230036,
 '639663': 2.2836513715212163,
 '35722': 2.710448890456231,
 '1302853': 2.5076294275554907,
 '639671': 2.4472600960890074,
 '1786523': 2.1695004812280785,
 '588394': 2.4208285769287112,
 '639690': 2.336798310942606,
 '1450640': 2.4162580827133944,
 '3936360': 2.294797693585073,
 '5622935': 2.4809257731039933,
 '2992576': 2.441949382525879,
 '47133': 2.5242492838996617,
 '47129': 2.469478684784585}

#### An example of a classic BM25 reranker

In [27]:
MODEL_BM25_FILE_NAME='exper_desc.best/models/one_feat.model'
FEAT_EXTR_BM25_FILE_NAME='exper_desc.best/extractors/bm25.json'

In [28]:
java_ranker_bm25 = ClassicRanker(resource_manager, 
                              feat_extr_file_name=FEAT_EXTR_BM25_FILE_NAME, 
                              model_file_name=MODEL_BM25_FILE_NAME)

Model:		Coordinate Ascent


In [29]:
java_ranker_bm25.score_candidates(query_res[1], query_dict)

{'639661': 1.3841341733932495,
 '472789': 1.304264783859253,
 '1776205': 1.3092241287231445,
 '639669': 1.1689906120300293,
 '8448903': 1.2439945936203003,
 '8448902': 1.240858793258667,
 '639670': 1.1532176733016968,
 '639663': 1.1385170221328735,
 '35722': 1.204897165298462,
 '1302853': 1.106221318244934,
 '639671': 1.0966399908065796,
 '1786523': 1.093814492225647,
 '588394': 1.1547898054122925,
 '639690': 1.1017565727233887,
 '1450640': 1.1577613353729248,
 '3936360': 1.1655044555664062,
 '5622935': 1.0610493421554565,
 '2992576': 1.066516399383545,
 '47133': 1.1253129243850708,
 '47129': 1.0485056638717651}

####  A an example of a ranker that uses averaged embeddings (loading embeddings can take a couple of minutes)

In [None]:
java_ranker_avg_embed = ClassicRanker(resource_manager, 
                                          feat_extr_file_name='exper_desc.best/extractors/avgembed.json', 
                                          model_file_name='exper_desc.best/models/one_feat.model')

In [31]:
java_ranker_avg_embed.score_candidates(query_res[1], query_dict)

{'639661': 0.9558641314506531,
 '472789': 0.9116753935813904,
 '1776205': 0.9183375835418701,
 '639669': 0.9491961002349854,
 '8448903': 0.8954006433486938,
 '8448902': 0.8910415172576904,
 '639670': 0.8910543322563171,
 '639663': 0.8869085311889648,
 '35722': 0.8330867886543274,
 '1302853': 0.8790420889854431,
 '639671': 0.8644178509712219,
 '1786523': 0.8477801084518433,
 '588394': 0.9045305848121643,
 '639690': 0.7475028038024902,
 '1450640': 0.7521618604660034,
 '3936360': 0.7771363854408264,
 '5622935': 0.917460560798645,
 '2992576': 0.9202541708946228,
 '47133': 0.7589597105979919,
 '47129': 0.9198287725448608}

#### A toy example where we re-rank the list of candidate using a BERT re-ranker

In [32]:
# Re-ranking on CPU, which can be fairly slow
neural_ranker = NeuralRanker(resource_manager, 
                         keep_case=False,
                         query_field_name='text_raw', 
                         index_field_name='text_raw', 
                         device_name='cuda', batch_size=25, 
                         model_path_rel=f'derived_data/ir_models/vanilla_bert/model.best')

Model type name: vanilla_bert, registered class: <class 'flexneuart.models.cedr.cedr_vanilla_bert.VanillaBertCEDRRanker'>


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Model type: bert-base-uncased # of channels: 13 hidden layer size: 768 input window size: 512 no token type IDs: False
Dropout Dropout(p=0.05, inplace=False)


In [33]:
query_dict = {DOCID_FIELD : FAKE_QUERY_ID, 
              'text_raw' : QUERY_TEXT}
neural_ranker.rank_candidates(query_res[1], query_dict)

[('639661', 1.4452017545700073),
 ('1786523', 0.8732627630233765),
 ('472789', 0.6902247667312622),
 ('639663', 0.4931100010871887),
 ('8448903', 0.395744264125824),
 ('3936360', 0.12360292673110962),
 ('35722', 0.06624050438404083),
 ('639670', -0.0123380646109581),
 ('639669', -0.0417940728366375),
 ('1776205', -0.14387202262878418),
 ('47133', -0.1606408804655075),
 ('1450640', -0.17226377129554749),
 ('47129', -0.18515662848949432),
 ('8448902', -0.24556735157966614),
 ('639671', -0.28867408633232117),
 ('639690', -0.5904686450958252),
 ('1302853', -0.7343864440917969),
 ('2992576', -0.7898008823394775),
 ('588394', -0.844143271446228),
 ('5622935', -1.08793044090271)]

In [34]:
query_dict = {DOCID_FIELD : FAKE_QUERY_ID, 
              'text_raw' : QUERY_TEXT}
neural_ranker.score_candidates(query_res[1], query_dict)

{'639661': 1.4452017545700073,
 '472789': 0.6902247667312622,
 '1776205': -0.14387202262878418,
 '639669': -0.0417940728366375,
 '8448903': 0.395744264125824,
 '8448902': -0.24556735157966614,
 '639670': -0.0123380646109581,
 '639663': 0.4931100010871887,
 '35722': 0.06624050438404083,
 '1302853': -0.7343864440917969,
 '639671': -0.28867408633232117,
 '1786523': 0.8732627630233765,
 '588394': -0.844143271446228,
 '639690': -0.5904686450958252,
 '1450640': -0.17226377129554749,
 '3936360': 0.12360292673110962,
 '5622935': -1.08793044090271,
 '2992576': -0.7898008823394775,
 '47133': -0.1606408804655075,
 '47129': -0.18515662848949432}

In [35]:
query_res[1]

[CandidateEntry(doc_id='639661', score=18.328275680541992),
 CandidateEntry(doc_id='472789', score=16.816619873046875),
 CandidateEntry(doc_id='1776205', score=16.630727767944336),
 CandidateEntry(doc_id='639669', score=15.6367826461792),
 CandidateEntry(doc_id='8448903', score=15.448601722717285),
 CandidateEntry(doc_id='8448902', score=15.369601249694824),
 CandidateEntry(doc_id='639670', score=15.27547550201416),
 CandidateEntry(doc_id='639663', score=14.904623985290527),
 CandidateEntry(doc_id='35722', score=14.59425163269043),
 CandidateEntry(doc_id='1302853', score=14.318553924560547),
 CandidateEntry(doc_id='639671', score=14.157160758972168),
 CandidateEntry(doc_id='1786523', score=14.077558517456055),
 CandidateEntry(doc_id='588394', score=13.997241973876953),
 CandidateEntry(doc_id='639690', score=13.810718536376953),
 CandidateEntry(doc_id='1450640', score=13.643953323364258),
 CandidateEntry(doc_id='3936360', score=13.642525672912598),
 CandidateEntry(doc_id='5622935', scor

#### A comprehensive example where we evaluate **all** queries from `dev`

In [36]:
all_queries = read_queries(QUERY_FILE_NAME)

In [37]:
# Query sample
all_queries[0:5]

[{'DOCNO': 'dev_0',
  'text': 'vein carry blood heart away',
  'text_unlemm': 'veins carry blood heart away',
  'text_raw': 'do veins carry blood to the heart or away',
  'answer_list': ['to'],
  'text_bert_tok': 'do veins carry blood to the heart or away'},
 {'DOCNO': 'dev_1',
  'text': 'sister king country',
  'text_unlemm': 'sister king country',
  'text_raw': 'who is the sister of for king and country',
  'answer_list': ['Rebecca St. James'],
  'text_bert_tok': 'who is the sister of for king and country'},
 {'DOCNO': 'dev_2',
  'text': 'develop periodic table 8 column',
  'text_unlemm': 'developed periodic table 8 columns',
  'text_raw': 'who developed the first periodic table with 8 columns',
  'answer_list': ['Dmitri Mendeleev'],
  'text_bert_tok': 'who developed the first periodic table with 8 columns'},
 {'DOCNO': 'dev_3',
  'text': 'season 14 grey anatomy come',
  'text_unlemm': 'season 14 grey anatomy come',
  'text_raw': "when does season 14 of grey 's anatomy come out",
  '

### Queries have one extra field that cannot be "digested" by the ranking API and we need to delete it:

In [38]:
from tqdm import tqdm
for query_dict in tqdm(all_queries):
    # Delete this field, it cannot be used by ranker
    del query_dict['answer_list']

100%|████████████████████████████████████████████████████████████████████████████████████| 2500/2500 [00:00<00:00, 976145.97it/s]


In [39]:
from flexneuart.eval import *
from flexneuart.utils import sync_out_streams
from tqdm import tqdm

TOP_K=50
MAX_QUERIES_QTY=500
qrels=read_qrels_dict(QREL_FILE_NAME)

for ranker in [java_ranker_avg_embed, java_ranker_bm25, java_ranker_bm25_model1, neural_ranker]:
    run_dict = {}
    with tqdm(all_queries[0:MAX_QUERIES_QTY]) as pbar:
        for query_dict in pbar:
            qid = query_dict[DOCID_FIELD]
            query_res = run_query(cand_prov, TOP_K, query_dict)
            rank_res = ranker.score_candidates(query_res[1], query_dict)
            run_dict[qid] = rank_res
    tqdm.write('\n')
        
    # Let us compute various metrics using our Python code. 
    # Note that results should generally match results obtained using `scripts/exper/run_experiments.sh`
    for eval_obj in [NormalizedDiscountedCumulativeGain(10), \
                 NormalizedDiscountedCumulativeGain(20), \
                 MeanAveragePrecision(), \
                 MeanReciprocalRank()]:
        tqdm.write(str(internal_eval(run=run_dict, metric_func=eval_obj, qrels=qrels)[0]))
    
    tqdm.write('==========================='+ '\n')

100%|█████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:02<00:00, 224.42it/s]




0.23153725283035664
0.2611430435620929
0.1882523067559224
0.3165160815593989



100%|█████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:01<00:00, 278.07it/s]




0.40148726937982154
0.43277089120056567
0.33493535080832154
0.5009406767385571



100%|█████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:02<00:00, 184.23it/s]




0.4640738457712773
0.4924966820985511
0.3838658604949658
0.5713954663360119



100%|██████████████████████████████████████████████████████████████████████████████████████████| 500/500 [03:08<00:00,  2.65it/s]



0.5694050471419018
0.581268005015674
0.481329427613784
0.6842795695923616






#### Optionally we can save the run to be later evaluated using external evaluation tools

In [40]:
write_run_dict(run_dict, 'run.txt')

In [41]:
!head run.txt

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
dev_0 Q0 639661 1 2.028193950653076 fake_run
dev_0 Q0 35722 2 1.4752246141433716 fake_run
dev_0 Q0 472789 3 1.355614185333252 fake_run
dev_0 Q0 8448902 4 1.24833345413208 fake_run
dev_0 Q0 588391 5 1.1973053216934204 fake_run
dev_0 Q0 8448903 6 1.1692496538162231 fake_run
dev_0 Q0 588392 7 0.9428008794784546 fake_run
dev_0 Q0 639663 8 0.9341850280761719 fake_run
dev_0 Q0 2981475 9 0.9230215549468994 fake_run
dev_0 Q0 1786523 10 0.8120793104171753 fake_run
