In [1]:
import os
import sys
import ast
from collections import OrderedDict
import json

sys.path.insert(0, "../third_party/ColBERT")

from colbert.infra import Run, RunConfig, ColBERTConfig
from colbert.data import Queries
from colbert import Searcher 

index_name = "enw100.1bits"

In [2]:
def load_mqueries(queries_path):
    queries = OrderedDict()

    print("#> Loading the multilingual queries from", queries_path, "...")

    with open(queries_path) as f:
        for line in f:
            qid, translated_query, query, lang, answers = line.strip().split("\t")
            # qid = int(qid)

            assert qid not in queries, ("Query QID", qid, "is repeated!")
            queries[qid] = (translated_query, query, lang, answers)

    print("#> Got", len(queries), "queries. All QIDs are unique.\n")

    return queries

In [22]:
def get_query_result(qid, query, rankings, searcher):
    query_result = {}
    query_result['q_id'] = qid
    query_result['question'] = query[1]
    query_result['answers'] = ast.literal_eval(query[3])
    query_result['lang'] = query[2]
    ctxs = []
    for passage_id, passage_rank, passage_score in rankings[qid]:
        ctx = {}
        ctx['id'] = passage_id
        split_passage = searcher.collection[passage_id].split(' | ', 1)
        ctx['title'] = split_passage[0]
        ctx['text'] = split_passage[1]
        ctx['score'] = passage_score
        ctx['has_answer'] = None
        ctxs.append(ctx)
    query_result['ctxs'] = ctxs
    return query_result

In [4]:
queries_path = "../data/mkqa_queries_translated.tsv"
queries = load_mqueries(queries_path)

query_text = OrderedDict()
for qid in queries:
    query_text[qid] = queries[qid][0]

#> Loading the multilingual queries from ../data/mkqa_queries_translated.tsv ...
#> Got 19338 queries. All QIDs are unique.



In [8]:
with Run().context(RunConfig(index_root='/workspace/index', experiment='enwiki')):
    searcher = Searcher(index=index_name)
rankings = searcher.search_all(query_text, k=1).todict()

[Sep 12, 03:45:33] #> Loading collection...
0M 1M 2M 3M 4M 5M 6M 7M 8M 9M 10M 11M 12M 13M 14M 15M 16M 17M 18M 19M 20M 21M 22M 
[Sep 12, 03:48:35] #> Loading codec...
[Sep 12, 03:48:36] Loading decompress_residuals_cpp extension (set COLBERT_LOAD_TORCH_EXTENSION_VERBOSE=True for more info)...
[Sep 12, 03:48:41] Loading packbits_cpp extension (set COLBERT_LOAD_TORCH_EXTENSION_VERBOSE=True for more info)...
[Sep 12, 03:48:47] #> Loading IVF...
[Sep 12, 03:48:51] #> Loading doclens...


100%|██████████| 907/907 [00:01<00:00, 672.73it/s]


[Sep 12, 03:48:55] #> Loading codes and residuals...


100%|██████████| 907/907 [00:33<00:00, 26.80it/s]
100%|██████████| 19338/19338 [06:09<00:00, 52.37it/s]


In [9]:
rankings

{'563260143484355911_ar': [(9536752, 1, 24.5625)],
 '-4767536963343233422_ar': [(12445800, 1, 22.21875)],
 '4412615293667765975_ar': [(18620298, 1, 18.765625)],
 '7818112641371223126_ar': [(21243091, 1, 26.796875)],
 '3716943449734480322_ar': [(19449209, 1, 19.671875)],
 '3309140698422618645_ar': [(1611823, 1, 24.375)],
 '-3407656036949886841_ar': [(185588, 1, 23.890625)],
 '846466415522403319_ar': [(13688613, 1, 23.9375)],
 '-5553927247739165705_ar': [(16893126, 1, 24.28125)],
 '-4166868313835054376_ar': [(18508753, 1, 25.0625)],
 '-3156311054855292528_ar': [(17055278, 1, 17.28125)],
 '-6277239576741418047_ar': [(20170486, 1, 18.609375)],
 '-8596127252162838467_ar': [(19730174, 1, 23.984375)],
 '8259032885448966485_ar': [(12481327, 1, 25.359375)],
 '3709269594158692309_ar': [(2304770, 1, 27.359375)],
 '7741387357303652822_ar': [(11735595, 1, 25.953125)],
 '1446143713199658242_ar': [(7998879, 1, 26.3125)],
 '-425194203213713887_ar': [(22466476, 1, 18.734375)],
 '-675453680088961188_ar'

In [24]:
output = []
for qid in queries:
    query = queries[qid]
    output.append(get_query_result(qid, query, rankings, searcher))

In [None]:
qid = '563260143484355911_ar'

In [19]:
query = queries[qid]

In [23]:
get_query_result(qid, query, rankings, searcher)

{'q_id': '-8847524364195038466_zh_cn',
 'question': 'من يغني أسمعك يطرق لكنك لا تستطيع أن تأتي',
 'answers': ['ديف إدموندز'],
 'lang': 'ar',
 'ctxs': [{'id': 21976822,
   'title': "Where'd You Go, Bernadette (film)",
   'text': "at her daughter Bee's school. When she disappears, it's Bee's mission to find out where she's disappeared to and what really happened to her. Section::::Cast. Additionally, Laurence Fishburne has been cast in an undisclosed role. Section::::Production. In January 2013, Annapurna Pictures and Color Force acquired the rights to the film adaptation of the novel, with Scott Neustadter and Michael H. Weber writing the screenplay, Semple, Bryan Unkeless, and Ted Schipper executive producing. In February 2015, Richard Linklater was announced to direct the film. In November 2015, Cate Blanchett joined the cast of the film. In April 2016, It was announced",
   'score': 22.609375,
   'has_answer': None}]}

In [25]:
output

[{'q_id': '563260143484355911_ar',
  'question': 'من يغني أسمعك يطرق لكنك لا تستطيع أن تأتي',
  'answers': ['ديف إدموندز'],
  'lang': 'ar',
  'ctxs': [{'id': 9536752,
    'title': 'I Hear You Knocking',
    'text': '"His Brownies (1936); and ""Keep Knocking (But You Can\'t Come In)"", by Bob Wills and His Texas Playboys (1938). None of these early singles listed a songwriter or composer. However, when popular jump blues bandleader Louis Jordan and his Tympany Five recorded the song as ""Keep A-Knockin\'"" in 1939, the single\'s credits listed ""Mays-Bradford"" (Bert Mays and Perry Bradford). in 1957, Little Richard recorded it with ""R. Penniman"", Richard\'s legal name, listed as the writer, although Bert Mays and J. Mayo Williams were later credited as songwriters. Beginning with his signing by the Los Angeles–based Imperial Records in 1950, Smiley"',
    'score': 24.5625,
    'has_answer': None}]},
 {'q_id': '-4767536963343233422_ar',
  'question': 'من لديه اكثر عدد معجبين علي انستا

In [26]:
with open("../data/colbert_outputs.json", "w") as outfile:
        json.dump(output, outfile)