In [57]:
import json
import os
import numpy as np
import pandas as pd
import pytrec_eval
import torch
from tqdm import tqdm
from collections import Counter, defaultdict
from transformers import AutoTokenizer, AutoModel
from beir.datasets.data_loader import GenericDataLoader
from beir.retrieval.evaluation import EvaluateRetrieval
from pyserini.analysis import Analyzer, get_lucene_analyzer

In [2]:
root_dir = "/home/gaia_data/iida.h/BEIR/datasets"
dataset = "dbpedia-entity"

In [3]:
data_path = os.path.join(root_dir, dataset)
corpus, queries, qrels = GenericDataLoader(data_path).load(split="test")

  0%|          | 0/4635922 [00:00<?, ?it/s]

In [4]:
def get_info(corpus, tokenizer):
    sep = " "
    doc_lens = {}
    df = Counter()
    d_tf = {}

    for cid in tqdm(corpus.keys()):
        text = corpus[cid]["title"] + sep + corpus[cid]["text"]
        input_ids = tokenizer(text)
        doc_lens[cid] = (len(input_ids))
        df.update(list(set(input_ids)))
        tf_d = Counter(input_ids)
        doc_lens[cid] =len(input_ids)
        d_tf[cid] = tf_d
        
    idf = defaultdict(float)
    N = len(corpus)
    for w, v in df.items():
        idf[w] = np.log(N / v)

    doc_len_ave = np.mean(list(doc_lens.values()))
    
    del df
        
    return d_tf, idf, doc_lens, doc_len_ave

In [5]:
def get_d_bm25(d_tf, idf, doc_lens, doc_len_ave):
    k1 = 0.9
    b = 0.6
    token_bm25_score = {}

    for cid, tfs in d_tf.items():
        token_bm25_score[cid] = defaultdict(float)
        for tid, tf in tfs.items():
            token_bm25_score[cid][tid] = tf * (1 + k1) / (tf + k1 * (1 - b + b * doc_lens[cid] / doc_len_ave)) * idf[tid]
            
    return token_bm25_score

In [36]:
hf_model = AutoModel.from_pretrained("/home/iida.h/work/LSS_FUNC/models/microsoft/mpnet-base-v3-msmarco/")

In [6]:
hf_tokenizer = AutoTokenizer.from_pretrained("/home/iida.h/work/LSS_FUNC/models/microsoft/mpnet-base-v3-msmarco/")
hf_d_tf, hf_idf, hf_doc_lens, hf_doc_len_ave = get_info(corpus, hf_tokenizer.tokenize)
hf_token_bm25_score = get_d_bm25(hf_d_tf, hf_idf, hf_doc_lens, hf_doc_len_ave)

  0%|▏                                                                                                                 | 9526/4635922 [00:03<26:23, 2921.03it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (617 > 512). Running this sequence through the model will result in indexing errors
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4635922/4635922 [27:15<00:00, 2835.21it/s]


In [7]:
del hf_d_tf, hf_doc_lens, hf_doc_len_ave

In [8]:
lm_tokenizer = Analyzer(get_lucene_analyzer())
lm_d_tf, lm_idf, lm_doc_lens, lm_doc_len_ave = get_info(corpus, lm_tokenizer.analyze)
lm_token_bm25_score = get_d_bm25(lm_d_tf, lm_idf, lm_doc_lens, lm_doc_len_ave)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4635922/4635922 [10:06<00:00, 7643.36it/s]


In [9]:
del lm_d_tf, lm_doc_lens, lm_doc_len_ave

In [10]:
# Check
with open(f"./analysis_data/{dataset}/bm25_result.json") as f:
    bm25_result = json.load(f)
    
with open(f"./analysis_data/{dataset}/dense_result.json") as f:
    dense_result = json.load(f)
    
with open(f"./analysis_data/{dataset}/weighted_dense_result.json") as f:
    weighted_dense_result = json.load(f)
    
with open(f"./analysis_data/{dataset}/cbm25_result.json") as f:
    cbm25_result = json.load(f)

In [11]:
evaluator = pytrec_eval.RelevanceEvaluator(qrels, {"ndcg_cut.10"})

In [12]:
bm25_scores = evaluator.evaluate(bm25_result)
dense_scores = evaluator.evaluate(dense_result)
wdense_scores = evaluator.evaluate(weighted_dense_result)
cbm25_scores = evaluator.evaluate(cbm25_result["maxsim_bm25"])

In [13]:
print("bm25: ", np.average([i["ndcg_cut_10"] for i in bm25_scores.values()]))
print("dense: ", np.average([i["ndcg_cut_10"] for i in dense_scores.values()]))
print("wdense: ", np.average([i["ndcg_cut_10"] for i in wdense_scores.values()]))
print("cbm25: ", np.average([i["ndcg_cut_10"] for i in cbm25_scores.values()]))

bm25:  0.2846362144861774
dense:  0.29393880438765735
wdense:  0.2957766113065704
cbm25:  0.3600502654050265


In [14]:
def create_diff_scores(source_scores, target_scores):
    diff_scores = {}
    for k in cbm25_scores:
        diff_score = source_scores[k]["ndcg_cut_10"] - target_scores[k]["ndcg_cut_10"]
        if abs(diff_score) > 0.0:
            diff_scores[k] = diff_score
            
    return diff_scores

In [15]:
diff_scores_bm25 = create_diff_scores(cbm25_scores, bm25_scores)
diff_scores_dense = create_diff_scores(cbm25_scores, dense_scores)
diff_scores_wdense = create_diff_scores(cbm25_scores, wdense_scores)

In [16]:
common_diff_query = set(diff_scores_bm25) & set(diff_scores_dense) & set(diff_scores_wdense)

In [17]:
all_diff = pd.concat([pd.Series(diff_scores_bm25), pd.Series(diff_scores_dense), pd.Series(diff_scores_wdense)], axis=1).fillna(0).sort_index()
all_diff = all_diff.rename(columns={0: "bm25", 1: "dense", 2: "wdense"})
all_diff.describe()

Unnamed: 0,bm25,dense,wdense
count,350.0,350.0,350.0
mean,0.086187,0.075556,0.073456
std,0.199378,0.169966,0.166577
min,-0.553146,-0.370335,-0.36907
25%,-0.031854,-0.020791,-0.021832
50%,0.067683,0.046179,0.048355
75%,0.185215,0.181144,0.174247
max,0.874607,0.576551,0.576551


In [18]:
better_query = all_diff[(all_diff > 0.2).all(1)]
better_query

Unnamed: 0,bm25,dense,wdense
INEX_LD-2010014,0.205871,0.377732,0.377732
INEX_LD-20120332,0.498297,0.33404,0.33404
INEX_LD-20120411,0.288059,0.331421,0.331421
INEX_LD-2012311,0.404589,0.225022,0.217117
INEX_LD-2012349,0.268051,0.422281,0.422281
INEX_XER-106,0.318319,0.254698,0.248132
INEX_XER-115,0.238013,0.226357,0.23483
QALD2_te-28,0.306678,0.436698,0.436698
QALD2_te-77,0.368669,0.273339,0.273339
QALD2_te-98,0.493698,0.353522,0.356703


In [19]:
worse_query = all_diff[(all_diff < -0.01).all(1)]
worse_query

Unnamed: 0,bm25,dense,wdense
INEX_LD-20120121,-0.042368,-0.140743,-0.140743
INEX_LD-20120512,-0.039686,-0.160444,-0.165478
INEX_LD-20120531,-0.123915,-0.191059,-0.194992
INEX_XER-123,-0.045102,-0.094529,-0.094529
INEX_XER-128,-0.236168,-0.089087,-0.089087
INEX_XER-134,-0.025829,-0.101242,-0.101242
INEX_XER-139,-0.080513,-0.031168,-0.031168
INEX_XER-140,-0.041795,-0.096607,-0.064796
INEX_XER-141,-0.262685,-0.340985,-0.340985
INEX_XER-147,-0.043313,-0.055023,-0.055023


In [211]:
def show_result_top1(qid, show_condition):
    def extract_top1(result):
        return sorted(result[qid].items(), key=lambda x: -x[1])[0]
    
    bm25_top1 = extract_top1(bm25_result)
    dense_top1 = extract_top1(dense_result)
    wdense_top1 = extract_top1(weighted_dense_result)
    cbm25_top1 = extract_top1(cbm25_result["maxsim_bm25"])
    
    correct = qrels[qid]
    if show_condition(cbm25_top1[0], bm25_top1[0], correct):
        print("------")
        print(f"qid: {qid}, query: {queries[qid]}")
        print(bm25_top1[0] in correct, f"bm25: {corpus[bm25_top1[0]]}")
        print(dense_top1[0] in correct, f"dense: {corpus[dense_top1[0]]}")
        print(wdense_top1[0] in correct, f"wdense: {corpus[wdense_top1[0]]}")
        print(cbm25_top1[0] in correct, f"cbm25: {corpus[cbm25_top1[0]]}")
    else:
        print("------")
        print(f"skip qid {qid}")

In [238]:
def correct_condition(target, compare, correct):
    return target in correct and not compare in correct

for qid in better_query.index:
    show_result_top1(qid, correct_condition)

------
skip qid INEX_LD-2010014
------
skip qid INEX_LD-20120332
------
skip qid INEX_LD-20120411
------
skip qid INEX_LD-2012311
------
skip qid INEX_LD-2012349
------
skip qid INEX_XER-106
------
skip qid INEX_XER-115
------
qid: QALD2_te-28, query: Give me all movies directed by Francis Ford Coppola.
False bm25: {'text': '"Don\'t Box Me In" is a collaboration between Stewart Copeland and Stan Ridgway. It was recorded as part of the soundtrack for the Francis Ford Coppola\'s movie Rumble Fish and was subsequently released as a single. Copeland plays guitar, drums, bass and keyboards, and Ridgway sings and plays harmonica.', 'title': "Don't Box Me In"}
False dense: {'text': 'All of Me is a 1984 fantasy comedy film directed by Carl Reiner and starring Steve Martin and Lily Tomlin. This film is based on the novel Me Two by Edwin Davis.', 'title': 'All of Me (1984 film)'}
False wdense: {'text': 'All of Me is a 1984 fantasy comedy film directed by Carl Reiner and starring Steve Martin and

In [210]:
def false_condition(target, compare, correct):
    return not target in correct and compare in correct

for i in worse_query.index:
    show_result_top1(i, false_condition)

------
skip qid INEX_LD-20120121
------
skip qid INEX_LD-20120512
------
skip qid INEX_LD-20120531
------
skip qid INEX_XER-123
------
skip qid INEX_XER-128
------
skip qid INEX_XER-134
------
skip qid INEX_XER-139
------
skip qid INEX_XER-140
------
skip qid INEX_XER-141
------
skip qid INEX_XER-147
------
skip qid INEX_XER-62
------
skip qid INEX_XER-67
------
skip qid INEX_XER-88
------
skip qid QALD2_te-60
------
qid: QALD2_te-64, query: Give me all launch pads operated by NASA.
True bm25: {'text': 'The RM-90 Blue Scout II was an American sounding rocket and expendable launch system which was flown three times during 1961. It was used for two HETS test flights, and the launch of the Mercury-Scout 1 satellite for NASA. It was a member of the Scout family of rockets.The Blue Scout II was a military version of the NASA-operated Scout X-1. All three launches occurred from Launch Complex 18B at the Cape Canaveral Air Force Station, the same launch pad used for the Blue Scout I.', 'title

In [226]:
def preproc_rep(reps: np.ndarray, att_mask: np.ndarray, input_tok: np.ndarray):
    reps = rep_lave(reps, att_mask)
    reps /= np.linalg.norm(reps, axis=2)[:, :, np.newaxis]
    reps[np.isnan(reps)] = 0.0
    return reps, att_mask[:, 1:], input_tok[:, 1:]

def rep_lave(reps, att_masks, window_size=3):
    tg_reps = np.zeros_like(reps[:, 1:])  # 3D
    for b, (rep, att_mask) in enumerate(zip(reps, att_masks)):
        og_rep = rep[att_mask == 1, :]  # 2D
        og_rep = og_rep[1:-1, :]  # remove special token
        rep_len = og_rep.shape[0]
        for i in range(rep_len):
            start = i - window_size if i - window_size > 0 else 0
            end = i + window_size
            tg_reps[b, i, :] += np.mean(og_rep[start:end, :], axis=0)

    return tg_reps

def max_cos_sims(query, doc, model, tokenizer):
    special_tokens = {
        tokenizer.pad_token_id,
        tokenizer.bos_token_id,
        tokenizer.eos_token_id,
        tokenizer.sep_token_id,
        tokenizer.cls_token_id,
    }
    def tok2rep_indexing(inputs_ids, batch_reps, att_masks):
        tok2rep = defaultdict(list)
        for qi, (input_ids, reps, att_mask) in enumerate(zip(inputs_ids, batch_reps, att_masks)):
            for i, (qt, rep, am) in enumerate(zip(input_ids, reps, att_mask)):
                if qt in special_tokens:
                    continue
                if am == 0:
                    continue
                tok2rep[qt].append(rep)
            
        for tid in tok2rep:
            tok2rep[tid] = np.vstack(tok2rep[tid])
            
        return tok2rep
        
    doc = doc["title"] + " " + doc["text"]
    t_query = tokenizer(query, return_tensors="pt")
    with torch.no_grad():
        e_query = model(**t_query).last_hidden_state
    e_queries, q_att_masks, q_inputs_ids = preproc_rep(e_query.numpy(), t_query["attention_mask"].numpy(), t_query["input_ids"].numpy())
    q_tok2rep = tok2rep_indexing(q_inputs_ids, e_queries, q_att_masks)
    t_doc = tokenizer(doc, return_tensors="pt")
    with torch.no_grad():
        e_doc = model(**t_doc).last_hidden_state
    e_docs, d_att_masks, d_inputs_ids = preproc_rep(e_doc.numpy(), t_doc["attention_mask"].numpy(), t_doc["input_ids"].numpy())
    # print(e_docs.shape, d_att_masks.shape, d_inputs_ids.shape, tokenizer.convert_ids_to_tokens(list(d_inputs_ids[0])))
    d_tok2rep = tok2rep_indexing(d_inputs_ids, e_docs, d_att_masks)
    result = []
    for qt, q_reps in q_tok2rep.items():
        q_token = tokenizer.convert_ids_to_tokens(int(qt))
        if qt not in d_tok2rep:
            result.append(f"{q_token}: 0.0")
            continue
            
        score=np.max(np.dot(q_reps, d_tok2rep[qt].T))
        result.append(f"{q_token}: {round(float(score), 2)}")
        
    return ", ".join(result)

In [203]:
def tokenizer_doc(target, tokenizer):
    text = corpus[target]["title"] + " " + corpus[target]["text"]
    return tokenizer(text)
    
def get_bm25_val(cid, query, token_bm25_score, tokenizer):
    this_q_tok_bm25 = []
    t_query = tokenizer(query)
    for tok in t_query:
        this_q_tok_bm25.append(f"{tok}: {round(token_bm25_score[cid][(tok)], 2)}")
    return ", ".join(this_q_tok_bm25)

def get_idf_val(query, idf, tokenizer):
    this_q_tok_idf = []
    t_query = tokenizer(query)
    for tok in t_query:
        this_q_tok_idf.append(f"{tok}: {round(idf[tok], 2)}")
    return ", ".join(this_q_tok_idf)
        

def show_result_top1_analysis(qid, show_condition):
    def extract_top1(result):
        return sorted(result[qid].items(), key=lambda x: -x[1])[0]
    
    bm25_top1 = extract_top1(bm25_result)
    dense_top1 = extract_top1(dense_result)
    wdense_top1 = extract_top1(weighted_dense_result)
    cbm25_top1 = extract_top1(cbm25_result["maxsim_bm25"])
    
    correct = qrels[qid]
    query = queries[qid]
    if show_condition(cbm25_top1[0], bm25_top1[0], correct):
        print("------")
        print(f"qid: {qid}, query: {query}")
        print(f"lm_idf: {get_idf_val(query, lm_idf, lm_tokenizer.analyze)}")
        print(f"hf_idf: {get_idf_val(query, hf_idf, hf_tokenizer.tokenize)}")
        print("---")
        cid_bm25_top1 = bm25_top1[0]
        cid_bm25_tok_score = get_bm25_val(cid_bm25_top1, query, lm_token_bm25_score, lm_tokenizer.analyze)
        t_doc = tokenizer_doc(cid_bm25_top1, lm_tokenizer.analyze)
        print(f"bm25: {cid_bm25_top1 in correct} q-bm25: {cid_bm25_tok_score}")
        print(f"t_doc: {t_doc}")
        cid_dense_top1 = dense_top1[0]
        cid_dense_tok_score = get_bm25_val(cid_dense_top1, query, hf_token_bm25_score, hf_tokenizer.tokenize)
        t_doc = tokenizer_doc(cid_dense_top1, hf_tokenizer.tokenize)
        print(f"dense: {cid_bm25_top1 in correct} q-bm25: {cid_dense_tok_score}")
        print(f"t_doc: {t_doc}")  
        cid_wdense_top1 = wdense_top1[0]
        cid_wdense_tok_score = get_bm25_val(cid_wdense_top1, query, hf_token_bm25_score, hf_tokenizer.tokenize)
        t_doc = tokenizer_doc(cid_wdense_top1, hf_tokenizer.tokenize)
        print(f"wdense: {cid_bm25_top1 in correct}, q-bm25: {cid_wdense_tok_score}")
        print(f"t_doc: {t_doc}")  
        cid_cbm25_top1 = cbm25_top1[0]
        cid_cbm25_tok_score = get_bm25_val(cid_cbm25_top1, query, hf_token_bm25_score, hf_tokenizer.tokenize)
        t_doc = tokenizer_doc(cid_cbm25_top1, hf_tokenizer.tokenize)
        t_cos_sims = max_cos_sims(query, corpus[cid_cbm25_top1], hf_model, hf_tokenizer)
        print(f"cbm25: {cid_cbm25_top1 in correct} q-bm25: {cid_cbm25_tok_score}")
        print(f"cos-sims: {t_cos_sims}")
        print(f"t_doc: {t_doc}")
    else:
        print("------")
        print(f"skip qid {qid}")

In [204]:
for qid in better_query.index:
    show_result_top1_analysis(qid, correct_condition)

------
skip qid INEX_LD-2010014
------
skip qid INEX_LD-20120332
------
skip qid INEX_LD-20120411
------
skip qid INEX_LD-2012311
------
skip qid INEX_LD-2012349
------
skip qid INEX_XER-106
------
skip qid INEX_XER-115
------
qid: QALD2_te-28, query: Give me all movies directed by Francis Ford Coppola.
lm_idf: give: 5.55, me: 5.71, all: 3.39, movi: 5.1, direct: 3.57, franci: 5.77, ford: 6.51, coppola: 9.72
hf_idf: give: 6.51, me: 4.48, all: 3.3, movies: 6.38, directed: 3.78, by: 1.22, francis: 5.71, ford: 6.43, cop: 6.87, ##pol: 6.01, ##a: 2.42, .: 0.01
---
bm25: False q-bm25: give: 0.0, me: 7.54, all: 0.0, movi: 5.16, direct: 0.0, franci: 5.85, ford: 6.59, coppola: 9.84
t_doc: ["don't", 'box', 'me', "don't", 'box', 'me', 'collabor', 'between', 'stewart', 'copeland', 'stan', 'ridgwai', 'record', 'part', 'soundtrack', 'franci', 'ford', 'coppola', 'movi', 'rumbl', 'fish', 'subsequ', 'releas', 'singl', 'copeland', 'plai', 'guitar', 'drum', 'bass', 'keyboard', 'ridgwai', 'sing', 'plai', '

  reps /= np.linalg.norm(reps, axis=2)[:, :, np.newaxis]


(1, 84, 768) (1, 84) (1, 84) ['brooklyn', 'bridge', 'the', 'brooklyn', 'bridge', 'is', 'a', 'hybrid', 'cable', '-', 'stayed', '/', 'suspension', 'bridge', 'in', 'new', 'york', 'city', 'and', 'is', 'one', 'of', 'the', 'oldest', 'bridges', 'of', 'either', 'type', 'in', 'the', 'united', 'states', '.', 'completed', 'in', '1883', ',', 'it', 'connects', 'the', 'boroughs', 'of', 'manhattan', 'and', 'brooklyn', 'by', 'spanning', 'the', 'east', 'river', '.', 'it', 'has', 'a', 'main', 'span', 'of', '1', ',', '59', '##5', '.', '5', 'feet', '(', '48', '##6', '.', '3', 'm', ')', ',', 'and', 'was', 'the', 'first', 'steel', '-', 'wire', 'suspension', 'bridge', 'constructed', '.', '</s>']
cbm25: True q-bm25: which: 0.0, river: 3.44, does: 0.0, the: 0.2, brooklyn: 8.96, bridge: 8.26, cross: 0.0, ?: 0.0
cos-sims: which: 0.0, river: 0.65, does: 0.0, the: 0.66, brooklyn: 0.62, bridge: 0.62, cross: 0.0, ?: 0.0
t_doc: ['brooklyn', 'bridge', 'the', 'brooklyn', 'bridge', 'is', 'a', 'hybrid', 'cable', '-', 'st

In [244]:
for qid in worse_query.index:
    show_result_top1_analysis(qid, false_condition)

------
qid: INEX_LD-20120121, query: vietnam food recipes
lm_idf: vietnam: 6.02, food: 5.62, recip: 8.24
hf_idf: vietnam: 5.96, food: 5.7, recipes: 8.86
---
bm25: True q-bm25: vietnam: 0.0, food: 8.47, recip: 12.86
t_doc: ['raw', 'food', 'made', 'easi', '1', '2', 'peopl', 'raw', 'food', 'made', 'easi', '1', '2', 'peopl', 'recip', 'book', 'raw', 'food', 'chef', 'jennif', 'cornbleet', 'best', 'sell', 'book', 'publish', '2005', 'promot', 'raw', 'food', 'diet', 'dietari', 'movement', 'encourag', 'consumpt', 'uncook', 'food', 'obtain', 'maximum', 'health', 'benefits.th', 'book', 'featur', '115', 'recip', 'includ', '21', 'breakfast', 'recip', '64', 'lunch', 'dinner', 'recip', '30', 'dessert', 'recip', 'each', 'recip', 'yield', 'serv', 'on', 'two', 'peopl']
dense: True q-bm25: vietnam: 5.98, food: 0.0, recipes: 0.0
t_doc: ['vietnamese', 'cuisine', 'vietnamese', 'cuisine', 'encompasses', 'the', 'foods', 'and', 'beverages', 'of', 'vietnam', ',', 'and', 'features', 'a', 'combination', 'of', 'fiv

  reps /= np.linalg.norm(reps, axis=2)[:, :, np.newaxis]


cbm25: False q-bm25: south: 3.81, korean: 7.63, girl: 7.13, groups: 8.16
cos-sims: south: 0.61, korean: 0.61, girl: 0.6, groups: 0.6
t_doc: ['list', 'of', 'south', 'korean', 'idol', 'groups', 'the', 'following', 'is', 'a', 'list', 'of', 'south', 'korean', 'teen', 'idol', 'musical', 'bands', '.', 'this', 'includes', 'a', 'list', 'of', 'boy', 'bands', 'and', 'girl', 'groups', ',', 'organized', 'by', 'year', 'of', 'debut', '.', 'idol', 'bands', 'in', 'south', 'korea', 'started', 'to', 'appear', 'after', 'the', 'success', 'of', 'seo', 'tai', '##ji', 'and', 'boys', ',', 'whose', 'debut', 'in', '1992', 'is', 'considered', 'a', 'turning', 'point', 'in', 'the', 'history', 'of', 'korean', 'popular', 'music', '.', '2012', 'was', 'a', 'record', 'year', 'in', 'k', '-', 'pop', 'in', 'terms', 'of', 'number', 'of', 'rookie', 'artists', ':', '33', 'male', 'groups', 'and', '38', 'girl', 'groups', 'debuted', '.']
------
skip qid INEX_LD-20120531
------
skip qid INEX_XER-123
------
skip qid INEX_XER-128


In [185]:
sorted(cbm25_result["maxsim_bm25"]["INEX_XER-141"].items(), key=lambda x: -x[1])[0]

('<dbpedia:CatalunyaCaixa>', 14.937976507696643)

In [186]:
qid = "QALD2_te-28"
cid = "<dbpedia:Don't_Box_Me_In>"
query = queries[qid]
print(query)
print("bm25")
print(get_bm25_val(cid, query, hf_token_bm25_score, hf_tokenizer.tokenize))
print(max_cos_sims(query, corpus[cid], hf_model, hf_tokenizer))
print("cmb25")
cid = "<dbpedia:The_Godfather>"
print(get_bm25_val(cid, query, hf_token_bm25_score, hf_tokenizer.tokenize))
print(max_cos_sims(query, corpus[cid], hf_model, hf_tokenizer))

Give me all movies directed by Francis Ford Coppola.
bm25
give: 0.0, me: 5.83, all: 0.0, movies: 0.0, directed: 0.0, by: 0.0, francis: 5.66, ford: 6.38, cop: 6.81, ##pol: 5.96, ##a: 2.4, .: 0.01
give: 0.0, me: 0.27, all: 0.0, movies: 0.0, directed: 0.0, by: 0.0, francis: 0.34, ford: 0.37, cop: 0.38, ##pol: 0.37, ##a: 0.35, .: 0.22
cmb25
give: 0.0, me: 0.0, all: 0.0, movies: 0.0, directed: 4.38, by: 1.91, francis: 6.6, ford: 7.44, cop: 9.88, ##pol: 8.64, ##a: 3.48, .: 0.01


  reps /= np.linalg.norm(reps, axis=2)[:, :, np.newaxis]


give: 0.0, me: 0.0, all: 0.0, movies: 0.0, directed: 0.48, by: 0.51, francis: 0.53, ford: 0.54, cop: 0.54, ##pol: 0.51, ##a: 0.51, .: 0.49


In [173]:
qid = "INEX_LD-20120512"
cid = '<dbpedia:GP_Basic>'
query = queries[qid]
print(query)
print(get_bm25_val(cid, query, hf_token_bm25_score, hf_tokenizer.tokenize))
print(max_cos_sims(query, corpus[cid], hf_model, hf_tokenizer))

south korean girl groups
south: 3.67, korean: 7.36, girl: 7.8, groups: 0.0
south: 0.61, korean: 0.64, girl: 0.64, groups: 0.0


  reps /= np.linalg.norm(reps, axis=2)[:, :, np.newaxis]


In [187]:
qid = "INEX_XER-141"
cid = '<dbpedia:Open_University_of_Catalonia>'
query = queries[qid]
print('dense')
print(query)
print(get_bm25_val(cid, query, hf_token_bm25_score, hf_tokenizer.tokenize))
print(max_cos_sims(query, corpus[cid], hf_model, hf_tokenizer))
print("cbm25")
cid = '<dbpedia:CatalunyaCaixa>'
print(get_bm25_val(cid, query, hf_token_bm25_score, hf_tokenizer.tokenize))
print(max_cos_sims(query, corpus[cid], hf_model, hf_tokenizer))

dense
Universities in Catalunya
universities: 0.0, in: 0.32, cat: 4.66, ##al: 3.51, ##un: 4.44, ##ya: 4.02
universities: 0.0, in: 0.65, cat: 0.7, ##al: 0.73, ##un: 0.72, ##ya: 0.7
cbm25
universities: 0.0, in: 0.31, cat: 7.58, ##al: 6.0, ##un: 7.22, ##ya: 6.53


  reps /= np.linalg.norm(reps, axis=2)[:, :, np.newaxis]


universities: 0.0, in: 0.46, cat: 0.57, ##al: 0.56, ##un: 0.55, ##ya: 0.58


In [192]:
qid = "INEX_LD-20120121"
cid = "<dbpedia:Raw_Food_Made_Easy_for_1_or_2_People>"
query = queries[qid]
print(query)
print("bm25")
print(get_bm25_val(cid, query, hf_token_bm25_score, hf_tokenizer.tokenize))
print(max_cos_sims(query, corpus[cid], hf_model, hf_tokenizer))
print(tokenizer_doc(cid, hf_tokenizer.tokenize))
print("cbm25")
cid = "<dbpedia:Luke_Nguyen's_Vietnam>"
print(get_bm25_val(cid, query, hf_token_bm25_score, hf_tokenizer.tokenize))
print(max_cos_sims(query, corpus[cid], hf_model, hf_tokenizer))

vietnam food recipes
bm25
vietnam: 0.0, food: 8.5, recipes: 13.2
vietnam: 0.0, food: 0.25, recipes: 0.27
['raw', 'food', 'made', 'easy', 'for', '1', 'or', '2', 'people', 'raw', 'food', 'made', 'easy', 'for', '1', 'or', '2', 'people', 'is', 'a', 'recipe', 'book', 'by', 'raw', 'food', 'chef', 'jennifer', 'corn', '##ble', '##et', '.', 'the', 'best', '-', 'selling', 'book', 'was', 'published', 'in', '2005', 'and', 'promotes', 'the', 'raw', 'food', 'diet', ',', 'a', 'dietary', 'movement', 'that', 'encourages', 'the', 'consumption', 'of', 'un', '##co', '##oked', 'foods', 'to', 'obtain', 'maximum', 'health', 'benefits', '.', 'the', 'book', 'features', '115', 'recipes', ',', 'including', '21', 'breakfast', 'recipes', ',', '64', 'lunch', 'and', 'dinner', 'recipes', 'and', '30', 'dessert', 'recipes', '.', 'each', 'recipe', 'yields', 'serving', '##s', 'for', 'one', 'or', 'two', 'people', '.']
cbm25
vietnam: 8.92, food: 7.71, recipes: 0.0


  reps /= np.linalg.norm(reps, axis=2)[:, :, np.newaxis]


vietnam: 0.51, food: 0.49, recipes: 0.0


In [181]:
for i in sorted(cbm25_result["maxsim_bm25"]["INEX_LD-20120121"].items(), key=lambda x: -x[1]):
    print(i, corpus[i[0]])

("<dbpedia:Luke_Nguyen's_Vietnam>", 8.137327611967876) {'text': "Luke Nguyen's Vietnam is an Australian television series first screened on SBS One in 2010. The series follows chef, Luke Nguyen, as he tours Vietnam seeking culinary delights and adventure. It is regularly broadcast on Good Food, a UK food-orientated TV channel.", 'title': "Luke Nguyen's Vietnam"}
('<dbpedia:2007_Vietnam_food_scare>', 6.830203367097289) {'text': 'The 2007 Vietnam food scare was a food scandal, which exposed contaminated food.  Among the issues were formaldehyde in noodles of the national dish, Phở, banned pesticides in vegetables and fruit, and toxic soy sauce.', 'title': '2007 Vietnam food scare'}
('<dbpedia:Puppy_chow>', 5.573718912252438) {'text': "Puppy chow, also typically known as muddy buddies, monkey munch, or reindeer food, is the name for a homemade snack made in the United States. The recipe's name and ingredients can differ depending on the version, but most recipes will typically include cer

In [188]:
lm_tokenizer.analyze("Vietnamese")

['vietnames']

In [234]:
def show_result_top1_comp_wdense(qid, show_condition):
    def extract_top1(result):
        return sorted(result[qid].items(), key=lambda x: -x[1])[0]
    
    bm25_top1 = extract_top1(bm25_result)
    dense_top1 = extract_top1(dense_result)
    wdense_top1 = extract_top1(weighted_dense_result)
    cbm25_top1 = extract_top1(cbm25_result["maxsim_bm25"])
    
    correct = qrels[qid]
    # if show_condition(cbm25_top1[0], wdense_top1[0], correct) and wdense_top1[0] != dense_top1[0]:
    if wdense_top1[0] != dense_top1[0]:
        print("------")
        print(f"qid: {qid}, query: {queries[qid]}")
        print(bm25_top1[0] in correct, f"bm25: {corpus[bm25_top1[0]]}")
        print(dense_top1[0] in correct, f"dense: {corpus[dense_top1[0]]}")
        print(wdense_top1[0] in correct, f"wdense: {corpus[wdense_top1[0]]}")
        print(cbm25_top1[0] in correct, f"cbm25: {corpus[cbm25_top1[0]]}")
    else:
        print("------")
        print(f"skip qid {qid}")

In [235]:
target_qid = []
for qid, sup in diff_scores_wdense.items():
    if sup < 0:
        continue
    diff = wdense_scores[qid]["ndcg_cut_10"] - dense_scores[qid]["ndcg_cut_10"]
    if diff > 0:
        target_qid.append(qid)
        
for qid in target_qid:
    show_result_top1_comp_wdense(qid, correct_condition)

------
skip qid INEX_LD-2009062
------
qid: INEX_LD-20120221, query: guitar classical flamenco
True bm25: {'text': 'Picados are the flamenco scales of a guitar (see Flamenco guitar), or a guitar playing technique where the musician plays scale passages by alternating the index and middle fingers. Picado is normally executed apoyando (with rest strokes). It is often used rapidly to play a melody.This technique is similar to that of the rest stroke played in classical guitar. The difference is that technique is of central importance in flamenco while peripheral in classical.', 'title': 'Picados'}
False dense: {'text': 'Dos guitarras flamencas en stereo (Two Flamenco Guitars in Stereo) is the first of three collaborative albums between Paco de Lucía & Ricardo Modrego.The two had met while working for José Greco, a talent agent and flamenco dancer, and had composed most of the pieces while touring around the world with him. When they finally returned to Madrid, they proceeded to pitch thei

In [221]:
for i in pd.Series(pd.Series(diff_scores_wdense) > 0).index:
    show_result_top1_comp_wdense(i, correct_condition)

------
skip qid INEX_LD-2009039
------
skip qid INEX_LD-2009053
------
skip qid INEX_LD-2009061
------
skip qid INEX_LD-2009062
------
skip qid INEX_LD-2009063
------
skip qid INEX_LD-2009074
------
skip qid INEX_LD-2009115
------
skip qid INEX_LD-2010004
------
skip qid INEX_LD-2010014
------
skip qid INEX_LD-2010019
------
skip qid INEX_LD-2010020
------
skip qid INEX_LD-2010037
------
skip qid INEX_LD-2010043
------
skip qid INEX_LD-2010057
------
skip qid INEX_LD-2010069
------
skip qid INEX_LD-20120111
------
skip qid INEX_LD-20120121
------
skip qid INEX_LD-20120122
------
skip qid INEX_LD-20120131
------
skip qid INEX_LD-20120211
------
skip qid INEX_LD-20120221
------
skip qid INEX_LD-20120231
------
skip qid INEX_LD-20120232
------
skip qid INEX_LD-20120311
------
skip qid INEX_LD-20120312
------
skip qid INEX_LD-20120321
------
skip qid INEX_LD-20120331
------
skip qid INEX_LD-20120332
------
skip qid INEX_LD-20120411
------
skip qid INEX_LD-20120412
------
skip qid INEX_LD-2

In [242]:
sorted(cbm25_result['maxsim_bm25']["QALD2_te-28"].items(), key=lambda x: -x[1])[0]

('<dbpedia:The_Godfather>', 21.494827189395043)

In [237]:
hf_token_bm25_score['<dbpedia:All_of_Me_(1984_film)>']

defaultdict(float,
            {'all': 4.670821353398937,
             'of': 0.42791657269598216,
             'me': 6.918947044817926,
             '(': 0.566189680895757,
             '1984': 6.491366650030292,
             'film': 4.965431874273123,
             ')': 0.5646920139951528,
             'is': 0.4117070653822991,
             'a': 0.25428511561457706,
             'fantasy': 6.651040939404667,
             'comedy': 5.434588938246994,
             'directed': 4.257379071963996,
             'by': 1.727493530618558,
             'carl': 6.732983317577365,
             'rein': 7.593501695450209,
             '##er': 3.7600933286432174,
             'and': 0.5957678932683022,
             'starring': 5.307614822818902,
             'steve': 6.680169500001085,
             'martin': 5.918879714547183,
             'lily': 8.787146097331808,
             'tom': 6.018670031150756,
             '##lin': 6.361386071293406,
             '.': 0.010193656302814413,
             'th

In [243]:
hf_token_bm25_score['<dbpedia:The_Godfather>']

defaultdict(float,
            {'the': 0.17876180519135781,
             'godfather': 12.846328482533274,
             'is': 0.3369646483632921,
             'a': 0.3248607113449043,
             '1972': 5.501378141161872,
             'american': 2.772878204391211,
             'crime': 6.822489484500963,
             'film': 3.7174427551685896,
             'directed': 4.375885529093986,
             'by': 1.9122347831187063,
             'francis': 6.602282426445269,
             'ford': 7.444094199751813,
             'cop': 9.882200535101756,
             '##pol': 8.643266661228308,
             '##a': 3.4784497248878625,
             'and': 0.6060741133520219,
             'produced': 4.537223204082256,
             'albert': 6.740979651407461,
             's': 1.8737024856516133,
             '.': 0.010369996898710356,
             'rudd': 10.348669873534496,
             '##y': 3.7741361466320886,
             'from': 1.816354023264928,
             'screenplay': 7.62972499151

In [227]:
qid = "QALD2_te-64"
cid = "<dbpedia:Launch_Control_Center>"
query = queries[qid]
print(query)
print("bm25")
print(get_bm25_val(cid, query, hf_token_bm25_score, hf_tokenizer.tokenize))
print(max_cos_sims(query, corpus[cid], hf_model, hf_tokenizer))
print("cmb25")
cid = "<dbpedia:Vandenberg_AFB_Space_Launch_Complex_4>"
print(get_bm25_val(cid, query, hf_token_bm25_score, hf_tokenizer.tokenize))
print(max_cos_sims(query, corpus[cid], hf_model, hf_tokenizer))

Give me all launch pads operated by NASA.
bm25
give: 0.0, me: 0.0, all: 3.5, launch: 10.15, pads: 0.0, operated: 0.0, by: 0.0, nasa: 7.71, .: 0.01
give: 0.0, me: 0.0, all: 0.48, launch: 0.47, pads: 0.0, operated: 0.0, by: 0.0, nasa: 0.49, .: 0.56
cmb25
give: 0.0, me: 0.0, all: 0.0, launch: 10.23, pads: 12.35, operated: 0.0, by: 1.63, nasa: 0.0, .: 0.01


  reps /= np.linalg.norm(reps, axis=2)[:, :, np.newaxis]


give: 0.0, me: 0.0, all: 0.0, launch: 0.49, pads: 0.47, operated: 0.0, by: 0.54, nasa: 0.0, .: 0.49


In [239]:
hf_token_bm25_score["<dbpedia:Launch_Control_Center>"]

defaultdict(float,
            {'launch': 10.145049269556322,
             'control': 7.21423193690452,
             'center': 6.432283478792204,
             'the': 0.18670420253094455,
             '(': 0.5325382413603182,
             'lc': 10.66449054879862,
             '##c': 5.201135998271228,
             ')': 0.5311295881751709,
             'is': 0.3083539803623241,
             'a': 0.23917169959588863,
             'four': 4.052094170804988,
             '-': 1.0240606968386188,
             'story': 4.691555579341044,
             'building': 4.681573392433325,
             'located': 2.6955873942313975,
             'at': 1.6698443265528524,
             'nasa': 7.707474672086065,
             "'": 1.7368524381656165,
             's': 1.7146119697474924,
             'kennedy': 7.3413683635005755,
             'space': 7.260534860063399,
             'on': 1.4762508338947171,
             'merritt': 9.509155484660358,
             'island': 4.358026861221938,
           

In [240]:
hf_token_bm25_score["<dbpedia:Vandenberg_AFB_Space_Launch_Complex_4>"]

defaultdict(float,
            {'van': 5.927488364964662,
             '##den': 6.2254206122904385,
             '##berg': 6.4278530215504,
             'afb': 7.551145199829048,
             'space': 7.1199098945878925,
             'launch': 10.225218207167412,
             'complex': 7.157702198547805,
             '4': 5.012931701138498,
             '(': 0.4201627185053785,
             'sl': 7.206787957774043,
             '##c': 5.790866012045858,
             '-': 1.5155265412233054,
             ')': 0.41905131747211016,
             'is': 0.2432855269136024,
             'a': 0.18870200050810537,
             'site': 3.9140270402091177,
             'at': 1.3174759617882894,
             'air': 3.6884891635960386,
             'force': 4.134375127854424,
             'base': 4.455086008242526,
             'with': 1.4452232733884367,
             'two': 3.4108622168589724,
             'pads': 12.35292428802794,
             ',': 0.28448963735241206,
             'both': 4.48

In [258]:
names = ["Carl Reiner", "Steve Martin", "Lily Tomlin "]
for name in names:
    print(", ".join([f"{tn}: {round(hf_idf[tn], 2)}" for tn in hf_tokenizer.tokenize(name)]))

carl: 5.98, rein: 6.74, ##er: 3.34
steve: 5.93, martin: 5.26
lily: 7.8, tom: 5.35, ##lin: 5.65
