In [1]:
import json
import os
import numpy as np
import pandas as pd
import pytrec_eval
import torch
from tqdm import tqdm
from collections import Counter, defaultdict
from transformers import AutoTokenizer, AutoModel
from beir.datasets.data_loader import GenericDataLoader
from beir.retrieval.evaluation import EvaluateRetrieval
from pyserini.analysis import Analyzer, get_lucene_analyzer

In [2]:
root_dir = "/home/gaia_data/iida.h/BEIR/datasets/"
dataset = "dbpedia-entity"
model_path="/home/gaia_data/iida.h/BEIR/C-BM25/model/distil/dense/"
func_name="maxsim_bm25_qtf"

In [3]:
data_path = os.path.join(root_dir, dataset)
corpus, queries, qrels = GenericDataLoader(data_path).load(split="test")

  0%|          | 0/4635922 [00:00<?, ?it/s]

In [4]:
def get_info(corpus, tokenizer):
    sep = " "
    doc_lens = {}
    df = Counter()
    d_tf = {}

    for cid in tqdm(corpus.keys()):
        text = corpus[cid]["title"] + sep + corpus[cid]["text"]
        input_ids = tokenizer(text)
        doc_lens[cid] = (len(input_ids))
        df.update(list(set(input_ids)))
        tf_d = Counter(input_ids)
        doc_lens[cid] =len(input_ids)
        d_tf[cid] = tf_d
        
    idf = defaultdict(float)
    N = len(corpus)
    for w, v in df.items():
        idf[w] = np.log(N / v)

    doc_len_ave = np.mean(list(doc_lens.values()))
    
    del df
        
    return d_tf, idf, doc_lens, doc_len_ave

In [5]:
def get_d_bm25(d_tf, idf, doc_lens, doc_len_ave, k1=0.9, b=0.6):
    k1 = k1
    b = b
    token_bm25_score = {}

    for cid, tfs in d_tf.items():
        token_bm25_score[cid] = defaultdict(float)
        for tid, tf in tfs.items():
            token_bm25_score[cid][tid] = tf * (1 + k1) / (tf + k1 * (1 - b + b * doc_lens[cid] / doc_len_ave)) * idf[tid]
            
    return token_bm25_score

In [6]:
# Check
# with open(f"./analysis_data/{dataset}/bm25_result.json") as f:
#     bm25_result = json.load(f)
    
with open(f"./analysis_data/{dataset}/dense_result.json") as f:
    dense_result = json.load(f)
    
# with open(f"./analysis_data/{dataset}/weighted_dense_result.json") as f:
#     weighted_dense_result = json.load(f)
    
# with open(f"./analysis_data/{dataset}/cbm25_result.json") as f:
#     cbm25_result = json.load(f)

with open(f"/home/gaia_data/iida.h/BEIR/C-BM25/results/{dataset}/result/bm25/analysis.json") as f:
    bm25_result = json.load(f)

# with open(f"/home/gaia_data/iida.h/BEIR/C-BM25/results/{dataset}/result/cos_sim/mpnet-tod/analysis.json") as f:
#     dense_result = json.load(f)

with open(f"/home/gaia_data/iida.h/BEIR/C-BM25/results/{dataset}/result/dot/mpnet-v3-mse-beir-dot/analysis.json") as f:
    distil_dense_result = json.load(f)

with open(f"/home/gaia_data/iida.h/BEIR/C-BM25/results/{dataset}/result/lss/mpnet-tod/analysis.json") as f:
    cbm25_result = json.load(f)

In [7]:
hf_model = AutoModel.from_pretrained(model_path)

In [8]:
hf_tokenizer = AutoTokenizer.from_pretrained(model_path)
hf_d_tf, hf_idf, hf_doc_lens, hf_doc_len_ave = get_info(corpus, hf_tokenizer.tokenize)
hf_token_bm25_score = get_d_bm25(hf_d_tf, hf_idf, hf_doc_lens, hf_doc_len_ave, k1=0.82, b=0.65)

  0%|▏                                                                                                        | 9550/4635922 [00:02<17:01, 4531.18it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (617 > 512). Running this sequence through the model will result in indexing errors
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 4635922/4635922 [16:20<00:00, 4728.09it/s]


In [9]:
# hf_tokenizer = AutoTokenizer.from_pretrained(model_path)

# sep = " "
# doc_lens = {}
# df = Counter()
# d_tf = {}

# for cid in tqdm(corpus.keys()):
#     text = corpus[cid]["title"] + sep + corpus[cid]["text"]
#     input_ids = hf_tokenizer(text)["input_ids"]
#     doc_lens[cid] = (len(input_ids))
#     df.update(list(set(input_ids)))
#     tf_d = Counter(input_ids)
#     doc_lens[cid] =len(input_ids)
#     d_tf[cid] = tf_d
    
# idf = defaultdict(float)
# N = len(corpus)
# for w, v in df.items():
#     idf[w] = np.log(N / v)

# doc_len_ave = np.mean(list(doc_lens.values()))

In [10]:
# from lss_func.arguments import ModelArguments, DataArguments, LSSArguments
# from lss_func.search.coil.exact_search import LSSSearcher
# from lss_func.models import coil
# from beir.retrieval.search.dense import DenseRetrievalExactSearch as DRES
# from beir.retrieval.evaluation import EvaluateRetrieval

# model_args = ModelArguments(model_name_or_path=model_path)

# base_model = coil.Coil(model_args.model_name_or_path, model_args)
# base_model.eval()

# score_functions = [func_name]
# qid = "TREC_Entity-4"
# qid2 = "INEX_LD-20120411"
# this_bm25_result = {}
# this_bm25_result[qid] = bm25_result[qid]
# this_bm25_result[qid2] = bm25_result[qid2]

# searcher = LSSSearcher(base_model, bm25_result, score_functions, pooler="ave", batch_size=128, idf=idf, doc_len_ave=doc_len_ave, doc_max_length=512, window_size=3, norm=True)
# dense_retriever = EvaluateRetrieval(searcher, score_function="cos_sim", k_values=[1, 3, 5, 10, 100])
# all_rerank_results = dense_retriever.rerank(corpus, {qid: queries[qid], qid2: queries[qid2]}, this_bm25_result, top_k=100)



In [11]:
# sorted(all_rerank_results[func_name][qid].items(), key= lambda x: -x[1])[:10]

In [12]:
del hf_d_tf, hf_doc_lens, hf_doc_len_ave

In [13]:
lm_tokenizer = Analyzer(get_lucene_analyzer())
lm_d_tf, lm_idf, lm_doc_lens, lm_doc_len_ave = get_info(corpus, lm_tokenizer.analyze)
lm_token_bm25_score = get_d_bm25(lm_d_tf, lm_idf, lm_doc_lens, lm_doc_len_ave)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 4635922/4635922 [06:30<00:00, 11882.63it/s]


In [14]:
del lm_d_tf, lm_doc_lens, lm_doc_len_ave

In [15]:
evaluator = pytrec_eval.RelevanceEvaluator(qrels, {"ndcg_cut.10"})

In [16]:
bm25_scores = evaluator.evaluate(bm25_result)
dense_scores = evaluator.evaluate(dense_result)
# wdense_scores = evaluator.evaluate(weighted_dense_result)
distil_dense_scores = evaluator.evaluate(distil_dense_result)
cbm25_scores = evaluator.evaluate(cbm25_result[func_name])

In [17]:
print("bm25: ", np.average([i["ndcg_cut_10"] for i in bm25_scores.values()]))
print("dense: ", np.average([i["ndcg_cut_10"] for i in dense_scores.values()]))
# print("wdense: ", np.average([i["ndcg_cut_10"] for i in wdense_scores.values()]))
print("distil_dense: ", np.average([i["ndcg_cut_10"] for i in distil_dense_scores.values()]))
print("cbm25: ", np.average([i["ndcg_cut_10"] for i in cbm25_scores.values()]))

bm25:  0.2846362144861774
dense:  0.29393880438765735
distil_dense:  0.3553918660659802
cbm25:  0.36305527785462854


In [18]:
def create_diff_scores(source_scores, target_scores):
    diff_scores = {}
    for k in cbm25_scores:
        diff_score = source_scores[k]["ndcg_cut_10"] - target_scores[k]["ndcg_cut_10"]
        if abs(diff_score) > 0.0:
            diff_scores[k] = diff_score
            
    return diff_scores

In [19]:
diff_scores_bm25 = create_diff_scores(cbm25_scores, bm25_scores)
diff_scores_dense = create_diff_scores(cbm25_scores, dense_scores)
# diff_scores_wdense = create_diff_scores(cbm25_scores, wdense_scores)
diff_scores_distil_dense = create_diff_scores(cbm25_scores, distil_dense_scores)

In [20]:
# common_diff_query = set(diff_scores_bm25) & set(diff_scores_dense) & set(diff_scores_wdense)
common_diff_query = set(diff_scores_bm25) & set(diff_scores_dense) & set(diff_scores_distil_dense)

In [21]:
# all_diff = pd.concat([pd.Series(diff_scores_bm25), pd.Series(diff_scores_dense), pd.Series(diff_scores_wdense)], axis=1).fillna(0).sort_index()
all_diff = pd.concat([pd.Series(diff_scores_bm25), pd.Series(diff_scores_dense), pd.Series(diff_scores_distil_dense)], axis=1).fillna(0).sort_index()
# all_diff = all_diff.rename(columns={0: "bm25", 1: "dense", 2: "wdense"})
all_diff = all_diff.rename(columns={0: "bm25", 1: "dense", 2: "distil_dense"})
all_diff.describe()

Unnamed: 0,bm25,dense,distil_dense
count,368.0,368.0,368.0
mean,0.085238,0.075127,0.00833
std,0.187884,0.184559,0.227448
min,-0.451049,-0.413503,-1.0
25%,-0.020364,-0.026576,-0.113689
50%,0.062059,0.03728,0.0
75%,0.192284,0.168207,0.130685
max,0.840303,0.666667,0.613147


In [22]:
better_query = all_diff[(all_diff > 0.2).all(1)]
better_query

Unnamed: 0,bm25,dense,distil_dense
INEX_LD-2009074,0.241481,0.302276,0.45045
INEX_LD-2010014,0.205871,0.377732,0.388609
INEX_LD-20120411,0.218032,0.261394,0.239357
INEX_LD-2012329,0.234046,0.256363,0.391327
INEX_LD-2012349,0.322836,0.477066,0.284965
INEX_LD-2012383,0.386853,0.386853,0.386853
INEX_XER-106,0.438563,0.374942,0.354634
INEX_XER-95,0.225651,0.36648,0.262333
QALD2_te-13,0.229975,0.352983,0.36274
QALD2_te-28,0.394377,0.524397,0.45355


In [23]:
worse_query = all_diff[(all_diff < -0.01).all(1)]
worse_query

Unnamed: 0,bm25,dense,distil_dense
INEX_LD-20120121,-0.042368,-0.140743,-0.140743
INEX_LD-20120531,-0.092376,-0.15952,-0.048826
INEX_XER-141,-0.194254,-0.272553,-0.029472
INEX_XER-67,-0.172258,-0.067942,-0.143214
QALD2_te-60,-0.129898,-0.206872,-0.475622
QALD2_te-64,-0.174647,-0.296422,-0.098872
QALD2_tr-65,-0.168427,-0.314496,-0.266951
QALD2_tr-82,-0.075244,-0.242761,-0.221984
SemSearch_ES-25,-0.05989,-0.149015,-0.149015
SemSearch_ES-95,-0.050593,-0.185707,-0.259922


In [38]:
def show_result_top1(qid, show_condition):
    def extract_top1(result):
        return sorted(result[qid].items(), key=lambda x: -x[1])[0]
    
    bm25_top1 = extract_top1(bm25_result)
    dense_top1 = extract_top1(dense_result)
    # wdense_top1 = extract_top1(weighted_dense_result)
    distil_dense_top1 = extract_top1(distil_dense_result)
    cbm25_top1 = extract_top1(cbm25_result[func_name])
    
    correct = qrels[qid]
    if show_condition(cbm25_top1[0], bm25_top1[0], correct):
        print("------")
        print(f"qid: {qid}, query: {queries[qid]}")
        print(bm25_top1[0] in correct, f"bm25: {corpus[bm25_top1[0]]}")
        print(dense_top1[0] in correct, f"dense: {corpus[dense_top1[0]]}")
        # print(wdense_top1[0] in correct, f"wdense: {corpus[wdense_top1[0]]}")
        print(distil_dense_top1[0] in correct, f"distil_dense: {corpus[distil_dense_top1[0]]}")
        print(cbm25_top1[0] in correct, f"cbm25: {corpus[cbm25_top1[0]]}")
    else:
        print("------")
        print(f"skip qid {qid}")

In [39]:
def correct_condition(target, compare, correct):
    return target in correct and not compare in correct

for qid in better_query.index:
    show_result_top1(qid, correct_condition)

------
skip qid INEX_LD-2009074
------
skip qid INEX_LD-2010014
------
skip qid INEX_LD-20120411
------
skip qid INEX_LD-2012329
------
skip qid INEX_LD-2012349
------
skip qid INEX_LD-2012383
------
skip qid INEX_XER-106
------
skip qid INEX_XER-95
------
qid: QALD2_te-13, query: Who is the youngest player in the Premier League?
False bm25: {'text': 'Scott Robinson (born 12 March 1992) is a Scottish professional footballer who plays for Kilmarnock in the Scottish Premiership, as a midfielder. He previously played for Heart of Midlothian, where he made his debut aged 16, becoming the youngest ever player to appear in the Scottish Premier League (SPL).', 'title': 'Scott Robinson (footballer)'}
False dense: {'text': 'Reuben Courtney Noble-Lazarus (born 16 August 1993) is an English-born Grenadian professional footballer who plays as a striker or winger for Rochdale. On 30 September 2008, he became the youngest player to debut in the English Football League, at 15 years and 45 days old, b

In [27]:
def preproc_rep(reps: np.ndarray, att_mask: np.ndarray, input_tok: np.ndarray):
    reps = rep_lave(reps, att_mask)
    reps /= np.linalg.norm(reps, axis=2)[:, :, np.newaxis]
    reps[np.isnan(reps)] = 0.0
    return reps, att_mask[:, 1:], input_tok[:, 1:]

def rep_lave(reps, att_masks, window_size=3):
    tg_reps = np.zeros_like(reps[:, 1:])  # 3D
    for b, (rep, att_mask) in enumerate(zip(reps, att_masks)):
        og_rep = rep[att_mask == 1, :]  # 2D
        og_rep = og_rep[1:-1, :]  # remove special token
        rep_len = og_rep.shape[0]
        for i in range(rep_len):
            start = i - window_size if i - window_size > 0 else 0
            end = i + window_size
            tg_reps[b, i, :] += np.mean(og_rep[start:end, :], axis=0)

    return tg_reps

def max_cos_sims(query, doc, model, tokenizer):
    special_tokens = {
        tokenizer.pad_token_id,
        tokenizer.bos_token_id,
        tokenizer.eos_token_id,
        tokenizer.sep_token_id,
        tokenizer.cls_token_id,
    }
    def tok2rep_indexing(inputs_ids, batch_reps, att_masks):
        tok2rep = defaultdict(list)
        for qi, (input_ids, reps, att_mask) in enumerate(zip(inputs_ids, batch_reps, att_masks)):
            for i, (qt, rep, am) in enumerate(zip(input_ids, reps, att_mask)):
                if qt in special_tokens:
                    continue
                if am == 0:
                    continue
                tok2rep[qt].append(rep)
            
        for tid in tok2rep:
            tok2rep[tid] = np.vstack(tok2rep[tid])
            
        return tok2rep
        
    doc = doc["title"] + " " + doc["text"]
    t_query = tokenizer(query, return_tensors="pt")
    with torch.no_grad():
        e_query = model(**t_query).last_hidden_state
    e_queries, q_att_masks, q_inputs_ids = preproc_rep(e_query.numpy(), t_query["attention_mask"].numpy(), t_query["input_ids"].numpy())
    q_tok2rep = tok2rep_indexing(q_inputs_ids, e_queries, q_att_masks)
    t_doc = tokenizer(doc, return_tensors="pt")
    with torch.no_grad():
        e_doc = model(**t_doc).last_hidden_state
    e_docs, d_att_masks, d_inputs_ids = preproc_rep(e_doc.numpy(), t_doc["attention_mask"].numpy(), t_doc["input_ids"].numpy())
    # print(e_docs.shape, d_att_masks.shape, d_inputs_ids.shape, tokenizer.convert_ids_to_tokens(list(d_inputs_ids[0])))
    d_tok2rep = tok2rep_indexing(d_inputs_ids, e_docs, d_att_masks)
    result = []
    for qt, q_reps in q_tok2rep.items():
        q_token = tokenizer.convert_ids_to_tokens(int(qt))
        if qt not in d_tok2rep:
            result.append(f"{q_token}: 0.0")
            continue
            
        score=np.max(np.dot(q_reps, d_tok2rep[qt].T))
        result.append(f"{q_token}: {round(float(score), 2)}")
        
    return ", ".join(result)

In [33]:
def tokenizer_doc(target, tokenizer):
    text = corpus[target]["title"] + " " + corpus[target]["text"]
    return tokenizer(text)
    
def get_bm25_val(cid, query, token_bm25_score, tokenizer):
    this_q_tok_bm25 = []
    t_query = tokenizer(query)
    for tok in t_query:
        this_q_tok_bm25.append(f"{tok}: {round(token_bm25_score[cid][(tok)], 2)}")
    return ", ".join(this_q_tok_bm25)

def get_idf_val(query, idf, tokenizer):
    this_q_tok_idf = []
    t_query = tokenizer(query)
    for tok in t_query:
        this_q_tok_idf.append(f"{tok}: {round(idf[tok], 2)}")
    return ", ".join(this_q_tok_idf)
        

def show_result_top1_analysis(qid, show_condition):
    def extract_top1(result):
        return sorted(result[qid].items(), key=lambda x: -x[1])[0]
    
    bm25_top1 = extract_top1(bm25_result)
    dense_top1 = extract_top1(dense_result)
    # wdense_top1 = extract_top1(weighted_dense_result)
    distil_dense_top1 = extract_top1(distil_dense_result)
    cbm25_top1 = extract_top1(cbm25_result[func_name])
    correct = qrels[qid]
    query = queries[qid]
    if show_condition(cbm25_top1[0], bm25_top1[0], correct):
        print("------")
        print(f"qid: {qid}, query: {query}")
        print(f"lm_idf: {get_idf_val(query, lm_idf, lm_tokenizer.analyze)}")
        print(f"hf_idf: {get_idf_val(query, hf_idf, hf_tokenizer.tokenize)}")
        print("---")
        cid_bm25_top1 = bm25_top1[0]
        cid_bm25_tok_score_lm = get_bm25_val(cid_bm25_top1, query, lm_token_bm25_score, lm_tokenizer.analyze)
        cid_bm25_tok_score_hf = get_bm25_val(cid_bm25_top1, query, hf_token_bm25_score, hf_tokenizer.tokenize)
        t_cos_sims = max_cos_sims(query, corpus[cid_bm25_top1], hf_model, hf_tokenizer)
        t_doc = tokenizer_doc(cid_bm25_top1, lm_tokenizer.analyze)
        print(f"bm25: {cid_bm25_top1 in correct} q-bm25: {cid_bm25_tok_score_lm}")
        print(f"bm25: {cid_bm25_top1 in correct} q-bm25_hf: {cid_bm25_tok_score_hf}")
        print(f"cos-sims: {t_cos_sims}")
        print(f"t_doc: {t_doc}")
        cid_dense_top1 = dense_top1[0]
        cid_dense_tok_score = get_bm25_val(cid_dense_top1, query, hf_token_bm25_score, hf_tokenizer.tokenize)
        t_doc = tokenizer_doc(cid_dense_top1, hf_tokenizer.tokenize)
        print(f"dense: {cid_bm25_top1 in correct} q-bm25: {cid_dense_tok_score}")
        print(f"t_doc: {t_doc}")  
        # cid_wdense_top1 = wdense_top1[0]
        # cid_wdense_tok_score = get_bm25_val(cid_wdense_top1, query, hf_token_bm25_score, hf_tokenizer.tokenize)
        # t_doc = tokenizer_doc(cid_wdense_top1, hf_tokenizer.tokenize)
        # print(f"wdense: {cid_bm25_top1 in correct}, q-bm25: {cid_wdense_tok_score}")
        cid_distil_dense_top1 = distil_dense_top1[0]
        cid_distil_dense_tok_score = get_bm25_val(cid_distil_dense_top1, query, hf_token_bm25_score, hf_tokenizer.tokenize)
        t_doc = tokenizer_doc(cid_distil_dense_top1, hf_tokenizer.tokenize)
        print(f"wdense: {cid_bm25_top1 in correct}, q-bm25: {cid_distil_dense_tok_score}")
        print(f"t_doc: {t_doc}")  
        cid_cbm25_top1 = cbm25_top1[0]
        cid_cbm25_tok_score = get_bm25_val(cid_cbm25_top1, query, hf_token_bm25_score, hf_tokenizer.tokenize)
        t_doc = tokenizer_doc(cid_cbm25_top1, hf_tokenizer.tokenize)
        t_cos_sims = max_cos_sims(query, corpus[cid_cbm25_top1], hf_model, hf_tokenizer)
        print(f"cbm25: {cid_cbm25_top1 in correct} q-bm25: {cid_cbm25_tok_score}")
        print(f"cos-sims: {t_cos_sims}")
        print(f"t_doc: {t_doc}")
    else:
        print("------")
        print(f"skip qid {qid}")

In [34]:
for qid in better_query.index:
    show_result_top1_analysis(qid, correct_condition)

------
skip qid INEX_LD-2009074
------
skip qid INEX_LD-2010014
------
skip qid INEX_LD-20120411
------
skip qid INEX_LD-2012329
------
skip qid INEX_LD-2012349
------
skip qid INEX_LD-2012383
------
skip qid INEX_XER-106
------
skip qid INEX_XER-95
------
qid: QALD2_te-13, query: Who is the youngest player in the Premier League?
lm_idf: who: 2.24, youngest: 6.58, player: 3.49, premier: 4.71, leagu: 3.28
hf_idf: who: 2.24, is: 0.29, the: 0.12, youngest: 6.58, player: 3.58, in: 0.23, the: 0.12, premier: 5.28, league: 3.28, ?: 5.51
---
bm25: False q-bm25: who: 2.18, youngest: 6.4, player: 3.39, premier: 4.58, leagu: 3.19
bm25: False q-bm25_hf: who: 2.25, is: 0.29, the: 0.18, youngest: 6.63, player: 3.61, in: 0.3, the: 0.18, premier: 5.33, league: 3.3, ?: 0.0
cos-sims: who: 0.64, is: 0.62, the: 0.68, youngest: 0.69, player: 0.69, in: 0.64, premier: 0.65, league: 0.63, ?: 0.0
t_doc: ['scott', 'robinson', 'footbal', 'scott', 'robinson', 'born', '12', 'march', '1992', 'scottish', 'profession

  reps /= np.linalg.norm(reps, axis=2)[:, :, np.newaxis]


cbm25: True q-bm25: give: 0.0, me: 0.0, all: 0.0, movies: 0.0, directed: 4.4, by: 1.87, francis: 6.63, ford: 7.48, cop: 9.75, ##pol: 8.53, ##a: 3.43, .: 0.01
cos-sims: give: 0.0, me: 0.0, all: 0.0, movies: 0.0, directed: 0.65, by: 0.69, francis: 0.71, ford: 0.72, cop: 0.71, ##pol: 0.68, ##a: 0.66, .: 0.61
t_doc: ['the', 'godfather', 'the', 'godfather', 'is', 'a', '1972', 'american', 'crime', 'film', 'directed', 'by', 'francis', 'ford', 'cop', '##pol', '##a', 'and', 'produced', 'by', 'albert', 's', '.', 'rudd', '##y', 'from', 'a', 'screenplay', 'by', 'mario', 'pu', '##zo', 'and', 'cop', '##pol', '##a', '.']
------
skip qid QALD2_te-3
------
qid: QALD2_tr-4, query: Which river does the Brooklyn Bridge cross?
lm_idf: which: 2.34, river: 3.57, doe: 5.76, brooklyn: 6.28, bridg: 5.38, cross: 4.79
hf_idf: which: 2.34, river: 3.61, does: 5.79, the: 0.12, brooklyn: 6.28, bridge: 5.43, cross: 5.07, ?: 5.51
---
bm25: False q-bm25: which: 2.12, river: 3.25, doe: 5.24, brooklyn: 0.0, bridg: 8.04, c

In [35]:
qid = "QALD2_te-28"
for cid, v in sorted(distil_dense_result["QALD2_te-28"].items(), key=lambda x: -x[1])[:10]:
    print(cid, v, qrels[qid].get(cid))

<dbpedia:Francis_Ford_Coppola> 35.19206237792969 1
<dbpedia:All_of_Me_(1984_film)> 31.441036224365234 None
<dbpedia:Sofia_Coppola> 31.26166534423828 None
<dbpedia:They_All_Come_Out> 31.114974975585938 None
<dbpedia:Roman_Coppola> 30.876121520996094 None
<dbpedia:Zoetrope:_All-Story> 30.47029685974121 0
<dbpedia:Take_All_of_Me> 30.2182559967041 None
<dbpedia:Winners_Take_All_(film)> 30.182092666625977 None
<dbpedia:Barry_Malkin> 29.78139305114746 None
<dbpedia:The_Thrill_of_It_All> 29.769575119018555 None


In [36]:
qid = "QALD2_te-28"
for cid, v in sorted(cbm25_result[func_name]["QALD2_te-28"].items(), key=lambda x: -x[1])[:10]:
    print(cid, v, qrels[qid].get(cid))

<dbpedia:The_Godfather> 23.97888279858307 1
<dbpedia:Carmine_Coppola> 23.118121077566524 None
<dbpedia:The_Godfather_Part_II> 22.592693794830655 2
<dbpedia:The_Rain_People> 22.45578928081565 2
<dbpedia:The_Godfather_(film_series)> 21.28662429360636 1
<dbpedia:Zoetrope:_All-Story> 21.286480655691456 0
<dbpedia:One_from_the_Heart> 21.193362114230922 2
<dbpedia:Youth_Without_Youth_(film)> 20.75816019368333 1
<dbpedia:Apocalypse_Now> 20.58722364044759 2
<dbpedia:Francis_Ford_Coppola> 20.58597351172225 1


# C-BM25 is worse than other cases

In [40]:
def false_condition(target, compare, correct):
    return not target in correct and compare in correct

for i in worse_query.index:
    show_result_top1(i, false_condition)

------
skip qid INEX_LD-20120121
------
skip qid INEX_LD-20120531
------
qid: INEX_XER-141, query: Universities in Catalunya
True bm25: {'text': "Plaça de Catalunya station, also known as Barcelona-Plaça Catalunya, Plaça Catalunya or simply Catalunya is a major station complex in Barcelona located under Plaça de Catalunya, the city's central square and a large transport hub.", 'title': 'Plaça de Catalunya station'}
True dense: {'text': 'The Open University of Catalonia (Catalan: Universitat Oberta de Catalunya, UOC; IPA: [uniβərsiˈtat uˈβɛrtə ðə kətəˈɫuɲə]) is a public Internet-centered open university based in Barcelona, Spain.The UOC offers graduate and postgraduate programs in Catalan, Spanish and English in fields such as Psychology, Computer Science, Sciences of Education, Information and Knowledge Society and Economics.', 'title': 'Open University of Catalonia'}
False distil_dense: {'text': "The Institut Nacional d'Educació Física de Catalunya (English: National Institute of Phys

In [37]:
for qid in worse_query.index:
    show_result_top1_analysis(qid, false_condition)

------
skip qid INEX_LD-20120121
------
skip qid INEX_LD-20120531
------
qid: INEX_XER-141, query: Universities in Catalunya
lm_idf: univers: 3.07, catalunya: 9.07
hf_idf: universities: 6.38, in: 0.23, cat: 5.3, ##al: 3.98, ##un: 5.05, ##ya: 4.56
---
bm25: True q-bm25: univers: 0.0, catalunya: 15.11
bm25: True q-bm25_hf: universities: 0.0, in: 0.23, cat: 8.5, ##al: 6.39, ##un: 8.09, ##ya: 7.32
cos-sims: universities: 0.0, in: 0.56, cat: 0.62, ##al: 0.64, ##un: 0.65, ##ya: 0.63
t_doc: ['plaça', 'de', 'catalunya', 'station', 'plaça', 'de', 'catalunya', 'station', 'also', 'known', 'barcelona', 'plaça', 'catalunya', 'plaça', 'catalunya', 'simpli', 'catalunya', 'major', 'station', 'complex', 'barcelona', 'locat', 'under', 'plaça', 'de', 'catalunya', 'citi', 'central', 'squar', 'larg', 'transport', 'hub']
dense: True q-bm25: universities: 0.0, in: 0.31, cat: 4.65, ##al: 3.49, ##un: 4.42, ##ya: 4.0
t_doc: ['open', 'university', 'of', 'catalonia', 'the', 'open', 'university', 'of', 'catalonia'

  reps /= np.linalg.norm(reps, axis=2)[:, :, np.newaxis]


cbm25: False q-bm25: give: 0.0, me: 0.0, all: 0.0, launch: 6.95, pads: 0.0, operated: 4.81, by: 1.61, nasa: 10.55, .: 0.01
cos-sims: give: 0.0, me: 0.0, all: 0.0, launch: 0.47, pads: 0.0, operated: 0.51, by: 0.53, nasa: 0.52, .: 0.53
t_doc: ['nasa', 'recovery', 'ship', 'the', 'nasa', 'recovery', 'ships', 'are', 'two', 'ships', ',', 'the', 'mv', 'liberty', 'star', 'and', 'the', 'mv', 'freedom', 'star', ',', 'that', 'are', 'tasked', 'with', 're', '##tri', '##eving', 'spent', 'solid', 'rocket', 'booster', '##s', '(', 'sr', '##bs', ')', 'following', 'the', 'launch', 'of', 'space', 'shuttle', 'missions', '.', 'although', 'owned', 'by', 'nasa', ',', 'the', 'ships', 'are', 'currently', 'operated', 'by', 'space', 'flight', 'operations', 'contractor', 'united', 'space', 'alliance', '.']
------
skip qid QALD2_tr-65
------
qid: QALD2_tr-82, query: In which programming language is GIMP written?
lm_idf: which: 2.34, program: 4.36, languag: 4.31, gimp: 11.06, written: 3.74
hf_idf: in: 0.23, which: 2

In [41]:
# bm25: True q-bm25_hf: give: 0.0, me: 0.0, all: 2.8, launch: 9.56, pads: 0.0, operated: 3.96, by: 0.0, nasa: 8.43, .: 0.01
# cos-sims: give: 0.0, me: 0.0, all: 0.37, launch: 0.42, pads: 0.0, operated: 0.42, by: 0.0, nasa: 0.42, .: 0.42
print(2.8+9.56+3.96+8.43+0.01)
print(2.8*0.37+9.56*0.42+3.96*0.42+8.43*0.42+0.01*0.42)

# cbm25: False q-bm25: give: 0.0, me: 0.0, all: 0.0, launch: 6.95, pads: 0.0, operated: 4.81, by: 1.61, nasa: 10.55, .: 0.01
# cos-sims: give: 0.0, me: 0.0, all: 0.0, launch: 0.47, pads: 0.0, operated: 0.51, by: 0.53, nasa: 0.52, .: 0.53
print(6.95+4.81+1.61+10.55+0.01)
print(6.95*0.47+4.81*0.51+1.61*0.53+10.55*0.52+0.01*0.53)

24.76
10.2592
23.930000000000003
12.064200000000001


In [35]:
qid = "TREC_Entity-4"
cid_cbm25_top2 = [(k, v, k in qrels[qid]) for k, v in  sorted(cbm25_result[func_name][qid].items(), key=lambda x: -x[1])][0][0]
cid_cbm25_tok_score = get_bm25_val(cid_cbm25_top2, queries[qid], hf_token_bm25_score, hf_tokenizer.tokenize)
t_doc = tokenizer_doc(cid_cbm25_top2, hf_tokenizer.tokenize)
t_cos_sims = max_cos_sims(queries[qid], corpus[cid_cbm25_top2], hf_model, hf_tokenizer)
print(f"cbm25: {cid_cbm25_top2 in qrels[qid]} q-bm25: {cid_cbm25_tok_score}")
print(f"cos-sims: {t_cos_sims}")
print(f"t_doc: {t_doc}")

cbm25: False q-bm25: professional: 3.82, sports: 7.04, teams: 5.16, in: 0.0, philadelphia: 8.38, .: 0.01
cos-sims: professional: 0.79, sports: 0.78, teams: 0.76, in: 0.0, philadelphia: 0.75, .: 0.75
t_doc: ['south', 'philadelphia', 'sports', 'complex', 'the', 'south', 'philadelphia', 'sports', 'complex', 'is', 'the', 'current', 'home', 'of', 'philadelphia', "'", 's', 'professional', 'sports', 'teams', '.', 'it', 'is', 'the', 'site', 'of', 'the', 'wells', 'fargo', 'center', ',', 'lincoln', 'financial', 'field', ',', 'citizens', 'bank', 'park', ',', 'and', 'a', 'retail', '/', 'entertainment', 'center', 'x', '##fin', '##ity', 'live', '!', '.']


  reps /= np.linalg.norm(reps, axis=2)[:, :, np.newaxis]


In [34]:
qid = "TREC_Entity-4"
cid_cbm25_top2 = [(k, v, k in qrels[qid]) for k, v in  sorted(cbm25_result[func_name][qid].items(), key=lambda x: -x[1])][1][0]
cid_cbm25_tok_score = get_bm25_val(cid_cbm25_top2, queries[qid], hf_token_bm25_score, hf_tokenizer.tokenize)
t_doc = tokenizer_doc(cid_cbm25_top2, hf_tokenizer.tokenize)
t_cos_sims = max_cos_sims(queries[qid], corpus[cid_cbm25_top2], hf_model, hf_tokenizer)
print(f"cbm25: {cid_cbm25_top2 in qrels[qid]} q-bm25: {cid_cbm25_tok_score}")
print(f"cos-sims: {t_cos_sims}")
print(f"t_doc: {t_doc}")

cbm25: True q-bm25: professional: 4.53, sports: 6.77, teams: 6.11, in: 0.36, philadelphia: 8.5, .: 0.01
cos-sims: professional: 0.78, sports: 0.85, teams: 0.79, in: 0.86, philadelphia: 0.85, .: 0.82
t_doc: ['sports', 'in', 'philadelphia', 'philadelphia', ',', 'pennsylvania', ',', 'has', 'been', 'home', 'to', 'many', 'teams', 'and', 'events', 'in', 'professional', ',', 'semi', '-', 'professional', ',', 'amateur', ',', 'college', ',', 'and', 'high', '-', 'school', 'sports', '.', 'philadelphia', 'is', 'one', 'of', 'twelve', 'cities', 'that', 'hosts', 'teams', 'in', 'all', 'four', 'major', 'sports', 'leagues', 'in', 'north', 'america', ',', 'and', 'philadelphia', 'is', 'one', 'of', 'just', 'three', 'cities', 'in', 'which', 'one', 'team', 'from', 'every', 'league', 'plays', 'within', 'city', 'limits', '.']


  reps /= np.linalg.norm(reps, axis=2)[:, :, np.newaxis]


In [50]:
# professional: 4.6, sports: 6.92, teams: 6.2, in: 0.37, philadelphia: 8.73, .: 0.01
# cos-sims: professional: 0.78, sports: 0.85, teams: 0.79, in: 0.86, philadelphia: 0.85, .: 0.82
print(4.6+6.92+6.2+0.37+8.73+0.01)
print(4.6*0.78+6.92*0.85+6.2*0.79+0.37*0.86+8.73*0.85+0.01*0.82)
# professional: 3.81, sports: 7.2, teams: 5.15, in: 0.0, philadelphia: 8.56, .: 0.01
# professional: 0.79, sports: 0.78, teams: 0.76, in: 0.0, philadelphia: 0.75, .: 0.75
print(3.81+7.2+5.15+0.0+8.56+0.01)
print(3.81*0.79+7.2*0.78+5.15*0.76+0.0+8.56*0.75+0.01*0.75)

26.830000000000002
22.114899999999995
24.73
18.9674


In [53]:
[(k, v, k in qrels[qid]) for k, v in  sorted(cbm25_result[func_name][qid].items(), key=lambda x: -x[1])][:10]

[('<dbpedia:South_Philadelphia_Sports_Complex>', 17.859652392290908, False),
 ('<dbpedia:Sports_in_Philadelphia>', 17.736056922069046, True),
 ('<dbpedia:Sports_in_Allentown,_Pennsylvania>', 16.71823069463957, False),
 ('<dbpedia:List_of_professional_sports_teams_in_Pennsylvania>',
  13.806071164176721,
  True),
 ('<dbpedia:SportsChannel_Philadelphia>', 12.749770900996921, True),
 ('<dbpedia:Sports_teams_in_the_Central_Pennsylvania_Area>',
  11.86070957058163,
  False),
 ('<dbpedia:Philadelphia_Phillies>', 11.785149620344132, True),
 ('<dbpedia:List_of_Philadelphia_Flyers_records>', 10.32809509868458, False),
 ('<dbpedia:List_of_professional_sports_teams_in_New_York>',
  9.511927393654405,
  False),
 ('<dbpedia:Philadelphia_Quakers_(AFL)>', 9.346098095411772, False)]

In [54]:
from lss_func import LSS_FUNC

ModuleNotFoundError: No module named 'lss_func'

In [30]:
sorted(cbm25_result[func_name]["INEX_XER-141"].items(), key=lambda x: -x[1])[0]

('<dbpedia:CatalunyaCaixa>', 14.438369397728444)

In [37]:
qid = "QALD2_te-28"
cid = "<dbpedia:Don't_Box_Me_In>"
query = queries[qid]
print(query)
print("bm25")
print(get_bm25_val(cid, query, hf_token_bm25_score, hf_tokenizer.tokenize))
print(max_cos_sims(query, corpus[cid], hf_model, hf_tokenizer))
print("cmb25")
cid = "<dbpedia:The_Godfather>"
print(get_bm25_val(cid, query, hf_token_bm25_score, hf_tokenizer.tokenize))
print(max_cos_sims(query, corpus[cid], hf_model, hf_tokenizer))

Give me all movies directed by Francis Ford Coppola.
bm25
give: 0.0, me: 5.83, all: 0.0, movies: 0.0, directed: 0.0, by: 0.0, francis: 5.66, ford: 6.38, cop: 6.81, ##pol: 5.96, ##a: 2.4, .: 0.01
give: 0.0, me: 0.49, all: 0.0, movies: 0.0, directed: 0.0, by: 0.0, francis: 0.6, ford: 0.61, cop: 0.61, ##pol: 0.61, ##a: 0.58, .: 0.46
cmb25
give: 0.0, me: 0.0, all: 0.0, movies: 0.0, directed: 4.38, by: 1.91, francis: 6.6, ford: 7.44, cop: 9.88, ##pol: 8.64, ##a: 3.48, .: 0.01
give: 0.0, me: 0.0, all: 0.0, movies: 0.0, directed: 0.65, by: 0.69, francis: 0.71, ford: 0.72, cop: 0.71, ##pol: 0.68, ##a: 0.66, .: 0.61


  reps /= np.linalg.norm(reps, axis=2)[:, :, np.newaxis]


In [38]:
qid = "INEX_LD-20120512"
cid = '<dbpedia:GP_Basic>'
query = queries[qid]
print(query)
print(get_bm25_val(cid, query, hf_token_bm25_score, hf_tokenizer.tokenize))
print(max_cos_sims(query, corpus[cid], hf_model, hf_tokenizer))

south korean girl groups
south: 3.67, korean: 7.36, girl: 7.8, groups: 0.0
south: 0.66, korean: 0.69, girl: 0.69, groups: 0.0


  reps /= np.linalg.norm(reps, axis=2)[:, :, np.newaxis]


In [39]:
qid = "INEX_XER-141"
cid = '<dbpedia:Open_University_of_Catalonia>'
query = queries[qid]
print('dense')
print(query)
print(get_bm25_val(cid, query, hf_token_bm25_score, hf_tokenizer.tokenize))
print(max_cos_sims(query, corpus[cid], hf_model, hf_tokenizer))
print("cbm25")
cid = '<dbpedia:CatalunyaCaixa>'
print(get_bm25_val(cid, query, hf_token_bm25_score, hf_tokenizer.tokenize))
print(max_cos_sims(query, corpus[cid], hf_model, hf_tokenizer))

dense
Universities in Catalunya
universities: 0.0, in: 0.32, cat: 4.66, ##al: 3.51, ##un: 4.44, ##ya: 4.02
universities: 0.0, in: 0.63, cat: 0.65, ##al: 0.68, ##un: 0.67, ##ya: 0.66
cbm25
universities: 0.0, in: 0.31, cat: 7.58, ##al: 6.0, ##un: 7.22, ##ya: 6.53
universities: 0.0, in: 0.6, cat: 0.68, ##al: 0.68, ##un: 0.67, ##ya: 0.66


  reps /= np.linalg.norm(reps, axis=2)[:, :, np.newaxis]


In [40]:
qid = "INEX_LD-20120121"
cid = "<dbpedia:Raw_Food_Made_Easy_for_1_or_2_People>"
query = queries[qid]
print(query)
print("bm25")
print(get_bm25_val(cid, query, hf_token_bm25_score, hf_tokenizer.tokenize))
print(max_cos_sims(query, corpus[cid], hf_model, hf_tokenizer))
print(tokenizer_doc(cid, hf_tokenizer.tokenize))
print("cbm25")
cid = "<dbpedia:Luke_Nguyen's_Vietnam>"
print(get_bm25_val(cid, query, hf_token_bm25_score, hf_tokenizer.tokenize))
print(max_cos_sims(query, corpus[cid], hf_model, hf_tokenizer))

vietnam food recipes
bm25
vietnam: 0.0, food: 8.5, recipes: 13.2
vietnam: 0.0, food: 0.4, recipes: 0.4
['raw', 'food', 'made', 'easy', 'for', '1', 'or', '2', 'people', 'raw', 'food', 'made', 'easy', 'for', '1', 'or', '2', 'people', 'is', 'a', 'recipe', 'book', 'by', 'raw', 'food', 'chef', 'jennifer', 'corn', '##ble', '##et', '.', 'the', 'best', '-', 'selling', 'book', 'was', 'published', 'in', '2005', 'and', 'promotes', 'the', 'raw', 'food', 'diet', ',', 'a', 'dietary', 'movement', 'that', 'encourages', 'the', 'consumption', 'of', 'un', '##co', '##oked', 'foods', 'to', 'obtain', 'maximum', 'health', 'benefits', '.', 'the', 'book', 'features', '115', 'recipes', ',', 'including', '21', 'breakfast', 'recipes', ',', '64', 'lunch', 'and', 'dinner', 'recipes', 'and', '30', 'dessert', 'recipes', '.', 'each', 'recipe', 'yields', 'serving', '##s', 'for', 'one', 'or', 'two', 'people', '.']
cbm25
vietnam: 8.92, food: 7.71, recipes: 0.0
vietnam: 0.56, food: 0.54, recipes: 0.0


  reps /= np.linalg.norm(reps, axis=2)[:, :, np.newaxis]


In [41]:
for i in sorted(cbm25_result[func_name]["INEX_LD-20120121"].items(), key=lambda x: -x[1]):
    print(i, corpus[i[0]])

('<dbpedia:2007_Vietnam_food_scare>', 8.387637734113273) {'text': 'The 2007 Vietnam food scare was a food scandal, which exposed contaminated food.  Among the issues were formaldehyde in noodles of the national dish, Phở, banned pesticides in vegetables and fruit, and toxic soy sauce.', 'title': '2007 Vietnam food scare'}
("<dbpedia:Luke_Nguyen's_Vietnam>", 8.362236920817843) {'text': "Luke Nguyen's Vietnam is an Australian television series first screened on SBS One in 2010. The series follows chef, Luke Nguyen, as he tours Vietnam seeking culinary delights and adventure. It is regularly broadcast on Good Food, a UK food-orientated TV channel.", 'title': "Luke Nguyen's Vietnam"}
('<dbpedia:Raw_Food_Made_Easy_for_1_or_2_People>', 6.43570945916866) {'text': 'Raw Food Made Easy for 1 or 2 People is a recipe book by raw food chef Jennifer Cornbleet.  The best-selling book was published in 2005 and promotes the raw food diet, a dietary movement that encourages the consumption of uncooked f

In [None]:
lm_tokenizer.analyze("Vietnamese")

In [None]:
def show_result_top1_comp_wdense(qid, show_condition):
    def extract_top1(result):
        return sorted(result[qid].items(), key=lambda x: -x[1])[0]
    
    bm25_top1 = extract_top1(bm25_result)
    dense_top1 = extract_top1(dense_result)
    wdense_top1 = extract_top1(weighted_dense_result)
    cbm25_top1 = extract_top1(cbm25_result[func_name])
    
    correct = qrels[qid]
    # if show_condition(cbm25_top1[0], wdense_top1[0], correct) and wdense_top1[0] != dense_top1[0]:
    if wdense_top1[0] != dense_top1[0]:
        print("------")
        print(f"qid: {qid}, query: {queries[qid]}")
        print(bm25_top1[0] in correct, f"bm25: {corpus[bm25_top1[0]]}")
        print(dense_top1[0] in correct, f"dense: {corpus[dense_top1[0]]}")
        print(wdense_top1[0] in correct, f"wdense: {corpus[wdense_top1[0]]}")
        print(cbm25_top1[0] in correct, f"cbm25: {corpus[cbm25_top1[0]]}")
    else:
        print("------")
        print(f"skip qid {qid}")

In [None]:
target_qid = []
for qid, sup in diff_scores_wdense.items():
    if sup < 0:
        continue
    diff = wdense_scores[qid]["ndcg_cut_10"] - dense_scores[qid]["ndcg_cut_10"]
    if diff > 0:
        target_qid.append(qid)
        
for qid in target_qid:
    show_result_top1_comp_wdense(qid, correct_condition)

In [None]:
for i in pd.Series(pd.Series(diff_scores_wdense) > 0).index:
    show_result_top1_comp_wdense(i, correct_condition)

In [None]:
sorted(cbm25_result[func_name]["QALD2_te-28"].items(), key=lambda x: -x[1])[0]

In [None]:
hf_token_bm25_score['<dbpedia:All_of_Me_(1984_film)>']

In [None]:
hf_token_bm25_score['<dbpedia:The_Godfather>']

In [None]:
qid = "QALD2_te-64"
cid = "<dbpedia:Launch_Control_Center>"
query = queries[qid]
print(query)
print("bm25")
print(get_bm25_val(cid, query, hf_token_bm25_score, hf_tokenizer.tokenize))
print(max_cos_sims(query, corpus[cid], hf_model, hf_tokenizer))
print("cmb25")
cid = "<dbpedia:Vandenberg_AFB_Space_Launch_Complex_4>"
print(get_bm25_val(cid, query, hf_token_bm25_score, hf_tokenizer.tokenize))
print(max_cos_sims(query, corpus[cid], hf_model, hf_tokenizer))

In [None]:
hf_token_bm25_score["<dbpedia:Launch_Control_Center>"]

In [None]:
hf_token_bm25_score["<dbpedia:Vandenberg_AFB_Space_Launch_Complex_4>"]

In [None]:
names = ["Carl Reiner", "Steve Martin", "Lily Tomlin "]
for name in names:
    print(", ".join([f"{tn}: {round(hf_idf[tn], 2)}" for tn in hf_tokenizer.tokenize(name)]))