In [None]:
import json
import os
import numpy as np
import pandas as pd
import pytrec_eval
import torch
from tqdm import tqdm
from collections import Counter, defaultdict
from transformers import AutoTokenizer, AutoModel
from beir.datasets.data_loader import GenericDataLoader
from beir.retrieval.evaluation import EvaluateRetrieval
from pyserini.analysis import Analyzer, get_lucene_analyzer

In [None]:
root_dir = "/path/to/BEIR/datasets"
dataset = "dbpedia-entity"
model_path="/path/to/dense/model"
func_name="maxsim_bm25_qtf"

In [None]:
data_path = os.path.join(root_dir, dataset)
corpus, queries, qrels = GenericDataLoader(data_path).load(split="test")

In [None]:
def get_info(corpus, tokenizer):
    sep = " "
    doc_lens = {}
    df = Counter()
    d_tf = {}

    for cid in tqdm(corpus.keys()):
        text = corpus[cid]["title"] + sep + corpus[cid]["text"]
        input_ids = tokenizer(text)
        doc_lens[cid] = (len(input_ids))
        df.update(list(set(input_ids)))
        tf_d = Counter(input_ids)
        doc_lens[cid] =len(input_ids)
        d_tf[cid] = tf_d
        
    idf = defaultdict(float)
    N = len(corpus)
    for w, v in df.items():
        idf[w] = np.log(N / v)

    doc_len_ave = np.mean(list(doc_lens.values()))
    
    del df
        
    return d_tf, idf, doc_lens, doc_len_ave

In [None]:
def get_d_bm25(d_tf, idf, doc_lens, doc_len_ave):
    k1 = 0.9
    b = 0.6
    token_bm25_score = {}

    for cid, tfs in d_tf.items():
        token_bm25_score[cid] = defaultdict(float)
        for tid, tf in tfs.items():
            token_bm25_score[cid][tid] = tf * (1 + k1) / (tf + k1 * (1 - b + b * doc_lens[cid] / doc_len_ave)) * idf[tid]
            
    return token_bm25_score

In [None]:
hf_model = AutoModel.from_pretrained(model_path)

In [None]:
hf_tokenizer = AutoTokenizer.from_pretrained(model_path)
hf_d_tf, hf_idf, hf_doc_lens, hf_doc_len_ave = get_info(corpus, hf_tokenizer.tokenize)
hf_token_bm25_score = get_d_bm25(hf_d_tf, hf_idf, hf_doc_lens, hf_doc_len_ave)

In [None]:
del hf_d_tf, hf_doc_lens, hf_doc_len_ave

In [None]:
lm_tokenizer = Analyzer(get_lucene_analyzer())
lm_d_tf, lm_idf, lm_doc_lens, lm_doc_len_ave = get_info(corpus, lm_tokenizer.analyze)
lm_token_bm25_score = get_d_bm25(lm_d_tf, lm_idf, lm_doc_lens, lm_doc_len_ave)

In [None]:
del lm_d_tf, lm_doc_lens, lm_doc_len_ave

In [None]:
# Check
with open(f"./analysis_data/{dataset}/bm25_result.json") as f:
    bm25_result = json.load(f)
    
with open(f"./analysis_data/{dataset}/dense_result.json") as f:
    dense_result = json.load(f)
    
with open(f"./analysis_data/{dataset}/weighted_dense_result.json") as f:
    weighted_dense_result = json.load(f)
    
with open(f"./analysis_data/{dataset}/cbm25_result.json") as f:
    cbm25_result = json.load(f)

In [None]:
evaluator = pytrec_eval.RelevanceEvaluator(qrels, {"ndcg_cut.10"})

In [None]:
bm25_scores = evaluator.evaluate(bm25_result)
dense_scores = evaluator.evaluate(dense_result)
wdense_scores = evaluator.evaluate(weighted_dense_result)
cbm25_scores = evaluator.evaluate(cbm25_result[func_name])

In [None]:
print("bm25: ", np.average([i["ndcg_cut_10"] for i in bm25_scores.values()]))
print("dense: ", np.average([i["ndcg_cut_10"] for i in dense_scores.values()]))
print("wdense: ", np.average([i["ndcg_cut_10"] for i in wdense_scores.values()]))
print("cbm25: ", np.average([i["ndcg_cut_10"] for i in cbm25_scores.values()]))

In [None]:
def create_diff_scores(source_scores, target_scores):
    diff_scores = {}
    for k in cbm25_scores:
        diff_score = source_scores[k]["ndcg_cut_10"] - target_scores[k]["ndcg_cut_10"]
        if abs(diff_score) > 0.0:
            diff_scores[k] = diff_score
            
    return diff_scores

In [None]:
diff_scores_bm25 = create_diff_scores(cbm25_scores, bm25_scores)
diff_scores_dense = create_diff_scores(cbm25_scores, dense_scores)
diff_scores_wdense = create_diff_scores(cbm25_scores, wdense_scores)

In [None]:
common_diff_query = set(diff_scores_bm25) & set(diff_scores_dense) & set(diff_scores_wdense)

In [None]:
all_diff = pd.concat([pd.Series(diff_scores_bm25), pd.Series(diff_scores_dense), pd.Series(diff_scores_wdense)], axis=1).fillna(0).sort_index()
all_diff = all_diff.rename(columns={0: "bm25", 1: "dense", 2: "wdense"})
all_diff.describe()

In [None]:
better_query = all_diff[(all_diff > 0.2).all(1)]
better_query

In [None]:
worse_query = all_diff[(all_diff < -0.01).all(1)]
worse_query

In [None]:
def show_result_top1(qid, show_condition):
    def extract_top1(result):
        return sorted(result[qid].items(), key=lambda x: -x[1])[0]
    
    bm25_top1 = extract_top1(bm25_result)
    dense_top1 = extract_top1(dense_result)
    wdense_top1 = extract_top1(weighted_dense_result)
    cbm25_top1 = extract_top1(cbm25_result[func_name])
    
    correct = qrels[qid]
    if show_condition(cbm25_top1[0], bm25_top1[0], correct):
        print("------")
        print(f"qid: {qid}, query: {queries[qid]}")
        print(bm25_top1[0] in correct, f"bm25: {corpus[bm25_top1[0]]}")
        print(dense_top1[0] in correct, f"dense: {corpus[dense_top1[0]]}")
        print(wdense_top1[0] in correct, f"wdense: {corpus[wdense_top1[0]]}")
        print(cbm25_top1[0] in correct, f"cbm25: {corpus[cbm25_top1[0]]}")
    else:
        print("------")
        print(f"skip qid {qid}")

In [None]:
def correct_condition(target, compare, correct):
    return target in correct and not compare in correct

for qid in better_query.index:
    show_result_top1(qid, correct_condition)

In [None]:
def false_condition(target, compare, correct):
    return not target in correct and compare in correct

for i in worse_query.index:
    show_result_top1(i, false_condition)

In [None]:
def preproc_rep(reps: np.ndarray, att_mask: np.ndarray, input_tok: np.ndarray):
    reps = rep_lave(reps, att_mask)
    reps /= np.linalg.norm(reps, axis=2)[:, :, np.newaxis]
    reps[np.isnan(reps)] = 0.0
    return reps, att_mask[:, 1:], input_tok[:, 1:]

def rep_lave(reps, att_masks, window_size=3):
    tg_reps = np.zeros_like(reps[:, 1:])  # 3D
    for b, (rep, att_mask) in enumerate(zip(reps, att_masks)):
        og_rep = rep[att_mask == 1, :]  # 2D
        og_rep = og_rep[1:-1, :]  # remove special token
        rep_len = og_rep.shape[0]
        for i in range(rep_len):
            start = i - window_size if i - window_size > 0 else 0
            end = i + window_size
            tg_reps[b, i, :] += np.mean(og_rep[start:end, :], axis=0)

    return tg_reps

def max_cos_sims(query, doc, model, tokenizer):
    special_tokens = {
        tokenizer.pad_token_id,
        tokenizer.bos_token_id,
        tokenizer.eos_token_id,
        tokenizer.sep_token_id,
        tokenizer.cls_token_id,
    }
    def tok2rep_indexing(inputs_ids, batch_reps, att_masks):
        tok2rep = defaultdict(list)
        for qi, (input_ids, reps, att_mask) in enumerate(zip(inputs_ids, batch_reps, att_masks)):
            for i, (qt, rep, am) in enumerate(zip(input_ids, reps, att_mask)):
                if qt in special_tokens:
                    continue
                if am == 0:
                    continue
                tok2rep[qt].append(rep)
            
        for tid in tok2rep:
            tok2rep[tid] = np.vstack(tok2rep[tid])
            
        return tok2rep
        
    doc = doc["title"] + " " + doc["text"]
    t_query = tokenizer(query, return_tensors="pt")
    with torch.no_grad():
        e_query = model(**t_query).last_hidden_state
    e_queries, q_att_masks, q_inputs_ids = preproc_rep(e_query.numpy(), t_query["attention_mask"].numpy(), t_query["input_ids"].numpy())
    q_tok2rep = tok2rep_indexing(q_inputs_ids, e_queries, q_att_masks)
    t_doc = tokenizer(doc, return_tensors="pt")
    with torch.no_grad():
        e_doc = model(**t_doc).last_hidden_state
    e_docs, d_att_masks, d_inputs_ids = preproc_rep(e_doc.numpy(), t_doc["attention_mask"].numpy(), t_doc["input_ids"].numpy())
    # print(e_docs.shape, d_att_masks.shape, d_inputs_ids.shape, tokenizer.convert_ids_to_tokens(list(d_inputs_ids[0])))
    d_tok2rep = tok2rep_indexing(d_inputs_ids, e_docs, d_att_masks)
    result = []
    for qt, q_reps in q_tok2rep.items():
        q_token = tokenizer.convert_ids_to_tokens(int(qt))
        if qt not in d_tok2rep:
            result.append(f"{q_token}: 0.0")
            continue
            
        score=np.max(np.dot(q_reps, d_tok2rep[qt].T))
        result.append(f"{q_token}: {round(float(score), 2)}")
        
    return ", ".join(result)

In [None]:
def tokenizer_doc(target, tokenizer):
    text = corpus[target]["title"] + " " + corpus[target]["text"]
    return tokenizer(text)
    
def get_bm25_val(cid, query, token_bm25_score, tokenizer):
    this_q_tok_bm25 = []
    t_query = tokenizer(query)
    for tok in t_query:
        this_q_tok_bm25.append(f"{tok}: {round(token_bm25_score[cid][(tok)], 2)}")
    return ", ".join(this_q_tok_bm25)

def get_idf_val(query, idf, tokenizer):
    this_q_tok_idf = []
    t_query = tokenizer(query)
    for tok in t_query:
        this_q_tok_idf.append(f"{tok}: {round(idf[tok], 2)}")
    return ", ".join(this_q_tok_idf)
        

def show_result_top1_analysis(qid, show_condition):
    def extract_top1(result):
        return sorted(result[qid].items(), key=lambda x: -x[1])[0]
    
    bm25_top1 = extract_top1(bm25_result)
    dense_top1 = extract_top1(dense_result)
    wdense_top1 = extract_top1(weighted_dense_result)
    cbm25_top1 = extract_top1(cbm25_result[func_name])
    correct = qrels[qid]
    query = queries[qid]
    if show_condition(cbm25_top1[0], bm25_top1[0], correct):
        print("------")
        print(f"qid: {qid}, query: {query}")
        print(f"lm_idf: {get_idf_val(query, lm_idf, lm_tokenizer.analyze)}")
        print(f"hf_idf: {get_idf_val(query, hf_idf, hf_tokenizer.tokenize)}")
        print("---")
        cid_bm25_top1 = bm25_top1[0]
        cid_bm25_tok_score = get_bm25_val(cid_bm25_top1, query, lm_token_bm25_score, lm_tokenizer.analyze)
        t_doc = tokenizer_doc(cid_bm25_top1, lm_tokenizer.analyze)
        print(f"bm25: {cid_bm25_top1 in correct} q-bm25: {cid_bm25_tok_score}")
        print(f"t_doc: {t_doc}")
        cid_dense_top1 = dense_top1[0]
        cid_dense_tok_score = get_bm25_val(cid_dense_top1, query, hf_token_bm25_score, hf_tokenizer.tokenize)
        t_doc = tokenizer_doc(cid_dense_top1, hf_tokenizer.tokenize)
        print(f"dense: {cid_bm25_top1 in correct} q-bm25: {cid_dense_tok_score}")
        print(f"t_doc: {t_doc}")  
        cid_wdense_top1 = wdense_top1[0]
        cid_wdense_tok_score = get_bm25_val(cid_wdense_top1, query, hf_token_bm25_score, hf_tokenizer.tokenize)
        t_doc = tokenizer_doc(cid_wdense_top1, hf_tokenizer.tokenize)
        print(f"wdense: {cid_bm25_top1 in correct}, q-bm25: {cid_wdense_tok_score}")
        print(f"t_doc: {t_doc}")  
        cid_cbm25_top1 = cbm25_top1[0]
        cid_cbm25_tok_score = get_bm25_val(cid_cbm25_top1, query, hf_token_bm25_score, hf_tokenizer.tokenize)
        t_doc = tokenizer_doc(cid_cbm25_top1, hf_tokenizer.tokenize)
        t_cos_sims = max_cos_sims(query, corpus[cid_cbm25_top1], hf_model, hf_tokenizer)
        print(f"cbm25: {cid_cbm25_top1 in correct} q-bm25: {cid_cbm25_tok_score}")
        print(f"cos-sims: {t_cos_sims}")
        print(f"t_doc: {t_doc}")
    else:
        print("------")
        print(f"skip qid {qid}")

In [None]:
for qid in better_query.index:
    show_result_top1_analysis(qid, correct_condition)

In [None]:
for qid in worse_query.index:
    show_result_top1_analysis(qid, false_condition)

In [None]:
sorted(cbm25_result[func_name]["INEX_XER-141"].items(), key=lambda x: -x[1])[0]

In [None]:
qid = "QALD2_te-28"
cid = "<dbpedia:Don't_Box_Me_In>"
query = queries[qid]
print(query)
print("bm25")
print(get_bm25_val(cid, query, hf_token_bm25_score, hf_tokenizer.tokenize))
print(max_cos_sims(query, corpus[cid], hf_model, hf_tokenizer))
print("cmb25")
cid = "<dbpedia:The_Godfather>"
print(get_bm25_val(cid, query, hf_token_bm25_score, hf_tokenizer.tokenize))
print(max_cos_sims(query, corpus[cid], hf_model, hf_tokenizer))

In [None]:
qid = "INEX_LD-20120512"
cid = '<dbpedia:GP_Basic>'
query = queries[qid]
print(query)
print(get_bm25_val(cid, query, hf_token_bm25_score, hf_tokenizer.tokenize))
print(max_cos_sims(query, corpus[cid], hf_model, hf_tokenizer))

In [None]:
qid = "INEX_XER-141"
cid = '<dbpedia:Open_University_of_Catalonia>'
query = queries[qid]
print('dense')
print(query)
print(get_bm25_val(cid, query, hf_token_bm25_score, hf_tokenizer.tokenize))
print(max_cos_sims(query, corpus[cid], hf_model, hf_tokenizer))
print("cbm25")
cid = '<dbpedia:CatalunyaCaixa>'
print(get_bm25_val(cid, query, hf_token_bm25_score, hf_tokenizer.tokenize))
print(max_cos_sims(query, corpus[cid], hf_model, hf_tokenizer))

In [None]:
qid = "INEX_LD-20120121"
cid = "<dbpedia:Raw_Food_Made_Easy_for_1_or_2_People>"
query = queries[qid]
print(query)
print("bm25")
print(get_bm25_val(cid, query, hf_token_bm25_score, hf_tokenizer.tokenize))
print(max_cos_sims(query, corpus[cid], hf_model, hf_tokenizer))
print(tokenizer_doc(cid, hf_tokenizer.tokenize))
print("cbm25")
cid = "<dbpedia:Luke_Nguyen's_Vietnam>"
print(get_bm25_val(cid, query, hf_token_bm25_score, hf_tokenizer.tokenize))
print(max_cos_sims(query, corpus[cid], hf_model, hf_tokenizer))

In [None]:
for i in sorted(cbm25_result[func_name]["INEX_LD-20120121"].items(), key=lambda x: -x[1]):
    print(i, corpus[i[0]])

In [None]:
lm_tokenizer.analyze("Vietnamese")

In [None]:
def show_result_top1_comp_wdense(qid, show_condition):
    def extract_top1(result):
        return sorted(result[qid].items(), key=lambda x: -x[1])[0]
    
    bm25_top1 = extract_top1(bm25_result)
    dense_top1 = extract_top1(dense_result)
    wdense_top1 = extract_top1(weighted_dense_result)
    cbm25_top1 = extract_top1(cbm25_result[func_name])
    
    correct = qrels[qid]
    # if show_condition(cbm25_top1[0], wdense_top1[0], correct) and wdense_top1[0] != dense_top1[0]:
    if wdense_top1[0] != dense_top1[0]:
        print("------")
        print(f"qid: {qid}, query: {queries[qid]}")
        print(bm25_top1[0] in correct, f"bm25: {corpus[bm25_top1[0]]}")
        print(dense_top1[0] in correct, f"dense: {corpus[dense_top1[0]]}")
        print(wdense_top1[0] in correct, f"wdense: {corpus[wdense_top1[0]]}")
        print(cbm25_top1[0] in correct, f"cbm25: {corpus[cbm25_top1[0]]}")
    else:
        print("------")
        print(f"skip qid {qid}")

In [None]:
target_qid = []
for qid, sup in diff_scores_wdense.items():
    if sup < 0:
        continue
    diff = wdense_scores[qid]["ndcg_cut_10"] - dense_scores[qid]["ndcg_cut_10"]
    if diff > 0:
        target_qid.append(qid)
        
for qid in target_qid:
    show_result_top1_comp_wdense(qid, correct_condition)

In [None]:
for i in pd.Series(pd.Series(diff_scores_wdense) > 0).index:
    show_result_top1_comp_wdense(i, correct_condition)

In [None]:
sorted(cbm25_result[func_name]["QALD2_te-28"].items(), key=lambda x: -x[1])[0]

In [None]:
hf_token_bm25_score['<dbpedia:All_of_Me_(1984_film)>']

In [None]:
hf_token_bm25_score['<dbpedia:The_Godfather>']

In [None]:
qid = "QALD2_te-64"
cid = "<dbpedia:Launch_Control_Center>"
query = queries[qid]
print(query)
print("bm25")
print(get_bm25_val(cid, query, hf_token_bm25_score, hf_tokenizer.tokenize))
print(max_cos_sims(query, corpus[cid], hf_model, hf_tokenizer))
print("cmb25")
cid = "<dbpedia:Vandenberg_AFB_Space_Launch_Complex_4>"
print(get_bm25_val(cid, query, hf_token_bm25_score, hf_tokenizer.tokenize))
print(max_cos_sims(query, corpus[cid], hf_model, hf_tokenizer))

In [None]:
hf_token_bm25_score["<dbpedia:Launch_Control_Center>"]

In [None]:
hf_token_bm25_score["<dbpedia:Vandenberg_AFB_Space_Launch_Complex_4>"]

In [None]:
names = ["Carl Reiner", "Steve Martin", "Lily Tomlin "]
for name in names:
    print(", ".join([f"{tn}: {round(hf_idf[tn], 2)}" for tn in hf_tokenizer.tokenize(name)]))