In [2]:
import json
import os
import numpy as np
import pandas as pd
import pytrec_eval
import torch
from tqdm import tqdm
from collections import Counter, defaultdict
from transformers import AutoTokenizer, AutoModel
from beir.datasets.data_loader import GenericDataLoader
from beir.retrieval.evaluation import EvaluateRetrieval
from pyserini.analysis import Analyzer, get_lucene_analyzer

In [3]:
root_dir = "/home/gaia_data/iida.h/BEIR/datasets"
dataset =  "scifact"
model_path="/home/gaia_data/iida.h/BEIR/C-BM25/model/distil/dense/"
func_name="maxsim_bm25_qtf"

In [4]:
data_path = os.path.join(root_dir, dataset)
corpus, queries, qrels = GenericDataLoader(data_path).load(split="test")

  0%|          | 0/5183 [00:00<?, ?it/s]

In [5]:
def get_info(corpus, tokenizer):
    sep = " "
    doc_lens = {}
    df = Counter()
    d_tf = {}

    for cid in tqdm(corpus.keys()):
        text = corpus[cid]["title"] + sep + corpus[cid]["text"]
        input_ids = tokenizer(text)
        doc_lens[cid] = (len(input_ids))
        df.update(list(set(input_ids)))
        tf_d = Counter(input_ids)
        doc_lens[cid] =len(input_ids)
        d_tf[cid] = tf_d
        
    idf = defaultdict(float)
    N = len(corpus)
    for w, v in df.items():
        idf[w] = np.log(N / v)

    doc_len_ave = np.mean(list(doc_lens.values()))
    
    del df
        
    return d_tf, idf, doc_lens, doc_len_ave

In [6]:
def get_d_bm25(d_tf, idf, doc_lens, doc_len_ave):
    k1 = 0.9
    b = 0.6
    token_bm25_score = {}

    for cid, tfs in d_tf.items():
        token_bm25_score[cid] = defaultdict(float)
        for tid, tf in tfs.items():
            token_bm25_score[cid][tid] = tf * (1 + k1) / (tf + k1 * (1 - b + b * doc_lens[cid] / doc_len_ave)) * idf[tid]
            
    return token_bm25_score

In [7]:
hf_model = AutoModel.from_pretrained(model_path)

In [8]:
hf_tokenizer = AutoTokenizer.from_pretrained(model_path)
hf_d_tf, hf_idf, hf_doc_lens, hf_doc_len_ave = get_info(corpus, hf_tokenizer.tokenize)
hf_token_bm25_score = get_d_bm25(hf_d_tf, hf_idf, hf_doc_lens, hf_doc_len_ave)

  0%|                                                                                                                         | 0/5183 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (547 > 512). Running this sequence through the model will result in indexing errors
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5183/5183 [00:03<00:00, 1513.25it/s]


In [9]:
del hf_d_tf, hf_doc_lens, hf_doc_len_ave

In [10]:
lm_tokenizer = Analyzer(get_lucene_analyzer())
lm_d_tf, lm_idf, lm_doc_lens, lm_doc_len_ave = get_info(corpus, lm_tokenizer.analyze)
lm_token_bm25_score = get_d_bm25(lm_d_tf, lm_idf, lm_doc_lens, lm_doc_len_ave)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5183/5183 [00:01<00:00, 2820.66it/s]


In [11]:
del lm_d_tf, lm_doc_lens, lm_doc_len_ave

In [12]:
# Check
# with open(f"./analysis_data/{dataset}/bm25_result.json") as f:
#     bm25_result = json.load(f)
    
# with open(f"./analysis_data/{dataset}/dense_result.json") as f:
#     dense_result = json.load(f)
   
# with open(f"./analysis_data/{dataset}/weighted_dense_result.json") as f:
#     weighted_dense_result = json.load(f)
    
# with open(f"./analysis_data/{dataset}/cbm25_result.json") as f:
#     cbm25_result = json.load(f)

with open(f"/home/gaia_data/iida.h/BEIR/C-BM25/results/{dataset}/result/bm25/analysis.json") as f:
    bm25_result = json.load(f)

with open(f"/home/gaia_data/iida.h/BEIR/C-BM25/results/{dataset}/result/cos_sim/mpnet-tod/analysis.json") as f:
    dense_result = json.load(f)

with open(f"/home/gaia_data/iida.h/BEIR/C-BM25/results/{dataset}/result/dot/mpnet-v3-mse-beir-dot/analysis.json") as f:
    distil_dense_result = json.load(f)

with open(f"/home/gaia_data/iida.h/BEIR/C-BM25/results/{dataset}/result/lss/mpnet-tod/analysis.json") as f:
    cbm25_result = json.load(f)


In [13]:
evaluator = pytrec_eval.RelevanceEvaluator(qrels, {"ndcg_cut.10"})

In [14]:
bm25_scores = evaluator.evaluate(bm25_result)
dense_scores = evaluator.evaluate(dense_result)
distil_dense_scores = evaluator.evaluate(distil_dense_result)
cbm25_scores = evaluator.evaluate(cbm25_result[func_name])

In [15]:
print("bm25: ", np.average([i["ndcg_cut_10"] for i in bm25_scores.values()]))
print("dense: ", np.average([i["ndcg_cut_10"] for i in dense_scores.values()]))
print("distil_dense: ", np.average([i["ndcg_cut_10"] for i in distil_dense_scores.values()]))
print("cbm25: ", np.average([i["ndcg_cut_10"] for i in cbm25_scores.values()]))

bm25:  0.6639400192301421
dense:  0.46129964831747294
distil_dense:  0.5761708026486931
cbm25:  0.7110404910234447


In [16]:
def create_diff_scores(source_scores, target_scores):
    diff_scores = {}
    for k in cbm25_scores:
        try:
            diff_score = source_scores[k]["ndcg_cut_10"] - target_scores[k]["ndcg_cut_10"]
        except KeyError:
            diff_score = source_scores[k]["ndcg_cut_10"]
        if abs(diff_score) > 0.0:
            diff_scores[k] = diff_score
            
    return diff_scores

In [17]:
diff_scores_bm25 = create_diff_scores(cbm25_scores, bm25_scores)
diff_scores_dense = create_diff_scores(cbm25_scores, dense_scores)
diff_scores_distil_dense = create_diff_scores(cbm25_scores, distil_dense_scores)

In [18]:
common_diff_query = set(diff_scores_bm25) & set(diff_scores_dense) & set(diff_scores_distil_dense)
# common_diff_query = set(diff_scores_bm25) & set(diff_scores_dense)

In [19]:
all_diff = pd.concat([pd.Series(diff_scores_bm25), pd.Series(diff_scores_dense), pd.Series(diff_scores_distil_dense)], axis=1).fillna(0).sort_index()
all_diff = all_diff.rename(columns={0: "bm25", 1: "dense", 2: "distil_dense"})
all_diff.describe()

Unnamed: 0,bm25,dense,distil_dense
count,188.0,188.0,188.0
mean,0.07516,0.398523,0.215218
std,0.262094,0.405287,0.39087
min,-0.569323,-1.0,-1.0
25%,0.0,0.13093,0.0
50%,0.0,0.36907,0.13093
75%,0.302416,0.63093,0.430677
max,1.0,1.0,1.0


In [20]:
# all_diff = pd.concat([pd.Series(diff_scores_bm25), pd.Series(diff_scores_dense)], axis=1).fillna(0).sort_index()
# all_diff = all_diff.rename(columns={0: "bm25", 1: "dense"})
# all_diff.describe()

In [21]:
better_query = all_diff[(all_diff > 0.2).all(1)]
better_query

Unnamed: 0,bm25,dense,distil_dense
1099,0.430677,0.430677,0.430677
1100,0.430677,0.430677,0.430677
127,0.36907,1.0,0.613147
1280,0.356207,0.356207,0.356207
1344,0.63093,0.63093,0.63093
183,0.63093,0.63093,0.63093
198,0.36907,1.0,1.0
294,0.5,0.666667,0.69897
3,0.5,1.0,1.0
343,0.419721,1.0,0.386853


In [22]:
worse_query = all_diff[(all_diff < -0.01).all(1)]
worse_query

Unnamed: 0,bm25,dense,distil_dense
237,-0.36907,-0.36907,-0.36907
513,-0.13093,-0.13093,-0.13093


In [32]:
def show_result_top1(qid, show_condition):
    def extract_top1(result):
        return sorted(result[qid].items(), key=lambda x: -x[1])[0]
    
    bm25_top1 = extract_top1(bm25_result)
    dense_top1 = extract_top1(dense_result)
    distil_dense_top1 = extract_top1(distil_dense_result)
    cbm25_top1 = extract_top1(cbm25_result[func_name])
    
    correct = qrels[qid]
    if show_condition(cbm25_top1[0], distil_dense_top1[0], correct):
        print("------")
        print(f"qid: {qid}, query: {queries[qid]}")
        print(bm25_top1[0] in correct, f"bm25: {corpus[bm25_top1[0]]}")
        print(dense_top1[0] in correct, f"dense: {corpus[dense_top1[0]]}")
        print(distil_dense_top1[0] in correct, f"distil_dense: {corpus[distil_dense_top1[0]]}")
        print(cbm25_top1[0] in correct, f"cbm25: {corpus[cbm25_top1[0]]}")
    else:
        print("------")
        print(f"skip qid {qid}")

In [24]:
def correct_condition(target, compare, correct):
    return target in correct and not compare in correct

for qid in better_query.index:
    show_result_top1(qid, correct_condition)

------
skip qid 1099
------
skip qid 1100
------
qid: 127, query: Arginine 90 in p150n is important for interaction with EB1.
False bm25: {'text': 'CLIP-170 is a "cytoplasmic linker protein" implicated in endosome-microtubule interactions and in control of microtubule dynamics. CLIP-170 localizes dynamically to growing microtubule plus ends, colocalizing with the dynein activator dynactin and the APC-binding protein EB1. This shared "plus-end tracking" behavior suggests that CLIP-170 might interact with dynactin and/or EB1. We have used site-specific mutagenesis of CLIP-170 and a transfection/colocalization assay to address this question in mammalian tissue culture cells. Our results indicate that CLIP-170 interacts, directly or indirectly, with both dynactin and EB1. We find that the CLIP-170/dynactin interaction is mediated by the second metal binding motif of the CLIP-170 tail. In contrast, the CLIP-170/EB1 interaction requires neither metal binding motif. In addition, our experimen

In [33]:
def false_condition(target, compare, correct):
    return not target in correct and compare in correct

for i in worse_query.index:
    show_result_top1(i, false_condition)

------
qid: 237, query: Cells lacking clpC have a defect in sporulation efficiency in Bacillus subtilis.
True bm25: {'text': 'The differentiation of the bacterium Bacillus subtilis into a dormant spore is among the most well-characterized developmental pathways in biology. Classical genetic screens performed over the past half century identified scores of factors involved in every step of this morphological process. More recently, transcriptional profiling uncovered additional sporulation-induced genes required for successful spore development. Here, we used transposon-sequencing (Tn-seq) to assess whether there were any sporulation genes left to be discovered. Our screen identified 133 out of the 148 genes with known sporulation defects. Surprisingly, we discovered 24 additional genes that had not been previously implicated in spore formation. To investigate their functions, we used fluorescence microscopy to survey early, middle, and late stages of differentiation of null mutants fro

In [26]:
def preproc_rep(reps: np.ndarray, att_mask: np.ndarray, input_tok: np.ndarray):
    reps = rep_lave(reps, att_mask)
    reps /= np.linalg.norm(reps, axis=2)[:, :, np.newaxis]
    reps[np.isnan(reps)] = 0.0
    return reps, att_mask[:, 1:], input_tok[:, 1:]

def rep_lave(reps, att_masks, window_size=3):
    tg_reps = np.zeros_like(reps[:, 1:])  # 3D
    for b, (rep, att_mask) in enumerate(zip(reps, att_masks)):
        og_rep = rep[att_mask == 1, :]  # 2D
        og_rep = og_rep[1:-1, :]  # remove special token
        rep_len = og_rep.shape[0]
        for i in range(rep_len):
            start = i - window_size if i - window_size > 0 else 0
            end = i + window_size
            tg_reps[b, i, :] += np.mean(og_rep[start:end, :], axis=0)

    return tg_reps

def max_cos_sims(query, doc, model, tokenizer):
    special_tokens = {
        tokenizer.pad_token_id,
        tokenizer.bos_token_id,
        tokenizer.eos_token_id,
        tokenizer.sep_token_id,
        tokenizer.cls_token_id,
    }
    def tok2rep_indexing(inputs_ids, batch_reps, att_masks):
        tok2rep = defaultdict(list)
        for qi, (input_ids, reps, att_mask) in enumerate(zip(inputs_ids, batch_reps, att_masks)):
            for i, (qt, rep, am) in enumerate(zip(input_ids, reps, att_mask)):
                if qt in special_tokens:
                    continue
                if am == 0:
                    continue
                tok2rep[qt].append(rep)
            
        for tid in tok2rep:
            tok2rep[tid] = np.vstack(tok2rep[tid])
            
        return tok2rep
        
    doc = doc["title"] + " " + doc["text"]
    t_query = tokenizer(query, return_tensors="pt")
    with torch.no_grad():
        e_query = model(**t_query).last_hidden_state
    e_queries, q_att_masks, q_inputs_ids = preproc_rep(e_query.numpy(), t_query["attention_mask"].numpy(), t_query["input_ids"].numpy())
    q_tok2rep = tok2rep_indexing(q_inputs_ids, e_queries, q_att_masks)
    t_doc = tokenizer(doc, return_tensors="pt")
    with torch.no_grad():
        e_doc = model(**t_doc).last_hidden_state
    e_docs, d_att_masks, d_inputs_ids = preproc_rep(e_doc.numpy(), t_doc["attention_mask"].numpy(), t_doc["input_ids"].numpy())
    # print(e_docs.shape, d_att_masks.shape, d_inputs_ids.shape, tokenizer.convert_ids_to_tokens(list(d_inputs_ids[0])))
    d_tok2rep = tok2rep_indexing(d_inputs_ids, e_docs, d_att_masks)
    result = []
    for qt, q_reps in q_tok2rep.items():
        q_token = tokenizer.convert_ids_to_tokens(int(qt))
        if qt not in d_tok2rep:
            result.append(f"{q_token}: 0.0")
            continue
            
        score=np.max(np.dot(q_reps, d_tok2rep[qt].T))
        result.append(f"{q_token}: {round(float(score), 2)}")
        
    return ", ".join(result)

In [27]:
def tokenizer_doc(target, tokenizer):
    text = corpus[target]["title"] + " " + corpus[target]["text"]
    return tokenizer(text)
    
def get_bm25_val(cid, query, token_bm25_score, tokenizer):
    this_q_tok_bm25 = []
    t_query = tokenizer(query)
    for tok in t_query:
        this_q_tok_bm25.append(f"{tok}: {round(token_bm25_score[cid][(tok)], 2)}")
    return ", ".join(this_q_tok_bm25)


def search_bm25(query, token_bm25_score, tokenizer):
    t_query = tokenizer(query)
    search_result = defaultdict(float)
    for cid in token_bm25_score:
        for tok in t_query:
            search_result[cid] += token_bm25_score[cid][(tok)]
    return search_result

    

def get_idf_val(query, idf, tokenizer):
    this_q_tok_idf = []
    t_query = tokenizer(query)
    for tok in t_query:
        this_q_tok_idf.append(f"{tok}: {round(idf[tok], 2)}")
    return ", ".join(this_q_tok_idf)
        

def show_result_top1_analysis(qid, show_condition):
    def extract_top1(result):
        return sorted(result[qid].items(), key=lambda x: -x[1])[0]
    
    bm25_top1 = extract_top1(bm25_result)
    dense_top1 = extract_top1(dense_result)
    distil_dense_top1 = extract_top1(distil_dense_result)
    cbm25_top1 = extract_top1(cbm25_result[func_name])
    correct = qrels[qid]
    query = queries[qid]
    if show_condition(cbm25_top1[0], bm25_top1[0], correct):
        print("------")
        print(f"qid: {qid}, query: {query}")
        print(f"lm_idf: {get_idf_val(query, lm_idf, lm_tokenizer.analyze)}")
        print(f"hf_idf: {get_idf_val(query, hf_idf, hf_tokenizer.tokenize)}")
        print("---")
        cid_bm25_top1 = bm25_top1[0]
        cid_bm25_tok_score_lm = get_bm25_val(cid_bm25_top1, query, lm_token_bm25_score, lm_tokenizer.analyze)
        cid_bm25_tok_score_hf = get_bm25_val(cid_bm25_top1, query, hf_token_bm25_score, hf_tokenizer.tokenize)
        t_doc = tokenizer_doc(cid_bm25_top1, lm_tokenizer.analyze)
        t_cos_sims = max_cos_sims(query, corpus[cid_bm25_top1], hf_model, hf_tokenizer)
        print(f"bm25: {cid_bm25_top1 in correct} q-bm25: {cid_bm25_tok_score_lm}")
        print(f"hf-q-bm25: {cid_bm25_tok_score_hf}")
        print(f"cos-sims: {t_cos_sims}")
        print(f"t_doc: {t_doc}")
        cid_dense_top1 = dense_top1[0]
        cid_dense_tok_score = get_bm25_val(cid_dense_top1, query, hf_token_bm25_score, hf_tokenizer.tokenize)
        t_doc = tokenizer_doc(cid_dense_top1, hf_tokenizer.tokenize)
        print(f"dense: {cid_bm25_top1 in correct} q-bm25: {cid_dense_tok_score}")
        print(f"t_doc: {t_doc}")  
        cid_distil_dense_top1 = distil_dense_top1[0]
        cid_distil_dense_tok_score = get_bm25_val(cid_distil_dense_top1, query, hf_token_bm25_score, hf_tokenizer.tokenize)
        t_doc = tokenizer_doc(cid_distil_dense_top1, hf_tokenizer.tokenize)
        print(f"wdense: {cid_bm25_top1 in correct}, q-bm25: {cid_distil_dense_tok_score}")
        print(f"t_doc: {t_doc}")  
        cid_cbm25_top1 = cbm25_top1[0]
        cid_cbm25_tok_score = get_bm25_val(cid_cbm25_top1, query, hf_token_bm25_score, hf_tokenizer.tokenize)
        t_doc = tokenizer_doc(cid_cbm25_top1, hf_tokenizer.tokenize)
        t_cos_sims = max_cos_sims(query, corpus[cid_cbm25_top1], hf_model, hf_tokenizer)
        print(f"cbm25: {cid_cbm25_top1 in correct} q-bm25: {cid_cbm25_tok_score}")
        print(f"cos-sims: {t_cos_sims}")
        print(f"t_doc: {t_doc}")
    else:
        print("------")
        print(f"skip qid {qid}")

In [28]:
for qid in better_query.index:
    show_result_top1_analysis(qid, correct_condition)

------
skip qid 1099
------
skip qid 1100
------
qid: 127, query: Arginine 90 in p150n is important for interaction with EB1.
lm_idf: arginin: 5.03, 90: 3.76, p150n: 0.0, import: 1.76, interact: 2.11, eb1: 6.76
hf_idf: ar: 3.61, ##gin: 4.79, ##ine: 2.01, 90: 3.33, in: 0.02, p: 1.56, ##15: 4.48, ##0: 2.99, ##n: 1.82, is: 0.31, important: 1.99, for: 0.26, interaction: 2.87, with: 0.26, e: 2.05, ##b: 2.17, ##1: 1.37, .: 0.0
---


  reps /= np.linalg.norm(reps, axis=2)[:, :, np.newaxis]


bm25: False q-bm25: arginin: 0.0, 90: 0.0, p150n: 0.0, import: 0.0, interact: 3.57, eb1: 11.44
hf-q-bm25: ar: 0.0, ##gin: 0.0, ##ine: 0.0, 90: 0.0, in: 0.03, p: 1.96, ##15: 5.62, ##0: 3.76, ##n: 0.0, is: 0.39, important: 0.0, for: 0.25, interaction: 4.34, with: 0.4, e: 3.5, ##b: 3.65, ##1: 2.31, .: 0.0
cos-sims: ar: 0.0, ##gin: 0.0, ##ine: 0.0, 90: 0.0, in: 0.43, p: 0.45, ##15: 0.46, ##0: 0.47, ##n: 0.0, is: 0.46, important: 0.0, for: 0.45, interaction: 0.49, with: 0.44, e: 0.5, ##b: 0.49, ##1: 0.47, .: 0.46
t_doc: ['clip', '170', 'interact', 'dynactin', 'complex', 'apc', 'bind', 'protein', 'eb1', 'differ', 'mechan', 'clip', '170', 'cytoplasm', 'linker', 'protein', 'implic', 'endosom', 'microtubul', 'interact', 'control', 'microtubul', 'dynam', 'clip', '170', 'local', 'dynam', 'grow', 'microtubul', 'plu', 'end', 'coloc', 'dynein', 'activ', 'dynactin', 'apc', 'bind', 'protein', 'eb1', 'share', 'plu', 'end', 'track', 'behavior', 'suggest', 'clip', '170', 'might', 'interact', 'dynactin', 

IndexError: index out of range in self

In [29]:
for qid in worse_query.index:
    show_result_top1_analysis(qid, false_condition)

------
qid: 237, query: Cells lacking clpC have a defect in sporulation efficiency in Bacillus subtilis.
lm_idf: cell: 0.72, lack: 2.7, clpc: 7.45, have: 0.98, defect: 2.91, sporul: 6.94, effici: 2.82, bacillu: 5.56, subtili: 5.91
hf_idf: cells: 0.97, lacking: 3.54, cl: 3.11, ##pc: 4.86, have: 1.01, a: 0.06, defect: 4.26, in: 0.02, sp: 3.02, ##or: 2.13, ##ulation: 3.63, efficiency: 3.93, in: 0.02, ba: 3.83, ##ci: 2.67, ##llus: 5.42, sub: 2.39, ##ti: 2.84, ##lis: 5.15, .: 0.0
---


  reps /= np.linalg.norm(reps, axis=2)[:, :, np.newaxis]


bm25: True q-bm25: cell: 1.0, lack: 0.0, clpc: 0.0, have: 0.0, defect: 3.61, sporul: 11.92, effici: 0.0, bacillu: 6.88, subtili: 8.29
hf-q-bm25: cells: 0.0, lacking: 0.0, cl: 0.0, ##pc: 0.0, have: 0.0, a: 0.09, defect: 0.0, in: 0.03, sp: 5.42, ##or: 3.68, ##ulation: 6.26, efficiency: 0.0, in: 0.03, ba: 4.84, ##ci: 3.37, ##llus: 6.84, sub: 3.39, ##ti: 4.04, ##lis: 7.32, .: 0.0
cos-sims: cells: 0.0, lacking: 0.0, cl: 0.0, ##pc: 0.0, have: 0.0, a: 0.53, defect: 0.0, in: 0.61, sp: 0.59, ##or: 0.59, ##ulation: 0.59, efficiency: 0.0, ba: 0.61, ##ci: 0.61, ##llus: 0.63, sub: 0.64, ##ti: 0.63, ##lis: 0.61, .: 0.56
t_doc: ['high', 'throughput', 'genet', 'screen', 'identifi', 'larg', 'divers', 'collect', 'new', 'sporul', 'gene', 'bacillu', 'subtili', 'differenti', 'bacterium', 'bacillu', 'subtili', 'dormant', 'spore', 'among', 'most', 'well', 'character', 'development', 'pathwai', 'biologi', 'classic', 'genet', 'screen', 'perform', 'over', 'past', 'half', 'centuri', 'identifi', 'score', 'factor'

# BM25-HF-tokenizer

In [49]:
qid = "127"
cid_cbm25_top1, score = sorted(cbm25_result[func_name][qid].items(), key=lambda x: -x[1])[0]
print(f"cid: {cid_cbm25_top1}, score: {round(score,3)}")

search_result = search_bm25(queries[qid], hf_token_bm25_score, hf_tokenizer.tokenize)
for cid, score in sorted(search_result.items(), key=lambda x: -x[1])[:3]:
    print(f"cid: {cid}, score: {round(score,3)}")
    bm25_val = get_bm25_val(cid, queries[qid], hf_token_bm25_score, hf_tokenizer.tokenize)
    print(corpus[cid])

cid: 21598000, score: 14.453
cid: 35231675, score: 26.209
{'text': 'CLIP-170 is a "cytoplasmic linker protein" implicated in endosome-microtubule interactions and in control of microtubule dynamics. CLIP-170 localizes dynamically to growing microtubule plus ends, colocalizing with the dynein activator dynactin and the APC-binding protein EB1. This shared "plus-end tracking" behavior suggests that CLIP-170 might interact with dynactin and/or EB1. We have used site-specific mutagenesis of CLIP-170 and a transfection/colocalization assay to address this question in mammalian tissue culture cells. Our results indicate that CLIP-170 interacts, directly or indirectly, with both dynactin and EB1. We find that the CLIP-170/dynactin interaction is mediated by the second metal binding motif of the CLIP-170 tail. In contrast, the CLIP-170/EB1 interaction requires neither metal binding motif. In addition, our experiments suggest that the CLIP-170/dynactin interaction occurs via the shoulder/sidear

In [37]:
sorted(cbm25_result[func_name]["INEX_XER-141"].items(), key=lambda x: -x[1])[0]

('<dbpedia:CatalunyaCaixa>', 14.438369397728444)

In [38]:
qid = "QALD2_te-28"
cid = "<dbpedia:Don't_Box_Me_In>"
query = queries[qid]
print(query)
print("bm25")
print(get_bm25_val(cid, query, hf_token_bm25_score, hf_tokenizer.tokenize))
print(max_cos_sims(query, corpus[cid], hf_model, hf_tokenizer))
print("cmb25")
cid = "<dbpedia:The_Godfather>"
print(get_bm25_val(cid, query, hf_token_bm25_score, hf_tokenizer.tokenize))
print(max_cos_sims(query, corpus[cid], hf_model, hf_tokenizer))

Give me all movies directed by Francis Ford Coppola.
bm25
give: 0.0, me: 5.83, all: 0.0, movies: 0.0, directed: 0.0, by: 0.0, francis: 5.66, ford: 6.38, cop: 6.81, ##pol: 5.96, ##a: 2.4, .: 0.01
give: 0.0, me: 0.49, all: 0.0, movies: 0.0, directed: 0.0, by: 0.0, francis: 0.6, ford: 0.61, cop: 0.61, ##pol: 0.61, ##a: 0.58, .: 0.46
cmb25
give: 0.0, me: 0.0, all: 0.0, movies: 0.0, directed: 4.38, by: 1.91, francis: 6.6, ford: 7.44, cop: 9.88, ##pol: 8.64, ##a: 3.48, .: 0.01
give: 0.0, me: 0.0, all: 0.0, movies: 0.0, directed: 0.65, by: 0.69, francis: 0.71, ford: 0.72, cop: 0.71, ##pol: 0.68, ##a: 0.66, .: 0.61


  reps /= np.linalg.norm(reps, axis=2)[:, :, np.newaxis]


In [None]:
qid = "INEX_LD-20120512"
cid = '<dbpedia:GP_Basic>'
query = queries[qid]
print(query)
print(get_bm25_val(cid, query, hf_token_bm25_score, hf_tokenizer.tokenize))
print(max_cos_sims(query, corpus[cid], hf_model, hf_tokenizer))

In [None]:
qid = "INEX_XER-141"
cid = '<dbpedia:Open_University_of_Catalonia>'
query = queries[qid]
print('dense')
print(query)
print(get_bm25_val(cid, query, hf_token_bm25_score, hf_tokenizer.tokenize))
print(max_cos_sims(query, corpus[cid], hf_model, hf_tokenizer))
print("cbm25")
cid = '<dbpedia:CatalunyaCaixa>'
print(get_bm25_val(cid, query, hf_token_bm25_score, hf_tokenizer.tokenize))
print(max_cos_sims(query, corpus[cid], hf_model, hf_tokenizer))

In [None]:
qid = "INEX_LD-20120121"
cid = "<dbpedia:Raw_Food_Made_Easy_for_1_or_2_People>"
query = queries[qid]
print(query)
print("bm25")
print(get_bm25_val(cid, query, hf_token_bm25_score, hf_tokenizer.tokenize))
print(max_cos_sims(query, corpus[cid], hf_model, hf_tokenizer))
print(tokenizer_doc(cid, hf_tokenizer.tokenize))
print("cbm25")
cid = "<dbpedia:Luke_Nguyen's_Vietnam>"
print(get_bm25_val(cid, query, hf_token_bm25_score, hf_tokenizer.tokenize))
print(max_cos_sims(query, corpus[cid], hf_model, hf_tokenizer))

In [None]:
for i in sorted(cbm25_result[func_name]["INEX_LD-20120121"].items(), key=lambda x: -x[1]):
    print(i, corpus[i[0]])

In [None]:
lm_tokenizer.analyze("Vietnamese")

In [None]:
def show_result_top1_comp_wdense(qid, show_condition):
    def extract_top1(result):
        return sorted(result[qid].items(), key=lambda x: -x[1])[0]
    
    bm25_top1 = extract_top1(bm25_result)
    dense_top1 = extract_top1(dense_result)
    wdense_top1 = extract_top1(weighted_dense_result)
    cbm25_top1 = extract_top1(cbm25_result[func_name])
    
    correct = qrels[qid]
    # if show_condition(cbm25_top1[0], wdense_top1[0], correct) and wdense_top1[0] != dense_top1[0]:
    if wdense_top1[0] != dense_top1[0]:
        print("------")
        print(f"qid: {qid}, query: {queries[qid]}")
        print(bm25_top1[0] in correct, f"bm25: {corpus[bm25_top1[0]]}")
        print(dense_top1[0] in correct, f"dense: {corpus[dense_top1[0]]}")
        print(wdense_top1[0] in correct, f"wdense: {corpus[wdense_top1[0]]}")
        print(cbm25_top1[0] in correct, f"cbm25: {corpus[cbm25_top1[0]]}")
    else:
        print("------")
        print(f"skip qid {qid}")

In [None]:
target_qid = []
for qid, sup in diff_scores_wdense.items():
    if sup < 0:
        continue
    diff = wdense_scores[qid]["ndcg_cut_10"] - dense_scores[qid]["ndcg_cut_10"]
    if diff > 0:
        target_qid.append(qid)
        
for qid in target_qid:
    show_result_top1_comp_wdense(qid, correct_condition)

In [None]:
for i in pd.Series(pd.Series(diff_scores_wdense) > 0).index:
    show_result_top1_comp_wdense(i, correct_condition)

In [None]:
sorted(cbm25_result[func_name]["QALD2_te-28"].items(), key=lambda x: -x[1])[0]

In [None]:
hf_token_bm25_score['<dbpedia:All_of_Me_(1984_film)>']

In [None]:
hf_token_bm25_score['<dbpedia:The_Godfather>']

In [None]:
qid = "QALD2_te-64"
cid = "<dbpedia:Launch_Control_Center>"
query = queries[qid]
print(query)
print("bm25")
print(get_bm25_val(cid, query, hf_token_bm25_score, hf_tokenizer.tokenize))
print(max_cos_sims(query, corpus[cid], hf_model, hf_tokenizer))
print("cmb25")
cid = "<dbpedia:Vandenberg_AFB_Space_Launch_Complex_4>"
print(get_bm25_val(cid, query, hf_token_bm25_score, hf_tokenizer.tokenize))
print(max_cos_sims(query, corpus[cid], hf_model, hf_tokenizer))

In [None]:
hf_token_bm25_score["<dbpedia:Launch_Control_Center>"]

In [None]:
hf_token_bm25_score["<dbpedia:Vandenberg_AFB_Space_Launch_Complex_4>"]

In [None]:
names = ["Carl Reiner", "Steve Martin", "Lily Tomlin "]
for name in names:
    print(", ".join([f"{tn}: {round(hf_idf[tn], 2)}" for tn in hf_tokenizer.tokenize(name)]))