# Re-rank results

In [None]:
import numpy as np
import pickle
import pandas as pd
from src.JinaAI import JinaAIReranker
from src.wikidataLangDB import create_wikidatalang_db
from src.wikidataEmbed import WikidataTextifier
from tqdm import tqdm
import os

textifier = WikidataTextifier(language='en', langvar_filename='en')

WikidataLang = create_wikidatalang_db(db_filname=f"sqlite_enwiki.db")

reranker = JinaAIReranker(device='cpu')

In [None]:
def rerank_qids(query, qids):
    rerank_scores = []

    for qid in qids:
        entity = WikidataLang.get_entity(qid)
        if entity:
            chunks = textifier.chunk_text(entity, tokenizer=reranker.tokenizer)
            score = reranker.rank(query, chunks)
            rerank_scores.append(np.max(score))
        else:
            rerank_scores.append(0)

    return rerank_scores

In [None]:
dataset = "LC_QuAD"
save_path = f'/home/philippe.saade/GitHub/WikidataTextEmbedding/data/Evaluation Data/retrieval_results_{dataset}-wikidata_prototype-DB(en)-Query(en)_reranked.pkl'
files = [
    f'/home/philippe.saade/GitHub/WikidataTextEmbedding/data/Evaluation Data/retrieval_results_{dataset}-wikidatav10_v3_sorted-DB(en)-Query(en)_wikidata_keywordsearch_bm25.pkl',
    f'/home/philippe.saade/GitHub/WikidataTextEmbedding/data/Evaluation Data/retrieval_results_{dataset}-wikidata_prototype-DB(en)-Query(en)_prototype.pkl',
]
names = [
    'Keyword Search',
    'Sorted & Filtered + Names'
]

# Resume if partial save exists
if os.path.exists(save_path):
    print("Resuming from saved file...")
    prep_vector = pickle.load(open(save_path, "rb"))
else:
    prep_keyword = pickle.load(open(files[0], "rb"))
    prep_vector = pickle.load(open(files[1], "rb"))

    prep_vector['Retrieval QIDs KS'] = prep_keyword['Retrieval QIDs']
    prep_vector['Retrieval Score KS'] = prep_keyword['Retrieval Score']

    prep_vector['Retrieval Reranks'] = None
    prep_vector['Retrieval Reranks KS'] = None

def is_empty(x):
    return (x is None) or (len(x) == 0)
missing_qids = prep_vector['Retrieval Reranks'].apply(is_empty)
missing_scores = prep_vector['Retrieval Reranks KS'].apply(is_empty)
row_to_process = missing_qids | missing_scores
row_to_process = prep_vector[row_to_process].index

# Progressive processing
for idx in tqdm(row_to_process):
    if pd.isna(prep_vector.loc[idx, 'Retrieval Reranks']) or pd.isna(prep_vector.loc[idx, 'Retrieval Reranks KS']):
        question = prep_vector.loc[idx, 'Question']
        qids = prep_vector.loc[idx, 'Retrieval QIDs']
        qids_ks = prep_vector.loc[idx, 'Retrieval QIDs KS']

        qids = [id.split("_")[0] for id in qids]
        qids_ks = [id.split("_")[0] for id in qids_ks]

        # Rerank if not already processed
        prep_vector.at[idx, 'Retrieval Reranks'] = rerank_qids(question, qids)
        prep_vector.at[idx, 'Retrieval Reranks KS'] = rerank_qids(question, qids_ks)

        # Save every 100 runs
        if idx % 100 == 0:
            with open(save_path, "wb") as f:
                pickle.dump(prep_vector, f)

# Final save
with open(save_path, "wb") as f:
    pickle.dump(prep_vector, f)
print("Finished and saved.")