# Calculate the evaluation metrics

In [2]:
import os
import numpy as np
import pickle
import pandas as pd
import requests

In [3]:
def calculate_mrr_score(prep, pred_col, true_cols):
    # Remove duplicate QIDs while keeping the order
    prep[pred_col] = prep[pred_col].apply(lambda x: list(dict.fromkeys(x)))
    # Get the rank of each retrieved QID
    ranks = prep.apply(lambda x: [i+1 for i in range(len(x[pred_col])) if (x[pred_col][i] in x[true_cols])], axis=1)
    # Return the MRR
    return ranks.apply(lambda x: 1/x[0] if len(x)>0 else 0).mean()

def calculate_ndcg_score(prep, pred_col, true_cols):
    # Remove duplicate QIDs while keeping the order
    prep[pred_col] = prep[pred_col].apply(lambda x: list(dict.fromkeys(x)))
    # Get the rank of each retrieved QID
    ranks = prep.apply(lambda x: [i+1 for i in range(len(x[pred_col])) if (x[pred_col][i] in x[true_cols])], axis=1)
    # Calculate the DCG, the Ideal DCG and finally return the NDCG
    dcg = ranks.apply(lambda x: sum([1/np.log2(y+1) for y in x]) if len(x)>0 else 0)
    idcg = prep.apply(lambda x: sum([1/np.log2(y+1) for y in range(1, min(len(x[true_cols]), len(x[pred_col])) + 1)]), axis=1)
    return (dcg/idcg).mean()

def calculate_accuracy_score(df):
    highest_score_idx = df['Retrieval Score'].apply(np.argmax)
    top_qid = df.apply(lambda x: x['Retrieval QIDs'][highest_score_idx[x.name]], axis=1)
    return (top_qid == df['Correct QID']).mean()

def calculate_log_odds_ratio_score(df):
    def log_odds_ratio(row):
        correct_qid = row['Correct QID']
        wrong_qid = row['Wrong QID']

        # Find the maximum scores for the correct and wrong QIDs
        correct_scores = [score for qid, score in zip(row['Retrieval QIDs'], row['Retrieval Score']) if qid == correct_qid]
        wrong_scores = [score for qid, score in zip(row['Retrieval QIDs'], row['Retrieval Score']) if qid == wrong_qid]

        max_correct_score = max(correct_scores, default=0.999)
        max_wrong_score = max(wrong_scores, default=0.001)

        correct_log_odds = np.log(max_correct_score / (1 - max_correct_score))
        wrong_log_odds = np.log(max_wrong_score / (1 - max_wrong_score))
        return correct_log_odds - wrong_log_odds

    # Apply the log odds ratio calculation to each row
    return df.apply(log_odds_ratio, axis=1).mean()

def clean_results(arr_ids, arr_scores):
    id_score_map = {}
    for qid, score in zip(arr_ids, arr_scores):
        qid = qid.split('_')[0]
        id_score_map[qid] = max(id_score_map.get(qid, 0), score)

    sorted_items = sorted(id_score_map.items(), key=lambda x: x[1], reverse=True)

    sorted_ids, sorted_scores = zip(*sorted_items) if sorted_items else ([], [])
    return list(sorted_ids), list(sorted_scores)

In [None]:
directory = '../data/Evaluation Data'
for file in os.listdir(directory):
    if ('property' in file) and ('.pkl' in file):
        print(file)
        filename = f"{directory}/{file}"
        prep = pickle.load(open(filename, "rb"))

        if prep['Retrieval QIDs'].apply(
            lambda x: (x is None) or (len(x) == 0)
        ).sum() != 0:
            print("Evaluation not complete")

        prep = prep[prep['Retrieval QIDs'].apply(
            lambda x: (x is not None) and (len(x) != 0)
        )]

        if len(prep) != 0:

            prep[['Retrieval QIDs', 'Retrieval Score']] = prep.apply(
                lambda row: pd.Series(clean_results(row['Retrieval QIDs'], row['Retrieval Score'])),
                axis=1
            )

            if 'Wikidata-Disamb' in filename:
                prep = prep[:50000]
                print("Size Data:", len(prep))
                print(f"Accuracy: {calculate_accuracy_score(prep)}")
                print(f"Log Odds: {calculate_log_odds_ratio_score(prep)}")
                print()

                prep = prep[prep['Correct in Wikipedia']]
                prep = prep[prep['Wrong in Wikipedia']]
                prep = prep[:15000]
                print("Size Data:", len(prep))
                print(f"Accuracy: {calculate_accuracy_score(prep)}")
                print(f"Log Odds: {calculate_log_odds_ratio_score(prep)}")
                print()

            else:
                if 'REDFM' in filename:
                    prep = prep[prep['Correct in Wikipedia']]
                    prep['Correct QIDs'] = prep['Correct QID'].apply(lambda x: [x])

                else:
                    prep = prep[prep.apply(
                        lambda x: all(x['Question in Wikipedia'] + x['Answer in Wikipedia']),
                        axis=1
                    )]
                    # prep['Correct QIDs'] = prep.apply(
                    #     lambda x: x['Question QIDs'] + x['Answer QIDs'],
                    #     axis=1
                    # )
                    prep['Correct QIDs'] = prep['Property QIDs']

                    prep = prep[prep['Correct QIDs'].apply(
                        lambda x: len(x) > 0)
                    ]

                print("Size Data:", len(prep))
                print(f"MRR: {calculate_mrr_score(prep, 'Retrieval QIDs', 'Correct QIDs')}")
                print(f"NDCG: {calculate_ndcg_score(prep, 'Retrieval QIDs', 'Correct QIDs')}")
                print()

In [None]:
file = "retrieval_results_RuBQ-wikidata_prototype-DB(en)-Query(en)_propertytest.pkl"
filename = f"{directory}/{file}"
prep1 = pickle.load(open(filename, "rb"))

file = "retrieval_results_RuBQ-wikidatav11_v3_sorted_512dim-DB(en)-Query(en)_propertytest.pkl"
filename = f"{directory}/{file}"
prep2 = pickle.load(open(filename, "rb"))

# Define a function to merge QIDs and scores, and sort
def merge_and_sort_qids(group):
    # Use the first row as base
    base = group.iloc[0].copy()

    # Merge Retrieval QIDs and Scores
    qids = []
    scores = []
    for q, s in zip(group["Retrieval QIDs"], group["Retrieval Score"]):
        if isinstance(q, list) and isinstance(s, list):
            qids.extend(q)
            scores.extend(s)

    # Zip and sort by score
    sorted_pairs = sorted(zip(qids, scores), key=lambda x: -x[1])
    sorted_qids = [qid for qid, _ in sorted_pairs]
    sorted_scores = [score for _, score in sorted_pairs]

    base["Retrieval QIDs"] = sorted_qids
    base["Retrieval Score"] = sorted_scores
    return base

combined = pd.concat([prep1, prep2], ignore_index=True)
# Group by Question and apply merge function
merged_df = combined.groupby("Question", as_index=False).apply(merge_and_sort_qids).reset_index(drop=True)


if merged_df['Retrieval QIDs'].apply(
    lambda x: (x is None) or (len(x) == 0)
).sum() != 0:
    print("Evaluation not complete")

merged_df = merged_df[merged_df['Retrieval QIDs'].apply(
    lambda x: (x is not None) and (len(x) != 0)
)]


merged_df[['Retrieval QIDs', 'Retrieval Score']] = merged_df.apply(
    lambda row: pd.Series(clean_results(row['Retrieval QIDs'], row['Retrieval Score'])),
    axis=1
)

merged_df = merged_df[merged_df.apply(
    lambda x: all(x['Question in Wikipedia'] + x['Answer in Wikipedia']),
    axis=1
)]
# prep['Correct QIDs'] = prep.apply(
#     lambda x: x['Question QIDs'] + x['Answer QIDs'],
#     axis=1
# )
merged_df['Correct QIDs'] = merged_df['Property QIDs']

merged_df = merged_df[merged_df['Correct QIDs'].apply(
    lambda x: len(x) > 0)
]

print("Size Data:", len(merged_df))
print(f"MRR: {calculate_mrr_score(merged_df, 'Retrieval QIDs', 'Correct QIDs')}")
print(f"NDCG: {calculate_ndcg_score(merged_df, 'Retrieval QIDs', 'Correct QIDs')}")
print()

In [None]:
from tqdm import tqdm
tqdm.pandas()

def reciprocal_rank_fusion(listKS, listVS, K=50):
    scores = {}

    for rank, item in enumerate(listKS):
        score = 1 / (K + rank + 1)
        scores[item] = scores.get(item, 0) + score

    for rank, item in enumerate(listVS):
        score = 1 / (K + rank + 1)
        scores[item] = scores.get(item, 0) + score

    # Sort by descending score
    sorted_items = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    return [item for item, _ in sorted_items]

def weighted_scores(listKS, scoresKS, listVS, scoresVS, K=50):
    scores = {}

    for item, score in zip(listKS, scoresKS):
        z = (score - 50)/10
        score = 1 / (1 + np.exp(-z))
        scores[item] = scores.get(item, 0) + score/2

    for item, score in zip(listVS, scoresVS):
        scores[item] = scores.get(item, 0) + score/2

    # Sort by descending score
    sorted_items = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    return [item for item, _ in sorted_items]


def vectordb_query_similarity(text, QIDs):
    """
    Query the Wikidata vector database and get the similarity scores between the text and a list of QIDs.

    Parameters:
    - text (str): The text to classify.
    - QIDs (list): List of QIDs to compare the query to.

    Returns:
    - list[dict]: A list of QIDs that are relevant to the query, sorted by similarity score.
    """

    headers = {
        'x-api-secret': '453d3575-8f01-4d37-bbc9-973cffbe7429'
    }

    params = {
        'query': text,
        'qid': ','.join(QIDs)
    }

    response = requests.get("https://wd-vectordb.toolforge.org/similarity-score", headers=headers, params=params)

    if response.status_code == 401:
        raise Exception('Invalid API key')

    if response.status_code == 422:
        raise Exception('Query is missing')

    return response.json()

def keyword_reorder(listKS, listVS, K=50):
    scores = {}
    for qid in listKS:
        if qid in listVS:
            scores[qid] = (1/(listVS.index(qid)+1))
        else:
            scores[qid] = 0

    sorted_items = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    return [item for item, _ in sorted_items]

def keyword_reorder_perfect(listKS, correct, K=50):
    res = []
    for qid in correct:
        if qid in listKS:
            res.append(qid)

    res = res + ['Q0']*(len(listKS) - len(res))
    return res

files = [
    '/home/philippe.saade/GitHub/WikidataTextEmbedding/data/Evaluation Data/retrieval_results_REDFM-wikidatav10_v3_sorted-DB(en)-Query(en)_wikidata_keywordsearch_bm25.pkl',
    '/home/philippe.saade/GitHub/WikidataTextEmbedding/data/Evaluation Data/retrieval_results_REDFM-wikidatav10_v3_sorted-DB(en)-Query(en)_removeprops_sorted.pkl',
]
names = [
    'Keyword Search',
    'Sorted & Filtered + Names'
]

prep_keyword = pickle.load(open(files[0], "rb"))
prep_vector = pickle.load(open(files[1], "rb"))

prep_vector['Retrieval QIDs KS'] = prep_keyword['Retrieval QIDs']
prep_vector['Retrieval Score KS'] = prep_keyword['Retrieval Score']

prep_vector[['Retrieval QIDs', 'Retrieval Score']] = prep_vector.apply(
    lambda row: pd.Series(clean_results(row['Retrieval QIDs'], row['Retrieval Score'])),
    axis=1
)
prep_vector[['Retrieval QIDs KS', 'Retrieval Score KS']] = prep_vector.apply(
    lambda row: pd.Series(clean_results(row['Retrieval QIDs KS'], row['Retrieval Score KS'])),
    axis=1
)

# prep_vector = prep_vector[prep_vector.apply(
#     lambda x: all(x['Question in Wikipedia'] + x['Answer in Wikipedia']),
#     axis=1
# )]
# prep_vector['Correct QIDs'] = prep_vector.apply(
#     lambda x: x['Question QIDs'] + x['Answer QIDs'],
#     axis=1
# )
prep_vector = prep_vector[prep_vector['Correct in Wikipedia']]
prep_vector['Correct QIDs'] = prep_vector['Correct QID'].apply(lambda x: [x])

prep_vector = prep_vector[prep_vector['Correct QIDs'].apply(
    lambda x: len(x) > 0)
]

sample = pickle.load(open('/home/philippe.saade/GitHub/WikidataTextEmbedding/data/Evaluation Data/Sample IDs (EN).pkl', "rb"))
sample = set(sample['QID'].unique())

print("On Full")
print('Vector:', calculate_mrr_score(prep_vector, 'Retrieval QIDs', 'Correct QIDs'))
print('\t', calculate_ndcg_score(prep_vector, 'Retrieval QIDs', 'Correct QIDs'))
print('Keyword:', calculate_mrr_score(prep_vector, 'Retrieval QIDs KS', 'Correct QIDs'))
print('\t', calculate_ndcg_score(prep_vector, 'Retrieval QIDs KS', 'Correct QIDs'))

prep_vector['Retrieval QIDs Fusion'] = prep_vector.apply(lambda x: reciprocal_rank_fusion(x['Retrieval QIDs KS'], x['Retrieval QIDs']), axis=1)

# prep_vector['Retrieval QIDs Fusion'] = prep_vector.apply(lambda x: weighted_scores(x['Retrieval QIDs KS'], x['Retrieval Score KS'], x['Retrieval QIDs'], x['Retrieval Score']), axis=1)

# prep_vector['Retrieval QIDs Fusion'] = prep_vector.apply(lambda x: keyword_reorder(x['Retrieval QIDs KS'], x['Retrieval QIDs']), axis=1)

# prep_vector['Retrieval QIDs Fusion'] = prep_vector.progress_apply(lambda x: keyword_reorder(x['Retrieval QIDs'], x['Question']), axis=1)

print('RRF:', calculate_mrr_score(prep_vector, 'Retrieval QIDs Fusion', 'Correct QIDs'))
print('\t', calculate_ndcg_score(prep_vector, 'Retrieval QIDs Fusion', 'Correct QIDs'))


print()
print("On Sample")

prep_vector['Retrieval QIDs KS'] = prep_vector['Retrieval QIDs KS'].apply(lambda x: [i for i in x if i in sample])
prep_vector['Retrieval QIDs KS'] = prep_vector.apply(lambda x: x['Retrieval QIDs KS']+['Q0']*(len(x['Retrieval QIDs']) - len(x['Retrieval QIDs KS'])), axis=1)
print('Vector:', calculate_mrr_score(prep_vector, 'Retrieval QIDs', 'Correct QIDs'))
print('\t', calculate_ndcg_score(prep_vector, 'Retrieval QIDs', 'Correct QIDs'))
print('Keyword:', calculate_mrr_score(prep_vector, 'Retrieval QIDs KS', 'Correct QIDs'))
print('\t', calculate_ndcg_score(prep_vector, 'Retrieval QIDs KS', 'Correct QIDs'))

prep_vector['Retrieval QIDs Fusion'] = prep_vector.apply(lambda x: reciprocal_rank_fusion(x['Retrieval QIDs'], x['Retrieval QIDs KS']), axis=1)

print('RRF:', calculate_mrr_score(prep_vector, 'Retrieval QIDs Fusion', 'Correct QIDs'))
print('\t', calculate_ndcg_score(prep_vector, 'Retrieval QIDs Fusion', 'Correct QIDs'))

In [None]:
import pickle
import pandas as pd

def rrf(row, K=20):
    scores = {}

    qids, _ = clean_results(row['Retrieval QIDs'], row['Retrieval Score'])
    for rank, item in enumerate(qids):
        score = 1 / (K + rank + 1)
        scores[item] = scores.get(item, 0) + score

    qids, _ = clean_results(row['Retrieval QIDs KS'], row['Retrieval Score KS'])
    for rank, item in enumerate(qids):
        score = 1 / (K + rank + 1.01)
        scores[item] = scores.get(item, 0) + score

    # Sort by descending score
    sorted_items = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    return [item for item, _ in sorted_items]

def rerank_all(row):
    qids = row['Retrieval QIDs'] + row['Retrieval QIDs KS']
    scores = row['Retrieval Reranks'] + row['Retrieval Reranks KS']
    qids, scores = clean_results(qids, scores)
    return qids

def rerank_vector(row):
    qids = row['Retrieval QIDs']
    scores = row['Retrieval Reranks']
    qids, scores = clean_results(qids, scores)
    return qids

def rerank_keyword(row):
    qids = row['Retrieval QIDs KS']
    scores = row['Retrieval Reranks KS']
    qids, scores = clean_results(qids, scores)
    return qids

def no_rerank_vector(row):
    qids = row['Retrieval QIDs']
    scores = row['Retrieval Score']
    qids, scores = clean_results(qids, scores)
    return qids

def no_rerank_keyword(row):
    qids = row['Retrieval QIDs KS']
    scores = row['Retrieval Score KS']
    qids, scores = clean_results(qids, scores)
    return qids

save_path = f'/home/philippe.saade/GitHub/WikidataTextEmbedding/data/Evaluation Data/retrieval_results_Mintaka-wikidata_prototype-DB(en)-Query(en)_reranked.pkl'
prep = pickle.load(open(save_path, "rb"))

prep = prep[prep.apply(
    lambda x: all(x['Question in Wikipedia'] + x['Answer in Wikipedia']),
    axis=1
)]
prep['Correct QIDs'] = prep.apply(
    lambda x: x['Question QIDs'] + x['Answer QIDs'],
    axis=1
)
# prep_vector = prep_vector[prep_vector['Correct in Wikipedia']]
# prep['Correct QIDs'] = prep['Correct QID'].apply(lambda x: [x])

# prep = prep[prep['Correct QIDs'].apply(
#     lambda x: len(x) > 0)
# ]

prep[['Reranked QIDs']] = prep.apply(lambda row: pd.Series([no_rerank_vector(row)]), axis=1)
print('Vector:', calculate_mrr_score(prep, 'Reranked QIDs', 'Correct QIDs'))
print('\t', calculate_ndcg_score(prep, 'Reranked QIDs', 'Correct QIDs'))

prep[['Reranked QIDs']] = prep.apply(lambda row: pd.Series([no_rerank_keyword(row)]), axis=1)
print('Keyword:', calculate_mrr_score(prep, 'Reranked QIDs', 'Correct QIDs'))
print('\t', calculate_ndcg_score(prep, 'Reranked QIDs', 'Correct QIDs'))

prep[['Reranked QIDs']] = prep.apply(lambda row: pd.Series([rrf(row)]), axis=1)
print('RRF:', calculate_mrr_score(prep, 'Reranked QIDs', 'Correct QIDs'))
print('\t', calculate_ndcg_score(prep, 'Reranked QIDs', 'Correct QIDs'))

# prep[['Reranked QIDs']] = prep.apply(lambda row: pd.Series([rerank_vector(row)]), axis=1)
# print('Rerank Vector:', calculate_mrr_score(prep, 'Reranked QIDs', 'Correct QIDs'))
# print('\t', calculate_ndcg_score(prep, 'Reranked QIDs', 'Correct QIDs'))

# prep[['Reranked QIDs']] = prep.apply(lambda row: pd.Series([rerank_keyword(row)]), axis=1)
# print('Rerank Keyword:', calculate_mrr_score(prep, 'Reranked QIDs', 'Correct QIDs'))
# print('\t', calculate_ndcg_score(prep, 'Reranked QIDs', 'Correct QIDs'))

# prep[['Reranked QIDs']] = prep.apply(lambda row: pd.Series([rerank_all(row)]), axis=1)
# print('Rerank All:', calculate_mrr_score(prep, 'Reranked QIDs', 'Correct QIDs'))
# print('\t', calculate_ndcg_score(prep, 'Reranked QIDs', 'Correct QIDs'))