In [None]:
import json
import math
from collections import defaultdict, Counter
from pathlib import Path
from typing import Dict
limit = 10

DATASET = "./quora"
RESULTS_FILE = f"/Users/acer/Desktop/thesis/results/{DATASET}/bm42.jsonl"
QUERIES = f"/Users/acer/Desktop/thesis/datasets/{DATASET}/queries.jsonl"
QRELS = f"/Users/acer/Desktop/thesis/datasets/{DATASET}/test.tsv"
RESULTS_FILE_BASELINE = f"/Users/acer/Desktop/thesis/results/{DATASET}/bm25.jsonl"

In [None]:
# Load annotated queries with relevance info
def load_queries(_queries, qrels):
    queries = {}
    with open(_queries, "r") as file:
        for line in file:
            row = json.loads(line)
            queries[str(row["_id"])] = {**row, "doc_ids": []}

    with open(qrels, "r") as file:
        next(file) 
        for line in file:
            query_id, doc_id, score = line.strip().split("\t")
            if int(score):
                queries[query_id]["doc_ids"].append(doc_id)

    return {qid: q for qid, q in queries.items() if len(q["doc_ids"]) > 0}

# NDCG@K implementation this is just for binary relevance 0 or 1
def ndcg_at_k(retrieved, relevant, k=limit):
    dcg = 0.0                                                       # this is the discounted cumulative gain score
    for i, doc_id in enumerate(retrieved[:k]):                      # iterate over the docs in retrieved and get index rank for each doc
        if doc_id in relevant:                                      # check that document is in relevant
            dcg += 1 / math.log2(i + 2)                             # compute dcg 
    ideal_hits = min(len(relevant), k)                              # compute how many hits you can get in a list of k documents
    idcg = sum(1 / math.log2(i + 2) for i in range(ideal_hits))     # this is the best dcg you can get if everything was 100%
    return dcg / idcg if idcg > 0 else 0                            # this is the normalized dcg against idcg



def get_ndcg2id(_loaded_queries, res_file):

    # Containers for evaluation metrics
    n = 0        # total number of relevant documents across queries
    hits = 0     # total number of relevant documents retrieved
    num_queries = 0
    precisions = []
    recalls = []
    ndcgs = []
    ndcg2id = {}

    # Evaluation loop over each query's result
    with open(res_file, "r") as f:
        for line in f:
            entry = json.loads(line)
            query_id = str(entry["query_id"])

            if query_id not in queries:
                continue

            found_ids = [str(doc["doc_id"]) for doc in entry["results"][:limit]]
            relevant = set(queries[query_id]["doc_ids"])

            # Compute metrics per query
            query_hits = len(set(found_ids) & relevant)
            precision = query_hits / limit
            recall = query_hits / len(relevant)
            ndcg = ndcg_at_k(found_ids, relevant, k=limit)
            ndcg2id[query_id] = ndcg
            
            # Append metric values to lists
            precisions.append(precision)
            recalls.append(recall)
            ndcgs.append(ndcg)
            hits += query_hits
            n += len(relevant)
            num_queries += 1

    print(f"Average NDCG@1: {sum(ndcgs) / len(ndcgs):.4f} for {res_file}")
    return ndcg2id


In [None]:
# Load ground truth
queries = load_queries(_queries=QUERIES, qrels=QRELS)
print("BM42:")
scores_bm42 = get_ndcg2id(queries,res_file=RESULTS_FILE)
print("BM25:")
scores_baseline = get_ndcg2id(queries,res_file=RESULTS_FILE_BASELINE)

In [None]:
#GETS YOU SCORES AND IDS FOR BM25

id_baseline = []
score_baseline = []
for ids, scores in scores_baseline.items():
    id_baseline.append(ids)
    score_baseline.append(scores)

#GETS YOU SCORES AND IDS FOR BM42
    
id_bm42 = []
score_bm42 = []
for ids, scores in scores_bm42.items():
    id_bm42.append(ids)
    score_bm42.append(scores)

In [None]:
#GETS THE INDIVIDUAL GAP SCORES BETWEEN TWO MODELS

gap_score = []
scores_dict = {}

for i in range(len(id_bm42)):
    gap = score_bm42[i] - score_baseline[i]         # THIS IS THE GAP SCORE BETWEEN BM25 AND BM42
    sum = score_bm42[i] + score_baseline[i]         # THIS IS THEIR SUM SCORE 
    scores_dict[id_bm42[i]] = {
        "baseline": score_baseline[i],
        "bm42": score_bm42[i],
        "score_gap": gap,
        "sum": sum
    }
    gap_score.append(gap)

In [None]:
#SCORES SORTED BASED ON WHAT YOU NEED TO BE LOOKING AT
worse_bm42 = {k: v for k, v in sorted(scores_dict.items(), key=lambda item: item[1]["score_gap"])}
better_bm42 = {k: v for k, v in sorted(scores_dict.items(), key=lambda item: item[1]["score_gap"], reverse=True)}

In [None]:
for i,d in better_bm42.items():
    print(i,d)