## Compute the ideal values for the given dataset

In [2]:
QRELS_DIR = '../data/annotation/results/qrels_merged'

In [3]:
def count_non_zero(lst):
    count = 0
    for num in lst:
        if num != 0:
            count += 1
    return count


In [7]:
# Statistics for relevance judgements
import os
import json
import statistics

def count_judgements(directory):
    judgements_count = {}

    for filename in os.listdir(directory):
        if filename.endswith(".json"):
            filepath = os.path.join(directory, filename)
            with open(filepath) as file:
                data = json.load(file)
                qid = filename.split(".")[0]
                judgements = data.get('judgements', [])
                judgements_count[qid] = len(judgements)

    return judgements_count

directory_path = QRELS_DIR
result = count_judgements(directory_path)
print(result)


judgements = list(result.values())
print(statistics.mean(judgements))
print(min(judgements))
print(max(judgements))

{'EQ_10': 8, 'EQ_38': 5, 'EQ_23': 6, 'EQ_22': 4, 'EQ_2': 4, 'EQ_28': 19, 'EQ_21': 5, 'EQ_25': 5, 'EQ_32': 3, 'EQ_14': 3, 'EQ_44': 5, 'EQ_16': 17, 'EQ_5': 2, 'EQ_4': 6, 'EQ_35': 8, 'EQ_24': 11, 'EQ_34': 11, 'EQ_19': 5, 'EQ_1': 5, 'EQ_29': 6, 'EQ_9': 6, 'EQ_27': 4, 'EQ_37': 2, 'EQ_42': 8, 'EQ_12': 9, 'EQ_48': 4, 'EQ_26': 4, 'EQ_45': 1, 'EQ_49': 4, 'EQ_33': 6, 'EQ_47': 5, 'EQ_13': 3, 'EQ_18': 3, 'EQ_40': 7, 'EQ_3': 7, 'EQ_17': 2, 'EQ_46': 9, 'EQ_6': 1, 'EQ_20': 10, 'EQ_8': 2, 'EQ_11': 2, 'EQ_15': 3, 'EQ_43': 2, 'EQ_39': 1, 'EQ_36': 3, 'EQ_7': 6, 'EQ_30': 2}
5.404255319148936
1
19


In [22]:
import os
import json
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score

# Set the value of k for computing metrics@k
k = 10

# Read in the relevance judgments from the first directory
judgements = {}
judgement_count = 0
relevant_count = 0
for json_file in os.listdir(QRELS_DIR):
    if not json_file.endswith(".json"):
        continue
    with open(os.path.join(QRELS_DIR, json_file)) as f:
        data = json.load(f)
        qid = data['qid']
        labels = [item['relevance'] for item in data['judgements']]
        labels.sort(reverse=True)
        judgements[qid] = labels
        judgement_count += len(labels)
        relevant_count += count_non_zero(labels)

# Calculate evaluation metrics@k for each query
ideal_precisions = []
ideal_recalls = []
ideal_f1_scores = []
ideal_ndcgs = []
for qid in judgements.keys():
    # Extract the relevance scores for the ranked documents
    relevance_scores = judgements[qid]
    
    # Calculate precision, recall, and F1 score
    precision = min(k, count_non_zero(relevance_scores)) / k if k > 0 else 0
    recall = min(k, count_non_zero(relevance_scores)) / count_non_zero(relevance_scores) if count_non_zero(relevance_scores) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    # # Check if all relevance scores are zero
    # if sum(relevance_scores) == 0:
    #     ndcg = 0
    # else:
    #     # Calculate NDCG@k
    #     dcg = relevance_scores[0]
    #     for i in range(1, min(len(relevance_scores), k)):
    #         dcg += relevance_scores[i] / np.log2(i+1)
    #     idcg = sorted(relevance_scores, reverse=True)
    #     idcg = idcg[0] + sum([idcg[i] / np.log2(i+1) for i in range(1, min(len(idcg), k))])
    #     ndcg = dcg / idcg if idcg > 0 else 0
    
    # Append the evaluation metrics to the lists
    ideal_precisions.append(precision)
    ideal_recalls.append(recall)
    ideal_f1_scores.append(f1)
    # all_ndcgs.append(ndcg)

# Compute the average evaluation metrics@k across all queries
avg_precision = np.mean(ideal_precisions)
avg_recall = np.mean(ideal_recalls)
avg_f1_score = np.mean(ideal_f1_scores)
# avg_ndcg = np.mean(all_ndcgs)

# Print the evaluation metrics@k
print(f"Metrics@{k}")
print(f"Average Precision: {avg_precision:.4f}")
print(f"Average Recall: {avg_recall:.4f}")
print(f"Average F1 Score: {avg_f1_score:.4f}")
# print(f"Average NDCG: {avg_ndcg:.4f}")

print(f"# queries: {len(judgements)}")
print(f"# judgements: {judgement_count}")
print(f"# relevant: {relevant_count}")

Metrics@10
Average Precision: 0.4191
Average Recall: 0.9856
Average F1 Score: 0.5356
# queries: 47
# judgements: 254
# relevant: 208


In [23]:
for item in ideal_precisions: 
    if item==0: 
        print('zero')

# Compute metrics

In [30]:
QRELS_DIR = '../data/annotation/results/qrels_merged'

# RESULTS_DIR = '../data/evaluation/results/evaluation_queries/text/bm25'
# RESULTS_DIR = '../data/evaluation/results/evaluation_queries/text/model1'
# RESULTS_DIR = '../data/evaluation/results/evaluation_queries/text/model2'
# RESULTS_DIR = '../data/evaluation/results/evaluation_queries/code/bm25'
# RESULTS_DIR = '../data/evaluation/results/evaluation_queries/code/model1'
# RESULTS_DIR = '../data/evaluation/results/evaluation_queries/code/model2'
# RESULTS_DIR = '../data/evaluation/results/evaluation_queries/code/model3'
# RESULTS_DIR = '../data/evaluation/results/evaluation_queries/code/model4'
# RESULTS_DIR = '../data/evaluation/results/evaluation_queries/code_comments/bm25'
RESULTS_DIR = '../data/evaluation/results/evaluation_queries/text_code/bm25'
# RESULTS_DIR = '../data/evaluation/results/evaluation_queries/text_code/model1'
# RESULTS_DIR = '../data/evaluation/results/evaluation_queries/text_code/model2'
# RESULTS_DIR = '../data/evaluation/results/evaluation_queries/fusion/bm25_model1'
# RESULTS_DIR = '../data/evaluation/results/evaluation_queries/fusion/bm25_model2'
# RESULTS_DIR = '../data/evaluation/results/evaluation_queries/fusion/model1_model2'


In [31]:
# Text
# EMBEDDING_MODELS = [("model1", "sentence-transformers/multi-qa-mpnet-base-dot-v1"),
#                      ("model2", "sentence-transformers/all-mpnet-base-v2")]

# Code
EMBEDDING_MODELS = [("model1", "microsoft/codebert-base"), 
                    ("model2", "flax-sentence-embeddings/st-codesearch-distilroberta-base"), 
                    ("model3", "sentence-transformers/multi-qa-mpnet-base-dot-v1"),
                    ("model4", "sentence-transformers/all-mpnet-base-v2")]

In [32]:
def divide_lists(list1, list2):
    if len(list1) != len(list2):
        raise ValueError("Both lists must have the same length.")
    
    result = []
    for i in range(len(list1)):
        # Perform division element-wise
        if list2[i] != 0:
            result.append(list1[i] / list2[i])
        else:
            # Handle division by zero
            result.append(float('inf'))
    
    return result


In [33]:
RESULTS_DIR

'../data/evaluation/results/evaluation_queries/text_code/bm25'

In [34]:
import os
import json
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score

# Set the value of k for computing metrics@k
for k in [5, 10]: 

    # Read in the relevance judgments from the first directory
    judgements = {}
    for json_file in os.listdir(QRELS_DIR):
        if not json_file.endswith(".json"):
            continue
        with open(os.path.join(QRELS_DIR, json_file)) as f:
            data = json.load(f)
            qid = data['qid']
            judgements[qid] = {item['docid']: item['relevance'] for item in data['judgements']}

    # Read in the ranking results from the second directory
    results = {}
    for json_file in os.listdir(RESULTS_DIR):
        if not json_file.endswith(".json"):
            continue
        with open(os.path.join(RESULTS_DIR, json_file)) as f:
            data = json.load(f)
            qid = data['qid']
            results[qid] = [(item['docid'], item['score']) for item in data['docs']]

    # Calculate evaluation metrics@k for each query
    all_precisions = []
    all_recalls = []
    all_f1_scores = []
    all_ndcgs = []

    norm_precisions = []
    norm_recalls = []
    norm_f1_scores = []

    for qid in judgements.keys():
        # Sort the ranked list of documents by score in descending order
        ranked_docs = sorted(results[qid], key=lambda x: x[1], reverse=True)[:k]
        # Extract the document IDs from the ranked list
        ranked_docids = [doc[0] for doc in ranked_docs]
        # Extract the relevance scores for the ranked documents
        relevance_scores = [judgements[qid].get(docid, 0) for docid in ranked_docids]
        
        print(qid, relevance_scores)
        # Calculate precision, recall, and F1 score
        tp = count_non_zero(relevance_scores)
        fp = len(relevance_scores) - tp
        precision = tp / k if k > 0 else 0
        recall = tp / count_non_zero(judgements[qid].values()) if count_non_zero(judgements[qid].values()) > 0 else 0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        
        data = judgements[qid]
        ideal_scores = list(data.values())
        ideal_scores.sort(reverse=True)
        ideal_precision = min(k, count_non_zero(ideal_scores))/k if k > 0 else 0
        ideal_recall = min(k, count_non_zero(ideal_scores)) / count_non_zero(ideal_scores) if count_non_zero(ideal_scores) > 0 else 0
        ideal_f1 = 2 * (ideal_precision * ideal_recall) / (ideal_precision + ideal_recall) if (ideal_precision + ideal_recall) > 0 else 0

        norm_precision = precision/ideal_precision
        norm_recall = recall/ideal_recall
        norm_f1 = f1/ideal_f1
        
        # Check if all relevance scores are zero
        if count_non_zero(relevance_scores) == 0:
            ndcg = 0
        else:
            # Calculate NDCG@k
            dcg = relevance_scores[0]
            for i in range(1, min(len(relevance_scores), k)):
                dcg += relevance_scores[i] / np.log2(i+1)
            idcg = sorted(relevance_scores, reverse=True)
            idcg = idcg[0] + count_non_zero([idcg[i] / np.log2(i+1) for i in range(1, min(len(idcg), k))])
            ndcg = dcg / idcg if idcg > 0 else 0
        
        # Append the evaluation metrics to the lists
        all_precisions.append(precision)
        all_recalls.append(recall)
        all_f1_scores.append(f1)
        all_ndcgs.append(ndcg)

        norm_precisions.append(norm_precision)
        norm_recalls.append(norm_recall)
        norm_f1_scores.append(norm_f1)


    # norm_precisions = divide_lists(all_precisions, ideal_precisions)
    # norm_recalls = divide_lists(all_recalls, ideal_recalls)
    # norm_f1_scores = divide_lists(all_f1_scores, ideal_f1_scores)

    # Compute the average evaluation metrics@k across all queries
    avg_precision = np.mean(all_precisions)
    avg_recall = np.mean(all_recalls)
    avg_f1_score = np.mean(all_f1_scores)
    avg_ndcg = np.mean(all_ndcgs)

    norm_avg_precision = np.mean(norm_precisions)
    norm_avg_recall = np.mean(norm_recalls)
    norm_avg_f1_score = np.mean(norm_f1_scores)

    # Print the evaluation metrics@k
    print(f"Metrics@{k}")
    print(f"Average Precision: {avg_precision:.4f}")
    print(f"Average Recall: {avg_recall:.4f}")
    print(f"Average F1 Score: {avg_f1_score:.4f}")
    print(f"Average NDCG: {avg_ndcg:.4f}\n")

# print(f"Metrics@{k}")
# print(f"Average Precision: {norm_avg_precision:.4f}")
# print(f"Average Recall: {norm_avg_recall:.4f}")
# print(f"Average F1 Score: {norm_avg_f1_score:.4f}")


EQ_10 [0, 0, 0, 0, 0]
EQ_38 [0, 0, 0, 2, 0]
EQ_23 [2, 0, 0, 0, 0]
EQ_22 [2, 0, 1, 1, 0]
EQ_2 [0, 0, 2, 1, 2]
EQ_28 [0, 0, 0, 0, 0]
EQ_21 [0, 0, 0, 0, 0]
EQ_25 [0, 0, 1, 2, 0]
EQ_32 [0, 0, 0, 0, 1]
EQ_14 [0, 0, 1, 2, 0]
EQ_44 [0, 0, 1, 0, 0]
EQ_16 [0, 0, 0, 0, 0]
EQ_5 [0, 0, 0, 0, 2]
EQ_4 [0, 0, 2, 0, 0]
EQ_35 [0, 0, 0, 1, 0]
EQ_24 [2, 2, 0, 2, 0]
EQ_34 [0, 3, 0, 0, 3]
EQ_19 [1, 0, 0, 0, 0]
EQ_1 [2, 0, 2, 2, 0]
EQ_29 [1, 0, 0, 3, 2]
EQ_9 [3, 0, 2, 2, 3]
EQ_27 [0, 0, 0, 0, 0]
EQ_37 [0, 0, 0, 0, 0]
EQ_42 [0, 0, 0, 1, 0]
EQ_12 [0, 0, 0, 0, 0]
EQ_48 [2, 0, 0, 0, 0]
EQ_26 [0, 3, 0, 2, 0]
EQ_45 [0, 0, 0, 0, 0]
EQ_49 [0, 0, 1, 0, 0]
EQ_33 [3, 2, 2, 2, 0]
EQ_47 [2, 2, 2, 2, 0]
EQ_13 [0, 0, 0, 0, 0]
EQ_18 [0, 0, 0, 0, 0]
EQ_40 [0, 3, 0, 3, 3]
EQ_3 [0, 0, 0, 0, 0]
EQ_17 [0, 0, 0, 0, 0]
EQ_46 [2, 0, 0, 0, 0]
EQ_6 [0, 0, 0, 0, 0]
EQ_20 [0, 2, 3, 0, 2]
EQ_8 [1, 0, 0, 0, 1]
EQ_11 [0, 0, 0, 0, 0]
EQ_15 [0, 0, 0, 0, 2]
EQ_43 [0, 0, 2, 0, 0]
EQ_39 [0, 0, 0, 0, 0]
EQ_36 [0, 0, 2, 0, 0]
EQ_7 [2, 2, 0, 0, 

In [81]:
len(judgements.keys())

47