In [1]:
import json
import statistics

import numpy as np

from config.storage import get_gradient_similarity_bm25_selected_file_path

In [49]:
def get_gradient_similarities_from_file(sample_size: int):
    with open(get_gradient_similarity_bm25_selected_file_path(sample_size)) as f:
        gradient_similarities = json.load(f)
        f.close()
        return gradient_similarities

In [50]:
def calculate_score(gradient_similarities: dict):
    correct_counter = 0
    
    for original_id, similarities in gradient_similarities.items():
        most_similar_paraphrased_id = max(similarities, key=similarities.get)
        correct_counter += int(original_id == most_similar_paraphrased_id)
        
    return correct_counter / len(gradient_similarities.keys())

In [51]:
def get_wrong_decisions(gradient_similarities: dict):
    wrong_decisions = []
    
    for original_id, similarities in gradient_similarities.items():
        most_similar_paraphrased_id = max(similarities, key=similarities.get)
        if original_id != most_similar_paraphrased_id:
            wrong_decisions.append(original_id)

    return wrong_decisions

In [52]:
def get_mean_similarity_score_of_most_similar_samples(gradient_similarities: dict, mean_function = np.mean):
    highest_similarities = []
    
    for original_id, similarities in gradient_similarities.items():
        highest_similarities.append(max(similarities.values()))
        
    return mean_function(highest_similarities)

In [53]:
gradient_similarities_subset_5 = get_gradient_similarities_from_file(5)

In [54]:
calculate_score(gradient_similarities_subset_5)

1.0

In [55]:
gradient_similarities_subset_100 = get_gradient_similarities_from_file(100)

In [56]:
calculate_score(gradient_similarities_subset_100)

0.99

In [57]:
get_wrong_decisions(gradient_similarities_subset_100)

['lima_70']

In [58]:
# bm25 made a mistake at lime_70

In [59]:
get_mean_similarity_score_of_most_similar_samples(gradient_similarities_subset_100)

np.float64(0.7384314449690282)

In [62]:
get_mean_similarity_score_of_most_similar_samples(gradient_similarities_subset_100, statistics.median)

0.7357294261455536