In [23]:
import json

In [24]:
small = "swtor"
big = "starwars"

In [25]:
mappings_file_small = "./_input/mappings/" + small + ".json"
mappings_file_big = "./_input/mappings/" + big + ".json"

gold_pairs_file = "./_input/gold_pairs/" + small + "-" + big + ".txt"
exact_match_file = "./_input/exact_match_all/" + small + "-" + big + ".json"
found_pairs_file = "./_input/found_pairs_em/" + small + "-" + big + ".txt"

In [26]:
with open(mappings_file_small) as file:
    mappings_small = {str(v): k for k, v in json.load(file).items()}
    mappings_small_reversed = {v: k for k, v in mappings_small.items()}

with open(mappings_file_big) as file:
    mappings_big = {str(v): k for k, v in json.load(file).items()}
    mappings_big_reversed = {v: k for k, v in mappings_big.items()}

with open(gold_pairs_file) as gpf:
    gold_pairs = []
    for line in gpf:
        numbers_list = [int(num) for num in line.strip().split(";")]
        gold_pairs.append(numbers_list)

with open(found_pairs_file) as fpf:
    found_pairs = []
    found_pairs_score = []
    for line in fpf:
        num_1 = int(mappings_small_reversed[line.strip().split("###")[0]])
        num_2 = int(mappings_big_reversed[line.strip().split("###")[1]])
        num_3 = float(line.strip().split("###")[2])
        numbers_list = [num_1, num_2]
        found_pairs.append(numbers_list)
        numbers_list_score = [num_1, num_2, num_3]
        found_pairs_score.append(numbers_list_score)

gold_pairs_left = {pair[0] for pair in gold_pairs}
gold_pairs_right = {pair[1] for pair in gold_pairs}

with open(exact_match_file) as file:
    exact_match = json.load(file)

exact_match_left = {pair[0] for pair in exact_match}

In [27]:
def compute_precision_recall_f1(gold_input, found_input):

    gold_set = set(tuple(pair) for pair in gold_input)
    found_set = set(tuple(pair) for pair in found_input)

    true_positives = gold_set & found_set
    tp = len(true_positives)
    fn = len(gold_set - found_set)
    fp = len(found_set - gold_set)

    precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0

    return precision, recall, f1

In [28]:
def print_found_but_not_gold(gold_input, found_input):

    found_but_not_gold = [[mappings_small[str(pair[0])], mappings_big[str(pair[1])]]
                          for pair in found_input if pair not in gold_input]

    for item in found_but_not_gold:
        print(item[0])
        print(item[1])
        print()

    print(len(found_but_not_gold))

In [29]:
def print_gold_not_found(gold_input, found_input):

    gold_not_found = [[mappings_small[str(pair[0])], mappings_big[str(pair[1])]]
                          for pair in gold_input if pair not in found_input]

    for item in gold_not_found:
        print(item[0])
        print(item[1])
        print()

    print(len(gold_not_found))

In [30]:
def clean_found_pairs(found_input_score, threshold):

    temp_dict = dict()

    for pair in found_input_score:
        if str(pair[1]) not in temp_dict:
            temp_dict[str(pair[1])] = pair[2]
        elif pair[2] > temp_dict[str(pair[1])]:
            temp_dict[str(pair[1])] = pair[2]

    output = []

    for pair in found_input_score:
        if ((pair[0] in gold_pairs_left or pair[1] in gold_pairs_right)
                and pair[2] > threshold
                and pair[2] == temp_dict[str(pair[1])]):
            output.append([pair[0], pair[1]])

    return output

In [31]:
def add_exact_match(found_input):

    output = []

    for em in exact_match:
        if em[0] in gold_pairs_left or em[1] in gold_pairs_right:
            output.append(em)

    for pair in found_input:
        if (pair[0] in gold_pairs_left or pair[1] in gold_pairs_right) and pair[0] not in exact_match_left:
            output.append(pair)

    return output

In [32]:
def calculate_threshold(found_input_score):

    temp_dict = dict()

    for pair in found_input_score:
        if str(pair[1]) not in temp_dict:
            temp_dict[str(pair[1])] = pair[2]
        elif pair[2] > temp_dict[str(pair[1])]:
            temp_dict[str(pair[1])] = pair[2]

    count = 0
    sum_score = 0.0
    for em in exact_match:
        if mappings_big_reversed[str(em[1])] in temp_dict:
            count += 1
            sum_score += temp_dict[mappings_big_reversed[str(em[1])]]

    return sum_score / count

In [33]:
len(found_pairs)

5385

In [34]:
precision, recall, f1 = compute_precision_recall_f1(gold_pairs, found_pairs)

print(f"Precision: {precision:.5f}")
print(f"Recall: {recall:.5f}")
print(f"F1 Score: {f1:.5f}")

Precision: 0.18087
Recall: 0.68160
F1 Score: 0.28588


In [35]:
reranker_threshold = calculate_threshold(found_pairs_score)
print(f"Threshold: {reranker_threshold:.5f}")

Threshold: 4.27462


In [36]:
found_pairs_cleaned = clean_found_pairs(found_pairs_score, reranker_threshold)

In [37]:
len(found_pairs_cleaned)

645

In [38]:
precision, recall, f1 = compute_precision_recall_f1(gold_pairs, found_pairs_cleaned)

print(f"Precision: {precision:.5f}")
print(f"Recall: {recall:.5f}")
print(f"F1 Score: {f1:.5f}")

Precision: 0.83411
Recall: 0.37649
F1 Score: 0.51880


In [39]:
found_pairs_cleaned_em = add_exact_match(found_pairs_cleaned)

In [40]:
len(found_pairs_cleaned_em)

645

In [41]:
precision, recall, f1 = compute_precision_recall_f1(gold_pairs, found_pairs_cleaned_em)

print(f"Precision: {precision:.5f}")
print(f"Recall: {recall:.5f}")
print(f"F1 Score: {f1:.5f}")

Precision: 0.83411
Recall: 0.37649
F1 Score: 0.51880


In [42]:
print_found_but_not_gold(gold_pairs, found_pairs_cleaned_em)

http://dbkwik.webdatacommons.org/swtor.wikia.com/resource/Jolee_Bindo
http://dbkwik.webdatacommons.org/starwars.wikia.com/resource/Unidentified_Jedi_Padawan

http://dbkwik.webdatacommons.org/swtor.wikia.com/property/leaders
http://dbkwik.webdatacommons.org/starwars.wikia.com/property/leader

http://dbkwik.webdatacommons.org/swtor.wikia.com/resource/HK-51
http://dbkwik.webdatacommons.org/starwars.wikia.com/resource/Darth_Scabrous%27_HK_droid

http://dbkwik.webdatacommons.org/swtor.wikia.com/resource/Shian
http://dbkwik.webdatacommons.org/starwars.wikia.com/resource/Unidentified_Jedi_Master_(Holocron)

http://dbkwik.webdatacommons.org/swtor.wikia.com/resource/Exar_Kun
http://dbkwik.webdatacommons.org/starwars.wikia.com/resource/Holocron_Dark_Jedi

http://dbkwik.webdatacommons.org/swtor.wikia.com/resource/Jedi_Purge
http://dbkwik.webdatacommons.org/starwars.wikia.com/resource/Jedi_Purge

http://dbkwik.webdatacommons.org/swtor.wikia.com/resource/Arcona
http://dbkwik.webdatacommons.org/star

In [43]:
print_gold_not_found(gold_pairs, found_pairs_cleaned_em)

http://dbkwik.webdatacommons.org/swtor.wikia.com/resource/2V-R8
http://dbkwik.webdatacommons.org/starwars.wikia.com/resource/2V-R8

http://dbkwik.webdatacommons.org/swtor.wikia.com/resource/26,347_BTC
http://dbkwik.webdatacommons.org/starwars.wikia.com/resource/30,000_BBY

http://dbkwik.webdatacommons.org/swtor.wikia.com/resource/ATC_0
http://dbkwik.webdatacommons.org/starwars.wikia.com/resource/3653_BBY

http://dbkwik.webdatacommons.org/swtor.wikia.com/resource/14_BTC
http://dbkwik.webdatacommons.org/starwars.wikia.com/resource/3667_BBY

http://dbkwik.webdatacommons.org/swtor.wikia.com/resource/18_BTC
http://dbkwik.webdatacommons.org/starwars.wikia.com/resource/3671_BBY

http://dbkwik.webdatacommons.org/swtor.wikia.com/resource/BTC_103
http://dbkwik.webdatacommons.org/starwars.wikia.com/resource/3756_BBY

http://dbkwik.webdatacommons.org/swtor.wikia.com/resource/298_BTC
http://dbkwik.webdatacommons.org/starwars.wikia.com/resource/3951_BBY

http://dbkwik.webdatacommons.org/swtor.wikia.

In [44]:
for item in found_pairs:
        print(mappings_small[str(item[0])])
        print(mappings_big[str(item[1])])
        print()

http://dbkwik.webdatacommons.org/swtor.wikia.com/resource/Category:Scoundrels/Icons
http://dbkwik.webdatacommons.org/starwars.wikia.com/resource/Category:Characters

http://dbkwik.webdatacommons.org/swtor.wikia.com/resource/Fuliginous
http://dbkwik.webdatacommons.org/starwars.wikia.com/resource/Fuliginous

http://dbkwik.webdatacommons.org/swtor.wikia.com/resource/Champion
http://dbkwik.webdatacommons.org/starwars.wikia.com/resource/Champion_(Zakuul)

http://dbkwik.webdatacommons.org/swtor.wikia.com/resource/Havoc_Rounds
http://dbkwik.webdatacommons.org/starwars.wikia.com/resource/Havoc

http://dbkwik.webdatacommons.org/swtor.wikia.com/resource/Taith
http://dbkwik.webdatacommons.org/starwars.wikia.com/resource/Unidentified_Captain_of_the_Royal_Guard

http://dbkwik.webdatacommons.org/swtor.wikia.com/resource/Enkindle
http://dbkwik.webdatacommons.org/starwars.wikia.com/resource/Enkindle

http://dbkwik.webdatacommons.org/swtor.wikia.com/resource/Gorlan_Palladane
http://dbkwik.webdatacommon