In [1]:
import papermill as pm
import os
import pandas as pd

In [5]:
working_path = os.path.abspath('../../Contrastive_loss_nn_train_dev_test_25_75')

In [24]:
profiles_file = os.path.abspath('../../all_t2dv2_entity_profile_labels.tsv')

In [25]:
df = pd.read_csv(profiles_file, delimiter = '\t')
profile_dict = df.groupby("node1")["node2"].apply(list).to_dict()

In [None]:
for fold_dir_name in ["fold_1", "fold_2", "fold_3", "fold_4"]:
    print(fold_dir_name)
    for features_to_use in feature_combs:
        print(features_to_use)
        pm.execute_notebook(
            "contrastive_loss_nn.ipynb",
            f"{working_path}/pm_notebooks/{'-'.join(features_to_use)}.{fold_dir_name}.ipynb",
            parameters=dict(
                fold_dir_name = fold_dir_name,
                features_to_use = features_to_use
            )
        )

In [20]:
def collect_results(test_set = True, micro = True):
    rows = []
    for fold_dir_name in ["fold_1", "fold_2", "fold_3", "fold_4"]:
        row = [fold_dir_name]
        for features_to_use in feature_combs:
            results_path = f"{working_path}/{'-'.join(features_to_use)}/{fold_dir_name}"

            # look at either test set or dev set results
            if test_set:
                results_file = f"{results_path}/test_set_results.csv"
            else:
                results_file = f"{results_path}/best_dev_set_results.csv"
            results_df = pd.read_csv(results_file)

            macro_acc = results_df.loc[results_df.loc[:,"table"]=="Macro Avg","top1 acc in attainable cells"].item()
            micro_acc = results_df.loc[results_df.loc[:,"table"]=="Micro Avg","top1 acc in attainable cells"].item()
            if micro:
                row.append(micro_acc)
            else:
                row.append(macro_acc)

        rows.append(row)

    headers = [""] + [' + '.join(features) for features in feature_combs]
    return pd.DataFrame(data=rows, columns=headers)

In [22]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(collect_results(test_set = True, micro = True))

Unnamed: 0,Unnamed: 1,String_Sim4,String_Sim4 + Profile_ComplEx,String_Sim4 + Profile_TransE,String_Sim4 + ComplEx,String_Sim4 + TransE,String_Sim4 + Profile,String_Sim_w_Context6,String_Sim_w_Context6 + Profile_ComplEx,String_Sim_w_Context6 + Profile_TransE,String_Sim_w_Context6 + ComplEx,String_Sim_w_Context6 + TransE,String_Sim_w_Context6 + Profile,Standard7,Standard7 + Profile_ComplEx,Standard7 + Profile_TransE,Standard7 + ComplEx,Standard7 + TransE,Standard7 + Profile,Pagerank,Pagerank + Profile_ComplEx,Pagerank + Profile_TransE,Pagerank + ComplEx,Pagerank + TransE,Pagerank + Profile
0,fold_1,0.5774,0.4143,0.7734,0.5763,0.555,0.6871,0.6076,0.6887,0.0152,0.6184,0.4243,0.8603,0.5937,0.7202,0.6586,0.614,0.6393,0.8401,0.0894,0.5429,0.5202,0.0152,0.581,0.0152
1,fold_2,0.4201,0.7127,0.6805,0.57,0.6029,0.8499,0.5856,0.7459,0.6682,0.3796,0.6311,0.8631,0.6217,0.7004,0.6753,0.5648,0.63,0.8535,0.0155,0.5436,0.0155,0.0155,0.3342,0.4722
2,fold_3,0.6062,0.5316,0.7078,0.5737,0.6024,0.6541,0.6145,0.7346,0.6446,0.5755,0.6139,0.8366,0.6572,0.6375,0.718,0.6225,0.6288,0.8102,0.0149,0.4749,0.0149,0.0149,0.5686,0.6367
3,fold_4,0.4561,0.673,0.6889,0.4703,0.5754,0.8657,0.5931,0.4918,0.671,0.6248,0.583,0.8152,0.5735,0.684,0.596,0.5869,0.5613,0.8089,0.0159,0.0159,0.0159,0.0159,0.3172,0.0159


In [160]:
def get_test_predictions(pred_file):
    df = pd.read_csv(pred_file)
    
    row_num_to_gt = {}
    row_num_to_choices = {}
    
    scores = df.groupby('row').apply(lambda g: {cand : g.loc[g.loc[:,"kg_id"]==cand,"siamese_pred"].max() for cand in g.loc[:,"kg_id"].unique()}).to_list()
    cells = df.groupby("row")["kg_id"].apply(list).to_list()
    cells_gt = [l[0] for l in df.groupby("row")["GT_kg_id"].apply(list).to_list()]
    row_nums = [l[0] for l in df.groupby("row")["row"].apply(list).to_list()]
    
    for row in range(len(cells_gt)):
        
        row_num = row_nums[row]
        
        gt = cells_gt[row]
        row_num_to_gt[row_num] = gt
        row_num_to_choices[row_num] = ""
        
        max_score_in_row = max(scores[row].values())
        chosen_cands = [cand for cand, score in scores[row].items() if score == max_score_in_row]
        
        # let's only consider correct as top1 no ties right now...
        if len(chosen_cands) == 1:
            row_num_to_choices[row_num] = chosen_cands[0]
    
    return [row_num_to_choices, row_num_to_gt]

def get_candidates_switched_correct(pred_file_1, pred_file_2):
    [row_num_to_choices_1, row_num_to_gt_1] = get_test_predictions(pred_file_1)
    [row_num_to_choices_2, row_num_to_gt_2] = get_test_predictions(pred_file_2)
    
    assert row_num_to_gt_1 == row_num_to_gt_2
    row_num_to_gt = row_num_to_gt_1
    
    incorrect_rows_1 = [row for row in row_num_to_choices_1 if row_num_to_choices_1[row] != row_num_to_gt[row]]
    switched_correct_choices_2 = {row : row_num_to_choices_2[row] for row in incorrect_rows_1 if row_num_to_choices_2[row] == row_num_to_gt[row]}
    
    return switched_correct_choices_2

def get_candidates_switched_incorrect(pred_file_1, pred_file_2):
    [row_num_to_choices_1, row_num_to_gt_1] = get_test_predictions(pred_file_1)
    [row_num_to_choices_2, row_num_to_gt_2] = get_test_predictions(pred_file_2)
    
    assert row_num_to_gt_1 == row_num_to_gt_2
    row_num_to_gt = row_num_to_gt_1
    
    correct_rows_1 = [row for row in row_num_to_choices_1 if row_num_to_choices_1[row] == row_num_to_gt[row]]
    switched_correct_choices_2 = {row : row_num_to_choices_2[row] for row in correct_rows_1 if row_num_to_choices_2[row] != row_num_to_gt[row]}
    
    return [switched_correct_choices_2, row_num_to_gt]

def get_centroid_profile_labels(centroid_file, profile_dict):
    # get centroid entities
    centroid_df = pd.read_csv(centroid_file)
    centroid_ents = [l[0] for l in centroid_df.groupby("row")["GT_kg_id"].apply(list).to_list()]
    centroid_label_counts = {}
    for ent in centroid_ents:
        if ent not in profile_dict:
            continue
        for label in set(profile_dict[ent]):
            if label not in centroid_label_counts:
                centroid_label_counts[label] = 0
            centroid_label_counts[label] += 1
    return centroid_label_counts

def get_label_contributions(ent, centroid_label_counts, profile_dict):
    if ent not in profile_dict:
        print("entity doesn't have a profile")
        return {}
    return {label : count for label, count in centroid_label_counts.items() if label in set(profile_dict[ent])}
            

In [75]:
!ls {working_path}/Standard7/fold_1/test_predictions

24036779_0_5608105867560183058.csv  58891288_0_1117541047012405958.csv
25404227_0_2240631045609013057.csv  80588006_0_6965325215443683359.csv
29414811_13_8724394428539174350.csv 8468806_0_4382447409703007384.csv
53822652_0_5767892317858575530.csv  99070098_0_2074872741302696997.csv


In [267]:
df = pd.read_csv(f"{working_path}/Standard7/fold_1/test_predictions/53822652_0_5767892317858575530.csv")
display(df)

Unnamed: 0,column,row,label,context,label_clean,kg_id,kg_labels,kg_aliases,method,kg_descriptions,...,H_5x8-embedding-score,A-embedding-score,S-embedding-score,Profile-ComplEx-embedding-score,profile-score,monge_elkan_aliases,Profile-TransE-embedding-score,ComplEx-embedding-score,TransE-embedding-score,siamese_pred
0,1,1,Blue Valentine,2012-01-23|2010|Derek Cianfrance,Blue Valentine,Q1754804,Blue Valentine,,fuzzy-augmented,album by Tom Waits,...,0.013099,0.010483,0.013309,0.011388,0.0,0.0,0.016672,0.025896,0.025728,0.999779
1,1,1,Blue Valentine,2012-01-23|2010|Derek Cianfrance,Blue Valentine,Q676047,Blue Valentine,,fuzzy-augmented,2010 film by Derek Cianfrance,...,0.034709,0.044936,0.012804,0.020181,1.0,0.0,0.035213,0.078246,0.098407,0.999684
2,1,1,Blue Valentine,2012-01-23|2010|Derek Cianfrance,Blue Valentine,Q3935837,Rima Valentienė,,fuzzy-augmented,basketball player,...,0.008466,0.018913,0.012371,0.008408,0.0,0.0,0.007236,-0.001852,-0.022608,0.000505
3,1,1,Blue Valentine,2012-01-23|2010|Derek Cianfrance,Blue Valentine,Q29638659,Blue Valentine,,fuzzy-augmented,episode of Holby City (S13 E18),...,0.000000,0.041006,0.013867,0.012734,0.0,0.0,0.020839,0.043248,0.038489,0.999451
4,1,1,Blue Valentine,2012-01-23|2010|Derek Cianfrance,Blue Valentine,Q100753535,_ Blue,,fuzzy-augmented,college basketball player (1979–1979) Texas So...,...,0.008191,0.000000,0.012884,0.010622,0.0,0.0,0.011821,-0.021119,-0.004310,0.000174
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36655,1,424,Dukes of Hazzard,2005-12-04|2005|Jay Chandrasekhar,Dukes of Hazzard,Q104447107,Michael Dukes,,fuzzy-augmented,"holocaust victim, birth date unknown",...,0.000000,0.000000,0.002266,0.009435,0.0,0.0,0.006782,0.002407,0.014432,0.000177
36656,1,424,Dukes of Hazzard,2005-12-04|2005|Jay Chandrasekhar,Dukes of Hazzard,Q104447108,Sigmund Dukes,,fuzzy-augmented,"holocaust victim, b. 1863-04-27",...,0.012214,0.016643,0.001351,0.010639,0.0,0.0,0.007285,-0.004586,0.001884,0.000185
36657,1,424,Dukes of Hazzard,2005-12-04|2005|Jay Chandrasekhar,Dukes of Hazzard,Q104447109,Toni Dukes,,fuzzy-augmented,"holocaust victim, b. 1873-08-07",...,0.008973,0.015779,0.000711,0.009366,0.0,0.0,0.008465,-0.005169,0.000465,0.000161
36658,1,424,Dukes of Hazzard,2005-12-04|2005|Jay Chandrasekhar,Dukes of Hazzard,Q104448295,Alfred Dukes,,fuzzy-augmented,"holocaust victim, b. 1880-01-21",...,0.007963,0.015005,-0.003804,0.008761,0.0,0.0,0.008898,-0.006550,-0.002285,0.000141


In [269]:
df.loc[df.loc[:,"row"] == 386]

Unnamed: 0,column,row,label,context,label_clean,kg_id,kg_labels,kg_aliases,method,kg_descriptions,...,H_5x8-embedding-score,A-embedding-score,S-embedding-score,Profile-ComplEx-embedding-score,profile-score,monge_elkan_aliases,Profile-TransE-embedding-score,ComplEx-embedding-score,TransE-embedding-score,siamese_pred
33437,1,386,Inside Man,2006-03-25|2006|Spike Lee,Inside Man,Q23001035,Morgan Spurlock Inside Man,Inside Man|MSIM,fuzzy-augmented,American investigative documentary TV series,...,0.000000,0.008923,-0.002343,0.011627,0.000000,1.0,0.013328,0.014396,0.019061,0.952919
33438,1,386,Inside Man,2006-03-25|2006|Spike Lee,Inside Man,Q10671423,The Inside Man,Inside Man,fuzzy-augmented,1984 film by Tom Clegg,...,0.021102,0.006474,0.015987,0.012104,0.006465,1.0,0.013742,0.021054,0.020897,0.999922
33439,1,386,Inside Man,2006-03-25|2006|Spike Lee,Inside Man,Q1336260,The Man Inside,Man Inside,fuzzy-augmented,1990 film by Bobby Roth,...,0.022932,0.011598,0.019489,0.014026,0.070869,1.0,0.018100,0.021802,0.025271,0.027929
33440,1,386,Inside Man,2006-03-25|2006|Spike Lee,Inside Man,Q30612670,The Man Inside,Man Inside,fuzzy-augmented,2012 film by Dan Turner,...,0.017596,0.022069,0.001956,0.012062,0.005339,1.0,0.013831,0.021633,0.019510,0.027929
33441,1,386,Inside Man,2006-03-25|2006|Spike Lee,Inside Man,Q7749880,The Man Inside,Man Inside,fuzzy-augmented,1958 film by John Gilling,...,0.023741,0.009928,0.017941,0.013823,0.068777,1.0,0.018377,0.022044,0.019684,0.027929
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33550,1,386,Inside Man,2006-03-25|2006|Spike Lee,Inside Man,Q16797658,Inside Man,,exact-match,Wikimedia disambiguation page,...,0.000000,0.000000,-0.001036,0.000000,0.000000,0.0,0.000000,-0.000643,-0.001169,0.999345
33551,1,386,Inside Man,2006-03-25|2006|Spike Lee,Inside Man,Q23001035,Morgan Spurlock Inside Man,Inside Man|MSIM,exact-match,American investigative documentary TV series,...,0.000000,0.008923,-0.002343,0.011627,0.000000,1.0,0.013328,0.014396,0.019061,0.884851
33552,1,386,Inside Man,2006-03-25|2006|Spike Lee,Inside Man,Q6037663,Inside Man,,exact-match,episode of Star Trek: Voyager (S7 E6),...,0.019318,0.018169,0.016292,0.012742,0.000000,0.0,0.012885,0.019046,0.020181,0.999345
33553,1,386,Inside Man,2006-03-25|2006|Spike Lee,Inside Man,Q81224,Inside Man,,exact-match,2006 film by Spike Lee,...,0.028024,0.029521,0.018094,0.016328,0.091314,0.0,0.019826,0.025504,0.032564,0.999345


In [291]:
file_name = "53822652_0_5767892317858575530.csv" # big movie table
# file_name = "25404227_0_2240631045609013057.csv" # smaller movie table
# file_name = "99070098_0_2074872741302696997.csv" # mountains?
# file_name = "8468806_0_4382447409703007384.csv" # lakes

In [299]:
pred_file_1 = f"{working_path}/Standard7/fold_1/test_predictions/{file_name}"
pred_file_2 = f"{working_path}/Standard7-Profile_ComplEx/fold_1/test_predictions/{file_name}"
pred_file_3 = f"{working_path}/Standard7-Profile/fold_1/test_predictions/{file_name}"
pred_file_4 = f"{working_path}/Standard7-ComplEx/fold_1/test_predictions/{file_name}"

centroid_file = f"{working_path}/test/fold_1/centroid_rows/{file_name}"

In [293]:
eval_gt_label_counts = get_centroid_profile_labels(pred_file_1, profile_dict)

In [294]:
centroid_label_counts = get_centroid_profile_labels(centroid_file, profile_dict)

In [278]:
all_gt_label_counts = {l : c for l,c in centroid_label_counts.items()}
for l, c in eval_gt_label_counts.items():
    if l not in all_gt_label_counts:
        all_gt_label_counts[l] = 0
    all_gt_label_counts[l] += c

In [288]:
with pd.option_context('max_rows', None):
    display(df.groupby(["column","row"])["label", "context"].first())

  display(df.groupby(["column","row"])["label", "context"].first())


Unnamed: 0_level_0,Unnamed: 1_level_0,label,context
column,row,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,Blue Valentine,2012-01-23|2010|Derek Cianfrance
1,2,Dogtooth,2012-01-21|2009|Giorgos Lanthimos
1,3,Serendipity,2012-01-20|2001|Peter Chelsom
1,5,Transit,2012-01-15|2012|Antonio Negret
1,6,The Tree of Life,2012-01-14|2011|Terrence Malick
1,7,The Girl Who Played with Fire,2012-01-12|2009|Daniel Alfredson
1,8,The Girl with the Dragon Tattoo,2012-01-11|2009|Niels Arden Oplev
1,9,Mission: Impossible - Ghost Protocol,2012-01-10|2011|Brad Bird
1,10,30 Days of Night,2012-01-08|2007|David Slade
1,11,The Muppets,2012-01-07|2011|James Bobin


In [279]:
{k: v  for k, v in sorted(all_gt_label_counts.items(), key=lambda item: item[1], reverse=True)}

{'Q11424_P495_Q6256_P1081_0.856-0.957__': 394,
 'Q11424_P495_Q3624078_P1081_0.853-0.957__': 394,
 'Q11424_P750_Q368290_P571_1990-2000__': 387,
 'Q11424_P495_Q3624078_P2219_1.0-2.3__Q11229': 384,
 'Q11424_P495_Q6256_P2219_1.0-2.3__Q11229': 384,
 'Q11424_P437_Q723685': 383,
 'Q11424_P750_Q4830453_P571_1990-2000__': 382,
 'Q11424_P462_Q22006653': 382,
 'Q11424_P750_Q1194970_P571_1990-2000__': 381,
 'Q11424_P750_Q15265344_P1833_1500000.0-117580000.0__': 381,
 'Q11424_P750_Q1194970_P2139_4532930.0-11692713000.0__Q4917': 381,
 'Q11424_P750_Q841645_P3362_838679000.0-838679000.0__Q4917': 381,
 'Q11424_P750_Q10689397_P1833_117580000.0-117580000.0__': 381,
 'Q11424_P750_Q4830453_P3362_156180000.0-922000000.0__Q4917': 381,
 'Q11424_P750_Q4830453_P2139_5350000000.0-27810000000.0__Q4917': 381,
 'Q11424_P750_Q368290_P2295_558929000.0-558929000.0__Q4917': 381,
 'Q11424_P750_Q4830453_P1661_3.0-1142.0__': 381,
 'Q11424_P750_Q841645_P1833_1500000.0-117580000.0__': 381,
 'Q11424_P750_Q15265344_P2295_5589

In [295]:
switched_correct_cands = get_candidates_switched_correct(pred_file_1, pred_file_2)

In [296]:
switched_correct_cands_3 = get_candidates_switched_correct(pred_file_1, pred_file_3)

In [300]:
switched_correct_cands_4 = get_candidates_switched_correct(pred_file_1, pred_file_4)

In [221]:
[row_num_to_choices_3, row_num_to_gt_1] = get_test_predictions(pred_file_3)

In [222]:
row_num_to_choices_3[88]

'Q30623660'

In [297]:
switched_correct_cands

{1: 'Q676047',
 19: 'Q732960',
 33: 'Q859448',
 34: 'Q738152',
 61: 'Q1338368',
 64: 'Q323472',
 76: 'Q725539',
 78: 'Q429934',
 102: 'Q314942',
 108: 'Q1144479',
 111: 'Q210364',
 114: 'Q63366',
 115: 'Q244296',
 118: 'Q168821',
 123: 'Q604083',
 156: 'Q213081',
 157: 'Q514565',
 158: 'Q108586',
 162: 'Q201819',
 187: 'Q4898550',
 189: 'Q640450',
 225: 'Q247182',
 232: 'Q470073',
 243: 'Q1540008',
 256: 'Q79503',
 264: 'Q1570133',
 265: 'Q2006346',
 273: 'Q270351',
 284: 'Q821753',
 288: 'Q382882',
 289: 'Q729788',
 298: 'Q387601',
 301: 'Q209667',
 327: 'Q151792',
 338: 'Q470771',
 342: 'Q841476',
 365: 'Q3476148',
 367: 'Q565009',
 370: 'Q4156493',
 372: 'Q208424',
 377: 'Q375855',
 378: 'Q582021',
 380: 'Q270385',
 385: 'Q2092936',
 386: 'Q81224',
 388: 'Q633307',
 389: 'Q188000',
 393: 'Q2005056',
 409: 'Q330113'}

In [298]:
switched_correct_cands_3

{1: 'Q676047',
 2: 'Q1075918',
 3: 'Q971468',
 5: 'Q5236871',
 11: 'Q550558',
 17: 'Q1551916',
 21: 'Q629596',
 25: 'Q165699',
 26: 'Q260509',
 30: 'Q170268',
 32: 'Q221820',
 33: 'Q859448',
 34: 'Q738152',
 48: 'Q379877',
 53: 'Q837264',
 61: 'Q1338368',
 64: 'Q323472',
 76: 'Q725539',
 78: 'Q429934',
 79: 'Q269887',
 86: 'Q380667',
 91: 'Q1345077',
 92: 'Q478212',
 93: 'Q25188',
 94: 'Q381028',
 97: 'Q629974',
 102: 'Q314942',
 108: 'Q1144479',
 109: 'Q1165314',
 111: 'Q210364',
 114: 'Q63366',
 115: 'Q244296',
 118: 'Q168821',
 123: 'Q604083',
 125: 'Q184605',
 147: 'Q26751',
 153: 'Q733570',
 156: 'Q213081',
 157: 'Q514565',
 158: 'Q108586',
 162: 'Q201819',
 163: 'Q379994',
 182: 'Q312078',
 186: 'Q275553',
 189: 'Q640450',
 197: 'Q162182',
 198: 'Q958626',
 217: 'Q1048360',
 218: 'Q220955',
 223: 'Q212041',
 224: 'Q2032325',
 225: 'Q247182',
 232: 'Q470073',
 237: 'Q93512',
 243: 'Q1540008',
 252: 'Q123742',
 254: 'Q1626186',
 256: 'Q79503',
 265: 'Q2006346',
 269: 'Q679611',
 27

In [301]:
switched_correct_cands_4

{111: 'Q210364',
 115: 'Q244296',
 158: 'Q108586',
 243: 'Q1540008',
 365: 'Q3476148',
 367: 'Q565009',
 370: 'Q4156493',
 385: 'Q2092936'}

In [None]:
get_label_contributions("Q1383", centroid_label_counts, profile_dict)

In [248]:
[row_num_to_choices_1, _] = get_test_predictions(pred_file_1)

In [250]:
{row: row_num_to_choices_1[row] for row in switched_correct_cands}

{3: 'Q24705146',
 12: 'Q1904',
 14: 'Q19317422',
 17: 'Q19394525',
 55: 'Q6961443',
 88: 'Q664319'}

In [43]:
row_num_to_choices_1[3]

'Q24705146'

In [215]:
for row in switched_correct_cands:
    original_choice = row_num_to_choices_1[row]
    label_contributions = get_label_contributions(original_choice, centroid_label_counts, profile_dict)
    if len(label_contributions):
        print(f"row:{row}, old(wrong):{original_choice}, new(correct):{switched_correct_cands[row]}")

entity doesn't have a profile


In [150]:
for row in switched_correct_cands_3:
    original_choice = row_num_to_choices_1[row]
    label_contributions = get_label_contributions(original_choice, centroid_label_counts, profile_dict)
    if len(label_contributions):
        print(f"row:{row}, old(wrong):{original_choice}, new(correct):{switched_correct_cands_3[row]}")

entity doesn't have a profile
entity doesn't have a profile
entity doesn't have a profile
entity doesn't have a profile
entity doesn't have a profile
entity doesn't have a profile
entity doesn't have a profile
entity doesn't have a profile
entity doesn't have a profile
entity doesn't have a profile
entity doesn't have a profile
entity doesn't have a profile
entity doesn't have a profile
entity doesn't have a profile
entity doesn't have a profile
entity doesn't have a profile
entity doesn't have a profile
entity doesn't have a profile
entity doesn't have a profile
row:198, old(wrong):Q56274908, new(correct):Q958626
entity doesn't have a profile
entity doesn't have a profile
row:224, old(wrong):Q3472777, new(correct):Q2032325
entity doesn't have a profile
entity doesn't have a profile
entity doesn't have a profile
entity doesn't have a profile
row:312, old(wrong):Q1164767, new(correct):Q645735
entity doesn't have a profile
entity doesn't have a profile
entity doesn't have a profile
row:3

In [228]:
[switched_incorrect_choices_2, row_num_to_gt] = get_candidates_switched_incorrect(pred_file_1, pred_file_2)

In [217]:
[switched_incorrect_choices_3, row_num_to_gt] = get_candidates_switched_incorrect(pred_file_1, pred_file_3)

In [229]:
switched_incorrect_choices_2

{}

In [219]:
switched_incorrect_choices_3

{0: 'Q1746745',
 22: 'Q22595071',
 31: 'Q22702235',
 50: 'Q22632492',
 74: '',
 86: 'Q22590810'}

In [204]:
{row : gt for row, gt in row_num_to_gt.items() if row in switched_incorrect_choices_2}

{156: 'Q4797719', 164: 'Q7993675', 167: 'Q6813053'}

In [239]:
old_cand_label_contributions = get_label_contributions("Q10671423", centroid_label_counts, profile_dict)

In [240]:
new_cand_label_contributions = get_label_contributions("Q81224", centroid_label_counts, profile_dict)

In [182]:
[l for l in profile_dict["Q81224"] if l == 'Q11424_P161_Q5_P2031_1990-2000__']

['Q11424_P161_Q5_P2031_1990-2000__',
 'Q11424_P161_Q5_P2031_1990-2000__',
 'Q11424_P161_Q5_P2031_1990-2000__',
 'Q11424_P161_Q5_P2031_1990-2000__']

In [241]:
sum(old_cand_label_contributions.values())

1694

In [242]:
sum(new_cand_label_contributions.values())

23926

In [135]:
new_cand_label_contributions

{'Q11424_P462_Q22006653': 93,
 'Q11424_P364_Q1860': 92,
 'Q11424_P136_Q130232': 56,
 'Q11424_P437_Q723685': 95,
 'Q11424_P750_Q907311': 93,
 'Q11424_P2758_Q23817729': 51,
 'Q11424_P1981_Q20644796': 37,
 'Q11424_P1552_Q45172088': 40,
 'Q11424_P495_Q6256_P1081_0.856-0.957__': 134,
 'Q11424_P495_Q3624078_P1081_0.853-0.957__': 134,
 'Q11424_P495_Q6256_P2219_1.0-2.3__Q11229': 123,
 'Q11424_P495_Q3624078_P2219_1.0-2.3__Q11229': 123,
 'Q11424_P495_Q6256_P7295_1580-1590__': 100,
 'Q11424_P495_Q3624078_P7295_1580-1590__': 100,
 'Q11424_P495_Q3624078_P3001_65.25-67.0__Q24564698': 94,
 'Q11424_P495_Q43702_P3001_65.25-67.0__Q24564698': 92,
 'Q11424_P495_Q43702_P3529_31112.0-46555.0__Q4917': 92,
 'Q11424_P495_Q3624078_P7295_1750-1760__': 100,
 'Q11424_P495_Q6256_P7295_1750-1760__': 100,
 'Q11424_P495_Q43702_P2884_120.0-230.0__Q25250': 92,
 'Q11424_P495_Q3624078_P3864_5.36-13.1__': 92,
 'Q11424_P495_Q6256_P3864_5.36-13.1__': 92,
 'Q11424_P495_Q43702_P3864_12.1-12.6__': 92,
 'Q11424_P161_Q5_P569_1950

In [289]:
new_choice_labels = {label : contribution for label, contribution in new_cand_label_contributions.items() if label not in old_cand_label_contributions}
{k: v  for k, v in sorted(new_choice_labels.items(), key=lambda item: item[1], reverse=True)}
# i = 0
# for k,v in new_choice_labels.items():
#     if "Q11424_P495_" in k and v==83:
#         i+=1
# print(i)


{'Q11424_P437_Q723685': 95,
 'Q11424_P750_Q368290_P571_1990-2000__': 95,
 'Q11424_P495_Q3624078_P2219_1.0-2.3__Q11229': 94,
 'Q11424_P495_Q6256_P2219_1.0-2.3__Q11229': 94,
 'Q11424_P750_Q1194970_P571_1990-2000__': 93,
 'Q11424_P750_Q15265344_P1833_1500000.0-117580000.0__': 93,
 'Q11424_P750_Q1194970_P2139_4532930.0-11692713000.0__Q4917': 93,
 'Q11424_P750_Q841645_P3362_838679000.0-838679000.0__Q4917': 93,
 'Q11424_P750_Q10689397_P1833_117580000.0-117580000.0__': 93,
 'Q11424_P750_Q4830453_P3362_156180000.0-922000000.0__Q4917': 93,
 'Q11424_P750_Q4830453_P2139_5350000000.0-27810000000.0__Q4917': 93,
 'Q11424_P750_Q368290_P2295_558929000.0-558929000.0__Q4917': 93,
 'Q11424_P750_Q4830453_P1661_3.0-1142.0__': 93,
 'Q11424_P750_Q841645_P1833_1500000.0-117580000.0__': 93,
 'Q11424_P750_Q4830453_P571_1990-2000__': 93,
 'Q11424_P750_Q15265344_P2295_558929000.0-558929000.0__Q4917': 93,
 'Q11424_P750_Q723685_P2295_558929000.0-558929000.0__Q4917': 93,
 'Q11424_P750_Q1194970_P1128_1000.0-6700.0__'

In [271]:
old_choice_labels = {label : contribution for label, contribution in old_cand_label_contributions.items() if label not in new_cand_label_contributions}
{k: v  for k, v in sorted(old_choice_labels.items(), key=lambda item: item[1], reverse=True)}


{'Q11424_P161_Q5_P570_2010-2020__': 41,
 'Q11424_P495_Q6256_P2046_214970.0-643801.0__Q712226': 31,
 'Q11424_P495_Q3624078_P2997_+18__Q24564698': 30,
 'Q11424_P495_Q6256_P2997_16.0-18.0__Q24564698': 30,
 'Q11424_P495_Q3624078_P2997_16.0-18.0__Q24564698': 30,
 'Q11424_P495_Q6256_P2997_+18__Q24564698': 30,
 'Q11424_P161_Q5_P2031_1950-1960__': 30,
 'Q11424_P495_Q3624078_P3270_+6__Q24564698': 26,
 'Q11424_P495_Q6256_P3270_6.0-6.0__Q24564698': 26,
 'Q11424_P495_Q6256_P3270_+6__Q24564698': 26,
 'Q11424_P495_Q3624078_P3270_6.0-6.0__Q24564698': 26,
 'Q11424_P495_Q3624078_P2884_+230__Q25250': 25,
 'Q11424_P495_Q6256_P2884_+230__Q25250': 25,
 'Q11424_P495_Q6256_P2884_220.0-230.0__Q25250': 25,
 'Q11424_P495_Q3624078_P2884_220.0-230.0__Q25250': 25,
 'Q11424_P495_Q3624078_P2046_268021.0-801590.0__Q712226': 24,
 'Q11424_P495_Q6256_P3271_15.0-16.0__Q24564698': 16,
 'Q11424_P495_Q3624078_P3271_+16__Q24564698': 16,
 'Q11424_P495_Q3624078_P3271_15.0-16.0__Q24564698': 16,
 'Q11424_P495_Q6256_P3271_+16__Q2

In [290]:
profile_dict["Q10671423"]

['Q11424_P462_Q22006653',
 'Q11424_P136_Q130232',
 'Q11424_P2747_Q23830578',
 'Q11424_P495_Q34',
 'Q11424_P364_Q9027',
 'Q11424_P577_1980-1990',
 'Q11424_P495_Q6256_P2997_+18__Q24564698',
 'Q11424_P495_Q3624078_P2997_+18__Q24564698',
 'Q11424_P495_Q6256_P3270_+6__Q24564698',
 'Q11424_P495_Q3624078_P3270_+6__Q24564698',
 'Q11424_P495_Q6256_P2884_+230__Q25250',
 'Q11424_P495_Q3624078_P2884_+230__Q25250',
 'Q11424_P495_Q6256_P3000_+18__Q24564698',
 'Q11424_P495_Q3624078_P3000_+18__Q24564698',
 'Q11424_P495_Q6256_P3271_+16__Q24564698',
 'Q11424_P495_Q3624078_P3271_+16__Q24564698',
 'Q11424_P495_Q6256_P1081_0.856-0.957__',
 'Q11424_P495_Q3624078_P1081_0.853-0.957__',
 'Q11424_P495_Q6256_P2997_16.0-18.0__Q24564698',
 'Q11424_P495_Q3624078_P2997_16.0-18.0__Q24564698',
 'Q11424_P495_Q6256_P3270_6.0-6.0__Q24564698',
 'Q11424_P495_Q3624078_P3270_6.0-6.0__Q24564698',
 'Q11424_P495_Q6256_P2884_220.0-230.0__Q25250',
 'Q11424_P495_Q3624078_P2884_220.0-230.0__Q25250',
 'Q11424_P495_Q3624078_P1198_6.0

In [232]:
{k: v for k, v in sorted(centroid_label_counts.items(), key=lambda item: item[1], reverse=True)}

{'Q23397_P2046_24.0-117600.0__Q712226': 17,
 'Q23397_P17_Q6256_P2884_110.0-220.0__Q25250': 15,
 'Q23397_P17_Q3624078_P2884_110.0-220.0__Q25250': 15,
 'Q23397_P17_Q6256_P3270_6.0-6.0__Q24564698': 14,
 'Q23397_P17_Q6256_P3270_+6__Q24564698': 14,
 'Q23397_P17_Q3624078_P3270_+6__Q24564698': 14,
 'Q23397_P2043_20.95-1209.0__Q828224': 14,
 'Q23397_P17_Q3624078_P3270_6.0-6.0__Q24564698': 14,
 'Q23397_P17_Q3624078_P1081_0.853-0.957__': 14,
 'Q23397_P17_Q6256_P1081_0.856-0.957__': 14,
 'Q23397_P17_Q6256_P2046_652230.0-17125200.0__Q712226': 13,
 'Q23397_P17_Q3624078_P2046_825615.0-647500000.0__Q712226': 13,
 'Q23397_P17_Q3624078_P2219_1.0-2.3__Q11229': 13,
 'Q23397_P205_Q6256_P1081_0.856-0.957__': 13,
 'Q23397_P17_Q6256_P2219_1.0-2.3__Q11229': 13,
 'Q23397_P17_Q6256_P2219_+1.4__Q11229': 13,
 'Q23397_P205_Q3624078_P1081_0.853-0.957__': 13,
 'Q23397_P17_Q3624078_P2219_+1.4__Q11229': 13,
 'Q23397_P2049_5.0-790.0__Q828224': 12,
 'Q23397_P17_Q6256_P1198_6.0-8.0__Q11229': 12,
 'Q23397_P17_Q3624078_P11