# Evaluation of Profiles on a Table Linking Application

The tl_file we are currently using can be downloaded from here: https://drive.google.com/file/d/1a7BFHZpaI12foxutPTpED6LuylQSG5oz/view?usp=sharing

In [52]:
import os
import pandas as pd
import random
import numpy as np

## Params
*tl_file*: path to csv file that contains data for table linking. **TODO** make this more specific once we decide on appropriate input format.  
*filename_out*: Name of the file we should save results to. This will be saved under entity_profiling/output/tl_evaluation/

In [111]:
tl_file = "./data/chief_names_final_output_new_sentence_no_empty_sentences.csv"
filename_out = "russian_politicians.csv"

param setup work

In [112]:
# ensrure paths are absolute
tl_file = os.path.abspath(tl_file)

# Set up output directories if not created yet
output_dir = os.path.abspath("./output/tl_evaluation")
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
    
out_file = "{}/{}".format(output_dir, filename_out)

data setup...

In [38]:
df = pd.read_csv(tl_file)
df = df[["row","label","kg_id","kg_labels","GT_kg_id","GT_kg_label"]]
display(df)

cell_candidates = df.groupby("row")["kg_id"].apply(list).to_list()
cell_gt = [l[0] for l in df.groupby("row")["GT_kg_id"].apply(list).to_list()]
methods = ["random"]
scores = dict.fromkeys(methods, [])

Unnamed: 0,row,label,kg_id,kg_labels,GT_kg_id,GT_kg_label
0,0,Vladimir Vladimirovich PUTIN,Q7747,Vladimir Putin|V. Putin|V Putin|Vladimir Vladi...,Q7747,Vladimir Putin
1,0,Vladimir Vladimirovich PUTIN,Q1833348,I Putin|Igor Putin|I. Putin|Igor Alexandrovich...,Q7747,Vladimir Putin
2,0,Vladimir Vladimirovich PUTIN,Q19300851,Vladimir Putin|V. Putin|V Putin|Vladimir Spiri...,Q7747,Vladimir Putin
3,0,Vladimir Vladimirovich PUTIN,Q12554172,Vladimir Vladimirovich Mikhailov|V. V. Mikhail...,Q7747,Vladimir Putin
4,0,Vladimir Vladimirovich PUTIN,Q17052997,political career of Vladimir Putin,Q7747,Vladimir Putin
...,...,...,...,...,...,...
815,40,Vasiliy Alekseyevich NEBENZYA,Q43448782,Vasiliy Bukanov,Q1000053,Vasily Nebenzya
816,40,Vasiliy Alekseyevich NEBENZYA,Q64456113,Category:Mikhail Alekseyevich Lukin,Q1000053,Vasily Nebenzya
817,40,Vasiliy Alekseyevich NEBENZYA,Q4441783,Vasiliy Stepanov,Q1000053,Vasily Nebenzya
818,40,Vasiliy Alekseyevich NEBENZYA,Q16447616,V. Makarov|V Makarov|Vasiliy Makarov|Vasiliy I...,Q1000053,Vasily Nebenzya


## Use several methods to score the candidates
Random selection strategy:

In [80]:
scores["random"] = []
for cell in cell_candidates:
    choice = random.choice(cell)
    scores["random"].append({candidate : 1 if candidate == choice else 0 for candidate in cell})
#     scores["random"].append([1 if candidate == choice else 0 for candidate in cell])

Other strategies here

...

## Calculate accuracy of the different methods

In [81]:
for method in methods:
#     choices = np.array([cell_candidates[cell_idx][np.argmax(scores[method][cell_idx])] for cell_idx in range(len(cell_candidate))])
    choices = np.array([max(candidate_scores,key=candidate_scores.get) for candidate_scores in scores[method]])
    correct = sum(choices == cell_gt)
    acc = correct / len(cell_gt)
    print("{}: {:.2f}".format(method, acc))

random: 0.02


Put scores for each candidate from each method into table for viewing

In [98]:
for method in methods:
    score_col = [scores[method][cell_idx][cand] for cell_idx in range(len(cell_candidates)) for cand in cell_candidates[cell_idx]]
    if method not in df.columns:
        df.insert(df.columns.get_loc("GT_kg_id"), method, score_col)
    else:
        df.loc[:,method] = score_col

In [109]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(df)

Unnamed: 0,row,label,kg_id,kg_labels,random,GT_kg_id,GT_kg_label
0,0,Vladimir Vladimirovich PUTIN,Q7747,Vladimir Putin|V. Putin|V Putin|Vladimir Vladi...,0,Q7747,Vladimir Putin
1,0,Vladimir Vladimirovich PUTIN,Q1833348,I Putin|Igor Putin|I. Putin|Igor Alexandrovich...,0,Q7747,Vladimir Putin
2,0,Vladimir Vladimirovich PUTIN,Q19300851,Vladimir Putin|V. Putin|V Putin|Vladimir Spiri...,0,Q7747,Vladimir Putin
3,0,Vladimir Vladimirovich PUTIN,Q12554172,Vladimir Vladimirovich Mikhailov|V. V. Mikhail...,0,Q7747,Vladimir Putin
4,0,Vladimir Vladimirovich PUTIN,Q17052997,political career of Vladimir Putin,0,Q7747,Vladimir Putin
5,0,Vladimir Vladimirovich PUTIN,Q30524893,Putin|Путин,0,Q7747,Vladimir Putin
6,0,Vladimir Vladimirovich PUTIN,Q4384355,Putin,0,Q7747,Vladimir Putin
7,0,Vladimir Vladimirovich PUTIN,Q45023984,"Vladimir Putin presidential campaign, 2018",0,Q7747,Vladimir Putin
8,0,Vladimir Vladimirovich PUTIN,Q56313518,"Vladimir Putin presidential campaign, 2000",0,Q7747,Vladimir Putin
9,0,Vladimir Vladimirovich PUTIN,Q30746096,Category:Opposition to Vladimir Putin,0,Q7747,Vladimir Putin


Also send table to file for viewing:

In [None]:
df.to_csv(path_or_buf = out_file, index = False)