# Evaluation of Profiles on a Table Linking Application

The tl_file we are currently using can be downloaded from here: https://drive.google.com/file/d/1R3_2hbsuwbjLT9H4tD6LdkdXeU8yJte5/view?usp=sharing

## Prereqs
- We use tabulate. You can install it with `pip install tabulate`

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import pandas as pd
import random
import numpy as np
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from tabulate import tabulate
from gensim.models import KeyedVectors
import json
from evaluate_tl_task import *



## Params
*tl_file*: path to csv file that contains data for table linking. **TODO** make this more specific once we decide on appropriate input format.  
*filename_out*: Name of the file we should save results to. This will be saved under entity_profiling/output/tl_evaluation/  
*profiles_file*: profiles_dict.json file containing dict of profiles to use in evaluation task.  
*type_mapping_file*: type_mapping.tsv file created by the label creation notebook.  
*profiles_type*: Entity type of the profiles you want to use. This should be a Q-node.  
*H_embedding_file*: .kv H-embedding file to use in embedding-based methods.  
*A_embedding_file*: .kv A-embedding file to use in embedding-based methods.  
*S_embedding_file*: .kv S-embedding file to use in embedding-based methods.  
*HAS_embedding_file*: .kv HAS-embedding file to use in embedding-based methods.

In [3]:
tl_file = "./data/2021-04-22-chiefs.csv"
filename_out = "russian_politicians.csv"
profiles_file = "./output/wikidata_humans_v3/final_label_sets/Q5/final/profiles_dict.json"
type_mapping_file = "./output/wikidata_humans_v3/label_creation/type_mapping.tsv"
profiles_type = "Q5"
H_embedding_file = "./output/wikidata_humans_v3/HAS_embeddings/H_embeddings.kv"
A_embedding_file = "./output/wikidata_humans_v3/A_walks_analysis_4/Q5/A_embeddings.kv"
S_embedding_file = "./output/wikidata_humans_v3/S_walks_analysis/Q5/S_embeddings.kv"
HAS_embedding_file = "./output/wikidata_humans_v3/HAS_embeddings/HAS_embeddings.kv"

H_all_types_3x6_file = "./output/wikidata-20210215-dwd/H_walks_analysis/h_embeddings_3x6,min_count=8.kv"
H_all_types_5x8_file = "./output/wikidata-20210215-dwd/H_walks_analysis/h_embeddings_5x8,min_count=21.kv"

param setup work

In [4]:
# ensure paths are absolute
tl_file = os.path.abspath(tl_file)
profiles_file = os.path.abspath(profiles_file)
type_mapping_file = os.path.abspath(type_mapping_file)
H_embedding_file = os.path.abspath(H_embedding_file)
A_embedding_file = os.path.abspath(A_embedding_file)
S_embedding_file = os.path.abspath(S_embedding_file)
HAS_embedding_file = os.path.abspath(HAS_embedding_file)

H_all_types_3x6_file = os.path.abspath(H_all_types_3x6_file)
H_all_types_5x8_file = os.path.abspath(H_all_types_5x8_file)

# Set up output directories if not created yet
output_dir = os.path.abspath("./output/tl_evaluation")
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
    
out_file = "{}/{}".format(output_dir, filename_out)

## Loading profiles and embeddings

In [6]:
print("now loading profiles...")
with open(profiles_file, 'r') as f:
    profile_dict = json.load(f)
print("profiles loaded")

# print("now loading entity-type mapping")
# type_mapping = load_type_mapping(type_mapping_file)
# print("type mapping loaded")

print("now loading H-entity embeddings")
H_entity_embeddings = KeyedVectors.load(H_embedding_file)
print("H-entity embeddings loaded")

print("now loading H-all-types-3x6 embeddings")
H_all_types_3x6_embeddings = KeyedVectors.load(H_all_types_3x6_file)
print("H-all-types-3x6 embeddings loaded")

print("now loading H-all-types-5x8 entity embeddings")
H_all_types_5x8_embeddings = KeyedVectors.load(H_all_types_5x8_file)
print("H-all-types-5x8 embeddings loaded")

print("now loading A-entity embeddings")
A_entity_embeddings = KeyedVectors.load(A_embedding_file)
print("A-entity embeddings loaded")

print("now loading S-entity embeddings")
S_entity_embeddings = KeyedVectors.load(S_embedding_file)
print("S-entity embeddings loaded")

print("now loading HAS-entity embeddings")
HAS_entity_embeddings = KeyedVectors.load(HAS_embedding_file)
print("HAS-entity embeddings loaded")

Unnamed: 0,row,label,GT_kg_id,kg_id,kg_labels
0,0,Vladimir Vladimirovich PUTIN,Q7747,Q7747,Vladimir Putin|V. Putin|V Putin|Vladimir Vladi...
1,0,Vladimir Vladimirovich PUTIN,Q7747,Q1833348,I Putin|Igor Putin|I. Putin|Igor Alexandrovich...
2,0,Vladimir Vladimirovich PUTIN,Q7747,Q19300851,Vladimir Putin|V. Putin|V Putin|Vladimir Spiri...
3,0,Vladimir Vladimirovich PUTIN,Q7747,Q12554172,Vladimir Vladimirovich Mikhailov|V. V. Mikhail...
4,0,Vladimir Vladimirovich PUTIN,Q7747,Q17052997,political career of Vladimir Putin
...,...,...,...,...,...
828,40,Vasiliy Alekseyevich NEBENZYA,Q1000053,Q43448782,Vasiliy Bukanov
829,40,Vasiliy Alekseyevich NEBENZYA,Q1000053,Q64456113,Category:Mikhail Alekseyevich Lukin
830,40,Vasiliy Alekseyevich NEBENZYA,Q1000053,Q4441783,Vasiliy Stepanov
831,40,Vasiliy Alekseyevich NEBENZYA,Q1000053,Q16447616,V. Makarov|V Makarov|Vasiliy Makarov|Vasiliy I...


now loading profiles...
profiles loaded
now loading H-entity embeddings
H-entity embeddings loaded
now loading H-all-types-3x6 embeddings
H-all-types-3x6 embeddings loaded
now loading H-all-types-5x8 entity embeddings
H-all-types-5x8 embeddings loaded
now loading A-entity embeddings
A-entity embeddings loaded
now loading S-entity embeddings
S-entity embeddings loaded
now loading HAS-entity embeddings
HAS-entity embeddings loaded


## Analysis of different candidate selection methods on a single table - Russian Politicians

Loading the table

In [55]:
df = pd.read_csv(tl_file)
df = df[["row","label", "GT_kg_id", "kg_id","kg_labels"]]
display(df.head())

cells = get_candidates_by_cell(df)
cells_gt = get_cell_ground_truths(df)

Unnamed: 0,row,label,GT_kg_id,kg_id,kg_labels
0,0,Vladimir Vladimirovich PUTIN,Q7747,Q7747,Vladimir Putin|V. Putin|V Putin|Vladimir Vladi...
1,0,Vladimir Vladimirovich PUTIN,Q7747,Q1833348,I Putin|Igor Putin|I. Putin|Igor Alexandrovich...
2,0,Vladimir Vladimirovich PUTIN,Q7747,Q19300851,Vladimir Putin|V. Putin|V Putin|Vladimir Spiri...
3,0,Vladimir Vladimirovich PUTIN,Q7747,Q12554172,Vladimir Vladimirovich Mikhailov|V. V. Mikhail...
4,0,Vladimir Vladimirovich PUTIN,Q7747,Q17052997,political career of Vladimir Putin


### Russian Politicians - Preliminary data analysis

In [52]:
gt_cands = get_gt_cands(cells, cells_gt)
num_cands = sum([len(cell) for cell in cells])
print("Table info:")
print("# of cells: {}".format(len(cells_gt)))
print("# of cells w/ gt in candidates: {}".format(len(gt_cands)))
print("# of candidates: {}".format(num_cands))

print("\nCoverage of the profiles and embeddings:")
header=["",
        "# of candidates represented (out of {})".format(num_cands),
        "# of gt candidates represented (out of {})".format(len(gt_cands))]
rows=[]
rows.append(["Profiles",
             get_num_cands_w_profiles(cells, profile_dict),
             get_num_gt_cands_w_profiles(gt_cands, profile_dict)])
embedding_models=[("H-Q5",H_entity_embeddings),
                  ("H-all-types-3x6",H_all_types_3x6_embeddings),
                  ("H-all-types-5x8",H_all_types_5x8_embeddings),
                  ("A-Q5",A_entity_embeddings),
                  ("S-Q5",S_entity_embeddings),
                  ("HAS-Q5",HAS_entity_embeddings)]
for name, model in embedding_models:
      rows.append([name,
                   get_num_cands_w_profiles(cells, model),
                   get_num_gt_cands_w_profiles(gt_cands, model)])
print(tabulate(rows, headers=header, tablefmt="fancy_grid"))

Table info:
# of cells: 41
# of cells w/ gt in candidates: 41
# of candidates: 833

Coverage of the profiles and embeddings:
╒═════════════════╤════════════════════════════════════════════╤══════════════════════════════════════════════╕
│                 │   # of candidates represented (out of 833) │   # of gt candidates represented (out of 41) │
╞═════════════════╪════════════════════════════════════════════╪══════════════════════════════════════════════╡
│ Profiles        │                                        720 │                                           41 │
├─────────────────┼────────────────────────────────────────────┼──────────────────────────────────────────────┤
│ H-Q5            │                                        794 │                                           41 │
├─────────────────┼────────────────────────────────────────────┼──────────────────────────────────────────────┤
│ H-all-types-3x6 │                                        765 │                           

## Use several methods to score the candidates

In [59]:
scores_by_method = {}

# random baseline
scores_by_method["random"] = random_scoring(cells)

# profile-based methods
scores_by_method["random-profile-cands"] = random_amongst_profile_ents(cells, profile_dict)
for profile_size in [50]: #[10,50,100,250,500]:
    scores_by_method["max-cell-{}profile-intersect".format(profile_size)] = max_cell_profile_intersect_size_candidate_scoring(cells, profile_dict, max_labels_in_profile=profile_size)
    scores_by_method["{}profile-intersect-gt-neighs".format(profile_size)] = profile_intersect_size_w_gt_neighbors_candidate_scoring(cells, cells_gt, profile_dict, max_labels_in_profile=profile_size)
    
# embedding-based methods
embedding_models=[("H",H_entity_embeddings),
                  ("H-3x6",H_all_types_3x6_embeddings),
                  ("H-5x8",H_all_types_5x8_embeddings),
                  ("A",A_entity_embeddings),
                  ("S",S_entity_embeddings),
                  ("HAS",HAS_entity_embeddings)]
for name, model in embedding_models:
    scores_by_method["random-{}-embed-cands".format(name)] = random_amongst_embedding_ents(cells, model)
    # scores_by_method["avg-cell-embed-sim-{}".format(name)] = avg_cell_embedding_sim_candidate_scoring(cells, model)
    scores_by_method["max-cell-embed-sim-{}".format(name)] = max_cell_embedding_sim_candidate_scoring(cells, model)
    scores_by_method["embed-sim-gt-neighs-{}".format(name)] = embedding_sim_w_gt_neighbors_candidate_scoring(cells, cells_gt, model)

## Calculate accuracy of the different methods

In [61]:
print_f1_and_margin_stats_for_method_scores(scores_by_method)

Method                         F1      Margin Avg      Min      25%      50%      75%     Max
-----------------------------  ----  ------------  -------  -------  -------  -------  ------
random                         0.04        0        0        0        0        0       0

random-profile-cands           0.03        0        0        0        0        0       0
max-cell-50profile-intersect   0.49       -0.0045  -0.1762  -0.0038  -0.0002   0.0032  0.0453
50profile-intersect-gt-neighs  0.75        0.0029  -0.2054   0        0.0045   0.0084  0.0836

random-H-embed-cands           0.06        0        0        0        0        0       0
max-cell-embed-sim-H           0.61        0.0013  -0.0212  -0.0017   0.0008   0.0026  0.0319
embed-sim-gt-neighs-H          0.76        0.0084  -0.0273   0.0003   0.0037   0.0101  0.0777

random-H-3x6-embed-cands       0.04        0        0        0        0        0       0
max-cell-embed-sim-H-3x6       0.68        0.001   -0.0139  -0.0011   0.0011 

## Now looking at how much the embedding features agree

In [74]:
method_scores = {"H" : scores_by_method["embed-sim-gt-neighs-H"],
                 "A" : scores_by_method["embed-sim-gt-neighs-A"],
                 "S" : scores_by_method["embed-sim-gt-neighs-S"],
                 "HAS" : scores_by_method["embed-sim-gt-neighs-HAS"],
                }
combs = [["H"],["A"],["S"],["HAS"],
         ["H","A"],["H","S"],["A","S"],["H","A","S"],
         ["H","HAS"],["A","HAS"],["S","HAS"],["H","A","S","HAS"]
        ]

print_f1_of_method_combs(method_scores, combs)

╒══════════════╤════════════════════╤═════════════════════╕
│ Methods      │   F1 of logical OR │   F1 of logical AND │
╞══════════════╪════════════════════╪═════════════════════╡
│ H            │               0.76 │                0.76 │
├──────────────┼────────────────────┼─────────────────────┤
│ A            │               0.29 │                0.29 │
├──────────────┼────────────────────┼─────────────────────┤
│ S            │               0.2  │                0.2  │
├──────────────┼────────────────────┼─────────────────────┤
│ HAS          │               0.73 │                0.73 │
├──────────────┼────────────────────┼─────────────────────┤
│ H, A         │               0.8  │                0.24 │
├──────────────┼────────────────────┼─────────────────────┤
│ H, S         │               0.76 │                0.2  │
├──────────────┼────────────────────┼─────────────────────┤
│ A, S         │               0.44 │                0.05 │
├──────────────┼────────────────────┼───

In [62]:
method_scores = {"H" : scores_by_method["embed-sim-gt-neighs-H"],
                 "A" : scores_by_method["embed-sim-gt-neighs-A"],
                 "S" : scores_by_method["embed-sim-gt-neighs-S"],
                 "HAS" : scores_by_method["embed-sim-gt-neighs-HAS"],
                }

print_cell_agreement_of_methods(method_scores)

# cells agreed upon: 'correct, incorrect'
╒═════╤═══════╤═══════╤══════╤═══════╕
│     │ H     │ A     │ S    │ HAS   │
╞═════╪═══════╪═══════╪══════╪═══════╡
│ H   │ 31.0, │ 10.0, │ 8.0, │ 26.0, │
│     │ 10.0  │ 1.0   │ 2.0  │ 5.0   │
├─────┼───────┼───────┼──────┼───────┤
│ A   │ 10.0, │ 12.0, │ 2.0, │ 11.0, │
│     │ 1.0   │ 29.0  │ 1.0  │ 1.0   │
├─────┼───────┼───────┼──────┼───────┤
│ S   │ 8.0,  │ 2.0,  │ 8.0, │ 7.0,  │
│     │ 2.0   │ 1.0   │ 33.0 │ 0.0   │
├─────┼───────┼───────┼──────┼───────┤
│ HAS │ 26.0, │ 11.0, │ 7.0, │ 30.0, │
│     │ 5.0   │ 1.0   │ 0.0  │ 11.0  │
╘═════╧═══════╧═══════╧══════╧═══════╛


### Put scores for each candidate from each method into table for viewing

In [95]:
df = pd.read_csv(tl_file)
df = df[["row","label", "GT_kg_id", "kg_id","kg_labels"]]
for method, scores in scores_by_method.items():
    if "random" in method:
        continue
    score_col = [scores[cell_idx][cand] for cell_idx in range(len(cells)) for cand in cells[cell_idx]]
    correct_col = []
    for cell_idx in range(len(cells)):
        max_score = max(scores[cell_idx].values())
        if scores[cell_idx][cells_gt[cell_idx]] == max_score:
            fill_val = 1
        else:
            fill_val = 0
        correct_col.extend([fill_val for cand in cells[cell_idx]])
    if method not in df.columns:
        df.insert(len(df.columns), method, score_col)
        df.insert(len(df.columns), "{} correct?".format(method), correct_col)
    else:
        df.loc[:,method] = score_col
        df.loc[:,"{} correct?".format(method)] = correct_col

In [96]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(df)

Unnamed: 0,row,label,GT_kg_id,kg_id,kg_labels,max-cell-50profile-intersect-size,max-cell-50profile-intersect-size correct?,50profile-intersect-size-gt-neighbors,50profile-intersect-size-gt-neighbors correct?,max-cell-embedding-similarity-H,max-cell-embedding-similarity-H correct?,embedding-sim-gt-neighbors-H,embedding-sim-gt-neighbors-H correct?,max-cell-embedding-similarity-A,max-cell-embedding-similarity-A correct?,embedding-sim-gt-neighbors-A,embedding-sim-gt-neighbors-A correct?,max-cell-embedding-similarity-S,max-cell-embedding-similarity-S correct?,embedding-sim-gt-neighbors-S,embedding-sim-gt-neighbors-S correct?,max-cell-embedding-similarity-HAS,max-cell-embedding-similarity-HAS correct?,embedding-sim-gt-neighbors-HAS,embedding-sim-gt-neighbors-HAS correct?
0,0,Vladimir Vladimirovich PUTIN,Q7747,Q7747,Vladimir Putin|V. Putin|V Putin|Vladimir Vladi...,0.327354,1,0.336113,1,0.087598,0,0.089327,0,0.249513,0,0.226482,0,0.266603,1,0.242612,0,0.082799,0,0.084569,0
1,0,Vladimir Vladimirovich PUTIN,Q7747,Q1833348,I Putin|Igor Putin|I. Putin|Igor Alexandrovich...,0.292601,1,0.299416,1,0.089501,0,0.090523,0,0.295675,0,0.351394,0,0.246679,1,0.284154,0,0.094302,0,0.095686,0
2,0,Vladimir Vladimirovich PUTIN,Q7747,Q19300851,Vladimir Putin|V. Putin|V Putin|Vladimir Spiri...,0.302317,1,0.306922,1,0.097226,0,0.097465,0,0.234521,0,0.203295,0,0.243797,1,0.239388,0,0.100392,0,0.099552,0
3,0,Vladimir Vladimirovich PUTIN,Q7747,Q12554172,Vladimir Vladimirovich Mikhailov|V. V. Mikhail...,0.077728,1,0.057548,1,0.094155,0,0.087594,0,0.220291,0,0.21883,0,0.242921,1,0.233846,0,0.094768,0,0.088608,0
4,0,Vladimir Vladimirovich PUTIN,Q7747,Q17052997,political career of Vladimir Putin,0.0,1,0.0,1,0.081904,0,0.083329,0,0.0,0,0.0,0,0.0,1,0.0,0,0.077403,0,0.078411,0
5,0,Vladimir Vladimirovich PUTIN,Q7747,Q30524893,Putin|Путин,0.0,1,0.0,1,0.081553,0,0.080623,0,0.0,0,0.0,0,0.0,1,0.0,0,0.082202,0,0.080286,0
6,0,Vladimir Vladimirovich PUTIN,Q7747,Q4384355,Putin,0.0,1,0.0,1,0.076578,0,0.075774,0,0.0,0,0.0,0,0.0,1,0.0,0,0.079367,0,0.078884,0
7,0,Vladimir Vladimirovich PUTIN,Q7747,Q45023984,"Vladimir Putin presidential campaign, 2018",0.0,1,0.0,1,0.0,0,0.0,0,0.0,0,0.0,0,0.0,1,0.0,0,0.0,0,0.0,0
8,0,Vladimir Vladimirovich PUTIN,Q7747,Q56313518,"Vladimir Putin presidential campaign, 2000",0.0,1,0.0,1,0.073948,0,0.074362,0,0.0,0,0.0,0,0.0,1,0.0,0,0.072924,0,0.073362,0
9,0,Vladimir Vladimirovich PUTIN,Q7747,Q30746096,Category:Opposition to Vladimir Putin,0.0,1,0.0,1,0.077797,0,0.079221,0,0.0,0,0.0,0,0.0,1,0.0,0,0.076145,0,0.077301,0


Also send table to file for viewing:

In [97]:
df.to_csv(path_or_buf = out_file, index = False)