In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from collections import defaultdict
from fuzzywuzzy import fuzz
from tqdm import tqdm, tqdm_notebook
import random 

from knowledge_graph import KG
from task_utils import *

In [3]:
kg = KG("data/uniform/construction")
#kg = KG("data/popular/construction")

Loading data from data/uniform/construction...
Loaded 718268 triples.


In [5]:
kg.load_tasks()

Task: entity_matching
entity_matching (train): 3159 samples
entity_matching (valid): 395 samples
entity_matching (test): 790 samples



## Entity Matching Task

Our baseline consists of string matching on names. 

In [6]:
def generate_candidates(triples, test_x):
    """ The entity matching task consists of mapping muiscbrainz entities to wikidata entities. 
    In order to make comparisons feasible, we first recommend performing a candidate generation step, where
    we generate a list of likely wikidata candidates for every musicbrainz entity we seek to match.
    
    Args:
        triples (list): list of all triples in the KG
        test_x: all unknown musicbrainz entities in the test split
    """
    mbz2wd_arcs = {
        'name': ['name', 'short name', 'official name', 'birth name', 'family name'],
        'title': ['name'],
    }
    
    arcs2save = set()
    for k, v in mbz2wd_arcs.items():
        arcs2save.add(k)
        for vv in v:
            arcs2save.add(vv)
        
    entity2source = {} 
    entity2features = defaultdict(lambda: defaultdict(list))
    
    # collect features for each entity 
    print("Collecting Features")
    for head, arc, tail, tail_type, source in tqdm_notebook(triples):
        entity2source[head] = source
        if arc in arcs2save:
            entity2features[head][arc].append(tail)

    # generate candidates 
    candidate_scores = defaultdict(lambda: defaultdict(int))
    print("Scoring candidates")
    
    for mbz_entity in tqdm_notebook(test_x): 
        for candidate_entity in entity2features:
            if not candidate_entity in entity2features:
                continue
                
            if not mbz_entity in entity2features:
                continue

            # skip candidate if it belongs to musicbrainz
            if entity2source[candidate_entity] == 'musicbrainz':
                continue 
            
            candidate_features = entity2features[candidate_entity]
            
    
            '''print("Source entity:",mbz_entity, entity2source[mbz_entity])
            print("Source features:",entity2features[mbz_entity])
            print("Candidate:", candidate_entity, entity2source[candidate_entity])
            print("Candidate features:", candidate_features)'''
            score = 0
            for off_feature_name, usda_feature_name_list in mbz2wd_arcs.items():
                max_score = 0
                for usda_feature_name in usda_feature_name_list:
                    for candidate_val in candidate_features.get(usda_feature_name, []):
                        for off_val in entity2features[mbz_entity].get(off_feature_name, []):
                            max_score = max(fuzz.ratio(off_val, candidate_val), max_score)
                score += max_score 
            candidate_scores[mbz_entity][candidate_entity] = score
    return candidate_scores
    

In [7]:
candidates = generate_candidates(kg.triples, kg.tasks['entity_matching']['test']['X'])

Collecting Features



Scoring candidates





In [8]:
# predict top scoring candidates 
unfiltered_predictions = {} 
all_scores = []
for mbz_entity in kg.tasks['entity_matching']['test']['X']:
    max_score = 0
    max_candidate = ""
    for candidate, score in candidates[mbz_entity].items():
        if score > max_score:
            max_score = score
            max_candidate = candidate
    all_scores.append(max_score)
    unfiltered_predictions[mbz_entity] = (max_candidate, max_score)

# use median score as threshold 
threshold = sorted(all_scores)[int(len(all_scores) / 2)]

filtered_predictions = {}
for mbz_entity, candidate_tuple in unfiltered_predictions.items():
    candidate, score = candidate_tuple
    if score < threshold:
        filtered_predictions[mbz_entity] = "None"
    else:
        filtered_predictions[mbz_entity] = candidate
        
    

In [9]:
X = kg.tasks['entity_matching']['test']['X']
Y = kg.tasks['entity_matching']['test']['Y']
precision, recall, f1 = evaluate_entity_matching(filtered_predictions, X, Y)
print("Precision: %f. Recall: %f. F1: %f" % (precision, recall, f1))

Precision: 0.558559. Recall: 0.822281. F1: 0.665236
