In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import multiprocessing as mp
import numpy as np
from scipy import stats
import sys
import os


%matplotlib inline

In [19]:
# Load results
results = pd.read_csv('../all_trials.csv')
results.dataset.loc[results.dataset == 'wnrr'] = 'wn18rr'

# Combine split columns
model_names = list(results.model.unique())
for column in results.columns:
    if column.split('.')[0] in model_names:
        actual_column = '.'.join(column.split('.')[1:])
        results[actual_column] = None
        for column2 in results.columns:
            if actual_column in column2 and actual_column != column2:
                for i, val in enumerate(results[column2]):
                    if pd.notna(val):
                        results[actual_column][i] = val
                results.drop(columns=[column2], inplace=True)

    
results.head()

Unnamed: 0,job_id,dataset,model,reciprocal,job,job_type,split,epoch,avg_loss,avg_penalty,...,entity_embedder.regularize_weight,relation_embedder.regularize_weight,entity_embedder.dropout,relation_embedder.dropout,l_norm,feature_map_dropout,projection_dropout,convolution_bias,entity_embedder.normalize.p,relation_embedder.normalize.p
0,0846d846,umls,distmult,1,search,negative_sampling,valid,30,,,...,,,,,,,,,,
1,0def3a9b,umls,relational_tucker3,0,search,negative_sampling,valid,30,,,...,,,,,,,,,,
2,93f0291e,umls,complex,1,search,KvsAll,valid,115,,,...,,,,,,,,,,
3,b5f61ac0,umls,transh,0,search,negative_sampling,valid,150,,,...,,,,,,,,,,
4,a89e6474,wn18rr,rescal,0,search,negative_sampling,valid,400,,,...,,,,,,,,,,


Median performance on UMLS is substantially higher than for the other two datasets. Since FB15K-237 and WN18RR have been specifically constructed to avoid test leakage through inverse relations [(Detmers et al)](https://arxiv.org/pdf/1707.01476.pdf), it is worth investigating whether this phenomenon is inflating the performance on UMLS.

In [21]:
def inverse_relation_proportion(predicate1, predicate2, train_list):

    pred1_list = train_list.loc[train_list.p == predicate1]
    pred2_list = train_list.loc[train_list.p == predicate2]

    inverse_count = len([i for i, triple in pred1_list.iterrows() if ((pred2_list['s'] == triple.o) & (pred2_list['o'] == triple.s)).any()])
    inverse_proportion = inverse_count/len(pred1_list)

    return [predicate1, predicate2, inverse_proportion]


def parallel_inverse_check(train, valid, test):

    # Calculate inverse relation threshold (using Detmers et al's definition)
    total_edges = len(train) + len(test) + len(valid)
    inverse_threshold = 0.99 - len(test)/total_edges - len(valid)/total_edges

    # Calculate all inverse relation proportions
    args = [[predicate1, predicate2, train] for predicate1 in train.p.unique() for predicate2 in train.p.unique() if predicate1 != predicate2]
    with mp.Pool(mp.cpu_count()) as pool:
        inverse_proportions = pool.starmap(inverse_relation_proportion, args)
    inverse_check_df = pd.DataFrame(inverse_proportions)
    inverse_check_df.columns = ['predicate', 'inverse', 'inverse_proportion']

    # Return inverse relations with proportions over the threshold
    leaking_rels = inverse_check_df.loc[inverse_check_df.inverse_proportion >= inverse_threshold]
    return leaking_rels.sort_values('inverse_proportion', ascending=False)


In [6]:
# Check for inverse relations in the datasets

run = False  # This cell took approx 4 hours on 4 cores, dont run unless output file is lost
if run:
    detmers_check = pd.DataFrame()

    for dataset in ['umls', 'wnrr', 'fb15k-237']:
        print(f'Processing {dataset}')
        # Read in edge splits
        train_set = pd.read_csv(f'../../kge/data/{dataset}/train.txt', sep='\t', header=None)
        train_set.columns = ['s', 'p', 'o']
        test_set = pd.read_csv(f'../../kge/data/{dataset}/test.txt', sep='\t', header=None)
        valid_set = pd.read_csv(f'../../kge/data/{dataset}/valid.txt', sep='\t', header=None)

        # Check for inverse relations
        inverse_results = parallel_inverse_check(train_set, valid_set, test_set)
        inverse_results['dataset'] = dataset

        # Store
        detmers_check = detmers_check.append(inverse_results)
    

else:
    detmers_check = pd.read_csv('graph_attributes/detmers_inverse_check.csv')

detmers_check

Unnamed: 0,predicate,inverse,inverse_proportion,dataset
0,derivative_of,contains,1.0,umls
1,derivative_of,surrounds,1.0,umls
2,precedes,affects,0.842105,umls
3,degree_of,affects,0.814815,umls
4,precedes,result_of,0.789474,umls
5,/location/administrative_division/first_level_...,/location/location/contains,0.879433,fb15k-237


In [29]:
detmers_check.columns = ['predicate', 'inverse', 'inverse_proportion', 'dataset']
detmers_check.to_csv('detmers_inverse_check_umls_fb15k_wnrr.csv', index=False)

In [6]:
# Check how frequent these relations are in the UMLS testing set
umls_test = pd.read_csv(f'../../kge/data/umls/test.txt', sep='\t', header=None)
umls_inverse = detmers_check.loc[detmers_check.dataset == 'umls']
for leak_rel in umls_inverse.predicate.unique():
    test_proportion = len(umls_test.loc[umls_test[1] == leak_rel])/len(umls_test)
    test_percentage = test_proportion * 100
    print(f'Leaking relation "{leak_rel}" composes {round(test_percentage, 3)}% of the test set edges')


Leaking relation "degree_of" composes 0.605% of the test set edges
Leaking relation "precedes" composes 1.362% of the test set edges
Leaking relation "derivative_of" composes 0.0% of the test set edges


In [34]:
# Check how frequent these relations are in the FB15k-237 testing set
fb_test = pd.read_csv(f'../../kge/data/fb15k-237/test.txt', sep='\t', header=None)
fb_inverse = detmers_check.loc[detmers_check.dataset == 'fb15k-237']
for leak_rel in fb_inverse.predicate.unique():
    test_proportion = len(fb_test.loc[fb_test[1] == leak_rel])/len(fb_test)
    test_percentage = test_proportion * 100
    print(f'Leaking relation "{leak_rel}" composes {round(test_percentage, 3)}% of the test set edges')

Leaking relation "/location/administrative_division/first_level_division_of" composes 0.0% of the test set edges


The table above shows that test leakage is indeed occurring in the UMLS dataset. Three different relations - 'derivative_of'; 'precedes'; and 'degree_of' - can be predicted unreasonably well using nothing but the inverse of another relation. This is potentially problematic because a link predictor could forgo proper modeling of the graph, instead utilising this phenomenon to inflate its perceived performance. However, in this case, these leaking edges compose less than 2% of the testing dataset and therefore cannot be the sole reason for the models' improved MRR on UMLS.