### Experiment4: Creating Benchmark

In [1]:
import pandas as pd
import json
import random
import pickle
random.seed(1234)

In [2]:
# Read entity mapping
e2id_path = open("/Users/niraj/Desktop/LinkLogic/akbc2021/akbc_2021/OpenKE/benchmarks/fb14/entity2id.txt", "r")
e2id = {}
for line in e2id_path:
    line = line.strip()
    e, ids = line.split("\t")
    e2id[e] = int(ids)
rev_e2id = {v: k for k, v in e2id.items()}

In [3]:
# Read relation mapping
r2id_path = open("/Users/niraj/Desktop/LinkLogic/akbc2021/akbc_2021/OpenKE/benchmarks/fb14/relation2id.txt", "r")
r2id = {}
for line in r2id_path:
    line = line.strip()
    r, ids = line.split("\t")
    r2id[r] = int(ids)
rev_r2id = {v: k for k, v in r2id.items()}

In [4]:
path = "/Users/niraj/Desktop/LinkLogic/akbc2021/akbc_2021/OpenKE/benchmarks/fb13_resplit/entity_names_to_types.csv"
data = pd.read_csv(path)
e2type = {}
for row in data.iterrows():
    e2type[row[1][0]] = row[1][1]

In [5]:
tuning_path = "../../../../akbc2021/akbc_2021/OpenKE/benchmarks/fb13_resplit/commonsense_benchmark_for_tuning_v2.json"
analysis_path = "../../../../akbc2021/akbc_2021/OpenKE/benchmarks/fb13_resplit/commonsense_benchmark_for_analysis_v2.json"

In [6]:
data = {}
with open(tuning_path) as f:
    data["tuning"] = json.load(f)

with open(analysis_path) as f:
    data["analysis"] = json.load(f)

In [7]:
train_path = "/Users/niraj/Desktop/LinkLogic/akbc2021/akbc_2021/OpenKE/benchmarks/fb14/train.txt"
test_path = "/Users/niraj/Desktop/LinkLogic/akbc2021/akbc_2021/OpenKE/benchmarks/fb14/test.txt"
valid_path = "/Users/niraj/Desktop/LinkLogic/akbc2021/akbc_2021/OpenKE/benchmarks/fb14/valid.txt"

train_path = open(train_path)
test_path = open(test_path)
valid_path = open(valid_path)
graph = {}
for t in train_path:
    t = t.strip()
    e1, r, e2 = t.split("\t")
    graph[e1, r, e2] = "train"

for t in test_path:
    t = t.strip()
    e1, r, e2 = t.split("\t")
    graph[e1, r, e2] = "test"
    
for t in valid_path:
    t = t.strip()
    e1, r, e2 = t.split("\t")
    graph[e1, r, e2] = "valid"

In [8]:
n_instance = 100

In [9]:
#1. Sample head entity
#2. Sample relation
#3. Sample tail entity

### 1. Non sensical benchmark

In [10]:
triple = []

while len(triple) < n_instance:
    head_idx = random.randint(0, len(e2id)-1)
    rel_idx = random.randint(0, len(r2id)-1)
    tail_idx = random.randint(0, len(e2id)-1)
    
    # Ramdomly sample triples with head entity of Person type
    if e2type[rev_e2id[head_idx]] == "Person":
        triple.append([rev_e2id[head_idx], rev_r2id[rel_idx], rev_e2id[tail_idx]])

with open(f'data/data_4a.pickle', 'wb') as handle:
    pickle.dump(triple, handle, protocol=pickle.HIGHEST_PROTOCOL) 

### 2. Different Family

In [11]:
triple = []

while len(triple) < n_instance:
    head_idx = random.randint(0, len(e2id)-1)
    rel_idx = random.randint(0, len(r2id)-1)
    tail_idx = random.randint(0, len(e2id)-1)
    
    if e2type[rev_e2id[head_idx]] == "Person" and \
        rev_r2id[rel_idx] in ["children", "parents", "spouse", "sibling"] and \
        e2type[rev_e2id[tail_idx]] == "Person":
        
        # If randomly sampled triple is not in graph, then append it to the triple list
        if (rev_e2id[head_idx], rev_r2id[rel_idx], rev_e2id[tail_idx]) not in graph:
            triple.append([rev_e2id[head_idx], rev_r2id[rel_idx], rev_e2id[tail_idx]])
        else:
            print('in graph', [rev_e2id[rand_idx], rev_r2id[rand_idx], rev_e2id[rand_idx]])

for t in triple:
    assert (t[0], t[1], t[2]) not in graph

# Store the triples 
with open(f'data/data_4b.pickle', 'wb') as handle:
    pickle.dump(triple, handle, protocol=pickle.HIGHEST_PROTOCOL)
      

### 3. Distance Family

In [12]:
triple = []
family_rel = ["children", "parents", "spouse", "sibling"]

while len(triple) < n_instance:
    
    # Sample random triple from the graph
    idx = random.randint(0, len(e2id)-1)
    t = list(graph.keys())[idx]
    
    # Swap the relation for the true triple in the graph
    if t[1] in family_rel:
        swap_rel = random.randint(0,3)
        
        # Make sure the randomly selected relation doesn't match with the query relation
        if family_rel[swap_rel] != t[1]:
            
            # If the perturbed triple is not in the graph, add to the list
            if (t[0], family_rel[swap_rel], t[2]) not in graph:
                triple.append([t[0], family_rel[swap_rel], t[2]])
                print(t)
                print([t[0], family_rel[swap_rel], t[2]])

# Verify all the triples are not in the graph
for t in triple:
    assert (t[0], t[1], t[2]) not in graph

# Store the triples 
with open(f'data/data_4c.pickle', 'wb') as handle:
    pickle.dump(triple, handle, protocol=pickle.HIGHEST_PROTOCOL)

('muriel_humphrey', 'spouse', 'hubert_humphrey')
['muriel_humphrey', 'sibling', 'hubert_humphrey']
('stilicho', 'spouse', 'honorius')
['stilicho', 'parents', 'honorius']
('elisabeth_of_parma', 'children', 'philip_duke_of_parma')
['elisabeth_of_parma', 'spouse', 'philip_duke_of_parma']
('stokely_carmichael', 'spouse', 'miriam_makeba')
['stokely_carmichael', 'sibling', 'miriam_makeba']
('charles_duke_of_lower_lorraine', 'children', 'gerberga_of_lower_lorraine')
['charles_duke_of_lower_lorraine', 'spouse', 'gerberga_of_lower_lorraine']
('anna_radziwill', 'children', 'janusz_iii_mazowiecki')
['anna_radziwill', 'sibling', 'janusz_iii_mazowiecki']
('dobroslav_ii', 'parents', 'mihailo_i_vojislav')
['dobroslav_ii', 'sibling', 'mihailo_i_vojislav']
('hugh_de_stafford_2nd_earl_of_stafford', 'children', 'katherine_de_stafford')
['hugh_de_stafford_2nd_earl_of_stafford', 'parents', 'katherine_de_stafford']
('hamengkubuwana_vi', 'children', 'hamengkubuwana_vii')
['hamengkubuwana_vi', 'parents', 'ham

### 4. True links from Experiment 3

In [None]:
tuning_path = "../../../../akbc2021/akbc_2021/OpenKE/benchmarks/fb13_resplit/commonsense_benchmark_for_tuning_v2.json"
analysis_path = "../../../../akbc2021/akbc_2021/OpenKE/benchmarks/fb13_resplit/commonsense_benchmark_for_analysis_v2.json"

In [None]:
data = {}
with open(tuning_path) as f:
    data["tuning"] = json.load(f)

with open(analysis_path) as f:
    data["analysis"] = json.load(f)

In [None]:
triples = []
category = "parents"
for d in data["analysis"]:
    if d['category'] == category:
        triples.append(d["query_triple"])   

In [None]:
with open(f'data/data_4d.pickle', 'wb') as handle:
    pickle.dump(triples, handle, protocol=pickle.HIGHEST_PROTOCOL)