In [1]:
import pandas as pd
import networkx as nx
from tqdm import tqdm

In [2]:
data_dir = '../data/fb13/'
benchmarks = dict()

### Load data and construct networkx graph object

In [3]:
def read_triples(file, **addl_cols):
    triples = pd.read_csv(file, sep='\t', names=['e1', 'rel', 'e2'])
    for key, value in addl_cols.items():
        triples[key] = value
    return triples

def construct_networkx_object(df_triples, df_entities):
    G = nx.MultiDiGraph()

    # add nodes
    print('adding nodes')
    e_tuples = [(row['name'], {'id': row['id']}) for i, row in df_entities.iterrows()]
    G.add_nodes_from(e_tuples)
    
    # add edges
    print('adding edges')
    for i, row in tqdm(df_triples.iterrows(), total=df_triples.shape[0]):
        G.add_edge(row.e1, row.e2, relation=row.rel)
        
    assert nx.number_of_nodes(G) == df_entities.shape[0]
        
    return G

In [4]:
# Read in all triples
splits = ['train', 'valid', 'test']
g_list = [read_triples(f'{data_dir}/{split}.txt', split=split) for split in splits]
g = pd.concat(g_list, axis=0)

# Read in entity info
entities = pd.read_csv(f'{data_dir}/entity2id.txt', sep='\t', names=['name', 'id'])

In [5]:
G = { split: construct_networkx_object(g[g.split == split], entities) for split in splits}
G['full'] = construct_networkx_object(g, entities)

adding nodes


  0%|          | 890/316232 [00:00<00:35, 8890.05it/s]

adding edges


100%|██████████| 316232/316232 [00:36<00:00, 8732.33it/s]


adding nodes


 32%|███▏      | 1869/5908 [00:00<00:00, 9353.75it/s]

adding edges


100%|██████████| 5908/5908 [00:00<00:00, 9295.36it/s]


adding nodes


  3%|▎         | 797/23733 [00:00<00:02, 7949.10it/s]

adding edges


100%|██████████| 23733/23733 [00:02<00:00, 8716.77it/s]


adding nodes


  0%|          | 866/345873 [00:00<00:39, 8658.96it/s]

adding edges


100%|██████████| 345873/345873 [00:40<00:00, 8538.09it/s]


### Helper functions for benchmark construction

In [7]:
def check_for_edge(G, u, v, rel):
    return (u, v, {'relation': rel}) in G.edges(data=True)

def construct_symmetric_edge_bmk(triples_df, G, predict_relation, explain_relation):
    bmk = []

    for _, row in triples_df.iterrows():

        explain_triple = (row.e2, row.e1, explain_relation)

        # if the reverse 'children' edge is in the graph at all, add it to the benchmark
        if check_for_edge(G['full'], *explain_triple):
            out = {'triple': (row.e1, predict_relation, row.e2),
                   'explanation': (row.e2, explain_relation, row.e1)}

            # if that edge is in training, set explanation_in_train to 1
            if check_for_edge(G['train'], *explain_triple):
                out['explanation_in_train'] = 1
            else:
                out['explanation_in_train'] = 0

            # if that edge is in test, set explanation_in_test to 1. However we know
            # this is not true since we only have parent edges in the test set.
            out['explanation_in_test'] = 0

            bmk.append(out)
            
    return bmk

def construct_explanation_benchmark(triples_to_explain, G, explanation_metapaths):
    bmk = []

    for _, row in triples_to_explain.iterrows():
        
        for metapath in explanation_metapaths:
            
            # if (any examples of the) metapath are in the graph, add them as explanations
            
            # for each path, compute the fraction of its edges that are in train, valid and test
            # (and assert this adds up to 1)

            explain_triple = (row.e2, row.e1, explain_relation)

            # if the reverse 'children' edge is in the graph at all, add it to the benchmark
            if check_for_edge(G['full'], *explain_triple):
                out = {'triple': (row.e1, predict_relation, row.e2),
                       'explanation': (row.e2, explain_relation, row.e1)}

                # if that edge is in training, set explanation_in_train to 1
                if check_for_edge(G['train'], *explain_triple):
                    out['explanation_in_train'] = 1
                else:
                    out['explanation_in_train'] = 0

                # if that edge is in test, set explanation_in_test to 1. However we know
                # this is not true since we only have parent edges in the test set.
                out['explanation_in_test'] = 0

                bmk.append(out)
            
    return bmk

# TO DO: change interface to check_for_edge to be edges and not G

# a path could be represented as a list of edges
#metapath = (entity1, relation1, entity2, relation2, entity3)
#or 
#metapath = (entity1, relation1, x, relation2, entity3)

In [15]:
full_edge = ('charles_studd', 'missionary', {'relation': 'profession'}) 
partial_edge = ('charles_studd', None, {'relation': 'profession'})
edges = G['test'].edges(data=True)

In [63]:
class Edge:
    
    def __init__(self, u, v, rel):
        self.u = u
        self.v = v
        self.rel = rel
        
    def __str__(self):
        return f'{self.u}--[{self.rel}]-->{self.v}'
    
    def __eq__(self, other):
        assert type(self) == type(other)
        
        if self.u == other.u and self.v == other.v and self.rel == other.rel:
            return True
        else:
            return False
        
class MetaEdge:
    
    def __init__(self, u=None, v=None, rel=None):
        self.u = u
        self.u_str = self._stringify(u)
        
        self.v = v
        self.v_str = self._stringify(v)
        
        self.rel = rel
        self.rel_str = self._stringify(rel)
        
    def __str__(self):
        e = Edge(self.u_str, self.v_str, self.rel_str)
        return e.__str__()
    
    def _stringify(self, elt):
        if elt is None:
            s = 'X'
        else:
            s = elt
        return s
            
class Path:
    pass


In [65]:
e1 = Edge(u='charles_studd', v='missionary', rel='profession')
e2 = MetaEdge(u='charles_studd', v='missionary')
#e1 == e2
print(e1)
print(e2)

charles_studd--[profession]-->missionary
charles_studd--[X]-->missionary


In [48]:
type(e)

__main__.Edge

In [40]:
def find_edges(edges, constraints, verbose=False):
    assert len(constraints) == 3
    u, rel, v = constraints
    matches = []
    for edge in edges:
        match = True
        if u is not None and edge[0] != u:
            match = False
        if v is not None and edge[1] != v:
            match = False
        if rel is not None and edge[2]['relation'] != rel:
            match = False
        if match:
            matches.append(edge)
    found_edge = (len(matches) > 0)
    if verbose: print(f'Found {len(matches)} matching edges')
    return found_edge, matches

def find_two_hop_paths(edges, metapath):
    assert len(metapath) == 5
    hop1 = metapath[:3]
    hop2 = metapath[2:]
    
    found1, edges1 = find_edges(edges, constraints=hop1)
    found2, edges2 = find_edges(edges, constraints=hop2)
    
    paths = []
    if found1 and found2:
        for e1 in edges1:
            for e2 in edges2:
                if e2[1] == e1
                    path = ()
                    paths.append()
                
        
    

In [36]:
u = None
v = 'female'
rel = 'gender'
found, edge_subset = find_edges(edges, u=u, v=v, verbose=True)

Found 865 matching edges


In [39]:
metapath = ('A', 'rel1', 'x', 'rel2', 'B')
metapath[:3]
metapath[2:]

('A', 'rel1', 'x')

('x', 'rel2', 'B')

### Children benchmark

In [8]:
benchmarks = dict()
parents_test = read_triples(f'{data_dir}/test_parents.txt')
benchmarks['children'] = construct_symmetric_edge_bmk(parents_test, G, 'parents', 'children')

In [9]:
benchmarks

{'children': [{'triple': ('anna_e_roosevelt', 'parents', 'eleanor_roosevelt'),
   'explanation': ('eleanor_roosevelt', 'children', 'anna_e_roosevelt'),
   'explanation_in_train': 1,
   'explanation_in_test': 0},
  {'triple': ('ethel_lilian_voynich', 'parents', 'george_boole'),
   'explanation': ('george_boole', 'children', 'ethel_lilian_voynich'),
   'explanation_in_train': 1,
   'explanation_in_test': 0},
  {'triple': ('prince_sigismund_of_prussia_kiel',
    'parents',
    'princess_irene_of_hesse_and_by_rhine'),
   'explanation': ('princess_irene_of_hesse_and_by_rhine',
    'children',
    'prince_sigismund_of_prussia_kiel'),
   'explanation_in_train': 1,
   'explanation_in_test': 0},
  {'triple': ('amun-her-khepeshef', 'parents', 'nefertari'),
   'explanation': ('nefertari', 'children', 'amun-her-khepeshef'),
   'explanation_in_train': 1,
   'explanation_in_test': 0},
  {'triple': ('anna_of_bohemia', 'parents', 'constance_of_hungary'),
   'explanation': ('constance_of_hungary', 'chi

### Spouse by symmetry

In [None]:
spouse_test = read_triples(f'{data_dir}/test_spouse.txt')
benchmarks['spouse_by_symmetry'] = construct_symmetric_edge_bmk(spouse_test, G, 'spouse', 'spouse')

### Parent-child by co-parent

### Nationality by family nationality

### Location by various factors

In [None]:
G = nx.MultiDiGraph()

In [None]:
G.add_node(0)
G.add_node(1)

In [None]:
nx.number_of_nodes(G)

In [None]:
G.add_edge(0, 1, relation='red')