In [1]:
import pandas as pd
from neo4j import GraphDatabase
import json

In [2]:
data_dir = '../data/fb13/'

In [3]:
# setup database connection
driver = GraphDatabase.driver('bolt://localhost:7687', auth=('neo4j', 'fb13'))
sess = driver.session()

In [4]:
def res_to_df(res):
    result_list = [record for record in res]
    return pd.DataFrame(result_list, columns=res.keys())


def split_triples(triples, fracs):
    assert len(fracs) == 3
    assert sum(fracs) == 1
    
    train, remainder = split_triples_into_two(triples, fracs[0])
    valid, test = split_triples_into_two(remainder, fracs[1]/(fracs[1] + fracs[2]))
    
    return train, valid, test

def split_triples_into_two(triples, frac):
    group1 = triples.sample(frac=frac, random_state=42)
    group2 = triples.drop(group1.index)
    assert group1.shape[0] + group2.shape[0] == triples.shape[0]
    return group1, group2

# Note that we are using Graph instead of MultiDiGraph here since networkx does not support
# computing connected components on a directed graph 
def construct_networkx_object(df_triples, df_entities):
    G = nx.Graph()

    # add nodes
    print('adding nodes')
    e_tuples = [(row['name'], {'id': row['id']}) for i, row in df_entities.iterrows()]
    G.add_nodes_from(e_tuples)
    
    # add edges
    print('adding edges')
    for i, row in tqdm(df_triples.iterrows(), total=df_triples.shape[0]):
        G.add_edge(row.e1, row.e2, relation=row.rel)
        
    assert nx.number_of_nodes(G) == df_entities.shape[0]
        
    return G

In [5]:
q = '''
MATCH (ch:Person)-[:parents]->(pa:Person)
RETURN ch.name as child, pa.name as parent

UNION

MATCH (ch:Person)<-[:children]-(pa:Person)
RETURN ch.name as child, pa.name as parent
'''
parents = res_to_df(sess.run(q))
parents.shape

(6666, 2)

In [6]:
parents_per_child = parents.groupby('child', as_index=False).agg(set)
parents_per_child['num_parents'] = parents_per_child['parent'].apply(len)
parents_per_child.head()

Unnamed: 0,child,parent,num_parents
0,a_k_faezul_huq,{a_k_fazlul_huq},1
1,a_starker_leopold,{aldo_leopold},1
2,aage_niels_bohr,{niels_bohr},1
3,aaron_burr,{aaron_burr_sr},1
4,abaqa_khan,{hulagu_khan},1


In [7]:
two_parents_df = parents_per_child[parents_per_child['num_parents']== 2]
two_parents = dict()
for i, row in two_parents_df.iterrows():
    two_parents[row.child] = row.parent

In [8]:
siblings = dict()
for child_i, parents_i in two_parents.items():
    siblings[child_i] = set()
    for child_j, parents_j in two_parents.items():
        if child_i != child_j and parents_i == parents_j:
            siblings[child_i].add(child_j)
    if len(siblings[child_i]) == 0:
        del siblings[child_i]
    else:
        siblings[child_i] = list(siblings[child_i])

In [9]:
with open(f'{data_dir}/resplit_with_sibs/siblings_via_two_shared_parents.json', 'w') as f:
    f.write(json.dumps(siblings, indent=4))

216384

In [10]:
triples_list = []
for child, sibs in siblings.items():
    for sib in sibs:
        triples_list.append([child, 'sibling', sib])
sib_triples = pd.DataFrame(triples_list, columns=['e1', 'rel', 'e2'])
sib_triples.drop_duplicates(inplace=True, ignore_index=True) 

sib_triples.to_csv(f'{data_dir}/resplit_with_sibs/siblings.txt')

In [11]:
train = pd.read_csv(f'{data_dir}/resplit/train.txt', sep='\t', names=['e1', 'rel', 'e2'])
valid = pd.read_csv(f'{data_dir}/resplit/valid.txt', sep='\t', names=['e1', 'rel', 'e2'])
test = pd.read_csv(f'{data_dir}/resplit/test.txt', sep='\t', names=['e1', 'rel', 'e2'])

In [12]:
sib_train, sib_valid, sib_test = split_triples(sib_triples, fracs=[0.8, 0.1, 0.1])

In [13]:
train = pd.concat([train, sib_train], axis=0, ignore_index=True)
valid = pd.concat([valid, sib_valid], axis=0, ignore_index=True)
test = pd.concat([test, sib_test], axis=0, ignore_index=True)

In [14]:
train.to_csv(f'{data_dir}resplit_with_sibs/train.txt', sep='\t', index=False, header=None)
valid.to_csv(f'{data_dir}resplit_with_sibs/valid.txt', sep='\t', index=False, header=None)
test.to_csv(f'{data_dir}resplit_with_sibs/test.txt', sep='\t', index=False, header=None)