In [1]:
import pandas as pd
import os
import sys
sys.path.append('../py')
import permute as pm

In [2]:
edges = pd.read_csv('data/hetnet_baseedges.csv')

In [3]:
edges.head(2)

Unnamed: 0,:START_ID,:END_ID,:TYPE
0,Q71969,Q221668,medical-condition-treated_CmD
1,Q71969,Q246084,medical-condition-treated_CmD


In [4]:
pair_list = pm.get_pair_lists(edges)

In [5]:
def invert_pairs(pair_list):
     return [(b, a) for (a,b) in pair_list]
    

In [6]:
def calc_overlap(pair_list, invert_list, cutoff=.9):
    count_a, count_b = 0, 0
    
    # if there is a large difference in size, don't bother checking
    if (len(pair_list)/len(invert_list) < cutoff or 
        len(invert_list)/len(pair_list) < cutoff):
        return 0
    
    # Check both dierections
    for pair in pair_list:
        if pair in invert_list:
            count_a += 1
            
    for pair in invert_list:
        if pair in pair_list:
            count_b += 1        
    
    # Return the minimum overlap
    return min(count_a/len(pair_list), count_b/len(invert_list))

In [7]:
def find_reciprocal_relations(pair_list, cutoff=0.9):


    invert_list = {key: invert_pairs(value) for key, value in pair_list.items()}
    kinds = list(pair_list.keys())

    reciprocal_types = []

    # Only compare 2 different relationship types once
    for i, kind in enumerate(kinds):
        for idx in range(i+1, len(kinds)):
            overlap = calc_overlap(pair_list[kind], invert_list[kinds[idx]])
            if overlap > cutoff:
                reciprocal_types.append([kind, kinds[idx]])
                
    return reciprocal_types

In [8]:
%%time
reciprocal_types = find_reciprocal_relations(pair_list)

CPU times: user 26.4 s, sys: 4 ms, total: 26.4 s
Wall time: 26.4 s


In [9]:
def remove_reciprocals(edges, reciprocal_types):

    edge_copy = edges.copy()

    for types in reciprocal_types:
        orig = edge_copy.loc[edges[':TYPE'] == types[1]].copy()
        # Swap
        tmp = orig[':START_ID'].copy()
        orig[':START_ID'] = orig[':END_ID']
        orig[':END_ID'] = tmp
        # Change Type
        orig[':TYPE'] = types[0]

        edge_copy.loc[edges[':TYPE'] == types[1]] = orig 

    return edge_copy.drop_duplicates()
        

In [10]:
new_edges = remove_reciprocals(edges, reciprocal_types)

In [11]:
new_edges.to_csv('data/hetnet_baseedges_bidircetional.csv')

In [12]:
len(edges)

86486

In [13]:
len(new_edges)

50981