In [1]:
import pandas as pd
import os
import sys
sys.path.append('../py')
import processing as pt
import permute as pm

In [2]:
comp = pd.read_hdf('data/compound_edges.h5')
gene = pd.read_hdf('data/h_genes_edges.h5')
prot = pd.read_hdf('data/h_protein_edges.h5')
dis = pd.read_hdf('data/disease_edges.h5')

comp['type'] = 'Compound'
gene['type'] = 'Gene'
prot['type'] = 'Protein'
dis['type'] = 'Disease'

base = pd.concat([comp, gene, prot, dis])

In [3]:
node_types = pt.get_node_type_dict(base, 'type')
base_filt = pt.filter_untyped_nodes(base)
base_filt['e_type'] = pt.get_edge_types(base_filt, node_types)
edges = pt.format_edges_neo(base_filt)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [4]:
base_filt1 = pt.remove_low_count_edges(base_filt)
edges = pt.format_edges_neo(base_filt1)

In [5]:
filt_2 = pt.prep_for_export(base)

In [7]:
edges.head(2)

Unnamed: 0,:START_ID,:END_ID,:TYPE
0,Q71969,Q221668,medical-condition-treated_CmctD
1,Q71969,Q246084,medical-condition-treated_CmctD


In [8]:
pair_list = pm.get_pair_lists(edges)

In [9]:
def invert_pairs(pair_list):
     return [(b, a) for (a,b) in pair_list]
    

In [10]:
def calc_overlap(pair_list, invert_list, cutoff=.5):
    count_a, count_b = 0, 0
    """
    # if there is a large difference in size, don't bother checking
    if (len(pair_list)/len(invert_list) < cutoff or 
        len(invert_list)/len(pair_list) < cutoff):
        return 0
    """
    # Check both dierections
    count_a = len(set(pair_list).intersection(set(invert_list)))
    count_b = len(set(invert_list).intersection(set(pair_list)))
            
    # Return the minimum overlap
    return min(count_a/len(pair_list), count_b/len(invert_list))

In [11]:
def find_reciprocal_relations(pair_list, cutoff=0.5):


    invert_list = {key: invert_pairs(value) for key, value in pair_list.items()}
    kinds = list(pair_list.keys())

    reciprocal_types = []

    # Only compare 2 different relationship types once
    for i, kind in enumerate(kinds):
        for idx in range(i, len(kinds)):
            overlap = calc_overlap(pair_list[kind], invert_list[kinds[idx]])
            if overlap > cutoff:
                reciprocal_types.append([kind, kinds[idx]])
                
    return reciprocal_types

In [12]:
%%time
reciprocal_types = find_reciprocal_relations(pair_list)

CPU times: user 264 ms, sys: 0 ns, total: 264 ms
Wall time: 264 ms


In [13]:
reciprocal_types

[['encodes_GeP', 'encoded-by_PebG'],
 ['physically-interacts-with_CpiwP', 'physically-interacts-with_PpiwC'],
 ['drug-used-for-treatment_DduftC', 'medical-condition-treated_CmctD'],
 ['genetic-association_DgaG', 'genetic-association_GgaD'],
 ['significant-drug-interaction_CsdiC', 'significant-drug-interaction_CsdiC']]

In [15]:
invert_list = {key: invert_pairs(value) for key, value in pair_list.items()}
kinds = list(pair_list.keys())

overlaps = []

# Only compare 2 different relationship types once
for i, kind in enumerate(kinds):
    for idx in range(i, len(kinds)):
        overlap = calc_overlap(pair_list[kind], invert_list[kinds[idx]])
        overlaps.append([kind, kind.split('_')[-1], kinds[idx], kinds[idx].split('_')[-1], overlap])

(pd.DataFrame(overlaps, columns=['edge1','nodes1', 'edge2','nodes2', 'overlap'])
   .sort_values('overlap', ascending=False)
   .reset_index(drop=True))

Unnamed: 0,edge1,nodes1,edge2,nodes2,overlap
0,encodes_GeP,GeP,encoded-by_PebG,PebG,0.993594
1,genetic-association_DgaG,DgaG,genetic-association_GgaD,GgaD,0.980918
2,physically-interacts-with_CpiwP,CpiwP,physically-interacts-with_PpiwC,PpiwC,0.979765
3,drug-used-for-treatment_DduftC,DduftC,medical-condition-treated_CmctD,CmctD,0.949519
4,significant-drug-interaction_CsdiC,CsdiC,significant-drug-interaction_CsdiC,CsdiC,0.780868
5,subclass-of_DsoD,DsoD,subclass-of_DsoD,DsoD,0.001174
6,has-part_ChpC,ChpC,instance-of_CioC,CioC,0.000421
7,physically-interacts-with_PpiwC,PpiwC,medical-condition-treated_CmctD,CmctD,0.000000
8,medical-condition-treated_CmctD,CmctD,significant-drug-interaction_CsdiC,CsdiC,0.000000
9,drug-used-for-treatment_DduftC,DduftC,instance-of_CioC,CioC,0.000000


In [16]:
def remove_reciprocals(edges, reciprocal_types):

    edge_copy = edges.copy()

    for types in reciprocal_types:
        orig = edge_copy.loc[edges[':TYPE'] == types[1]].copy()
        # Swap
        tmp = orig[':START_ID'].copy()
        orig[':START_ID'] = orig[':END_ID']
        orig[':END_ID'] = tmp
        # Change Type
        orig[':TYPE'] = types[0]

        edge_copy.loc[edges[':TYPE'] == types[1]] = orig 

    return edge_copy.drop_duplicates()
        

In [10]:
new_edges = remove_reciprocals(edges, reciprocal_types)

In [11]:
new_edges.to_csv('data/hetnet_baseedges_bidircetional.csv')

## Edge types after filtering

In [20]:
base_filt['e_type'].nunique()

76

In [27]:
list(base_filt['e_type'].unique())

['medical-condition-treated_CmctD',
 'physically-interacts-with_CpiwP',
 'has-part_ChpC',
 'instance-of_CioC',
 'significant-drug-interaction_CsdiC',
 'different-from_CdfC',
 'subclass-of_CsoC',
 'part-of_CpoC',
 'material-used_CmuC',
 'cause-of_CcoD',
 'physically-interacts-with_CpiwG',
 'stereoisomer-of_CsoC',
 'decays-to_CdtC',
 'drug-used-for-treatment_CduftD',
 'follows_CfC',
 'vaccine-for_CvfD',
 'said-to-be-the-same-as_CstbtsaC',
 'medical-condition_CmcD',
 'followed-by_CfbC',
 'named-after_CnaC',
 'side-effect_CseD',
 'encodes_GeP',
 'genetic-association_GgaD',
 'decreased-expression-in_GdeiD',
 'increased-expression-in_GieiD',
 'subclass-of_GsoC',
 'gene-inversion-association-with_GgiawD',
 'genetic-association_PgaD',
 'encoded-by_PebP',
 'encodes_PeP',
 'deletion-association-with_GdawD',
 'cause-of_GcoD',
 'different-from_GdfP',
 'posttranslational-modification-association-with_GpmawD',
 'gene-duplication-association-with_GgdawD',
 'gene-insertion-association-with_GgiawD',
 '

In [28]:
len(base_filt), len(base_filt1)

(86490, 86024)

In [25]:
base_filt['e_type'].value_counts()

encodes_GeP                                               27164
encoded-by_PebG                                           27004
subclass-of_DsoD                                          10218
physically-interacts-with_PpiwC                            3657
physically-interacts-with_CpiwP                            3596
drug-used-for-treatment_DduftC                             2912
medical-condition-treated_CmctD                            2822
has-part_ChpC                                              2378
genetic-association_GgaD                                   2201
genetic-association_DgaG                                   2181
significant-drug-interaction_CsdiC                         1798
instance-of_CioC                                             93
symptoms_DsD                                                 81
has-cause_DhcD                                               55
subclass-of_CsoC                                             40
subclass-of_GsoC                        

In [21]:
base_filt1['e_type'].nunique()

12

In [22]:
base_filt2 = pt.remove_reciprocals(base_filt1, reciprocal_types)

In [23]:
base_filt2['e_type'].nunique()

8

In [24]:
base_filt2['e_type'].value_counts()

encodes_GeP                           27178
subclass-of_DsoD                      10218
physically-interacts-with_CpiwP        3670
drug-used-for-treatment_DduftC         2969
has-part_ChpC                          2378
genetic-association_DgaG               2223
significant-drug-interaction_CsdiC     1096
instance-of_CioC                         93
Name: e_type, dtype: int64