In [1]:
import sys
sys.path.append('../../hetnet-ml/src')

import pandas as pd
import graph_tools as gt
from hetio.hetnet import MetaGraph

In [2]:
def num_metapaths(nodes, edges):
    abv, met = gt.get_abbrev_dict_and_edge_tuples(gt.add_colons(nodes), gt.add_colons(edges))
    return len(MetaGraph.from_edge_tuples(met, abv).extract_metapaths('Chemicals & Drugs', 'Disorders', 4))

In [3]:
nodes = gt.remove_colons(pd.read_csv('../data/nodes_VER31_R_condensed_filtered_001.csv'))
edges = gt.remove_colons(pd.read_csv('../data/edges_VER31_R_condensed_filtered_001.csv', converters={'pmids':eval}))

In [4]:
num_metapaths(nodes, edges)

9946

Right now we have 23,231 potential metapahts of length 4 or fewer. With a very modest 4000 Compounds and 2000 diseases, this will produces over 185 Billion Degree-Weighted-Path-Count values upon feature extraction.  

Removal of some edges is necesary. Compound-Compound, Disease-Disease, and Compound-Disease edges have the greatest impact computational complexity and memory use during the metapah feature extraction by ballooning the potnetial number of metapaths.  These will be targetd for removal first.

A couple other edge types which are not very semanticly meaningful, but highly populated, will also be removed.

Finally, directed edges produce twice as many metapaths as undirected edges, as you can traverse them in the forward or reverse direction.  Some of these directed edges may be repurposed into undirected edges.

- REMOVE EDGES:
    - Compound AW Disease
    - Compound AG Disease
    - ASSOCIATED_WITH_CDawCD
    - ASSOCIATED_WITH_CDawDO
    - AUGMENTS_DOag>DO
    - PART_OF_Gpo>G
    - NEG_LOCATION_OF_AnloCD 
    - NEG_LOCATION_OF_AnloG 
    - compared_with_PRcfpwPR

Some nodes really have little to do with bio-medical knowledge and these will also be removed.

- REMOVE NODES:
    - Organizations
    - Activities & Behaviors
    - Concepts & Ideas

Other nodes seem like they may or may not have much to do with biomedicine.  These will be examined for possible removal as well.
   
- LOOK AT NODES (for potential removal):
    - Devices
    - Phenomena
    - Living Beings
    - Procedures

Finally, targeting edges that produce large numbers of extra metapaths, some edges will be further condensed in semmantic type.

- COMPRESS EDGES:
    - Self-referential: compound, gene, and disease edges   
    - Any 2 metanodes connected by 4 or more edge types

### Removing 'Organizations', 'Activities & Behaviors', and 'Concepts & Ideas'

In [5]:
remove_types = ['Organizations', 'Activities & Behaviors', 'Concepts & Ideas']
idx = gt.remove_colons(nodes).query('label in @remove_types').index
nodes.drop(idx, inplace=True)

In [6]:
ok_ids = nodes['id'].unique()
edges = edges.query('start_id in @ok_ids and end_id in @ok_ids')

In [7]:
num_metapaths(nodes, edges)

9764

In [8]:
remove_types = ['Procedures', 'Devices']
idx = gt.remove_colons(nodes).query('label in @remove_types').index
nodes.drop(idx, inplace=True)
ok_ids = nodes['id'].unique()
edges = edges.query('start_id in @ok_ids and end_id in @ok_ids')
num_metapaths(nodes, edges)

6233

In [10]:
print(len(ok_ids))
print(len(set(edges['start_id']).union(set(edges['end_id']))))

222102
212121


In [11]:
ok_ids = list(set(edges['start_id']).union(set(edges['end_id'])))
nodes = nodes.query('id in @ok_ids')

In [12]:
print(len(ok_ids))
print(len(set(edges['start_id']).union(set(edges['end_id']))))

212121
212121


In [13]:
len(edges)

10902818

In [14]:
len(edges.query('n_pmids >= 5'))

1075276

In [15]:
nodes['label'].value_counts()

Chemicals & Drugs              83815
Living Beings                  47441
Disorders                      37728
Genes & Molecular Sequences    19147
Anatomy                        14911
Physiology                      7968
Phenomena                       1111
Name: label, dtype: int64

In [16]:
edges['type'].value_counts()

LOCATION_OF_AloCD         1000717
ASSOCIATED_WITH_DOawDO     731651
INTERACTS_WITH_CDiwG       692722
TREATS_CDtDO               610001
LOCATION_OF_AloDO          540530
LOCATION_OF_AloG           516981
STIMULATES_CDstCD          507388
AFFECTS_DOafLB             471994
PRODUCES_LBpdCD            462977
REGULATES_CDreg>CD         434934
STIMULATES_CDstG           375404
AFFECTS_CDafPS             358135
AFFECTS_CDafA              316910
ASSOCIATED_WITH_GawDO      310501
LOCATION_OF_AloA           264680
AFFECTS_GafPS              247912
INHIBITS_CDinG             238822
AUGMENTS_CDagPS            228281
INTERACTS_WITH_GiwG        227896
RELATED_TO_CDrtCD          222325
PART_OF_GpoLB              208882
REGULATES_Greg>G           196208
DISRUPTS_CDdsPS            180595
AFFECTS_PSafDO             142059
AUGMENTS_GagDO             138579
OCCURS_IN_AoiLB            135471
PROCESS_OF_PSpro>LB        128664
AFFECTS_GafA               121872
INHIBITS_GinDO             113522
TREATS_CDtLB  

In [17]:
edges.query('n_pmids >= 5')['type'].value_counts()

LOCATION_OF_AloCD         140213
LOCATION_OF_AloDO          81198
AFFECTS_DOafLB             73989
ASSOCIATED_WITH_DOawDO     71635
TREATS_CDtDO               56837
PRODUCES_LBpdCD            54855
LOCATION_OF_AloG           51688
INTERACTS_WITH_CDiwG       49084
LOCATION_OF_AloA           43403
RELATED_TO_CDrtCD          39492
AFFECTS_CDafPS             31850
STIMULATES_CDstCD          27863
OCCURS_IN_AoiLB            24279
AFFECTS_CDafA              23792
STIMULATES_CDstG           23010
REGULATES_CDreg>CD         20990
PROCESS_OF_PSpro>LB        20361
PART_OF_GpoLB              20322
ASSOCIATED_WITH_GawDO      20273
AFFECTS_GafPS              19642
AUGMENTS_CDagPS            18272
ISA_LBi>LB                 15523
AFFECTS_PSafDO             14623
TREATS_CDtLB               14363
AFFECTS_PSafPS             14342
DISRUPTS_CDdsPS            10886
INHIBITS_CDinG             10527
INTERACTS_WITH_GiwG        10164
REGULATES_Greg>G            9946
AUGMENTS_GagDO              6009
AFFECTS_Ga

In [18]:
%%time
# Some edges now duplicated, de-duplicate and combine pmids
print(len(edges))
grpd = edges.groupby(['start_id', 'end_id', 'type'])
edges = grpd['pmids'].apply(lambda Series: set.union(*Series.values)).reset_index()

# re-count the pmid numbers
edges['n_pmids'] = edges['pmids'].apply(len)

print(len(edges))

10902818
10902818
CPU times: user 15min 17s, sys: 2.72 s, total: 15min 19s
Wall time: 15min 19s


In [19]:
gt.add_colons(nodes).to_csv('../data/nodes_VER31_R_7_metanode.csv', index=False)
gt.add_colons(edges).to_csv('../data/edges_VER31_R_7_metanode.csv', index=False)