In [1]:
import sys
sys.path.append('../../hetnet-ml/src')

import pandas as pd
import graph_tools as gt
from hetio.hetnet import MetaGraph

In [2]:
def num_metapaths(nodes, edges):
    abv, met = gt.get_abbrev_dict_and_edge_tuples(gt.add_colons(nodes), gt.add_colons(edges))
    return len(MetaGraph.from_edge_tuples(met, abv).extract_metapaths('Chemicals & Drugs', 'Disorders', 4))

In [3]:
nodes = gt.remove_colons(pd.read_csv('../data/nodes_condensed_filtered_001.csv'))
edges = gt.remove_colons(pd.read_csv('../data/edges_condensed_filtered_001.csv', converters={'pmids':eval}))

In [4]:
num_metapaths(nodes, edges)

23231

Right now we have 23,231 potential metapahts of length 4 or fewer. With a very modest 4000 Compounds and 2000 diseases, this will produces over 185 Billion Degree-Weighted-Path-Count values upon feature extraction.  

Removal of some edges is necesary. Compound-Compound, Disease-Disease, and Compound-Disease edges have the greatest impact computational complexity and memory use during the metapah feature extraction by ballooning the potnetial number of metapaths.  These will be targetd for removal first.

A couple other edge types which are not very semanticly meaningful, but highly populated, will also be removed.

Finally, directed edges produce twice as many metapaths as undirected edges, as you can traverse them in the forward or reverse direction.  Some of these directed edges may be repurposed into undirected edges.

- REMOVE EDGES:
    - Compound AW Disease
    - Compound AG Disease
    - ASSOCIATED_WITH_CDawCD
    - ASSOCIATED_WITH_CDawDO
    - AUGMENTS_DOag>DO
    - PART_OF_Gpo>G
    - NEG_LOCATION_OF_AnloCD 
    - NEG_LOCATION_OF_AnloG 
    - compared_with_PRcfpwPR

Some nodes really have little to do with bio-medical knowledge and these will also be removed.

- REMOVE NODES:
    - Organizations
    - Activities & Behaviors
    - Concepts & Ideas

Other nodes seem like they may or may not have much to do with biomedicine.  These will be examined for possible removal as well.
   
- LOOK AT NODES (for potential removal):
    - Devices
    - Phenomena
    - Living Beings
    - Procedures

Finally, targeting edges that produce large numbers of extra metapaths, some edges will be further condensed in semmantic type.

- COMPRESS EDGES:
    - Self-referential: compound, gene, and disease edges   
    - Any 2 metanodes connected by 4 or more edge types

### Removing 'Organizations', 'Activities & Behaviors', and 'Concepts & Ideas'

In [6]:
remove_types = ['Organizations', 'Activities & Behaviors', 'Concepts & Ideas']
idx = gt.remove_colons(nodes).query('label in @remove_types').index
nodes.drop(idx, inplace=True)

In [8]:
ok_ids = nodes['id'].unique()
edges = edges.query('start_id in @ok_ids and end_id in @ok_ids')

In [9]:
num_metapaths(nodes, edges)

23218

### Begin dropping the edges no longer desired

In [10]:
def change_edge_type(df, from_type, to_type, swap=False):
    idx = df.query('type == @from_type').index
    df.loc[idx, 'type'] = to_type
    if swap:
        tmp = df.loc[idx, 'start_id']
        df.loc[idx, 'start_id'] = df.loc[idx, 'end_id']
        df.loc[idx, 'end_id'] = tmp
                                             
def merge_edge_types(df, from_list, to_type, swap=False):
    for from_type in from_list:
        change_edge_type(df, from_type, to_type, swap=swap)

def drop_edges_from_list(df, drop_list):
    idx = df.query('type in @drop_list').index
    df.drop(idx, inplace=True)

In [11]:
drop = ['NEG_LOCATION_OF_AnloCD', 'NEG_LOCATION_OF_AnloG', 'compared_with_PRcfpwPR']

In [12]:
drop_edges_from_list(edges, drop)

In [13]:
num_metapaths(nodes, edges)

21603

In [14]:
drop = ['ASSOCIATED_WITH_CDawDO', 'AUGMENTS_CDagDO']
drop_edges_from_list(edges, drop)
num_metapaths(nodes, edges)

14773

In [15]:
# The associated-with edges in these contexts don't appear to be very meaningufl
to_drop = ['ASSOCIATED_WITH_CDawCD', 'ASSOCIATED_WITH_CDawDO',  'AUGMENTS_DOag>DO', 'PART_OF_Gpo>G']
drop_edges_from_list(edges, to_drop)

# These edges could be combined into a simple regulates edge
to_merge = ['STIMULATES_CDst>CD', 'INHIBITS_CDinCD']
merge_edge_types(edges, to_merge, 'REGULATES_CDreg>CD')

# See previous comment
to_merge = ['STIMULATES_Gst>G', 'INHIBITS_GinG']
merge_edge_types(edges, to_merge, 'REGULATES_Greg>G')

# These should just be the same... too many differeing self-referental edges on diseases
merge_edge_types(edges, ['ISA_DOi>DO', 'ASSOCIATED_WITH_DOawDO'], 'ASSOCIATED_WITH_DOawDO')

# make this edge undirected for fewer metapaths.
change_edge_type(edges, 'ISA_CDi>CD', 'RELATED_TO_CDrtCD')

In [16]:
num_metapaths(nodes, edges)

7889

In [17]:
both = gt.combine_nodes_and_edges(gt.add_colons(nodes), gt.add_colons(edges))

In [18]:
both.head()

Unnamed: 0,:START_ID,:END_ID,:TYPE,pmids,n_pmids,start_name,end_name,start_label,end_label
0,C0041361,C0025920,ADMINISTERED_TO_CDatLB,{6642564},1,Tumor Antigens,"Mice, Inbred C3H",Chemicals & Drugs,Living Beings
1,C0022924,C1123019,ADMINISTERED_TO_CDatLB,{7364689},1,Lactates,Domestic Sheep,Chemicals & Drugs,Living Beings
2,C0030827,C0031831,ADMINISTERED_TO_CDatLB,{8757705},1,Penicillin G,Physicians,Chemicals & Drugs,Living Beings
3,C0301532,C0031831,ADMINISTERED_TO_CDatLB,"{8179050, 18774045}",2,Multivitamin preparation,Physicians,Chemicals & Drugs,Living Beings
4,C0729237,C0012984,ADMINISTERED_TO_CDatLB,{1827854},1,Photofrin II,Canis familiaris,Chemicals & Drugs,Living Beings


In [23]:
previews = []
for sem in ['Devices', 'Phenomena', 'Living Beings', 'Procedures']:
    previews.append(both.query('start_label == @sem or end_label == @sem').sample(10))

In [24]:
# Devices
previews[0]

Unnamed: 0,:START_ID,:END_ID,:TYPE,pmids,n_pmids,start_name,end_name,start_label,end_label
14931305,C0183346,C0026056,USES_DVuCD,{21057337},1,"Sling, device",Midazolam,Devices,Chemicals & Drugs
15253818,C0038136,C0183432,USES_PRuDV,"{25657587, 25880766}",2,Standardization,Mercury sphygmomanometer,Procedures,Devices
14398937,C0180202,C0027651,TREATS_DVtDO,{12729775},1,Cryostats,Neoplasm,Devices,Disorders
15250568,C0014098,C0175649,USES_PRuDV,"{7218601, 25490369}",2,Endarterectomy,Prosthesis,Procedures,Devices
15263685,C0204229,C0600199,USES_PRuDV,{784958},1,"Restoration, resin","Cavity Liner, Dental",Procedures,Devices
15252226,C0013806,C0021102,USES_PRuDV,{21816465},1,Electroconvulsive Therapy,Implants,Procedures,Devices
15237548,C1293122,C0021113,USES_PRuDV,"{8436533, 24705238}",2,Augmentation procedure,Artificial Implants,Procedures,Devices
10642690,C0038293,C0009653,LOCATION_OF_AloDV,{2980051},1,Sternum,"Condoms, Male",Anatomy,Devices
14954813,C0438883,C0524491,USES_DVuDV,{24743451},1,Above knee prosthesis,Leg Prosthesis,Devices,Devices
14437012,C0181722,C0679670,TREATS_DVtLB,{25435221},1,Manifolds,network,Devices,Living Beings


In [25]:
# Phenomena
previews[1]

Unnamed: 0,:START_ID,:END_ID,:TYPE,pmids,n_pmids,start_name,end_name,start_label,end_label
477732,C1451465,C0005495,AFFECTS_CDafPH,"{25848967, 17962579, 12963735}",3,"CRK protein, human",Biogenesis,Chemicals & Drugs,Phenomena
456222,C0018282,C0013203,AFFECTS_CDafPH,{26781210},1,Growth Inhibitors,Drug resistance,Chemicals & Drugs,Phenomena
1442016,C1413931,C0019868,AFFECTS_GafPH,"{11716800, 9198890, 8743505, 3037173, 11097339...",6,ACE gene,Homeostasis,Genes & Molecular Sequences,Phenomena
412241,C0136976,C0563227,AFFECTS_CDafPH,"{12232088, 23964902, 20530216, 17982000, 25542...",9,Phytochrome B,Red light,Chemicals & Drugs,Phenomena
473717,C0005456,C0085813,AFFECTS_CDafPH,{8504810},1,Binding Sites,"Anisotropy, Fluorescence",Chemicals & Drugs,Phenomena
431853,C0014914,C0007382,AFFECTS_CDafPH,{9204000},1,Estradiol 17 beta-Dehydrogenase,Catalysis,Chemicals & Drugs,Phenomena
426637,C0055966,C0678587,AFFECTS_CDafPH,{21602277},1,Clusterin,steady state,Chemicals & Drugs,Phenomena
1750910,C0162691,C1326225,AFFECTS_PHafPS,{27729233},1,Friction,cell homeostasis,Phenomena,Physiology
431157,C0009002,C0005495,AFFECTS_CDafPH,{4370900},1,Clofibrate,Biogenesis,Chemicals & Drugs,Phenomena
1718923,C0038817,C0011615,AFFECTS_PHafDO,"{17073869, 1401486}",2,Sunlight,"Dermatitis, Atopic",Phenomena,Disorders


In [26]:
# Living Beings
previews[2]

Unnamed: 0,:START_ID,:END_ID,:TYPE,pmids,n_pmids,start_name,end_name,start_label,end_label
11950114,C0073306,C0036025,PART_OF_GpoLB,"{23945946, 1733947, 27390266, 24129494}",4,ribosomal protein L7,Saccharomyces cerevisiae,Genes & Molecular Sequences,Living Beings
12196387,C0598496,C0021585,PROCESS_OF_PSpro>LB,"{23667556, 27595655, 24859468, 16756557, 10620...",8,Gene Silencing,Insecta,Physiology,Living Beings
1004009,C0011164,C0521057,AFFECTS_DOafLB,"{8148961, 27317023, 18318232, 14734837, 100749...",8,Degenerative abnormality,Hyphae,Disorders,Living Beings
12643538,C0599383,C0034085,PRODUCES_LBpdCD,{18304608},1,Marine Organism,Pulmonary Surfactants,Living Beings,Chemicals & Drugs
12327959,C0337845,C0063690,PRODUCES_LBpdCD,{12816089},1,Pygmies,Integrase,Living Beings,Chemicals & Drugs
12186079,C0042789,C0010212,PROCESS_OF_PSpro>LB,{8916776},1,Vision,Professional counsellor,Physiology,Living Beings
8389063,C0039679,C0599840,INTERACTS_WITH_LBiwLB,{9026146},1,Tetrahymena,microbial,Living Beings,Living Beings
1203721,C1268666,C0870221,AFFECTS_DOafLB,"{17241290, 22832109}",2,Circumscribed lesion,Boys,Disorders,Living Beings
1244150,C1442161,C0686882,AFFECTS_DOafLB,{13129521},1,DELETION,Trophozoite,Disorders,Living Beings
19092,C0699612,C0324817,ADMINISTERED_TO_CDatLB,{1312241},1,Osteogen,Papio ursinus,Chemicals & Drugs,Living Beings


In [27]:
# Procedures
previews[3]

Unnamed: 0,:START_ID,:END_ID,:TYPE,pmids,n_pmids,start_name,end_name,start_label,end_label
1817050,C1293900,C0234222,AFFECTS_PRafPS,{26233580},1,Hand grip,Baresthesia,Procedures,Physiology
14759840,C0185010,C0003857,TREATS_PRtDO,{23406825},1,Closure by clip procedure,Congenital arteriovenous malformation,Procedures,Disorders
14650267,C0203075,C0021933,TREATS_PRtDO,"{7959392, 8854346}",2,Lower gastrointestinal tract contrast procedure,Intussusception,Procedures,Disorders
1831608,C1168098,C0600519,AFFECTS_PRafPS,{22954276},1,Pulmonary arterial pressure,Ventricular Remodeling,Procedures,Physiology
14728440,C1303150,C0516977,TREATS_PRtDO,{21767927},1,Disease management program,physical health,Procedures,Disorders
14836025,C0337280,C0027859,TREATS_PRtDO,{3703129},1,Fenestration procedure,Acoustic Neuroma,Procedures,Disorders
14524230,C0014935,C0080179,TREATS_PRtDO,{23604900},1,Estrogen Replacement Therapy,Spinal Fractures,Procedures,Disorders
11365515,C0013103,C0796679,METHOD_OF_PRmoPR,{18931875},1,Drainage procedure,Chemoembolisation,Procedures,Procedures
6213921,C0005851,C0007820,DIAGNOSES_PRdgDO,{10835454},1,Blood Volume Determination,Cerebrovascular Disorders,Procedures,Disorders
6334116,C0696138,C1522225,DIAGNOSES_PRdgDO,{11731130},1,Metabolic analysis,Knock-out,Procedures,Disorders


In [28]:
# I want to keep phenomena and Procedures if possible... Some examples seem biologically relevant

In [30]:
remove_types = ['Procedures', 'Devices']
idx = gt.remove_colons(nodes).query('label in @remove_types').index
nodes.drop(idx, inplace=True)
ok_ids = nodes['id'].unique()
edges = edges.query('start_id in @ok_ids and end_id in @ok_ids')
num_metapaths(nodes, edges)

4888

In [31]:
# Can't have 4 edges between physiology and Chemicals & Drugs
to_drop = ['AFFECTS_CDafPS']
drop_edges_from_list(edges, to_drop)

In [32]:
num_metapaths(nodes, edges)

4345

In [33]:
print(len(ok_ids))
print(len(set(edges['start_id']).union(set(edges['end_id']))))

234567
215817


In [34]:
ok_ids = list(set(edges['start_id']).union(set(edges['end_id'])))
nodes = nodes.query('id in @ok_ids')

In [35]:
print(len(ok_ids))
print(len(set(edges['start_id']).union(set(edges['end_id']))))

215817
215817


In [36]:
len(edges)

10501869

In [37]:
len(edges.query('n_pmids >= 5'))

998318

In [38]:
nodes['label'].value_counts()

Chemicals & Drugs              84614
Living Beings                  48191
Disorders                      38511
Genes & Molecular Sequences    20539
Anatomy                        15100
Physiology                      7727
Phenomena                       1135
Name: label, dtype: int64

In [39]:
edges['type'].value_counts()

LOCATION_OF_AloCD         996348
REGULATES_CDreg>CD        917174
ASSOCIATED_WITH_DOawDO    755777
INTERACTS_WITH_CDiwG      705394
TREATS_CDtDO              599026
LOCATION_OF_AloDO         540408
LOCATION_OF_AloG          534310
AFFECTS_DOafLB            462162
PRODUCES_LBpdCD           449085
STIMULATES_CDstG          373978
AFFECTS_CDafA             313572
ASSOCIATED_WITH_GawDO     309743
LOCATION_OF_AloA          267214
AFFECTS_GafPS             250457
INHIBITS_CDinG            235809
INTERACTS_WITH_GiwG       235187
REGULATES_Greg>G          222314
AUGMENTS_CDagPS           222088
RELATED_TO_CDrtCD         216035
PART_OF_GpoLB             205720
DISRUPTS_CDdsPS           177044
AFFECTS_PSafDO            140466
AUGMENTS_GagDO            136697
PART_OF_Apo>LB            133351
PROCESS_OF_PSpro>LB       126722
AFFECTS_GafA              124156
INHIBITS_GinDO            111738
AFFECTS_PSafPS            101491
TREATS_CDtLB              101489
AFFECTS_CDafPH             74460
ISA_LBi>LB

In [40]:
edges.query('n_pmids >= 5')['type'].value_counts()

LOCATION_OF_AloCD         137304
LOCATION_OF_AloDO          77730
ASSOCIATED_WITH_DOawDO     67842
AFFECTS_DOafLB             67678
TREATS_CDtDO               54029
LOCATION_OF_AloG           52690
INTERACTS_WITH_CDiwG       50531
PRODUCES_LBpdCD            49608
REGULATES_CDreg>CD         46830
LOCATION_OF_AloA           42602
RELATED_TO_CDrtCD          38129
AFFECTS_CDafA              23097
STIMULATES_CDstG           22802
PART_OF_Apo>LB             22417
ASSOCIATED_WITH_GawDO      19701
AFFECTS_GafPS              19294
PROCESS_OF_PSpro>LB        19000
PART_OF_GpoLB              17309
AUGMENTS_CDagPS            17251
ISA_LBi>LB                 14953
AFFECTS_PSafDO             13614
AFFECTS_PSafPS             13523
TREATS_CDtLB               13170
INTERACTS_WITH_GiwG        10507
INHIBITS_CDinG             10386
DISRUPTS_CDdsPS            10344
REGULATES_Greg>G            9228
AFFECTS_GafA                5935
AUGMENTS_GagDO              5771
AFFECTS_CDafPH              5752
CAUSES_DOc

In [41]:
%%time
# Some edges now duplicated, de-duplicate and combine pmids
print(len(edges))
grpd = edges.groupby(['start_id', 'end_id', 'type'])
edges = grpd['pmids'].apply(lambda Series: set.union(*Series.values)).reset_index()

# re-count the pmid numbers
edges['n_pmids'] = edges['pmids'].apply(len)

print(len(edges))

10501869
10344754
CPU times: user 14min, sys: 3 s, total: 14min 3s
Wall time: 14min 3s


In [42]:
gt.add_colons(nodes).to_csv('../data/nodes_7_metanode.csv', index=False)
gt.add_colons(edges).to_csv('../data/edges_7_metanode.csv', index=False)