## Read DrugMechDB Data

In [4]:
import yaml
import pandas as pd

# Read DrugMechDB Dataset
with open("../data/drugmechdb/DrugMechDB_indication_paths.yml", 'r') as stream:
    try:
        data = (yaml.safe_load(stream))
    except yaml.YAMLError as exc:
        print(exc)

In [5]:
print("Total Data:", len(data)) 
print("Keys in each data point:", data[0].keys())

Total Data: 1245
Keys in each data point: dict_keys(['directed', 'graph', 'links', 'multigraph', 'nodes'])


In [6]:
# node key contains a list of :
# 1) entity id, 
# 2) entity label 
# 3) entity name
# for all the nodes in a path

d = data[0]
print(d["nodes"])

[{'id': 'MESH:D000068877', 'label': 'Drug', 'name': 'imatinib'}, {'id': 'UniProt:P00519', 'label': 'Protein', 'name': 'BCR/ABL'}, {'id': 'MESH:D015464', 'label': 'Disease', 'name': 'CML (ph+)'}]


In [7]:
# DrugMechDB: Store all the nodes in a list
nodes_list = []
label_list = []
path_id_list = []
path_label_list = []
for d in data:
    path_id = []
    path_label = []
    for node in d["nodes"]:
        path_id.append(node["id"])
        path_label.append(node["label"])
        nodes_list.append(node["id"])
        label_list.append(node["label"])
        if node["label"] == "Disease":
            print(node)
    path_id_list.append(path_id)
    path_label_list.append(path_label)

{'id': 'MESH:D015464', 'label': 'Disease', 'name': 'CML (ph+)'}
{'id': 'MESH:D034721', 'label': 'Disease', 'name': 'Systemic mast cell disease'}
{'id': 'MESH:D010146', 'label': 'Disease', 'name': 'Pain'}
{'id': 'MESH:D005334', 'label': 'Disease', 'name': 'Fever'}
{'id': 'MESH:D010146', 'label': 'Disease', 'name': 'Pain'}
{'id': 'MESH:D013927', 'label': 'Disease', 'name': 'Thrombosis'}
{'id': 'MESH:D010146', 'label': 'Disease', 'name': 'Pain'}
{'id': 'MESH:D004405', 'label': 'Disease', 'name': 'Shigellosis'}
{'id': 'MESH:D007634', 'label': 'Disease', 'name': 'Keratitis'}
{'id': 'MESH:D003586', 'label': 'Disease', 'name': 'CMV infection'}
{'id': 'MESH:D018805', 'label': 'Disease', 'name': 'Bacterial septicemia'}
{'id': 'MESH:D000073605', 'label': 'Disease', 'name': 'Rickettsialpox'}
{'id': 'MESH:D014069', 'label': 'Disease', 'name': 'Streptococcal tonsillitis'}
{'id': 'MESH:D011023', 'label': 'Disease', 'name': 'Staphylococcal pneumonia'}
{'id': 'MESH:D016920', 'label': 'Disease', 'name'

In [8]:
set(label_list)

{'BiologicalProcess',
 'Cell',
 'CellularComponent',
 'ChemicalSubstance',
 'Disease',
 'Drug',
 'GeneFamily',
 'GrossAnatomicalStructure',
 'MacromolecularComplex',
 'MolecularActivity',
 'OrganismTaxon',
 'Pathway',
 'PhenotypicFeature',
 'Protein'}

## Read HetNet Nodes

In [9]:
hetnet_file = "../data/hetnet/hetionet-v1.0-nodes.tsv"
hetnet_data = pd.read_csv(hetnet_file, sep="\t")

In [10]:
het_net_nodes = hetnet_data["id"].to_list()

In [11]:
entity_type_hetnet = []
hetnet_node_ids = []
for h in het_net_nodes:
    if h.split("::")[0] in ["Biological Process", "Anatomy", "Cellular Component"] :
#         print(h)
        hetnet_node_ids.append(h.split("::")[1])
    elif h.split("::")[0] in ["Disease"]:
        print(h.split("::")[1])
    entity_type_hetnet.append(h.split("::")[0])

DOID:0050156
DOID:0050425
DOID:0050741
DOID:0050742
DOID:0060073
DOID:0060119
DOID:10021
DOID:10153
DOID:1024
DOID:10283
DOID:10534
DOID:10608
DOID:10652
DOID:10763
DOID:10811
DOID:10871
DOID:1094
DOID:10941
DOID:10976
DOID:11054
DOID:11119
DOID:1115
DOID:11239
DOID:11476
DOID:11555
DOID:11612
DOID:11615
DOID:11714
DOID:11819
DOID:119
DOID:1192
DOID:11920
DOID:11934
DOID:11949
DOID:12185
DOID:12236
DOID:12306
DOID:12361
DOID:12365
DOID:1245
DOID:12849
DOID:12930
DOID:12995
DOID:1312
DOID:13189
DOID:1319
DOID:13223
DOID:1324
DOID:13241
DOID:13378
DOID:13499
DOID:14004
DOID:14221
DOID:14227
DOID:14268
DOID:14330
DOID:1459
DOID:1595
DOID:1612
DOID:1686
DOID:1725
DOID:175
DOID:1781
DOID:1790
DOID:1793
DOID:1826
DOID:184
DOID:1909
DOID:1936
DOID:1964
DOID:1993
DOID:2043
DOID:216
DOID:2174
DOID:219
DOID:2355
DOID:2377
DOID:2394
DOID:2531
DOID:2596
DOID:263
DOID:2841
DOID:2986
DOID:2994
DOID:2998
DOID:3070
DOID:3083
DOID:3121
DOID:3277
DOID:3310
DOID:3312
DOID:332
DOID:3393
DOID:3565
DOID:357

In [13]:
len(hetnet_node_ids)

13174

In [14]:
set(entity_type_hetnet)

{'Anatomy',
 'Biological Process',
 'Cellular Component',
 'Compound',
 'Disease',
 'Gene',
 'Molecular Function',
 'Pathway',
 'Pharmacologic Class',
 'Side Effect',
 'Symptom'}

In [65]:
het_net_ids = []
for n in het_net_nodes:
    
    try:
        print(n, 3)
        het_net_ids.append(n.split(":")[3])
    except IndexError:
        print(n, 2)
        het_net_ids.append(n.split(":")[2])


Anatomy::UBERON:0000002 3
Anatomy::UBERON:0000004 3
Anatomy::UBERON:0000006 3
Anatomy::UBERON:0000007 3
Anatomy::UBERON:0000010 3
Anatomy::UBERON:0000011 3
Anatomy::UBERON:0000013 3
Anatomy::UBERON:0000020 3
Anatomy::UBERON:0000026 3
Anatomy::UBERON:0000029 3
Anatomy::UBERON:0000033 3
Anatomy::UBERON:0000038 3
Anatomy::UBERON:0000042 3
Anatomy::UBERON:0000043 3
Anatomy::UBERON:0000045 3
Anatomy::UBERON:0000053 3
Anatomy::UBERON:0000054 3
Anatomy::UBERON:0000056 3
Anatomy::UBERON:0000057 3
Anatomy::UBERON:0000165 3
Anatomy::UBERON:0000178 3
Anatomy::UBERON:0000211 3
Anatomy::UBERON:0000473 3
Anatomy::UBERON:0000474 3
Anatomy::UBERON:0000482 3
Anatomy::UBERON:0000483 3
Anatomy::UBERON:0000922 3
Anatomy::UBERON:0000941 3
Anatomy::UBERON:0000946 3
Anatomy::UBERON:0000948 3
Anatomy::UBERON:0000949 3
Anatomy::UBERON:0000955 3
Anatomy::UBERON:0000959 3
Anatomy::UBERON:0000964 3
Anatomy::UBERON:0000966 3
Anatomy::UBERON:0000970 3
Anatomy::UBERON:0000974 3
Anatomy::UBERON:0000975 3
Anatomy::UBE

In [66]:
for j in het_net_ids:
    print(j)

0000002
0000004
0000006
0000007
0000010
0000011
0000013
0000020
0000026
0000029
0000033
0000038
0000042
0000043
0000045
0000053
0000054
0000056
0000057
0000165
0000178
0000211
0000473
0000474
0000482
0000483
0000922
0000941
0000946
0000948
0000949
0000955
0000959
0000964
0000966
0000970
0000974
0000975
0000977
0000978
0000982
0000985
0000988
0000989
0000991
0000992
0000993
0000995
0000996
0000997
0000998
0000999
0001000
0001003
0001004
0001007
0001008
0001011
0001013
0001016
0001017
0001021
0001031
0001037
0001044
0001064
0001067
0001070
0001072
0001088
0001089
0001090
0001093
0001103
0001105
0001111
0001130
0001132
0001135
0001140
0001143
0001148
0001174
0001175
0001182
0001183
0001184
0001193
0001194
0001224
0001225
0001228
0001231
0001232
0001235
0001236
0001237
0001264
0001267
0001283
0001285
0001288
0001295
0001296
0001300
0001305
0001310
0001322
0001323
0001324
0001343
0001348
0001352
0001361
0001365
0001394
0001398
0001404
0001406
0001437
0001456
0001460
0001461
0001463
0001464


In [21]:
cnt = 0
for u in unique_nodes_ids:
    if u in het_net_ids:
        pass
    if u not in het_net_ids:
        cnt +=1
        print(u)

IPR013673
D000069446
D002217
D001424
R-HSA-556833
C017367
D019819
R-HSA-416476﻿
D013927
0012531
R-HSA-2022377
D000067759
P22303
D014661
D062326
16412
D007611
P04275
O00329
P17181
P27958
D015352
D004827
C047340
D000505
C007852
R-HSA-5357956
P0A7J6
P35367
R-HSA-418597
DB06794
D012080
P17252
Q7ZJM1
D012127
D000073605
D005984
P0AD63
Q12809
D020156
D011736
P35968
D001943
C016986
D001369
C016163
C570240
P01275
D003981
D006952
D010612
Q01726
D003023
D001987
D016923
0097746
D009855
P00519
C015715
C487081
Q8TDS4
P05023
IPR000286
Q16236
P23560
D000690
D008694
Q06187
D000172
C471992
P27487
P22748
D013832
D001066
Q13639
P15428
0000092
D019386
C486464
DB06720
0004363
C509700
P13843
D009543
P16234
D019821
C512204
P0A0Z5
Q14524
C587014
D009181
C033781
D002752
D007635
IPR000837
R-HSA-2454202
D006069
D003555
D004381
P12931
IPR033907
0035635
D014801
P28335
0099155
0046911
C551803
D020734
0035623
D009355
D000068759
P08546
D016411
D002062
D014693
D003874
IPR001696
0098719
0030421
P48169
P51649
D013207
C02

In [22]:
cnt

1749

In [23]:
len(unique_nodes_ids)

2266

In [24]:
for h in het_net_ids:
    print(h)

0000002
0000004
0000006
0000007
0000010
0000011
0000013
0000020
0000026
0000029
0000033
0000038
0000042
0000043
0000045
0000053
0000054
0000056
0000057
0000165
0000178
0000211
0000473
0000474
0000482
0000483
0000922
0000941
0000946
0000948
0000949
0000955
0000959
0000964
0000966
0000970
0000974
0000975
0000977
0000978
0000982
0000985
0000988
0000989
0000991
0000992
0000993
0000995
0000996
0000997
0000998
0000999
0001000
0001003
0001004
0001007
0001008
0001011
0001013
0001016
0001017
0001021
0001031
0001037
0001044
0001064
0001067
0001070
0001072
0001088
0001089
0001090
0001093
0001103
0001105
0001111
0001130
0001132
0001135
0001140
0001143
0001148
0001174
0001175
0001182
0001183
0001184
0001193
0001194
0001224
0001225
0001228
0001231
0001232
0001235
0001236
0001237
0001264
0001267
0001283
0001285
0001288
0001295
0001296
0001300
0001305
0001310
0001322
0001323
0001324
0001343
0001348
0001352
0001361
0001365
0001394
0001398
0001404
0001406
0001437
0001456
0001460
0001461
0001463
0001464
