## Hetionet 

In [1]:
from pykeen.datasets import Hetionet
dataset = Hetionet()
dataset

Hetionet(num_entities=45158, num_relations=24, create_inverse_triples=False)

In [2]:
train_hetionet = dataset.training.triples 
valid_hetionet = dataset.validation.triples
test_hetionet = dataset.testing.triples

Reconstructing all label-based triples. This is expensive and rarely needed.
Reconstructing all label-based triples. This is expensive and rarely needed.
Reconstructing all label-based triples. This is expensive and rarely needed.


In [3]:
import pandas as pd

train_df = pd.DataFrame(train_hetionet, columns=["head", "relation", "tail"])
valid_df = pd.DataFrame(valid_hetionet, columns=["head", "relation", "tail"])
test_df = pd.DataFrame(test_hetionet, columns=["head", "relation", "tail"])

hetionet_df = pd.concat([train_df, valid_df, test_df])
hetionet_df

Unnamed: 0,head,relation,tail
0,Anatomy::UBERON:0000002,AdG,Gene::10005
1,Anatomy::UBERON:0000002,AdG,Gene::114804
2,Anatomy::UBERON:0000002,AdG,Gene::118670
3,Anatomy::UBERON:0000002,AdG,Gene::128989
4,Anatomy::UBERON:0000002,AdG,Gene::132851
...,...,...,...
225015,Gene::2260,GpBP,Biological Process::GO:0002764
225016,Gene::2,Gr>G,Gene::55893
225017,Anatomy::UBERON:0001017,AeG,Gene::92595
225018,Gene::89797,GiG,Gene::134637


In [4]:
hetionet_df[hetionet_df['head'].str.match("Compound.*")]

Unnamed: 0,head,relation,tail
1126,Compound::DB00014,CbG,Gene::2798
1127,Compound::DB00014,CcSE,Side Effect::C0000737
1128,Compound::DB00014,CcSE,Side Effect::C0002170
1129,Compound::DB00014,CcSE,Side Effect::C0002418
1130,Compound::DB00014,CcSE,Side Effect::C0002453
...,...,...,...
224885,Compound::DB01149,CcSE,Side Effect::C0314719
224889,Compound::DB00601,CcSE,Side Effect::C0036572
224937,Compound::DB00472,CcSE,Side Effect::C0542351
224983,Compound::DB01083,CcSE,Side Effect::C0042029


In [4]:
hetionet_df["tail"] = hetionet_df["tail"].str.replace('Compound::','')
hetionet_df["head"] = hetionet_df["head"].str.replace('Compound::','')
hetionet_df


Unnamed: 0,head,relation,tail
0,Anatomy::UBERON:0000002,AdG,Gene::10005
1,Anatomy::UBERON:0000002,AdG,Gene::114804
2,Anatomy::UBERON:0000002,AdG,Gene::118670
3,Anatomy::UBERON:0000002,AdG,Gene::128989
4,Anatomy::UBERON:0000002,AdG,Gene::132851
...,...,...,...
225015,Gene::2260,GpBP,Biological Process::GO:0002764
225016,Gene::2,Gr>G,Gene::55893
225017,Anatomy::UBERON:0001017,AeG,Gene::92595
225018,Gene::89797,GiG,Gene::134637


In [22]:
hetionet_df[hetionet_df["head"].str.match("DB\d+")]

Unnamed: 0,head,relation,tail
1126,DB00014,CbG,Gene::2798
1127,DB00014,CcSE,Side Effect::C0000737
1128,DB00014,CcSE,Side Effect::C0002170
1129,DB00014,CcSE,Side Effect::C0002418
1130,DB00014,CcSE,Side Effect::C0002453
...,...,...,...
224885,DB01149,CcSE,Side Effect::C0314719
224889,DB00601,CcSE,Side Effect::C0036572
224937,DB00472,CcSE,Side Effect::C0542351
224983,DB01083,CcSE,Side Effect::C0042029


In [41]:
drugbank_dir = '../data/triplets/'
drugs_drugbank = pd.read_csv(drugbank_dir + 'drug_atc_codes.tsv', sep='\t', index_col=[0])
drugs_drugbank_ids = list(drugs_drugbank.id)
len(drugs_drugbank_ids)

4019

In [27]:
# select only drug - drug relations

drug_drug_relation_df_head = hetionet_df[hetionet_df["head"].str.match("DB\d+")]
drug_drug_relation_df = drug_drug_relation_df_head[drug_drug_relation_df_head["tail"].str.match("DB\d+")]
drug_drug_relation_df.head()

Unnamed: 0,head,relation,tail
1366,DB00014,CrC,DB00183
1765,DB00117,CrC,DB02959
3696,DB00250,CrC,DB01145
3720,DB00256,CrC,DB00453
3769,DB00267,CrC,DB00430


In [40]:
drugs_ids = list(drug_drug_relation_df['head']) + list(drug_drug_relation_df['tail']) 
drugs_ids = set(drugs_ids)
len()

1281

In [45]:
# drugs in both datasets - DrugBank and Hetionet
drugs_in_both_data = list(drugs_ids.intersection(set(drugs_drugbank_ids)))
print(len(drugs_in_both_data))

1088


In [60]:
# remove rows with drugs that are not in DrugBank
mask = drug_drug_relation_df.isin(drugs_in_both_data)
mask['relation'] = True
drug_drug_relation_df = drug_drug_relation_df.mask(~mask).dropna()
drug_drug_relation_df

Unnamed: 0,head,relation,tail
1366,DB00014,CrC,DB00183
3696,DB00250,CrC,DB01145
3769,DB00267,CrC,DB00430
4209,DB00301,CrC,DB00948
5473,DB00560,CrC,DB00595
...,...,...,...
221494,DB00931,CrC,DB01017
221791,DB00247,CrC,DB01200
222802,DB00670,CrC,DB00719
223149,DB00784,CrC,DB00712


In [100]:
# select drug - * relations

drug_head = drug_drug_relation_df_head
mask_head = drug_head['head'].isin(drugs_in_both_data)
drug_head = drug_head[mask_head]

drug_tail = hetionet_df[hetionet_df["tail"].str.match("DB\d+")]
mask_tail = drug_tail['tail'].isin(drugs_in_both_data)
drug_tail = drug_tail[mask_tail]

drug_other_relation_df = pd.concat([drug_head, drug_tail])
drug_other_relation_df = drug_other_relation_df.drop_duplicates()
drug_other_relation_df

Unnamed: 0,head,relation,tail
1126,DB00014,CbG,Gene::2798
1127,DB00014,CcSE,Side Effect::C0000737
1128,DB00014,CcSE,Side Effect::C0002170
1129,DB00014,CcSE,Side Effect::C0002418
1130,DB00014,CcSE,Side Effect::C0002453
...,...,...,...
217668,DB01321,CrC,DB00778
218846,DB00232,CrC,DB01144
222074,DB08974,CrC,DB00643
222880,Pharmacologic Class::N0000175729,PCiC,DB00201


In [104]:
# select * - * relations (both * must be in drug_other_relation_df)

other_head = list(drug_other_relation_df[~drug_other_relation_df["head"].str.match("DB\d+")]["head"])
other_tail = list(drug_other_relation_df[~drug_other_relation_df["tail"].str.match("DB\d+")]["tail"])
other_entities = list(set(other_head + other_tail))

mask_head = hetionet_df['head'].isin(other_entities)
other_head = hetionet_df[mask_head]

mask_tail = hetionet_df['tail'].isin(other_entities)
other_tail = hetionet_df[mask_tail]

other_other_relation_df = pd.concat([other_head, other_tail])
other_other_relation_df = other_other_relation_df.drop_duplicates()
other_other_relation_df

Unnamed: 0,head,relation,tail
8346,Disease::DOID:0050156,DaG,Gene::10144
8347,Disease::DOID:0050156,DdG,Gene::100129884
8348,Disease::DOID:0050156,DlA,Anatomy::UBERON:0001004
8349,Disease::DOID:0050156,DpS,Symptom::D000860
8350,Disease::DOID:0050156,DpS,Symptom::D003371
...,...,...,...
224994,Gene::7405,Gr>G,Gene::131408
225006,Gene::5577,Gr>G,Gene::4864
225009,Anatomy::UBERON:0000955,AdG,Gene::4998
225010,Anatomy::UBERON:0001017,AeG,Gene::10615


In [108]:
# concatenate everything

print('Size of the original Hetionet:', hetionet_df.shape[0])

hetionet_subgraph = pd.concat([drug_drug_relation_df, drug_other_relation_df, other_other_relation_df]) 
hetionet_subgraph = hetionet_subgraph.drop_duplicates()

print('Size of the Hetionet subgraph, based on drugs in DrugBank:', hetionet_subgraph.shape[0])
hetionet_subgraph.head()


Size of the original Hetionet: 2250197
Size of the Hetionet subgraph, based on drugs in DrugBank: 1221982


Unnamed: 0,head,relation,tail
1366,DB00014,CrC,DB00183
3696,DB00250,CrC,DB01145
3769,DB00267,CrC,DB00430
4209,DB00301,CrC,DB00948
5473,DB00560,CrC,DB00595


In [109]:
hetionet_subgraph.to_csv("../data/triplets/hetionet.tsv", sep="\t")