# BioMedGPS - Knowledge Graph Embedding

## Install dependencies

In [None]:
%%bash
# We assume you are using a conda environment
# torchvision==0.14.0 & torch==1.13 only work with CUDA==1.16 or CUDA==1.17 and python==3.10
mamba create -n biomedgps python==3.10

In [None]:
%%bash
# If you have installed CUDA==1.16 or CUDA==1.17, you can use the following command to install torch==1.13
# dglke is compatible with dgl==0.9.0 and dgl==0.9.0 only works with torch==1.13
pip3 install torch==1.13 torchvision==0.14.0

In [None]:
%%bash
pip install git+https://github.com/awslabs/dgl-ke.git#subdirectory=python && pip install ogb dgl==0.9.0

## Prepare Data

In [1]:
import pandas as pd
import os

rootdir = os.path.dirname(os.getcwd())
relations = pd.read_csv(os.path.join(rootdir, 'graph_data/relations.tsv'), sep='\t')

In [2]:
relations.head()

Unnamed: 0,raw_source_id,raw_target_id,raw_source_type,raw_target_type,relation_type,resource,pmids,key_sentence,source_id,source_type,target_id,target_type,source_target
0,ENTREZ:2159,CHEMBL:CHEMBL1201414,Gene,Compound,DGIDB::OTHER::Gene:Compound,DGIDB,,,ENTREZ:2159,Gene,DrugBank:DB06822,Compound,Gene:Compound
1,ENTREZ:462,CHEMBL:CHEMBL1201414,Gene,Compound,DGIDB::ACTIVATOR::Gene:Compound,DGIDB,,,ENTREZ:462,Gene,DrugBank:DB06822,Compound,Gene:Compound
2,ENTREZ:462,CHEMBL:CHEMBL1201414,Gene,Compound,DGIDB::OTHER::Gene:Compound,DGIDB,,,ENTREZ:462,Gene,DrugBank:DB06822,Compound,Gene:Compound
3,ENTREZ:2147,CHEMBL:CHEMBL1201662,Gene,Compound,DGIDB::INHIBITOR::Gene:Compound,DGIDB,,,ENTREZ:2147,Gene,DrugBank:DB11095,Compound,Gene:Compound
4,ENTREZ:5733,CHEMBL:CHEMBL1207745,Gene,Compound,DGIDB::AGONIST::Gene:Compound,DGIDB,,,ENTREZ:5733,Gene,DrugBank:DB05229,Compound,Gene:Compound


In [3]:
relations["target_type"].unique(), relations["source_type"].unique()

(array(['Compound', 'Disease', 'Gene', 'BiologicalProcess',
        'MolecularFunction', 'CellularComponent', 'Symptom', 'SideEffect',
        'Pathway'], dtype=object),
 array(['Gene', 'Compound', 'Disease', 'PharmacologicClass',
        'BiologicalProcess', 'CellularComponent', 'Pathway',
        'MolecularFunction'], dtype=object))

In [5]:
os.makedirs(os.path.join(os.getcwd(), "biomedgps/data"), exist_ok=True)

df = pd.DataFrame()
# Merge the source_type and source_id columns
df['source_id'] = relations['source_type'] + '::' + relations['source_id'].astype(str)

# Merge the target_type and target_id columns
df['target_id'] = relations['target_type'] + '::' + relations['target_id'].astype(str)

df['relation_type'] = relations['relation_type']

# Reorder the columns
df = df[['source_id', 'relation_type', 'target_id']]

# Remove the header
df.to_csv(os.path.join(os.getcwd(), 'biomedgps/data/relations_hrt.tsv'), sep='\t', index=False, header=False)

In [6]:
df = pd.read_csv(os.path.join(os.getcwd(), 'biomedgps/data/relations_hrt.tsv'), sep='\t', header=None)
triples = df.values.tolist()
df.head()

Unnamed: 0,0,1,2
0,Gene::ENTREZ:2159,DGIDB::OTHER::Gene:Compound,Compound::DrugBank:DB06822
1,Gene::ENTREZ:462,DGIDB::ACTIVATOR::Gene:Compound,Compound::DrugBank:DB06822
2,Gene::ENTREZ:462,DGIDB::OTHER::Gene:Compound,Compound::DrugBank:DB06822
3,Gene::ENTREZ:2147,DGIDB::INHIBITOR::Gene:Compound,Compound::DrugBank:DB11095
4,Gene::ENTREZ:5733,DGIDB::AGONIST::Gene:Compound,Compound::DrugBank:DB05229


In [7]:
num_triples = len(triples)
num_triples

4592639

In [8]:
import os 
import numpy as np

# Please make sure the output directory exist.
seed = np.arange(num_triples)
np.random.shuffle(seed)

train_cnt = int(num_triples * 0.9)
valid_cnt = int(num_triples * 0.05)
train_set = seed[:train_cnt]
train_set = train_set.tolist()
valid_set = seed[train_cnt:train_cnt+valid_cnt].tolist()
test_set = seed[train_cnt+valid_cnt:].tolist()

os.makedirs("biomedgps/data/train", exist_ok=True)

with open("biomedgps/data/train/biomedgps_train.tsv", 'w+') as f:
    for idx in train_set:
        f.writelines("{}\t{}\t{}\n".format(triples[idx][0], triples[idx][1], triples[idx][2]))
        
with open("biomedgps/data/train/biomedgps_valid.tsv", 'w+') as f:
    for idx in valid_set:
        f.writelines("{}\t{}\t{}\n".format(triples[idx][0], triples[idx][1], triples[idx][2]))

with open("biomedgps/data/train/biomedgps_test.tsv", 'w+') as f:
    for idx in test_set:
        f.writelines("{}\t{}\t{}\n".format(triples[idx][0], triples[idx][1], triples[idx][2]))

print(len(train_set), len(valid_set), len(test_set))

4133375 229631 229633


## Train models with different hyperparameters

In [None]:
%%bash
DGLBACKEND=pytorch dglke_train --dataset biomedgps --model_name TransE_l2 --batch_size 1024 --neg_sample_size 256 --hidden_dim 400 --gamma 12 --lr 0.1 --max_step 100000 --log_interval 100 --batch_size_eval 16 --neg_sample_size_eval 10000 -adv --regularization_coef 1.00E-09 --data_path ./biomedgps/data/train --data_files biomedgps_train.tsv biomedgps_valid.tsv biomedgps_test.tsv --format raw_udd_hrt --save_path ./biomedgps/models --gpu 0 --valid --num_proc 10 --num_thread 1 --mix_cpu_gpu