# DRKG - Knowledge Graph Embedding

## Install dependencies

In [None]:
%%bash
# We assume you are using a conda environment
# torchvision==0.14.0 & torch==1.13 only work with CUDA==1.16 or CUDA==1.17 and python==3.10
mamba create -n biomedgps python==3.10

In [None]:
%%bash
# If you have installed CUDA==1.16 or CUDA==1.17, you can use the following command to install torch==1.13
# dglke is compatible with dgl==0.9.0 and dgl==0.9.0 only works with torch==1.13
pip3 install torch==1.13 torchvision==0.14.0

In [None]:
%%bash
pip install git+https://github.com/awslabs/dgl-ke.git#subdirectory=python && pip install ogb dgl==0.9.0

## Prepare Data

### [Option1] Get DRKG data from an official website

In [None]:
%%bash
wget https://s3.us-west-2.amazonaws.com/dgl-data/dataset/DRKG/drkg.tar.gz -O ${DATA_DIR}/drkg.tar.gz
tar -xvzf ${DATA_DIR}/drkg.tar.gz -C ${DATA_DIR}

In [None]:
import pandas as pd
import numpy as np

drkg_file = "drkg/data/drkg.tsv"
df = pd.read_csv(drkg_file, sep="\t", header=None)
triples = df.values.tolist()
num_triples = len(triples)
num_triples

### [Option2] Get DRKG + HSDN data from the [biomedgps-data repo](https://github.com/yjcyxky/biomedgps-data)

In [14]:
# Unzip the file

import os, shutil

root_dir = os.path.join(os.getcwd(), 'drkg-hsdn')
data_dir = os.path.join(root_dir, 'data')
models_dir = os.path.join(root_dir, 'models')
os.makedirs(data_dir, exist_ok=True)
os.makedirs(models_dir, exist_ok=True)

raw_drkg_zipfile = os.path.join(data_dir, "raw_drkg.tsv.zip")
if os.path.exists(raw_drkg_zipfile):
    shutil.unpack_archive(raw_drkg_zipfile, data_dir)

raw_hs_zipfile = os.path.join(data_dir, "raw_hsdn.tsv.zip")
if os.path.exists(raw_hs_zipfile):
    shutil.unpack_archive(raw_hs_zipfile, data_dir)

In [15]:
# Download the DRKG + HSDN raw data
# Skip it, we assume the data files are already in the data folder

# Merge the DRKG + HSDN raw data
import pandas as pd
import os

drkg_data = pd.read_csv(os.path.join(data_dir, 'raw_drkg.tsv'), sep='\t')
print("DRKG data shape: ", drkg_data.shape)
hsdn_data = pd.read_csv(os.path.join(data_dir, 'raw_hsdn.tsv'), sep='\t')
print("HSDN data shape: ", hsdn_data.shape)

relations = pd.concat([drkg_data, hsdn_data])

# Save the merged data
relations.to_csv(os.path.join(data_dir, 'relations.tsv'), sep='\t', index=False)

DRKG data shape:  (5874261, 8)
HSDN data shape:  (140595, 8)


In [16]:
df = pd.DataFrame()
# Merge the source_type and source_id columns
df['source_id'] = relations['source_type'] + '::' + relations['source_id'].astype(str)

# Merge the target_type and target_id columns
df['target_id'] = relations['target_type'] + '::' + relations['target_id'].astype(str)

df['relation_type'] = relations['relation_type']

# Reorder the columns
df = df[['source_id', 'relation_type', 'target_id']]

# Remove the header
df.to_csv(os.path.join(data_dir, 'relations_hrt.tsv'), sep='\t', index=False, header=False)

triples = df.values.tolist()
num_triples = len(triples)
num_triples

6014856

### Split data into train/valid/test

In [17]:
import os 
import numpy as np

# Please make sure the output directory exist.
seed = np.arange(num_triples)
np.random.shuffle(seed)

train_cnt = int(num_triples * 0.9)
valid_cnt = int(num_triples * 0.05)
train_set = seed[:train_cnt]
train_set = train_set.tolist()
valid_set = seed[train_cnt:train_cnt+valid_cnt].tolist()
test_set = seed[train_cnt+valid_cnt:].tolist()

train_dir = os.path.join(root_dir, "data/train")
train_datafile = os.path.join(train_dir, "train.tsv")
valid_datafile = os.path.join(train_dir, "valid.tsv")
test_datafile = os.path.join(train_dir, "test.tsv")
os.makedirs(train_dir, exist_ok=True)

with open(train_datafile, 'w+') as f:
    for idx in train_set:
        f.writelines("{}\t{}\t{}\n".format(triples[idx][0], triples[idx][1], triples[idx][2]))
        
with open(valid_datafile, 'w+') as f:
    for idx in valid_set:
        f.writelines("{}\t{}\t{}\n".format(triples[idx][0], triples[idx][1], triples[idx][2]))

with open(test_datafile, 'w+') as f:
    for idx in test_set:
        f.writelines("{}\t{}\t{}\n".format(triples[idx][0], triples[idx][1], triples[idx][2]))

print(len(train_set), len(valid_set), len(test_set))

5413370 300742 300744


## Train models with different hyperparameters

In [9]:
%%bash
export DATASET_NAME=drkg-hsdn
DGLBACKEND=pytorch dglke_train --dataset ${DATASET_NAME} --data_path ${DATASET_NAME}/data/train --data_files train.tsv valid.tsv test.tsv --format 'raw_udd_hrt' --model_name TransE_l2 --batch_size 2048 --neg_sample_size 256 --hidden_dim 400 --gamma 12.0 --lr 0.1 --max_step 100000 --log_interval 1000 --batch_size_eval 16 -adv --regularization_coef 1.00E-07 --test --gpu 0 --num_proc 7 --neg_sample_size_eval 10000 --async_update --mix_cpu_gpu --save_path ./${DATASET_NAME}/models

Reading train triples....
Finished. Read 5413370 train triples.
Reading valid triples....
Finished. Read 300742 valid triples.
Reading test triples....
Finished. Read 300744 test triples.




|Train|: 5413370
random partition 5413370 edges into 7 parts
part 0 has 773339 edges
part 1 has 773339 edges
part 2 has 773339 edges
part 3 has 773339 edges
part 4 has 773339 edges
part 5 has 773339 edges
part 6 has 773336 edges
|valid|: 300742
|test|: 300744
Total initialize time 18.423 seconds
[proc 2][Train](1000/100000) average pos_loss: 0.351940309730946[proc 6][Train](1000/100000) average pos_loss: 0.3528962600387713

[proc 0][Train](1000/100000) average pos_loss: 0.3477012766843609[proc 4][Train](1000/100000) average pos_loss: 0.35124006860882945[proc 3][Train](1000/100000) average pos_loss: 0.35531988380077745


[proc 5][Train](1000/100000) average pos_loss: 0.3463028523172716
[proc 2][Train](1000/100000) average neg_loss: 0.6045996388494969
[proc 1][Train](1000/100000) average pos_loss: 0.3492997063137045
[proc 6][Train](1000/100000) average neg_loss: 0.5965299041569233
[proc 3][Train](1000/100000) average neg_loss: 0.6316645026803017
[proc 4][Train](1000/100000) average neg_l