# DRKG - Knowledge Graph Embedding

## Install dependencies

In [None]:
%%bash
# We assume you are using a conda environment
# torchvision==0.14.0 & torch==1.13 only work with CUDA==1.16 or CUDA==1.17 and python==3.10
mamba create -n biomedgps python==3.10

In [None]:
%%bash
# If you have installed CUDA==1.16 or CUDA==1.17, you can use the following command to install torch==1.13
# dglke is compatible with dgl==0.9.0 and dgl==0.9.0 only works with torch==1.13
pip3 install torch==1.13 torchvision==0.14.0

In [None]:
%%bash
pip install git+https://github.com/awslabs/dgl-ke.git#subdirectory=python && pip install ogb dgl==0.9.0

## Prepare Data

In [None]:
%%bash
export DATA_DIR=drkg/data
export MODEL_DIR=drkg/models
mkdir -p ${DATA_DIR} ${MODEL_DIR}
wget https://s3.us-west-2.amazonaws.com/dgl-data/dataset/DRKG/drkg.tar.gz -O ${DATA_DIR}/drkg.tar.gz
tar -xvzf ${DATA_DIR}/drkg.tar.gz -C ${DATA_DIR}

In [None]:
import pandas as pd
import numpy as np

drkg_file = "drkg/data/drkg.tsv"
df = pd.read_csv(drkg_file, sep="\t", header=None)
triples = df.values.tolist()

In [None]:
num_triples = len(triples)
num_triples

In [None]:
import os 
# Please make sure the output directory exist.
seed = np.arange(num_triples)
np.random.shuffle(seed)

train_cnt = int(num_triples * 0.9)
valid_cnt = int(num_triples * 0.05)
train_set = seed[:train_cnt]
train_set = train_set.tolist()
valid_set = seed[train_cnt:train_cnt+valid_cnt].tolist()
test_set = seed[train_cnt+valid_cnt:].tolist()

os.makedirs("drkg/data/train", exist_ok=True)

with open("drkg/data/train/drkg_train.tsv", 'w+') as f:
    for idx in train_set:
        f.writelines("{}\t{}\t{}\n".format(triples[idx][0], triples[idx][1], triples[idx][2]))
        
with open("drkg/data/train/drkg_valid.tsv", 'w+') as f:
    for idx in valid_set:
        f.writelines("{}\t{}\t{}\n".format(triples[idx][0], triples[idx][1], triples[idx][2]))

with open("drkg/data/train/drkg_test.tsv", 'w+') as f:
    for idx in test_set:
        f.writelines("{}\t{}\t{}\n".format(triples[idx][0], triples[idx][1], triples[idx][2]))

print(len(train_set), len(valid_set), len(test_set))

## Train models with different hyperparameters

In [13]:
!DGLBACKEND=pytorch dglke_train --dataset drkg --data_path ./drkg/data/train --data_files drkg_train.tsv drkg_valid.tsv drkg_test.tsv --format 'raw_udd_hrt' --model_name TransE_l2 --batch_size 2048 --neg_sample_size 256 --hidden_dim 400 --gamma 12.0 --lr 0.1 --max_step 100000 --log_interval 1000 --batch_size_eval 16 -adv --regularization_coef 1.00E-07 --test --gpu 0 --num_proc 7 --neg_sample_size_eval 10000 --async_update --mix_cpu_gpu --save_path ./drkg/models

Reading train triples....
Finished. Read 5286834 train triples.
Reading valid triples....
Finished. Read 293713 valid triples.
Reading test triples....
Finished. Read 293714 test triples.
|Train|: 5286834
random partition 5286834 edges into 7 parts
part 0 has 755262 edges
part 1 has 755262 edges
part 2 has 755262 edges
part 3 has 755262 edges
part 4 has 755262 edges
part 5 has 755262 edges
part 6 has 755262 edges
|valid|: 293713
|test|: 293714
Total initialize time 18.921 seconds
[proc 2][Train](1000/100000) average pos_loss: 0.5303053002283014
[proc 0][Train](1000/100000) average pos_loss: 0.5433412829106219
[proc 2][Train](1000/100000) average neg_loss: 0.6654514085352421
[proc 3][Train](1000/100000) average pos_loss: 0.5488146484308909[proc 6][Train](1000/100000) average pos_loss: 0.5437324741735174

[proc 0][Train](1000/100000) average neg_loss: 0.6356007323265076
[proc 2][Train](1000/100000) average loss: 0.5978783544003964
[proc 5][Train](1000/100000) average pos_loss: 0.53421221

### Model 1: TransE

### Model 2: TransR