# DRKG - Knowledge Graph Embedding
>
> biomedgps-hsdn-custom-malacards-ctd vs. biomedgps-hsdn-custom-malacards
>
> Based on the biomedgps-hsdn-custom-malacards- dataset, we add the following datasets: ctd.
>

## Install dependencies

In [1]:
%%bash
# We assume you are using a conda environment
# torchvision==0.14.0 & torch==1.13 only work with CUDA==1.16 or CUDA==1.17 and python==3.10
mamba create -n biomedgps python==3.10


                  __    __    __    __
                 /  \  /  \  /  \  /  \
                /    \/    \/    \/    \
███████████████/  /██/  /██/  /██/  /████████████████████████
              /  / \   / \   / \   / \  \____
             /  /   \_/   \_/   \_/   \    o \__,
            / _/                       \_____/  `
            |/
        ███╗   ███╗ █████╗ ███╗   ███╗██████╗  █████╗
        ████╗ ████║██╔══██╗████╗ ████║██╔══██╗██╔══██╗
        ██╔████╔██║███████║██╔████╔██║██████╔╝███████║
        ██║╚██╔╝██║██╔══██║██║╚██╔╝██║██╔══██╗██╔══██║
        ██║ ╚═╝ ██║██║  ██║██║ ╚═╝ ██║██████╔╝██║  ██║
        ╚═╝     ╚═╝╚═╝  ╚═╝╚═╝     ╚═╝╚═════╝ ╚═╝  ╚═╝

        mamba (0.15.3) supported by @QuantStack

        GitHub:  https://github.com/mamba-org/mamba
        Twitter: https://twitter.com/QuantStack

█████████████████████████████████████████████████████████████

Remove existing environment (y/[n])? 



CondaSystemExit: Exiting.



In [None]:
%%bash
# If you have installed CUDA==1.16 or CUDA==1.17, you can use the following command to install torch==1.13
# dglke is compatible with dgl==0.9.0 and dgl==0.9.0 only works with torch==1.13
pip3 install torch==1.13 torchvision==0.14.0

In [None]:
%%bash
pip install git+https://github.com/awslabs/dgl-ke.git#subdirectory=python && pip install ogb dgl==0.9.0

## Prepare Data

### Get formatted + unformatted DRKG from the [biomedgps-data repo](https://github.com/yjcyxky/biomedgps-data)

In [1]:
# Unzip the file

import os, shutil

rootdir = os.path.join(os.getcwd())
datadir = os.path.join(rootdir, 'data')
models_dir = os.path.join(rootdir, 'models')
os.makedirs(datadir, exist_ok=True)
os.makedirs(models_dir, exist_ok=True)

In [7]:
!cd ./data && unzip ./formatted_drkg.tsv.zip

Archive:  ./formatted_drkg.tsv.zip
caution: filename not matched:  -y
Archive:  ./unformatted_drkg.tsv.zip
caution: filename not matched:  -y


In [8]:
!cd ./data && unzip ./unformatted_drkg.tsv.zip

Archive:  ./unformatted_drkg.tsv.zip
replace unformatted_drkg.tsv? [y]es, [n]o, [A]ll, [N]one, [r]ename: ^C


In [2]:
import pandas as pd
import os

selected_columns = ["relation_type", "source_type", "source_id", "target_type", "target_id", "resource"]

formatted_drkg_data = pd.read_csv(os.path.join(datadir, 'formatted_drkg.tsv'), sep='\t')
formatted_drkg_data = formatted_drkg_data[selected_columns]
print("Formatted DRKG data shape: ", formatted_drkg_data.shape)

unformatted_drkg_data = pd.read_csv(os.path.join(datadir, 'unformatted_drkg.tsv'), sep='\t')
unformatted_drkg_data = unformatted_drkg_data[selected_columns]
print("Unformatted DRKG data shape: ", unformatted_drkg_data.shape)

duplicated_drkg_data = pd.read_csv(os.path.join(datadir, 'duplicated_drkg.tsv'), sep='\t')
duplicated_drkg_data = duplicated_drkg_data[selected_columns]
print("Duplicated DRKG data shape: ", duplicated_drkg_data.shape)

custom_kg = pd.read_csv(os.path.join(datadir, 'custom_kg.tsv'), sep='\t')
custom_kg = custom_kg[selected_columns]
print("Custom KG data shape: ", custom_kg.shape)

malacards = pd.read_csv(os.path.join(datadir, 'malacards.tsv'), sep='\t')
malacards = malacards[selected_columns]
print("Malacards data shape: ", malacards.shape)

formatted_hsdn = pd.read_csv(os.path.join(datadir, 'formatted_hsdn.tsv'), sep='\t')
formatted_hsdn = formatted_hsdn[selected_columns]
print("Formatted HSDN data shape: ", formatted_hsdn.shape)

formatted_ctd = pd.read_csv(os.path.join(datadir, 'formatted_ctd.tsv'), sep='\t')
formatted_ctd = formatted_ctd[selected_columns]
print("Formatted CTD data shape: ", formatted_ctd.shape)

relations = pd.concat([formatted_drkg_data, unformatted_drkg_data, duplicated_drkg_data, custom_kg, malacards, formatted_hsdn, formatted_ctd])

# Save the merged data
relations.to_csv(os.path.join(datadir, 'relations.tsv'), sep='\t', index=False)

Formatted DRKG data shape:  (5678569, 6)
Unformatted DRKG data shape:  (194412, 6)
Duplicated DRKG data shape:  (2499, 6)
Custom KG data shape:  (602, 6)
Malacards data shape:  (201, 6)
Formatted HSDN data shape:  (130857, 6)


  formatted_ctd = pd.read_csv(os.path.join(datadir, 'formatted_ctd.tsv'), sep='\t')


Formatted CTD data shape:  (34133278, 6)


In [3]:
df = pd.DataFrame()
# Merge the source_type and source_id columns
df['merged_source_id'] = relations['source_type'] + '::' + relations['source_id'].astype(str)

# Merge the target_type and target_id columns
df['merged_target_id'] = relations['target_type'] + '::' + relations['target_id'].astype(str)

df['relation_type'] = relations['relation_type']

# Reorder the columns
df = df[['merged_source_id', 'relation_type', 'merged_target_id']]

# Remove the header
df.to_csv(os.path.join(datadir, 'relations_hrt.tsv'), sep='\t', index=False, header=False)

triples = df.values.tolist()
num_triples = len(triples)
num_triples

40140418

### Split data into train/valid/test

In [4]:
import os 
import numpy as np

# Please make sure the output directory exist.
seed = np.arange(num_triples)
np.random.shuffle(seed)

train_cnt = int(num_triples * 0.9)
valid_cnt = int(num_triples * 0.05)
train_set = seed[:train_cnt]
train_set = train_set.tolist()
valid_set = seed[train_cnt:train_cnt+valid_cnt].tolist()
test_set = seed[train_cnt+valid_cnt:].tolist()

train_dir = os.path.join(rootdir, "data/train")
train_datafile = os.path.join(train_dir, "train.tsv")
valid_datafile = os.path.join(train_dir, "valid.tsv")
test_datafile = os.path.join(train_dir, "test.tsv")
os.makedirs(train_dir, exist_ok=True)

with open(train_datafile, 'w+') as f:
    for idx in train_set:
        f.writelines("{}\t{}\t{}\n".format(triples[idx][0], triples[idx][1], triples[idx][2]))
        
with open(valid_datafile, 'w+') as f:
    for idx in valid_set:
        f.writelines("{}\t{}\t{}\n".format(triples[idx][0], triples[idx][1], triples[idx][2]))

with open(test_datafile, 'w+') as f:
    for idx in test_set:
        f.writelines("{}\t{}\t{}\n".format(triples[idx][0], triples[idx][1], triples[idx][2]))

print(len(train_set), len(valid_set), len(test_set))

36126376 2007020 2007022


## Train models with different hyperparameters

In [None]:
%%bash
# You must change the DATASET_NAME according to your situation
export INDEX=0
export MODEL_DIR=./models
mkdir -p ${MODEL_DIR}

DGLBACKEND=pytorch dglke_train --dataset biomedgps --data_path ./data/train --data_files train.tsv valid.tsv test.tsv --format 'raw_udd_hrt' --model_name TransE_l2 --batch_size 2048 --neg_sample_size 256 --hidden_dim 400 --gamma 12.0 --lr 0.1 --max_step 100000 --log_interval 1000 --batch_size_eval 16 -adv --regularization_coef 1.00E-07 --test --gpu 0 --num_proc 7 --neg_sample_size_eval 10000 --async_update --mix_cpu_gpu --save_path ./models 2>&1 | tee ${MODEL_DIR}/${DATASET_NAME}_${INDEX}_log.txt