In [1]:
import argparse

import torch
from torch.utils.data import DataLoader

from kg_model import KG_model

from ogb.linkproppred import Evaluator, PygLinkPropPredDataset

from pykeen.evaluation import RankBasedEvaluator
from pykeen.triples import TriplesFactory
from pykeen.pipeline import pipeline 

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = PygLinkPropPredDataset(name='ogbl-ddi')
data = dataset[0]
data

Data(num_nodes=4267, edge_index=[2, 2135822])

In [12]:
data.edge_index

tensor([[4039, 2424, 4039,  ...,  338,  835, 3554],
        [2424, 4039,  225,  ...,  708, 3554,  835]])

In [4]:
split_edge = dataset.get_edge_split()
train_edge, valid_edge, test_edge = split_edge["train"], split_edge["valid"], split_edge["test"]
train_edge

{'edge': tensor([[4039, 2424],
         [4039,  225],
         [4039, 3901],
         ...,
         [ 647,  708],
         [ 708,  338],
         [ 835, 3554]])}

In [45]:
def convert_to_triples_factory(data):
    tf_data = TriplesFactory.from_labeled_triples(
        data[["head", "relation", "tail"]].values,
        create_inverse_triples=True,
        entity_to_id=None,
        relation_to_id=None,
        compact_id=False 
    )

    print(tf_data.mapped_triples)

    return tf_data

In [46]:
import pandas as pd

# add relation type - interacts with


train = train_edge['edge']
train = torch.tensor([[x[0], 0, x[1]] for x in train])
train_df = pd.DataFrame(train, columns=['head', 'relation', 'tail']).astype(str)

valid = valid_edge['edge']
valid = torch.tensor([[x[0], 0, x[1]] for x in valid])
valid_df = pd.DataFrame(valid, columns=['head', 'relation', 'tail']).astype(str)

test = test_edge['edge']
test = torch.tensor([[x[0], 0, x[1]] for x in test])
test_df = pd.DataFrame(test, columns=['head', 'relation', 'tail']).astype(str)

train_tf = convert_to_triples_factory(train_df)
valid_tf = convert_to_triples_factory(valid_df)
test_tf = convert_to_triples_factory(test_df)

tensor([[   0,    0,  667],
        [   0,    0, 1182],
        [   0,    0, 1280],
        ...,
        [4266,    0, 4250],
        [4266,    0, 4252],
        [4266,    0, 4260]])
tensor([[   0,    0,  729],
        [   1,    0,  681],
        [   1,    0,  768],
        ...,
        [3812,    0, 3722],
        [3812,    0, 3758],
        [3812,    0, 3802]])
tensor([[   0,    0,    3],
        [   0,    0,  185],
        [   0,    0,  187],
        ...,
        [1611,    0, 1562],
        [1611,    0, 1573],
        [1611,    0, 1601]])


In [47]:
train_tf

TriplesFactory(num_entities=4267, num_relations=2, create_inverse_triples=True, num_triples=1067911)

In [48]:
model = KG_model('TransE', train_tf, valid_tf, test_tf, 'ogb')
model.set_params(5, 'Adam', RankBasedEvaluator, 'gpu')
print('Training...')
model.train()
print('Training done')

No random seed is specified. Setting to 2584837398.


Training...


INFO:pykeen.training.training_loop:=> no checkpoint found at 'kg_checkpoints/TransE-ogb_checkpoint.pt'. Creating a new file.
INFO:pykeen.triples.triples_factory:Creating inverse triples.
Training epochs on cuda:0:   0%|                       | 0/5 [00:00<?, ?epoch/s]INFO:pykeen.triples.triples_factory:Creating inverse triples.

Training batches on cuda:0:   0%|                   | 0/8344 [00:00<?, ?batch/s][A
Training batches on cuda:0:   0%|          | 10/8344 [00:00<01:24, 98.75batch/s][A
Training batches on cuda:0:   1%|         | 52/8344 [00:00<00:29, 285.71batch/s][A
Training batches on cuda:0:   1%|        | 101/8344 [00:00<00:21, 375.91batch/s][A
Training batches on cuda:0:   2%|▏       | 153/8344 [00:00<00:19, 429.80batch/s][A
Training batches on cuda:0:   2%|▏       | 200/8344 [00:00<00:18, 443.18batch/s][A
Training batches on cuda:0:   3%|▏       | 249/8344 [00:00<00:17, 456.93batch/s][A
Training batches on cuda:0:   4%|▎       | 303/8344 [00:00<00:16, 481.71batch/s][

Training batches on cuda:0:  61%|████▏  | 5050/8344 [00:09<00:06, 538.00batch/s][A
Training batches on cuda:0:  61%|████▎  | 5104/8344 [00:09<00:06, 537.18batch/s][A
Training batches on cuda:0:  62%|████▎  | 5158/8344 [00:09<00:05, 534.14batch/s][A
Training batches on cuda:0:  62%|████▎  | 5212/8344 [00:09<00:05, 526.56batch/s][A
Training batches on cuda:0:  63%|████▍  | 5266/8344 [00:09<00:05, 528.75batch/s][A
Training batches on cuda:0:  64%|████▍  | 5320/8344 [00:09<00:05, 531.79batch/s][A
Training batches on cuda:0:  64%|████▌  | 5374/8344 [00:10<00:05, 533.88batch/s][A
Training batches on cuda:0:  65%|████▌  | 5428/8344 [00:10<00:05, 535.04batch/s][A
Training batches on cuda:0:  66%|████▌  | 5483/8344 [00:10<00:05, 536.81batch/s][A
Training batches on cuda:0:  66%|████▋  | 5537/8344 [00:10<00:05, 536.99batch/s][A
Training batches on cuda:0:  67%|████▋  | 5591/8344 [00:10<00:05, 536.79batch/s][A
Training batches on cuda:0:  68%|████▋  | 5645/8344 [00:10<00:05, 536.68batc

Training batches on cuda:0:  23%|█▌     | 1921/8344 [00:03<00:12, 533.31batch/s][A
Training batches on cuda:0:  24%|█▋     | 1975/8344 [00:03<00:11, 533.15batch/s][A
Training batches on cuda:0:  24%|█▋     | 2029/8344 [00:03<00:11, 532.92batch/s][A
Training batches on cuda:0:  25%|█▋     | 2083/8344 [00:03<00:11, 533.25batch/s][A
Training batches on cuda:0:  26%|█▊     | 2137/8344 [00:04<00:11, 533.93batch/s][A
Training batches on cuda:0:  26%|█▊     | 2191/8344 [00:04<00:11, 534.21batch/s][A
Training batches on cuda:0:  27%|█▉     | 2245/8344 [00:04<00:11, 534.41batch/s][A
Training batches on cuda:0:  28%|█▉     | 2301/8344 [00:04<00:11, 540.22batch/s][A
Training batches on cuda:0:  28%|█▉     | 2357/8344 [00:04<00:10, 544.35batch/s][A
Training batches on cuda:0:  29%|██     | 2413/8344 [00:04<00:10, 547.20batch/s][A
Training batches on cuda:0:  30%|██     | 2468/8344 [00:04<00:10, 546.17batch/s][A
Training batches on cuda:0:  30%|██     | 2523/8344 [00:04<00:10, 544.13batc

Training batches on cuda:0:  88%|██████▏| 7308/8344 [00:13<00:01, 543.87batch/s][A
Training batches on cuda:0:  88%|██████▏| 7363/8344 [00:13<00:01, 535.66batch/s][A
Training batches on cuda:0:  89%|██████▏| 7417/8344 [00:13<00:01, 534.85batch/s][A
Training batches on cuda:0:  90%|██████▎| 7471/8344 [00:13<00:01, 533.93batch/s][A
Training batches on cuda:0:  90%|██████▎| 7525/8344 [00:13<00:01, 532.94batch/s][A
Training batches on cuda:0:  91%|██████▎| 7580/8344 [00:14<00:01, 537.03batch/s][A
Training batches on cuda:0:  91%|██████▍| 7634/8344 [00:14<00:01, 524.15batch/s][A
Training batches on cuda:0:  92%|██████▍| 7689/8344 [00:14<00:01, 529.47batch/s][A
Training batches on cuda:0:  93%|██████▍| 7744/8344 [00:14<00:01, 534.57batch/s][A
Training batches on cuda:0:  93%|██████▌| 7799/8344 [00:14<00:01, 536.76batch/s][A
Training batches on cuda:0:  94%|██████▌| 7854/8344 [00:14<00:00, 539.32batch/s][A
Training batches on cuda:0:  95%|██████▋| 7909/8344 [00:14<00:00, 541.80batc

Training batches on cuda:0:  50%|███▍   | 4140/8344 [00:07<00:07, 551.40batch/s][A
Training batches on cuda:0:  50%|███▌   | 4196/8344 [00:07<00:07, 545.45batch/s][A
Training batches on cuda:0:  51%|███▌   | 4251/8344 [00:07<00:07, 542.11batch/s][A
Training batches on cuda:0:  52%|███▌   | 4306/8344 [00:08<00:07, 539.34batch/s][A
Training batches on cuda:0:  52%|███▋   | 4360/8344 [00:08<00:07, 537.54batch/s][A
Training batches on cuda:0:  53%|███▋   | 4414/8344 [00:08<00:07, 535.78batch/s][A
Training batches on cuda:0:  54%|███▋   | 4468/8344 [00:08<00:07, 536.14batch/s][A
Training batches on cuda:0:  54%|███▊   | 4524/8344 [00:08<00:07, 540.35batch/s][A
Training batches on cuda:0:  55%|███▊   | 4580/8344 [00:08<00:06, 543.72batch/s][A
Training batches on cuda:0:  56%|███▉   | 4636/8344 [00:08<00:06, 546.14batch/s][A
Training batches on cuda:0:  56%|███▉   | 4692/8344 [00:08<00:06, 547.75batch/s][A
Training batches on cuda:0:  57%|███▉   | 4748/8344 [00:08<00:06, 548.95batc

Training batches on cuda:0:  12%|▉       | 971/8344 [00:01<00:13, 536.32batch/s][A
Training batches on cuda:0:  12%|▊      | 1027/8344 [00:02<00:13, 540.68batch/s][A
Training batches on cuda:0:  13%|▉      | 1082/8344 [00:02<00:13, 536.54batch/s][A
Training batches on cuda:0:  14%|▉      | 1137/8344 [00:02<00:13, 537.63batch/s][A
Training batches on cuda:0:  14%|█      | 1192/8344 [00:02<00:13, 538.50batch/s][A
Training batches on cuda:0:  15%|█      | 1247/8344 [00:02<00:13, 539.18batch/s][A
Training batches on cuda:0:  16%|█      | 1301/8344 [00:02<00:13, 538.66batch/s][A
Training batches on cuda:0:  16%|█▏     | 1355/8344 [00:02<00:13, 535.00batch/s][A
Training batches on cuda:0:  17%|█▏     | 1409/8344 [00:02<00:13, 532.82batch/s][A
Training batches on cuda:0:  18%|█▏     | 1463/8344 [00:02<00:12, 530.77batch/s][A
Training batches on cuda:0:  18%|█▎     | 1517/8344 [00:02<00:12, 529.56batch/s][A
Training batches on cuda:0:  19%|█▎     | 1570/8344 [00:03<00:12, 529.66batc

Training batches on cuda:0:  74%|█████▏ | 6193/8344 [00:11<00:04, 523.38batch/s][A
Training batches on cuda:0:  75%|█████▏ | 6246/8344 [00:11<00:04, 519.81batch/s][A
Training batches on cuda:0:  75%|█████▎ | 6298/8344 [00:11<00:03, 518.44batch/s][A
Training batches on cuda:0:  76%|█████▎ | 6350/8344 [00:12<00:03, 518.37batch/s][A
Training batches on cuda:0:  77%|█████▎ | 6404/8344 [00:12<00:03, 523.19batch/s][A
Training batches on cuda:0:  77%|█████▍ | 6457/8344 [00:12<00:03, 518.92batch/s][A
Training batches on cuda:0:  78%|█████▍ | 6509/8344 [00:12<00:03, 514.25batch/s][A
Training batches on cuda:0:  79%|█████▌ | 6561/8344 [00:12<00:03, 511.06batch/s][A
Training batches on cuda:0:  79%|█████▌ | 6613/8344 [00:12<00:03, 508.70batch/s][A
Training batches on cuda:0:  80%|█████▌ | 6664/8344 [00:12<00:03, 506.95batch/s][A
Training batches on cuda:0:  80%|█████▋ | 6715/8344 [00:12<00:03, 493.58batch/s][A
Training batches on cuda:0:  81%|█████▋ | 6765/8344 [00:12<00:03, 488.45batc

Training batches on cuda:0:  35%|██▍    | 2900/8344 [00:05<00:10, 528.64batch/s][A
Training batches on cuda:0:  35%|██▍    | 2953/8344 [00:05<00:10, 527.51batch/s][A
Training batches on cuda:0:  36%|██▌    | 3006/8344 [00:05<00:10, 524.66batch/s][A
Training batches on cuda:0:  37%|██▌    | 3059/8344 [00:05<00:10, 523.78batch/s][A
Training batches on cuda:0:  37%|██▌    | 3112/8344 [00:05<00:09, 523.26batch/s][A
Training batches on cuda:0:  38%|██▋    | 3165/8344 [00:06<00:09, 519.45batch/s][A
Training batches on cuda:0:  39%|██▋    | 3217/8344 [00:06<00:09, 519.18batch/s][A
Training batches on cuda:0:  39%|██▋    | 3270/8344 [00:06<00:09, 520.14batch/s][A
Training batches on cuda:0:  40%|██▊    | 3323/8344 [00:06<00:09, 520.65batch/s][A
Training batches on cuda:0:  40%|██▊    | 3376/8344 [00:06<00:09, 521.43batch/s][A
Training batches on cuda:0:  41%|██▉    | 3429/8344 [00:06<00:09, 521.76batch/s][A
Training batches on cuda:0:  42%|██▉    | 3482/8344 [00:06<00:09, 521.93batc

Training batches on cuda:0:  96%|██████▋| 8045/8344 [00:15<00:00, 520.84batch/s][A
Training batches on cuda:0:  97%|██████▊| 8098/8344 [00:15<00:00, 520.89batch/s][A
Training batches on cuda:0:  98%|██████▊| 8151/8344 [00:15<00:00, 516.86batch/s][A
Training batches on cuda:0:  98%|██████▉| 8204/8344 [00:15<00:00, 518.03batch/s][A
Training batches on cuda:0:  99%|██████▉| 8257/8344 [00:15<00:00, 518.98batch/s][A
Training batches on cuda:0: 100%|██████▉| 8310/8344 [00:15<00:00, 519.72batch/s][A
Training epochs on cuda:0:  80%|▊| 4/5 [01:19<00:15, 15.98s/epoch, loss=0.229, p[AINFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 5.
Training epochs on cuda:0: 100%|█| 5/5 [01:19<00:00, 15.99s/epoch, loss=0.229, p
INFO:pykeen.evaluation.evaluator:Starting batch_size search for evaluation now...
INFO:pykeen.evaluation.evaluator:Concluded batch_size search with batch_size=1024.
Evaluating on cuda:0: 100%|█████████████| 133k/133k [00:41<00:00, 3.22ktriple/s]


Training done


In [52]:
from ogb.linkproppred import Evaluator

evaluator = Evaluator(name = 'ogbl-ddi')
# You can learn the input and output format specification of the evaluator as follows.
print(evaluator.expected_input_format) 
# print(evaluator.expected_output_format) 

==== Expected input format of Evaluator for ogbl-ddi
{'y_pred_pos': y_pred_pos, 'y_pred_neg': y_pred_neg}
- y_pred_pos: numpy ndarray or torch tensor of shape (num_edges, ). Torch tensor on GPU is recommended for efficiency.
- y_pred_neg: numpy ndarray or torch tensor of shape (num_edges, ). Torch tensor on GPU is recommended for efficiency.
y_pred_pos is the predicted scores for positive edges.
y_pred_neg is the predicted scores for negative edges.
Note: As the evaluation metric is ranking-based, the predicted scores need to be different for different edges.


In [None]:
# pipeline_result = pipeline(
#     dataset='BioKG',
#     model='TransE',
#     epochs=5,
# )

In [50]:
pipeline_result.get_metric('MRR')

AttributeError: 'KG_model' object has no attribute 'result'

In [12]:
pipeline_result.metric_results.to_df()

Unnamed: 0,Side,Type,Metric,Value
0,head,optimistic,z_arithmetic_mean_rank,763.334352
1,tail,optimistic,z_arithmetic_mean_rank,774.100216
2,both,optimistic,z_arithmetic_mean_rank,1087.127262
3,head,realistic,z_arithmetic_mean_rank,763.334343
4,tail,realistic,z_arithmetic_mean_rank,774.10021
5,both,realistic,z_arithmetic_mean_rank,1087.127251
6,head,pessimistic,z_arithmetic_mean_rank,763.334327
7,tail,pessimistic,z_arithmetic_mean_rank,774.100202
8,both,pessimistic,z_arithmetic_mean_rank,1087.127234
9,head,optimistic,standard_deviation,6107.172591


In [3]:
import pandas as pd
import json

ddi = pd.read_csv("data/triplets/ddi.tsv", sep='\t', index_col=[0])
drug_ingredients = pd.read_csv("data/triplets/ingredients.tsv", sep='\t', index_col=[0])
food_compounds = pd.read_csv("data/triplets/food_compound.tsv", sep='\t', index_col=[0])

In [4]:
ddi.head()

Unnamed: 0,drug1,interaction,drug2
0,Apixaban,increase_anticoagulant_activities,Lepirudin
1,Dabigatran etexilate,increase_anticoagulant_activities,Lepirudin
2,Dasatinib,increase_bleeding,Lepirudin
3,Dasatinib,increase_hemorrhage,Lepirudin
4,Deferasirox,increase_gastrointestinal_bleeding,Lepirudin


In [5]:
compounds = set(food_compounds.compound)

food_compound_dict = dict()

for c in compounds:
    food_compound_dict[c] = list(food_compounds[food_compounds['compound'] == c].food)

food_compound_dict['Zinc'][:10]    

['Garden onion',
 'Mugwort',
 'Roman camomile',
 'Ceylon cinnamon',
 'Common hazelnut',
 'Wild carrot',
 'Cornmint',
 'Evening primrose',
 'Scarlet bean',
 'Cloudberry']

In [6]:
drug_compound_dict = dict()
drugs = set(drug_ingredients.drug_name)
interacting_drugs = set(ddi.drug1)
interacting_drugs.union(set(ddi.drug2))

for d in drugs:
    if d in interacting_drugs:
        drug_compound_dict[d] = list(drug_ingredients[drug_ingredients['drug_name'] == d].ingredient)

drug_compound_dict['Galantamine'][:10]  

['Galantamine']

In [6]:
with open('data/drug_compound_dict.json', 'w') as f:
    f.write(json.dumps(drug_compound_dict))

In [7]:
import itertools 

ingredients = list(drug_compound_dict.values())
ingredients = list(itertools.chain.from_iterable(ingredients))
ingredients = set([x.strip() for x in ingredients])
print('Number of interacting drugs ingredients:', len(ingredients))

print('Number of food compounds:', len(food_compound_dict.keys()))
ingredients.intersection(food_compound_dict.keys())

Number of interacting drugs ingredients: 758
Number of food compounds: 276


{'Adenosine',
 'Allantoin',
 'Ascorbic acid',
 'Atropine',
 'Caffeine',
 'Calcium',
 'Capsaicin',
 'Chromium',
 'Diazepam',
 'Dopamine',
 'Ethanol',
 'Formaldehyde',
 'Glycine',
 'Iron',
 'Melatonin',
 'Nicotinamide',
 'Nicotine',
 'Phenol',
 'Progesterone',
 'Pyridoxine',
 'Selenium',
 'Testosterone',
 'Theophylline',
 'Thiamine',
 'Vitamin D'}

In [19]:
from collections import Counter

def get_similar_food(drug_ingredients, food_compound_dict):
    sim_food = []

    for ingredient in drug_ingredients:
        # print(ingredient)
        foods = food_compound_dict.get(ingredient)
        if foods is not None:
            sim_food += foods    

    if len(sim_food) < 1:
        return []

    counts = Counter(sim_food)
    counts = dict(sorted(counts.items(), key=lambda item: item[1]))
    print(counts)
    return sim_food    

In [20]:
# Find a food with the most overlapping content with the interacting drug:

stop = 0
inspected_drugs = []

for row in ddi.itertuples():
    drug1 = row[1]
    effect = row[2]
    drug2 = row[3]

    if drug2 in inspected_drugs:
        continue

    inspected_drugs.append(drug2)
  
    drug2_ingredients = drug_compound_dict.get(drug2)
    
    if drug2_ingredients is not None:
        sim_food = get_similar_food(drug2_ingredients, food_compound_dict)

        
        

{'Savoy cabbage': 1, 'Kiwi': 1, 'Allium': 1, 'Garden onion': 1, 'Leek': 1, 'Garlic': 1, 'Chives': 1, 'Cashew nut': 1, 'Pineapple': 1, 'Dill': 1, 'Wild celery': 1, 'Peanut': 1, 'Burdock': 1, 'Horseradish': 1, 'Tarragon': 1, 'Asparagus': 1, 'Oat': 1, 'Star fruit': 1, 'Brazil nut': 1, 'Common beet': 1, 'Borage': 1, 'Swede': 1, 'Rape': 1, 'Common cabbage': 1, 'Cauliflower': 1, 'Brussel sprouts': 1, 'Broccoli': 1, 'Chinese cabbage': 1, 'Turnip': 1, 'Pigeon pea': 1, 'Tea': 1, 'Pepper': 1, 'Papaya': 1, 'Safflower': 1, 'Caraway': 1, 'Pecan nut': 1, 'Chestnut': 1, 'Chickpea': 1, 'Endive': 1, 'Chicory': 1, 'Chinese cinnamon': 1, 'Watermelon': 1, 'Lemon': 1, 'Mandarin orange (Clementine, Tangerine)': 1, 'Sweet orange': 1, 'Coffee': 1, 'Arabica coffee': 1, 'Robusta coffee': 1, 'Coriander': 1, 'Saffron': 1, 'Muskmelon': 1, 'Cucumber': 1, 'Cucurbita': 1, 'Wild carrot': 1, 'Japanese persimmon': 1, 'Loquat': 1, 'Common buckwheat': 1, 'Fig': 1, 'Fennel': 1, 'Strawberry': 1, 'Soy bean': 1, 'Sunflower': 