## Knowledge Graph
#### Drug-Food or Drug-Supplements interaction prediction

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

from pykeen.models import predict
from pykeen.pipeline import pipeline
from pykeen.triples import TriplesFactory
from pykeen.models import TransE
from pykeen.evaluation import RankBasedEvaluator

from os import listdir

### Split data into train, validation and test sets

In [3]:
data_dir = 'data/triplets/'

In [4]:
ddi_df = pd.read_csv(data_dir + 'ddi.tsv', sep='\t', index_col=[0])
ddi_df.head()

Unnamed: 0,drug1,interaction,drug2
0,Apixaban,increase_anticoagulant_activities,Lepirudin
1,Dabigatran etexilate,increase_anticoagulant_activities,Lepirudin
2,Dasatinib,increase_bleeding,Lepirudin
3,Dasatinib,increase_hemorrhage,Lepirudin
4,Deferasirox,increase_gastrointestinal_bleeding,Lepirudin


In [5]:
print('total interactions:', ddi_df.shape[0])
print('unique interactions:', len(set(ddi_df.interaction)))

ddi_df.groupby(by=['interaction']).size()

total interactions: 3123450
unique interactions: 205


interaction
decrease_absorption                             576
decrease_adverse_effects                        384
decrease_anticholinergic_activities              26
decrease_anticoagulant_activities              5174
decrease_antihypertensive_activities          63420
                                              ...  
increase_visual_accommodation_disturbances        2
increase_vomiting                                54
increase_water_intoxication                     256
increase_weakness                              1710
increase_weight_gain                             16
Length: 205, dtype: int64

In [6]:
set(ddi_df.interaction)

{'decrease_absorption',
 'decrease_adverse_effects',
 'decrease_anticholinergic_activities',
 'decrease_anticoagulant_activities',
 'decrease_antihypertensive_activities',
 'decrease_antiplatelet_activities',
 'decrease_arrhythmogenic_activities',
 'decrease_bioavailability',
 'decrease_bronchodilatory_activities',
 'decrease_cardiotoxicity',
 'decrease_cytotoxicity',
 'decrease_diuretic_activities',
 'decrease_effectiveness',
 'decrease_excretion_rate',
 'decrease_fluid_retaining_activities',
 'decrease_hypertension',
 'decrease_hypoglycemia',
 'decrease_hypotension',
 'decrease_metabolism',
 'decrease_myopathy',
 'decrease_nephrotoxicity',
 'decrease_neuromuscular_blockade',
 'decrease_protein_binding',
 'decrease_qtc_prolongation',
 'decrease_rhabdomyolysis',
 'decrease_sedation',
 'decrease_seizure',
 'decrease_serum_concentration',
 'decrease_skeletal_muscle_relaxing_activities',
 'decrease_stimulatory_activities',
 'decrease_therapeutic_efficacy',
 'decrease_vasoconstricting_acti

In [7]:
ddi_df.groupby(by=['interaction']).size().min()

2

In [8]:
drug_supplement_df = pd.read_csv(data_dir + 'ds_relations.tsv', sep='\t', index_col=[0])
drug_supplement_df = drug_supplement_df[drug_supplement_df['REL'] != 'has_ingredient']
drug_supplement_df.head()

Unnamed: 0,CUI1,REL,CUI2
689297,DC0477348,has_adverse_effect_on,DC0478433
689298,DC0477348,has_adverse_effect_on,DC0478434
689299,DC0477348,has_adverse_effect_on,DC0478432
689300,DC0477348,has_adverse_effect_on,DC0478435
689301,DC0477348,interacts_with,DC0478436


In [9]:
print('total interactions:', drug_supplement_df.shape[0])
print('unique interactions:', len(set(drug_supplement_df.REL)))

drug_supplement_df.groupby(by=['REL']).size()

total interactions: 18867
unique interactions: 5


REL
has_adverse_effect_on    3120
has_adverse_reaction     2093
has_therapeutic_class    5443
interacts_with           3057
is_effective_for         5154
dtype: int64

In [10]:
def compute_size(n):
    if n == 2:
        return 1, 1
    if n == 3:
        return 1, 2
    if n == 4:
        return 2, 3
    if n == 5:
        return 3, 4
    if n == 6:
        return 4, 5
    # n == 7
    return 4, 6 

In [11]:
# train : valid : test = 80 : 10 : 10
def split_data_relation(df_relation):
    if df_relation.shape[0] <= 7:
        train_size, valid_size = compute_size(df_relation.shape[0])
        
        df_relation = df_relation.sample(frac=1, random_state=42)
        X_train = df_relation.iloc[:train_size]
        X_valid = df_relation.iloc[train_size:valid_size]
        X_test = df_relation.iloc[valid_size:]

    else:
        X_train, X_rem = train_test_split(df_relation, train_size=0.8, random_state=42)
        X_valid, X_test = train_test_split(X_rem, test_size=0.5, random_state=42)
        
    return X_train, X_valid, X_test

In [12]:
# split drug-supplements relation dataset
def split_drug_supplements_dataset(drug_supplement_df):
    relations = set(drug_supplement_df.REL)
    
    train_triplets = pd.DataFrame(columns=['CUI1', 'REL', 'CUI2'])
    valid_triplets = pd.DataFrame(columns=['CUI1', 'REL', 'CUI2'])
    test_triplets = pd.DataFrame(columns=['CUI1', 'REL', 'CUI2'])

    for rel in relations:
        train, valid, test = split_data_relation(drug_supplement_df[drug_supplement_df['REL'] == rel])
        train_triplets = pd.concat([train_triplets, train])
        valid_triplets = pd.concat([valid_triplets, valid])
        test_triplets = pd.concat([test_triplets, test])

    train_triplets.rename(columns={'CUI1': 'head', 'REL': 'relation', 'CUI2': 'tail'}, inplace=True)
    valid_triplets.rename(columns={'CUI1': 'head', 'REL': 'relation', 'CUI2': 'tail'}, inplace=True)
    test_triplets.rename(columns={'CUI1': 'head', 'REL': 'relation', 'CUI2': 'tail'}, inplace=True)    


    print('train dataset size:', train_triplets.shape[0])
    print('validation dataset size:',valid_triplets.shape[0])
    print('test dataset size:',test_triplets.shape[0])
    
    return train_triplets, valid_triplets, test_triplets

In [13]:
# split drug-drug interaction dataset (from DrugBank)
def split_ddi_dataset(ddi_df):
    interactions = set(ddi_df.interaction)
    
    train_triplets = pd.DataFrame(columns=['drug1', 'interaction', 'drug2'])
    valid_triplets = pd.DataFrame(columns=['drug1', 'interaction', 'drug2'])
    test_triplets = pd.DataFrame(columns=['drug1', 'interaction', 'drug2'])
    
    for inter in interactions:
        train, valid, test = split_data_relation(ddi_df[ddi_df['interaction'] == inter])
        train_triplets = pd.concat([train_triplets, train])
        valid_triplets = pd.concat([valid_triplets, valid])
        test_triplets = pd.concat([test_triplets, test])
        
    train_triplets.rename(columns={'drug1': 'head', 'interaction': 'relation', 'drug2': 'tail'}, inplace=True)
    valid_triplets.rename(columns={'drug1': 'head', 'interaction': 'relation', 'drug2': 'tail'}, inplace=True)
    test_triplets.rename(columns={'drug1': 'head', 'interaction': 'relation', 'drug2': 'tail'}, inplace=True)    


    print('train dataset size:', train_triplets.shape[0])
    print('validation dataset size:',valid_triplets.shape[0])
    print('test dataset size:',test_triplets.shape[0])
    
    return train_triplets, valid_triplets, test_triplets    
    

In [14]:
# DrugBank drug-drug interactions
print('DrugBank drug-drug interactions')
train_triplets_ddi, valid_triplets_ddi, test_triplets_ddi = split_ddi_dataset(ddi_df)

print()

# Drug Supplement database - drug-suplement interactions
print('Drug Supplement database - drug-suplement interactions')
train_triplets_ds, valid_triplets_ds, test_triplets_ds = split_drug_supplements_dataset(drug_supplement_df)

DrugBank drug-drug interactions
train dataset size: 2498661
validation dataset size: 312349
test dataset size: 312440

Drug Supplement database - drug-suplement interactions
train dataset size: 15092
validation dataset size: 1886
test dataset size: 1889


In [15]:
# all interactions
train_triplets = pd.concat([train_triplets_ddi, train_triplets_ds])
valid_triplets = pd.concat([valid_triplets_ddi, valid_triplets_ds])
test_triplets = pd.concat([test_triplets_ddi, test_triplets_ds])

print('All interactions:')
print('train dataset size:', train_triplets.shape[0])
print('validation dataset size:',valid_triplets.shape[0])
print('test dataset size:',test_triplets.shape[0])

All interactions:
train dataset size: 2513753
validation dataset size: 314235
test dataset size: 314329


In [16]:
all_relations = set(train_triplets.relation)
print('Number of unique interactions:', len(all_relations))
print(list(all_relations)[:10])

Number of unique interactions: 210
['increase_thrombosis', 'decrease_antihypertensive_activities', 'increase_extrapyramidal_symptoms', 'decrease_antiplatelet_activities', 'increase_cutaneous_drug_reaction', 'increase_skeletal_muscle_relaxing_activities', 'decrease_hypertension', 'increase_antihypertensive_activities', 'increase_hyperglycemia', 'increase_vasopressor_activities']


#### Add rest of the data into the training set

In [17]:
files = listdir(data_dir)

for file in files:
    if file == 'ddi.tsv' or file == 'train.tsv' or file == 'test.tsv' or file == 'valid.tsv':
        continue
        
    if file == 'ds_relations.tsv':
        df = pd.read_csv(data_dir + file, sep='\t', index_col=[0])
        df = df[df['REL'] == 'has_ingredient']
        
    else:    
        df = pd.read_csv(data_dir + file, sep='\t', index_col=[0])
    
    df.set_axis(['head', 'relation', 'tail'], axis=1, inplace=True) # copy=False insead of inplace -> newer versionn of pandas?
    train_triplets = pd.concat([train_triplets, df])
    
print('Final size of train dataset (with other relations):', train_triplets.shape[0]) 

Final size of train dataset (with other relations): 4027571


In [18]:
train_triplets = train_triplets.astype(str)
valid_triplets = valid_triplets.astype(str)
test_triplets = test_triplets.astype(str)

In [None]:
# save train, valid and test datasets

train_triplets.to_csv(data_dir + 'train.tsv', sep='\t')
valid_triplets.to_csv(data_dir + 'valid.tsv', sep='\t')
test_triplets.to_csv(data_dir + 'test.tsv', sep='\t')

#### Convert datasets into Triples Factory format

In [19]:
def convert_to_triples_factory(data):
    tf_data = TriplesFactory.from_labeled_triples(
      data[["head", "relation", "tail"]].values,
      create_inverse_triples=False,
      entity_to_id=None,
      relation_to_id=None,
      compact_id=False 
    )
    print(tf_data)  # kam mizeji nejake trojice? - jiny pocet zde a po vytvoreni datasetu
    return tf_data

In [20]:
tf_train = convert_to_triples_factory(train_triplets)
tf_valid = convert_to_triples_factory(valid_triplets)
tf_test = convert_to_triples_factory(test_triplets)


TriplesFactory(num_entities=667521, num_relations=221, create_inverse_triples=False, num_triples=4021167)
TriplesFactory(num_entities=5629, num_relations=187, create_inverse_triples=False, num_triples=314106)
TriplesFactory(num_entities=5593, num_relations=210, create_inverse_triples=False, num_triples=314223)


In [21]:
# take just subset of data for testing

train_sub, _ = tf_train.split(0.15)
valid_sub, _ = tf_valid.split(0.15)
test_sub, _ = tf_test.split(0.15)

using automatically assigned random_state=3079871275
using automatically assigned random_state=2549625383
using automatically assigned random_state=2245768799


In [22]:
train_sub

TriplesFactory(num_entities=667521, num_relations=221, create_inverse_triples=False, num_triples=603175)

### Model

In [None]:
# evaluation_relation_whitelist = all_relations

In [23]:
# creating a model
result = pipeline(
    training=train_sub,
    testing=test_sub,
    validation=valid_sub,
    model='TransE',
    epochs=1,
    evaluator=RankBasedEvaluator,
#     model_kwargs=dict(embedding_dim=128),
    device='gpu',
    optimizer='Adam',
#     evaluation_relation_whitelist=evaluation_relation_whitelist,
#     training_kwargs=dict(
#         num_epochs=2,
#         checkpoint_name='transE_checkpoint.pt',
#         checkpoint_directory='kg_ckeckpoints',
#         checkpoint_frequency=0
#     ),
)

No random seed is specified. Setting to 2262391157.
Training epochs on cuda:0:   0%|                                                                             | 0/1 [00:00<?, ?epoch/s]
Training batches on cuda:0:   0%|                                                                         | 0/2357 [00:00<?, ?batch/s][A
Training batches on cuda:0:   0%|▏                                                                | 7/2357 [00:00<00:37, 62.93batch/s][A
Training batches on cuda:0:   1%|▍                                                               | 15/2357 [00:00<00:33, 69.28batch/s][A
Training batches on cuda:0:   1%|▌                                                               | 23/2357 [00:00<00:32, 71.32batch/s][A
Training batches on cuda:0:   1%|▊                                                               | 31/2357 [00:00<00:32, 72.34batch/s][A
Training batches on cuda:0:   2%|█                                                               | 39/2357 [00:00<00:32, 72

Training batches on cuda:0:  20%|████████████▍                                                  | 463/2357 [00:06<00:25, 74.22batch/s][A
Training batches on cuda:0:  20%|████████████▌                                                  | 471/2357 [00:06<00:25, 74.05batch/s][A
Training batches on cuda:0:  20%|████████████▊                                                  | 479/2357 [00:06<00:25, 74.11batch/s][A
Training batches on cuda:0:  21%|█████████████                                                  | 487/2357 [00:06<00:25, 74.13batch/s][A
Training batches on cuda:0:  21%|█████████████▏                                                 | 495/2357 [00:06<00:25, 74.15batch/s][A
Training batches on cuda:0:  21%|█████████████▍                                                 | 503/2357 [00:06<00:24, 74.19batch/s][A
Training batches on cuda:0:  22%|█████████████▋                                                 | 511/2357 [00:06<00:24, 74.20batch/s][A
Training batches on cuda:0:  22%|█

Training batches on cuda:0:  40%|████████████████████████▉                                      | 935/2357 [00:12<00:19, 73.72batch/s][A
Training batches on cuda:0:  40%|█████████████████████████▏                                     | 943/2357 [00:12<00:19, 73.76batch/s][A
Training batches on cuda:0:  40%|█████████████████████████▍                                     | 951/2357 [00:12<00:19, 73.74batch/s][A
Training batches on cuda:0:  41%|█████████████████████████▋                                     | 959/2357 [00:12<00:18, 73.63batch/s][A
Training batches on cuda:0:  41%|█████████████████████████▊                                     | 967/2357 [00:13<00:18, 73.59batch/s][A
Training batches on cuda:0:  41%|██████████████████████████                                     | 975/2357 [00:13<00:18, 73.51batch/s][A
Training batches on cuda:0:  42%|██████████████████████████▎                                    | 983/2357 [00:13<00:18, 73.63batch/s][A
Training batches on cuda:0:  42%|█

Training batches on cuda:0:  60%|█████████████████████████████████████                         | 1407/2357 [00:19<00:12, 74.41batch/s][A
Training batches on cuda:0:  60%|█████████████████████████████████████▏                        | 1415/2357 [00:19<00:12, 74.42batch/s][A
Training batches on cuda:0:  60%|█████████████████████████████████████▍                        | 1423/2357 [00:19<00:12, 74.36batch/s][A
Training batches on cuda:0:  61%|█████████████████████████████████████▋                        | 1431/2357 [00:19<00:12, 74.29batch/s][A
Training batches on cuda:0:  61%|█████████████████████████████████████▊                        | 1439/2357 [00:19<00:12, 74.24batch/s][A
Training batches on cuda:0:  61%|██████████████████████████████████████                        | 1447/2357 [00:19<00:12, 74.20batch/s][A
Training batches on cuda:0:  62%|██████████████████████████████████████▎                       | 1455/2357 [00:19<00:12, 73.89batch/s][A
Training batches on cuda:0:  62%|█

Training batches on cuda:0:  80%|█████████████████████████████████████████████████▍            | 1879/2357 [00:25<00:06, 74.00batch/s][A
Training batches on cuda:0:  80%|█████████████████████████████████████████████████▋            | 1887/2357 [00:25<00:06, 74.00batch/s][A
Training batches on cuda:0:  80%|█████████████████████████████████████████████████▊            | 1895/2357 [00:25<00:06, 73.96batch/s][A
Training batches on cuda:0:  81%|██████████████████████████████████████████████████            | 1903/2357 [00:25<00:06, 74.00batch/s][A
Training batches on cuda:0:  81%|██████████████████████████████████████████████████▎           | 1911/2357 [00:25<00:06, 73.97batch/s][A
Training batches on cuda:0:  81%|██████████████████████████████████████████████████▍           | 1919/2357 [00:25<00:05, 73.64batch/s][A
Training batches on cuda:0:  82%|██████████████████████████████████████████████████▋           | 1927/2357 [00:26<00:05, 73.76batch/s][A
Training batches on cuda:0:  82%|█

Training batches on cuda:0: 100%|█████████████████████████████████████████████████████████████▊| 2351/2357 [00:31<00:00, 73.38batch/s][A
Training epochs on cuda:0: 100%|███████████████████████████████████████████| 1/1 [00:32<00:00, 32.03s/epoch, loss=0.92, prev_loss=nan][A
INFO:pykeen.evaluation.evaluator:Starting batch_size search for evaluation now...
INFO:pykeen.evaluation.evaluator:Concluded batch_size search with batch_size=16.
Evaluating on cuda:0: 100%|███████████████████████████████████████████████████████████████████| 47.1k/47.1k [02:19<00:00, 339triple/s]
INFO:pykeen.evaluation.evaluator:Evaluation took 140.48s seconds


In [None]:
result.plot()

In [24]:
hits_at_10 = result.get_metric('hits@10')
hits_at_10

2.121655740139605e-05

In [28]:
result.training

TriplesFactory(num_entities=667521, num_relations=221, create_inverse_triples=False, num_triples=603175)

In [25]:
# predictions

# model = result.model

# predictions_df = predict.get_prediction_df(
#     model, 
#     triples_factory=result.training, 
#     tail_label='Kiwi', 
#     relation_label='increase_sleep_disorders')

# predictions_df.head(15)

predicted_tails_df = predict.get_tail_prediction_df(
        model = result.model, 
        head_label = "Ibuprofen", 
        relation_label = "decrease_adverse_effects", 
        triples_factory = result.training,
    )

predicted_tails_df.head(20)

Unnamed: 0,tail_id,tail_label,score,in_training
504720,504720,Ibuprofen,-4.569305,False
318122,318122,DC0016603,-5.347391,False
621,621,Ethanol,-5.379491,False
410346,410346,DC0447253,-5.523694,False
650,650,Famotidine,-5.703913,False
53917,53917,Cadmium Iodatum Tablets,-5.744159,False
485653,485653,Glonoinum Pellets,-5.784556,False
639475,639475,VITAMIN WORLD(R) - Fennel Seed 480MG,-5.785741,False
289171,289171,DB03723,-5.804255,False
74348,74348,Cyctek Chai Hu Gui Zhi Gan Jiang Granule,-5.821579,False


In [None]:
# evaluation
# # ????
# evaluator = RankBasedEvaluator()

# triples = tf_test.mapped_triples

# eval_results = evaluator.evaluate(
#     model=model,
#     mapped_triples=triples,
#     batch_size=1024,
#     additional_filter_triples=[
#        train_sub.mapped_triples,
#        valid_sub.mapped_triples,
#     ],
# )

In [None]:
# eval_results.to_df()

In [None]:
result.metric_results.to_df()

In [None]:
result.save_to_directory("results")