## Knowledge Graph
#### Drug-Food or Drug-Supplements interaction prediction

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

from pykeen.predict import predict_target
from pykeen.pipeline import pipeline
from pykeen.triples import TriplesFactory, CoreTriplesFactory
from pykeen.models import TransE
from pykeen.evaluation import RankBasedEvaluator, OGBEvaluator

from os import listdir

from kg_model import KG_model

### Split data into train, validation and test sets

In [3]:
data_dir = '../data/triplets/'

In [None]:
ddi_df = pd.read_csv(data_dir + 'ddi.tsv', sep='\t', index_col=[0])
ddi_df.head()

In [None]:
print('total interactions:', ddi_df.shape[0])
print('unique interactions:', len(set(ddi_df.interaction)))

interaction_counts = ddi_df.groupby(by=['interaction']).size()
interaction_counts

In [None]:
set(ddi_df.interaction)

In [None]:
drug_supplement_df = pd.read_csv(data_dir + 'ds_relations.tsv', sep='\t', index_col=[0])
# drug_supplement_df = drug_supplement_df[drug_supplement_df['REL'] != 'has_ingredient']
drug_supplement_df.head()

In [None]:
print('total interactions:', drug_supplement_df.shape[0])
print('unique interactions:', len(set(drug_supplement_df.REL)))

ds_interaction_counts = drug_supplement_df.groupby(by=['REL']).size()
ds_interaction_counts

In [None]:
interactions_count = pd.concat([interaction_counts, ds_interaction_counts])
interactions_count.to_csv('interaction_counts.csv', header=['interaction_count'])

In [None]:
def compute_size(n):
    if n == 2:
        return 1, 1
    if n == 3:
        return 1, 2
    if n == 4:
        return 2, 3
    if n == 5:
        return 3, 4
    if n == 6:
        return 4, 5
    # n == 7
    return 4, 6 

In [None]:
# train : valid : test = 80 : 10 : 10
def split_data_relation(df_relation):
    
    # too few triplets with the realtion
    if df_relation.shape[0] <= 7:
        train_size, valid_size = compute_size(df_relation.shape[0])
        
        # shuffle df_relation
        df_relation = df_relation.sample(frac=1, random_state=42)
        
        X_train = df_relation.iloc[:train_size]
        X_valid = df_relation.iloc[train_size:valid_size]
        X_test = df_relation.iloc[valid_size:]

    else:
        X_train, X_rem = train_test_split(df_relation, train_size=0.8, random_state=42)
        X_valid, X_test = train_test_split(X_rem, test_size=0.5, random_state=42)
        
    return X_train, X_valid, X_test

In [None]:
# split drug-supplements relation dataset
def split_drug_supplements_dataset(drug_supplement_df):
    relations = set(drug_supplement_df.REL)
    
    train_triplets = pd.DataFrame(columns=['CUI1', 'REL', 'CUI2'])
    valid_triplets = pd.DataFrame(columns=['CUI1', 'REL', 'CUI2'])
    test_triplets = pd.DataFrame(columns=['CUI1', 'REL', 'CUI2'])

    for rel in relations:
        train, valid, test = split_data_relation(drug_supplement_df[drug_supplement_df['REL'] == rel])
        train_triplets = pd.concat([train_triplets, train])
        valid_triplets = pd.concat([valid_triplets, valid])
        test_triplets = pd.concat([test_triplets, test])

    train_triplets.rename(columns={'CUI1': 'head', 'REL': 'relation', 'CUI2': 'tail'}, inplace=True)
    valid_triplets.rename(columns={'CUI1': 'head', 'REL': 'relation', 'CUI2': 'tail'}, inplace=True)
    test_triplets.rename(columns={'CUI1': 'head', 'REL': 'relation', 'CUI2': 'tail'}, inplace=True)    


    print('train dataset size:', train_triplets.shape[0])
    print('validation dataset size:',valid_triplets.shape[0])
    print('test dataset size:',test_triplets.shape[0])
    
    return train_triplets, valid_triplets, test_triplets

In [None]:
# split drug-drug interaction dataset (from DrugBank)
def split_ddi_dataset(ddi_df):
    interactions = set(ddi_df.interaction)
    
    train_triplets = pd.DataFrame(columns=['drug1', 'interaction', 'drug2'])
    valid_triplets = pd.DataFrame(columns=['drug1', 'interaction', 'drug2'])
    test_triplets = pd.DataFrame(columns=['drug1', 'interaction', 'drug2'])
    
    for inter in interactions:
        train, valid, test = split_data_relation(ddi_df[ddi_df['interaction'] == inter])
        train_triplets = pd.concat([train_triplets, train])
        valid_triplets = pd.concat([valid_triplets, valid])
        test_triplets = pd.concat([test_triplets, test])
        
    train_triplets.rename(columns={'drug1': 'head', 'interaction': 'relation', 'drug2': 'tail'}, inplace=True)
    valid_triplets.rename(columns={'drug1': 'head', 'interaction': 'relation', 'drug2': 'tail'}, inplace=True)
    test_triplets.rename(columns={'drug1': 'head', 'interaction': 'relation', 'drug2': 'tail'}, inplace=True)    


    print('train dataset size:', train_triplets.shape[0])
    print('validation dataset size:',valid_triplets.shape[0])
    print('test dataset size:',test_triplets.shape[0])
    
    return train_triplets, valid_triplets, test_triplets    
    

In [None]:
# DrugBank drug-drug interactions
print('DrugBank drug-drug interactions')
train_triplets_ddi, valid_triplets_ddi, test_triplets_ddi = split_ddi_dataset(ddi_df)

print()

# Drug Supplement database - drug-suplement interactions
print('Drug Supplement database - drug-suplement interactions')
train_triplets_ds, valid_triplets_ds, test_triplets_ds = split_drug_supplements_dataset(drug_supplement_df)

In [None]:
# all interactions
train_triplets = pd.concat([train_triplets_ddi, train_triplets_ds])
valid_triplets = pd.concat([valid_triplets_ddi, valid_triplets_ds])
test_triplets = pd.concat([test_triplets_ddi, test_triplets_ds])

print('All interactions:')
print('train dataset size:', train_triplets.shape[0])
print('validation dataset size:',valid_triplets.shape[0])
print('test dataset size:',test_triplets.shape[0])

In [None]:
all_relations = set(train_triplets.relation)
print('Number of unique interactions:', len(all_relations))
print(list(all_relations)[:10])

#### Add rest of the data into the training set

In [None]:
files = listdir(data_dir)

for file in files:
    if file == 'ddi.tsv' or file == '.ipynb_checkpoints' or file == 'ds_relations.tsv':
        continue
    if 'train' in file or 'valid' in file or 'test' in file:
        continue
           
    df = pd.read_csv(data_dir + file, sep='\t', index_col=[0])
    
    # if file == 'ds_relations.tsv':
    #     df = df[df['REL'] == 'has_ingredient']
    
    df.set_axis(['head', 'relation', 'tail'], axis=1, inplace=True) 
    train_triplets = pd.concat([train_triplets, df])
    
print('Final size of train dataset (with other relations):', train_triplets.shape[0]) 

In [None]:
train_triplets = train_triplets.astype(str)
valid_triplets = valid_triplets.astype(str)
test_triplets = test_triplets.astype(str)

In [None]:
# save train, valid and test datasets

train_triplets.to_csv(data_dir + 'train.tsv', sep='\t')
valid_triplets.to_csv(data_dir + 'valid.tsv', sep='\t')
test_triplets.to_csv(data_dir + 'test.tsv', sep='\t')

In [4]:
# read data

specification = '_with_biokg'

train_triplets = pd.read_csv(data_dir + 'train' + specification + '.tsv', sep='\t')
valid_triplets = pd.read_csv(data_dir + 'valid' + specification + '.tsv', sep='\t')
test_triplets = pd.read_csv(data_dir + 'test' + specification + '.tsv', sep='\t')

In [None]:
train_triplets

In [5]:
common_drugs = pd.read_csv('../data/common_drugs.csv', sep=';')
common_drugs = common_drugs['DrugBank_id'].values

In [None]:
train_triplets[train_triplets['head'] == common_drugs[3]]

#### Convert datasets into Triples Factory format

In [6]:
def convert_to_triples_factory(data):
    tf_data = TriplesFactory.from_labeled_triples(
      data[["head", "relation", "tail"]].values,
      create_inverse_triples=False,
      entity_to_id=None,
      relation_to_id=None,
      compact_id=False 
    )
    print(tf_data)  
    return tf_data


In [7]:
tf_train = convert_to_triples_factory(train_triplets.astype(str))
tf_valid = convert_to_triples_factory(valid_triplets.astype(str))
tf_test = convert_to_triples_factory(test_triplets.astype(str))



TriplesFactory(num_entities=33046, num_relations=58, create_inverse_triples=False, num_triples=2074440)
TriplesFactory(num_entities=2402, num_relations=4, create_inverse_triples=False, num_triples=164903)
TriplesFactory(num_entities=2415, num_relations=4, create_inverse_triples=False, num_triples=164926)


In [8]:
# take just subset of data for testing

train_sub, _ = tf_train.split(0.15)
valid_sub, _ = tf_valid.split(0.15)
test_sub, _ = tf_test.split(0.15)

using automatically assigned random_state=2378562106
using automatically assigned random_state=481061183
using automatically assigned random_state=770684229


### Model

In [9]:
params = {'epochs': 2, 
          'optimizer': 'adam', 
          'learning_rate': 0.005,
          'loss': 'MarginRankingLoss',
          'batch': 512,
          'embedding_dim': 300,
          'margin': 0.64,
          'evaluator': 'ogb'
         }
params['epochs']

2

In [10]:
model_kg = KG_model('complex', tf_train, tf_valid, tf_test, 'jupyter_test')
model_kg.set_params2(params)

print('Training...')
model_kg.train()
print('Training done')

No random seed is specified. Setting to 784705745.


Training...


TypeError: __init__() missing 1 required positional argument: 'evaluation_factory'

In [None]:
# model_kg.trained_model.metric_results.to_df()

head = common_drugs[0]
relation = 'negative'

pred = predict_target(
            model = model_kg.trained_model.model, 
            head = head, 
            relation = relation, 
            triples_factory = model_kg.trained_model.training,
        )
pred

In [None]:
model_kg.trained_model.get_metric('mrr')

In [None]:
# creating a model
result = pipeline(
    training=train_sub,
    testing=test_sub,
    validation=valid_sub,
    model='RGCN',
    epochs=1,
#     evaluator=RankBasedEvaluator,
    model_kwargs=dict(embedding_dim=20, num_layers=1, interaction='distmult'),
#     device='gpu',
#     optimizer='Adam',
#     training_kwargs=dict(
#         batch_size=32,
#         use_tqdm=True
# #         num_epochs=2,
# #         checkpoint_name='transE_checkpoint.pt',
# #         checkpoint_directory='kg_ckeckpoints',
# #         checkpoint_frequency=0
#     ),
#     use_tqdm=True,
)

In [None]:
result.plot()

In [None]:
hits_at_10 = result.get_metric('hits@10')
hits_at_10

In [None]:
result.training

In [None]:
result.metric_results.to_df()

In [None]:
result.save_to_directory("results")

---------------------------------------------------------------------------------------

## KG parameters setting

In [15]:
from pykeen.datasets import Hetionet, BioKG
dataset = Hetionet()
dataset

Hetionet(num_entities=45158, num_relations=24, create_inverse_triples=False)

In [None]:
labels = dataset.training.entity_id_to_label.values()
for l in labels:
    if 'Compound' in l:
        print(l)

In [26]:
result = pipeline(
#     dataset=BioKG,
    training=tf_train,
    vlidation=tf_valid,
    model='ComplEx',
    epochs=5,
    evaluator=OGBEvaluator(tf_test),
    model_kwargs={
        'embedding_dim': 1000
    },
    optimizer='Adam',
    optimizer_kwargs = dict(
                lr = 0.001
    ),
    loss = 'MarginRankingLoss',
    device='gpu'
)

ValueError: OGB evaluator is already filtered, but not dynamically like other evaluators because it requires pre-calculated filtered negative triples. Therefore, it is not allowed to accept filtered=True

In [25]:
tf_test

TriplesFactory(num_entities=2415, num_relations=4, create_inverse_triples=False, num_triples=164926)

In [None]:
model = result.model
model

In [None]:
result.plot_losses()
plt.show()

In [None]:
result.metric_results.to_df()

In [None]:
result.get_metric('mrr')

In [None]:
from pykeen.hpo import hpo_pipeline_from_config

In [None]:
config = {
    'optuna': dict(
        n_trials=5,
    ),
    'pipeline': dict(
        training = tf_train,
        validation = tf_valid,
        testing = tf_test,
        model='TransR',
        model_kwargs_ranges=dict(
               embedding_dim=dict(type=int, low=20, high=160, q=20),
        ),
        optimizer='Adam',
        optimizer_kwargs=dict(lr=0.01),
        loss='marginranking',
        loss_kwargs=dict(margin=1),
        training_loop='slcwa',
        training_kwargs=dict(num_epochs=100, batch_size=128),
        negative_sampler='basic',
        negative_sampler_kwargs=dict(num_negs_per_pos=1),
        evaluator_kwargs=dict(filtered=True),
        evaluation_kwargs=dict(batch_size=128),
        stopper='early',
        stopper_kwargs=dict(frequency=5, patience=2, relative_delta=0.002),
    )
}

In [None]:
hpo_pipeline_result = hpo_pipeline_from_config(config)

In [None]:
hpo_pipeline_result.save_to_directory('hpo_results')

In [None]:
from pykeen.pipeline import pipeline_from_config

In [None]:
config2 = {
    "metadata": {
    "title": "nations - try",
    "comments": "comment"
  },
  "pipeline": {
    "dataset": "nations",
    "model": "TransE",
    "model_kwargs": {
      "embedding_dim": 50,
      "scoring_fct_norm": 1
    },
    "optimizer": "SGD",
    "optimizer_kwargs": {
      "lr": 0.01
    },
    "loss": "MarginRankingLoss",
    "loss_kwargs": {
      "reduction": "mean",
      "margin": 1
    },
    "training_loop": "slcwa",
    "negative_sampler": "basic",
    "negative_sampler_kwargs": {
      "num_negs_per_pos": 1
    },
    "training_kwargs": {
      "num_epochs": 100,
      "batch_size": 32
    },
    "evaluator_kwargs": {
      "filtered": True
    }
  }
}

pipeline_result = pipeline_from_config(config2)

In [None]:
pipeline_result.metric_results.to_df()

## Result tracking

In [None]:
from pykeen.pipeline import pipeline
from pykeen.datasets import get_dataset
from pykeen.trackers import ResultTracker


dataset = get_dataset(dataset="nations")

pipeline_result = pipeline(
    model='RotatE',
    dataset=dataset,
    result_tracker="console",
    result_tracker_kwargs = dict(metric_filter='.*head.realistic.hits_at_10.*'),
    training_kwargs = dict(
        num_epochs = 5,
        callbacks="validation-loss",
        callback_kwargs=dict(
            evaluation_triples=dataset.validation.mapped_triples,
            prefix="validation",
        ),
    )    
)



In [None]:
dataset