## Knowledge Graph
#### Drug-Food or Drug-Supplements interaction prediction

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

from pykeen.models import predict
from pykeen.pipeline import pipeline
from pykeen.triples import TriplesFactory
from pykeen.models import TransE
from pykeen.evaluation import RankBasedEvaluator

  from .autonotebook import tqdm as notebook_tqdm


### Split data into train, validation and test sets

In [2]:
data_dir = 'data/triplets/'

In [3]:
ddi_df = pd.read_csv(data_dir + 'ddi.tsv', sep='\t', index_col=[0])
ddi_df.head()

Unnamed: 0,drug1,interaction,drug2
0,Apixaban,increase_anticoagulant_activities,Lepirudin
1,Dabigatran etexilate,increase_anticoagulant_activities,Lepirudin
2,Dasatinib,increase_bleeding,Lepirudin
3,Dasatinib,increase_hemorrhage,Lepirudin
4,Deferasirox,increase_gastrointestinal_bleeding,Lepirudin


In [4]:
print('total interactions:', ddi_df.shape[0])
print('unique interactions:', len(set(ddi_df.interaction)))

ddi_df.groupby(by=['interaction']).size()

total interactions: 3123450
unique interactions: 205


interaction
decrease_absorption                             576
decrease_adverse_effects                        384
decrease_anticholinergic_activities              26
decrease_anticoagulant_activities              5174
decrease_antihypertensive_activities          63420
                                              ...  
increase_visual_accommodation_disturbances        2
increase_vomiting                                54
increase_water_intoxication                     256
increase_weakness                              1710
increase_weight_gain                             16
Length: 205, dtype: int64

In [5]:
ddi_df.groupby(by=['interaction']).size().min()

2

In [6]:
drug_supplement_df = pd.read_csv(data_dir + 'ds_relations.tsv', sep='\t', index_col=[0])
drug_supplement_df = drug_supplement_df[drug_supplement_df['REL'] != 'has_ingredient']
drug_supplement_df.head()

Unnamed: 0,CUI1,REL,CUI2
689297,DC0477348,has_adverse_effect_on,DC0478433
689298,DC0477348,has_adverse_effect_on,DC0478434
689299,DC0477348,has_adverse_effect_on,DC0478432
689300,DC0477348,has_adverse_effect_on,DC0478435
689301,DC0477348,interacts_with,DC0478436


In [7]:
print('total interactions:', drug_supplement_df.shape[0])
print('unique interactions:', len(set(drug_supplement_df.REL)))

drug_supplement_df.groupby(by=['REL']).size()

total interactions: 18867
unique interactions: 5


REL
has_adverse_effect_on    3120
has_adverse_reaction     2093
has_therapeutic_class    5443
interacts_with           3057
is_effective_for         5154
dtype: int64

In [8]:
def compute_size(n):
    if n == 2:
        return 1, 1
    if n == 3:
        return 1, 2
    if n == 4:
        return 2, 3
    if n == 5:
        return 3, 4
    if n == 6:
        return 4, 5
    # n == 7
    return 4, 6 

In [9]:
# train : valid : test = 80 : 10 : 10
def split_data_relation(df_relation):
    if df_relation.shape[0] <= 7:
        train_size, valid_size = compute_size(df_relation.shape[0])
        
        df_relation = df_relation.sample(frac=1, random_state=42)
        X_train = df_relation.iloc[:train_size]
        X_valid = df_relation.iloc[train_size:valid_size]
        X_test = df_relation.iloc[valid_size:]

    else:
        X_train, X_rem = train_test_split(df_relation, train_size=0.8, random_state=42)
        X_valid, X_test = train_test_split(X_rem, test_size=0.5, random_state=42)
        
    return X_train, X_valid, X_test

In [10]:
# split drug-supplements relation dataset
def split_drug_supplements_dataset(drug_supplement_df):
    relations = set(drug_supplement_df.REL)
    
    train_triplets = pd.DataFrame(columns=['CUI1', 'REL', 'CUI2'])
    valid_triplets = pd.DataFrame(columns=['CUI1', 'REL', 'CUI2'])
    test_triplets = pd.DataFrame(columns=['CUI1', 'REL', 'CUI2'])

    for rel in relations:
        train, valid, test = split_data_relation(drug_supplement_df[drug_supplement_df['REL'] == rel])
        train_triplets = pd.concat([train_triplets, train])
        valid_triplets = pd.concat([valid_triplets, valid])
        test_triplets = pd.concat([test_triplets, test])

    train_triplets.rename(columns={'CUI1': 'head', 'REL': 'relation', 'CUI2': 'tail'}, inplace=True)
    valid_triplets.rename(columns={'CUI1': 'head', 'REL': 'relation', 'CUI2': 'tail'}, inplace=True)
    test_triplets.rename(columns={'CUI1': 'head', 'REL': 'relation', 'CUI2': 'tail'}, inplace=True)    


    print('train dataset size:', train_triplets.shape[0])
    print('validation dataset size:',valid_triplets.shape[0])
    print('test dataset size:',test_triplets.shape[0])
    
    return train_triplets, valid_triplets, test_triplets

In [11]:
train_triplets, valid_triplets, test_triplets = split_drug_supplements_dataset(drug_supplement_df)

train dataset size: 15092
validation dataset size: 1886
test dataset size: 1889


In [12]:
# split drug-drug interaction dataset (from DrugBank)
def split_ddi_dataset(ddi_df):
    interactions = set(ddi_df.interaction)
    
    train_triplets = pd.DataFrame(columns=['drug1', 'interaction', 'drug2'])
    valid_triplets = pd.DataFrame(columns=['drug1', 'interaction', 'drug2'])
    test_triplets = pd.DataFrame(columns=['drug1', 'interaction', 'drug2'])
    
    for inter in interactions:
        train, valid, test = split_data_relation(ddi_df[ddi_df['interaction'] == inter])
        train_triplets = pd.concat([train_triplets, train])
        valid_triplets = pd.concat([valid_triplets, valid])
        test_triplets = pd.concat([test_triplets, test])
        
    train_triplets.rename(columns={'drug1': 'head', 'interaction': 'relation', 'drug2': 'tail'}, inplace=True)
    valid_triplets.rename(columns={'drug1': 'head', 'interaction': 'relation', 'drug2': 'tail'}, inplace=True)
    test_triplets.rename(columns={'drug1': 'head', 'interaction': 'relation', 'drug2': 'tail'}, inplace=True)    


    print('train dataset size:', train_triplets.shape[0])
    print('validation dataset size:',valid_triplets.shape[0])
    print('test dataset size:',test_triplets.shape[0])
    
    return train_triplets, valid_triplets, test_triplets    
    

In [13]:
train_triplets_ddi, valid_triplets_ddi, test_triplets_ddi = split_ddi_dataset(ddi_df)

train dataset size: 2498661
validation dataset size: 312349
test dataset size: 312440


#### Add rest of the data into the training set

In [None]:
# TODO

#### Convert datasets into Triples Factory format

In [None]:
def convert_to_triples_factory(data):
    tf_data = TriplesFactory.from_labeled_triples(
      data[["head", "relation", "tail"]].values,
      create_inverse_triples=False,
      entity_to_id=None,
      relation_to_id=None,
      compact_id=False 
    )
    print(tf_data)  # kam mizeji nejake trojice? - jiny pocet zde a po vytvoreni datasetu
    return tf_data

In [None]:
tf_train = convert_to_triples_factory(train_triplets)
tf_valid = convert_to_triples_factory(valid_triplets)
tf_test = convert_to_triples_factory(test_triplets)


### Model

In [None]:
# creating a model
result = pipeline(
    training=tf_train,
    testing=tf_test,
    validation=tf_valid,
    model='TransE',
    evaluator=RankBasedEvaluator
)

In [None]:
# predictions

model = result.model

predictions_df = predict.get_tail_prediction_df(model, 'DC0477356', 'has_adverse_effect_on', triples_factory=result.training)
predictions_df

In [None]:
# evaluation
# ????
evaluator = RankBasedEvaluator()

triples = tf_test.mapped_triples

eval_results = evaluator.evaluate(
    model=model,
    mapped_triples=triples,
    batch_size=1024,
    additional_filter_triples=[
       tf_train.mapped_triples,
       tf_valid.mapped_triples,
    ],
)

In [None]:
eval_results.to_df()