In [1]:
import pandas as pd

from pykeen.models import predict
from pykeen.pipeline import pipeline
from pykeen.triples import TriplesFactory
from pykeen.models import TransE
from pykeen.evaluation import RankBasedEvaluator

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
models_dir = 'kg_checkpoints/'

In [3]:
drugs = ['Ibuprofen', 'Galantamine', 'Losartan', 'Daunorubicin', 'Clocortolone']

In [None]:
relations = {'decrease_absorption',
 'decrease_adverse_effects',
 'decrease_anticholinergic_activities',
 'decrease_anticoagulant_activities',
 'decrease_antihypertensive_activities',
 'decrease_antiplatelet_activities',
 'decrease_arrhythmogenic_activities',
 'decrease_bioavailability',
 'decrease_bronchodilatory_activities',
 'decrease_cardiotoxicity',
 'decrease_cytotoxicity',
 'decrease_diuretic_activities',
 'decrease_effectiveness',
 'decrease_excretion_rate',
 'decrease_fluid_retaining_activities',
 'decrease_hypertension',
 'decrease_hypoglycemia',
 'decrease_hypotension',
 'decrease_metabolism',
 'decrease_myopathy',
 'decrease_nephrotoxicity',
 'decrease_neuromuscular_blockade',
 'decrease_protein_binding',
 'decrease_qtc_prolongation',
 'decrease_rhabdomyolysis',
 'decrease_sedation',
 'decrease_seizure',
 'decrease_serum_concentration',
 'decrease_skeletal_muscle_relaxing_activities',
 'decrease_stimulatory_activities',
 'decrease_therapeutic_efficacy',
 'decrease_vasoconstricting_activities',
 'decrease_vasopressor_activities',
 'increase_adverse_effects',
 'increase_alpha-adrenergic_activities',
 'increase_analgesic_activities',
 'increase_anemia',
 'increase_angioedema',
 'increase_anti-angiogenesis',
 'increase_anticholinergic_activities',
 'increase_anticoagulant_activities',
 'increase_anticonvulsant_toxicity',
 'increase_antihypertensive_activities',
 'increase_antiplatelet_activities',
 'increase_antipsychotic_activities',
 'increase_arrhythmogenic_activities',
 'increase_atrioventricular_blocking_(av_block)_activities',
 'increase_bioavailability',
 'increase_bleeding',
 'increase_bradycardia',
 'increase_bronchoconstrictory_activities',
 'increase_bronchospasm',
 'increase_bruising',
 'increase_cardiac_arrest',
 'increase_cardiac_arrhythmia',
 'increase_cardiodepressant_activities',
 'increase_cardiotoxicity',
 'increase_cardiovascular_complications',
 'increase_cardiovascular_impairment',
 'increase_change_in_thyroid_function_activities',
 'increase_cns_depression_activities',
 'increase_cns_stimulation',
 'increase_confusion',
 'increase_congestive_heart_failure',
 'increase_constipation',
 'increase_convulsion',
 'increase_cutaneous_drug_reaction',
 'increase_cytopenia',
 'increase_death',
 'increase_decreased_alertness_activities',
 'increase_dermatologic_adverse_activities',
 'increase_diuretic_activities',
 'increase_dizziness',
 'increase_drowsiness',
 'increase_dyspnea',
 'increase_edema_formation',
 'increase_electrolyte_disturbance_activities',
 'increase_electrolyte_imbalance',
 'increase_elevated_creatine_kinase_(cpk)',
 'increase_encephalopathy',
 'increase_excretion_rate',
 'increase_extrapyramidal_symptoms',
 'increase_facial_flushing',
 'increase_fluid_retaining_activities',
 'increase_fluid_retention',
 'increase_gastrointestinal_bleeding',
 'increase_gastrointestinal_irritation',
 'increase_gastrointestinal_motility_reducing_activities',
 'increase_gastrointestinal_ulceration',
 'increase_generalized_seizure',
 'increase_gouty_arthritis',
 'increase_granulocytopenia',
 'increase_hemorrhage',
 'increase_hemorrhagic_cystitis',
 'increase_hepatotoxic_activities',
 'increase_hyperbilirubinemia',
 'increase_hypercalcemia',
 'increase_hypercoagulability',
 'increase_hyperglycemia',
 'increase_hyperkalemia',
 'increase_hyperkinetic_symptoms',
 'increase_hypersensitivity_reaction',
 'increase_hypertension',
 'increase_hyperthermia',
 'increase_hypertrichosis',
 'increase_hyperuricemia',
 'increase_hypocalcemia',
 'increase_hypoglycemia',
 'increase_hypokalemia',
 'increase_hypolipidaemic_activities',
 'increase_hypomagnesemia',
 'increase_hypomania',
 'increase_hyponatremia',
 'increase_hypotension',
 'increase_hypothyroid_activities',
 'increase_hypotonia',
 'increase_immunosuppressive_activities',
 'increase_increased_glucose',
 'increase_increased_serum_creatinine',
 'increase_increased_transaminases',
 'increase_infection',
 'increase_intraocular_pressure',
 'increase_irritability',
 'increase_ischemic_colitis',
 'increase_jaw_osteonecrosis',
 'increase_lactic_acidosis',
 'increase_leukopenia',
 'increase_liver_damage',
 'increase_liver_enzyme_elevations',
 'increase_metabolic_acidosis',
 'increase_metabolism',
 'increase_methemoglobinemia',
 'increase_mucosal_ulceration',
 'increase_myelosuppression',
 'increase_myocardial_depression',
 'increase_myoglobinuria',
 'increase_myopathic_rhabdomyolysis_activities',
 'increase_myopathy',
 'increase_nausea',
 'increase_nephrotoxicity',
 'increase_neuroexcitatory_activities',
 'increase_neuroleptic_malignant_syndrome',
 'increase_neuromuscular_blockade',
 'increase_neurotoxic_activities',
 'increase_neutropenia',
 'increase_nitritoid_reactions',
 'increase_oligohydrosis',
 'increase_opioid_antagonism_activities',
 'increase_opioid_toxicity',
 'increase_orthostatic_hypotension',
 'increase_osteomalacia',
 'increase_ototoxicity',
 'increase_pancreatitis_activities',
 'increase_peptic_ulcer',
 'increase_peripheral_neuropathy',
 'increase_photosensitizing_activities',
 'increase_priapism',
 'increase_pseudotumor_cerebri',
 'increase_psychotic_reaction',
 'increase_pulmonary_toxicity',
 'increase_qtc_prolongation',
 'increase_rash',
 'increase_reduced_gastrointestinal_motility',
 'increase_reduced_intravascular_volume',
 'increase_renal_failure',
 'increase_respiratory_depression',
 'increase_rhabdomyolysis',
 'increase_sedation',
 'increase_seizure',
 'increase_serotonergic_activities',
 'increase_serotonin_syndrome',
 'increase_serum_concentration',
 'increase_severe_leukopenia',
 'increase_shortness_of_breath',
 'increase_sinus_node_depression',
 'increase_skeletal_muscle_relaxing_activities',
 'increase_sleep_disorders',
 'increase_smooth_muscle_relaxing_activities',
 'increase_somnolence',
 'increase_stevens-johnson_syndrome',
 'increase_sympathomimetic_activities',
 'increase_syncope',
 'increase_tachycardia',
 'increase_tardive_dyskinesia',
 'increase_tendinopathy',
 'increase_teratogenic_activities',
 'increase_therapeutic_efficacy',
 'increase_thrombocytopenia',
 'increase_thromboembolism',
 'increase_thrombogenic_activities',
 'increase_thrombosis',
 'increase_torsade_de_pointes',
 'increase_ulceration',
 'increase_urinary_retention',
 'increase_uterotonic_activities',
 'increase_vasoconstricting_activities',
 'increase_vasodilatory_activities',
 'increase_vasopressor_activities',
 'increase_vasospastic_reactions',
 'increase_ventricular_arrhythmias',
 'increase_visual_accommodation_disturbances',
 'increase_vomiting',
 'increase_water_intoxication',
 'increase_weakness',
 'increase_weight_gain',
 'has_adverse_effect_on',
 'has_adverse_reaction',
 'has_therapeutic_class',
 'interacts_with',
 'is_effective_for'}

In [4]:
def convert_to_triples_factory(data):
    tf_data = TriplesFactory.from_labeled_triples(
      data[["head", "relation", "tail"]].values,
      create_inverse_triples=False,
      entity_to_id=None,
      relation_to_id=None,
      compact_id=False 
    )
    #print(tf_data)  # kam mizeji nejake trojice? - jiny pocet zde a po vytvoreni datasetu
    return tf_data

In [2]:
train = pd.read_csv('data/triplets/train.tsv', sep='\t', index_col=[0], engine='python')
valid = pd.read_csv('data/triplets/valid.tsv', sep='\t', index_col=[0], engine='python')
test = pd.read_csv('data/triplets/test.tsv', sep='\t', index_col=[0], engine='python')

In [None]:
tf_train = convert_to_triples_factory(train.astype(str))
tf_valid = convert_to_triples_factory(valid.astype(str))
tf_test = convert_to_triples_factory(test.astype(str))

### Analyze datasets

In [15]:
print('Counts of different relations:')

relations = ['has_name', 'in_subclass', 'contains', 'has_inchi_key', 'has_molecule',
             'has_name', 'contains_salt', 'has_cas_number', 'has_compound', 'is_type', 'maps_to']

for rel in relations:
    count = train[train['relation'] == rel].shape[0]
    print(rel, ":", count)




Counts of different relations:
has_name : 228067
in_subclass : 10436
contains : 207289
has_inchi_key : 13695
has_molecule : 216
has_name : 228067
contains_salt : 2578
has_cas_number : 2830
has_compound : 5375
is_type : 144536
maps_to : 209499


### Experiment with KG

In [8]:
result = pipeline(
        dataset = 'BioKG',
        model = 'TransE',
        optimizer = 'Adam',
        evaluator = RankBasedEvaluator,
        epochs = 2,
        device = 'gpu',
#         training_kwargs = dict(
#             num_epochs = 20,
#             checkpoint_name = model + '_checkpoint.pt',
#             checkpoint_directory = 'kg_checkpoints',
#             checkpoint_frequency = 10
#         ),
    )

INFO:pykeen.pipeline.api:Using device: gpu
INFO:pykeen.datasets.utils:Loading cached preprocessed dataset from file:///work/.data/pykeen/datasets/biokg/cache/47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM
INFO:pykeen.triples.triples_factory:Loading from file:///work/.data/pykeen/datasets/biokg/cache/47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM/training
INFO:pykeen.triples.triples_factory:Loading from file:///work/.data/pykeen/datasets/biokg/cache/47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM/testing
INFO:pykeen.triples.triples_factory:Loading from file:///work/.data/pykeen/datasets/biokg/cache/47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM/validation
Training epochs on cuda:0:   0%|                                                          | 0/2 [00:00<?, ?epoch/s]
Training batches on cuda:0:   0%|                                                      | 0/6463 [00:00<?, ?batch/s][A
Training batches on cuda:0:   0%|                                            | 14/6463 [00:00<00:46, 138.01batch/s][A
Training batches on cuda:0:   1%|▎      

Training batches on cuda:0:  66%|███████████████████████████▊              | 4286/6463 [00:13<00:06, 329.18batch/s][A
Training batches on cuda:0:  67%|████████████████████████████              | 4319/6463 [00:13<00:06, 329.28batch/s][A
Training batches on cuda:0:  67%|████████████████████████████▎             | 4352/6463 [00:13<00:06, 329.34batch/s][A
Training batches on cuda:0:  68%|████████████████████████████▍             | 4385/6463 [00:13<00:06, 329.46batch/s][A
Training batches on cuda:0:  68%|████████████████████████████▋             | 4418/6463 [00:13<00:06, 329.43batch/s][A
Training batches on cuda:0:  69%|████████████████████████████▉             | 4451/6463 [00:13<00:06, 329.53batch/s][A
Training batches on cuda:0:  69%|█████████████████████████████▏            | 4484/6463 [00:13<00:06, 329.61batch/s][A
Training batches on cuda:0:  70%|█████████████████████████████▎            | 4517/6463 [00:13<00:05, 329.73batch/s][A
Training batches on cuda:0:  70%|███████████████

Training batches on cuda:0:  34%|██████████████▍                           | 2222/6463 [00:06<00:13, 324.02batch/s][A
Training batches on cuda:0:  35%|██████████████▋                           | 2255/6463 [00:07<00:12, 323.91batch/s][A
Training batches on cuda:0:  35%|██████████████▊                           | 2288/6463 [00:07<00:12, 324.26batch/s][A
Training batches on cuda:0:  36%|███████████████                           | 2321/6463 [00:07<00:12, 324.69batch/s][A
Training batches on cuda:0:  36%|███████████████▎                          | 2354/6463 [00:07<00:12, 324.73batch/s][A
Training batches on cuda:0:  37%|███████████████▌                          | 2387/6463 [00:07<00:12, 325.10batch/s][A
Training batches on cuda:0:  37%|███████████████▋                          | 2420/6463 [00:07<00:12, 324.96batch/s][A
Training batches on cuda:0:  38%|███████████████▉                          | 2453/6463 [00:07<00:12, 324.66batch/s][A
Training batches on cuda:0:  38%|███████████████

In [9]:
result.get_metric('hits@10')

0.05345744680851064

In [11]:
df = result.metric_results.to_df()

with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(df)

     Side         Type                               Metric         Value
0    head   optimistic          inverse_geometric_mean_rank  2.343598e-03
1    tail   optimistic          inverse_geometric_mean_rank  2.533924e-03
2    both   optimistic          inverse_geometric_mean_rank  2.436904e-03
3    head    realistic          inverse_geometric_mean_rank  2.343595e-03
4    tail    realistic          inverse_geometric_mean_rank  2.533919e-03
5    both    realistic          inverse_geometric_mean_rank  2.436901e-03
6    head  pessimistic          inverse_geometric_mean_rank  2.343591e-03
7    tail  pessimistic          inverse_geometric_mean_rank  2.533917e-03
8    both  pessimistic          inverse_geometric_mean_rank  2.436896e-03
9    head   optimistic   adjusted_geometric_mean_rank_index  9.889857e-01
10   tail   optimistic   adjusted_geometric_mean_rank_index  9.898066e-01
11   both   optimistic   adjusted_geometric_mean_rank_index  9.894041e-01
12   head    realistic   adjusted_geom