In [1]:
import pandas as pd
import itertools
import numpy as np

In [42]:
prediction_dir = '../predictions/'
triplets_dir = '../data/triplets/'
drugbank_dir = '../data/drugbank/'
specification = 'best_pipeline2.2-200epochs'
data = '/drugbank/'
food_compounds_names_path = '../data/triplets/compounds_names.tsv'

### Create id - name mappings

In [3]:
drug_name_map = pd.read_csv(drugbank_dir + 'drug_id_name_map.csv', sep=',', index_col=[0])
food_name_map = pd.read_csv(triplets_dir + 'food_name.tsv', sep='\t', index_col=[0])
food_compound_map = pd.read_csv(triplets_dir + 'compounds_names.tsv', sep='\t', index_col=[0])

In [4]:
drug_ids = drug_name_map.id
drug_names = drug_name_map.drug_name
drug_id_map_dict = dict(zip(drug_ids, drug_names))

food_ids = food_name_map.public_id
food_names = food_name_map.name
food_id_map_dict = dict(zip(food_ids, food_names))

compound_ids = food_compound_map.compound_id
compound_names = food_compound_map.name
compound_id_map_dict = dict(zip(compound_ids, compound_names))

### Load drugs and foods 

In [5]:
common_drugs = pd.read_csv('../data/common_drugs_num_interactions.csv', sep=';')
common_drugs = common_drugs.dropna()
common_drugs_ids = common_drugs.db_id.values

# print(common_drugs)
print(common_drugs_ids)

['DB00321' 'DB00091' 'DB00564' ... 'DB11237' 'DB00878' 'DB00768']


In [34]:
with open('../data/foods4predictions.txt', 'r') as f:
    foods = f.readlines()

foods = [food.strip() for food in foods]
# foods[:10]

### Load predictions for specific drug/food

In [35]:
def get_predictions(prediction_file):
    try:
        predictions = pd.read_csv(prediction_file, sep=',', index_col=[0])
    except:
        return None
    # keep just drug/food/food compound predictions
    predictions['node_type'] = list(itertools.repeat('xxx', predictions.shape[0]))
    predictions.loc[predictions['tail_label'].str.contains("DB\d+", regex=True), 'node_type'] = "drug"
    predictions.loc[predictions['tail_label'].str.contains("FDB"), 'node_type'] = "food_compound"
    predictions.loc[predictions['tail_label'].str.contains("FOOD"), 'node_type'] = "food"
    predictions = predictions.loc[predictions['node_type'] != 'xxx']
    
    return predictions

# assign entity names to ids
def assign_names(predictions):
    for row in predictions.iterrows():
        tail = row[1].tail_label
        node_type = row[1].node_type

        if node_type == 'drug':
            tail_name = drug_id_map_dict[tail]
        elif node_type == 'food':
            tail_name = food_id_map_dict[tail]
        else:
            tail_name = compound_id_map_dict[tail]

        print(tail, tail_name)    

In [43]:
# drug predictions
drug_id = common_drugs_ids[10]
prediction_file = prediction_dir + specification + data +'complex_' + drug_id + '_interacts_' + specification + '.csv'

print('Interactions with', drug_id_map_dict[drug_id])

predictions = get_predictions(prediction_file)
print(predictions)
print()
assign_names(predictions)

Interactions with Zonisamide
      tail_id      score tail_label node_type
3532     3532  11.472713    DB06212      drug
3048     3048  11.335865    DB01144      drug
2668     2668  11.303630    DB00742      drug
3240     3240  10.999071    DB01396      drug
4043     4043  10.925892    DB09407      drug
...       ...        ...        ...       ...
3608     3608   9.562111    DB06700      drug
3603     3603   9.555648    DB06694      drug
2215     2215   9.550627    DB00270      drug
3887     3887   9.543069    DB09111      drug
3104     3104   9.542161    DB01201      drug

[100 rows x 4 columns]

DB06212 Tolvaptan
DB01144 Diclofenamide
DB00742 Mannitol
DB01396 Digitoxin
DB09407 Magnesium chloride
DB00955 Netilmicin
DB00220 Nelfinavir
DB14500 Potassium
DB06283 Ziconotide
DB01026 Ketoconazole
DB01211 Clarithromycin
DB13967 Patent Blue
DB00421 Spironolactone
DB00390 Digoxin
DB11901 Apalutamide
DB09401 Isosorbide
DB00185 Cevimeline
DB01111 Colistimethate
DB00932 Tipranavir
DB09015 Canren

In [None]:
predictions.in_testing.sum()

In [40]:
# food predicitons
food_id = foods[11]
prediction_file = prediction_dir + specification + data +'complex_' + food_id + '_interacts_' + specification + '.csv'

print('Interactions with', compound_id_map_dict[food_id])

predictions = get_predictions(prediction_file)
print(predictions)
print()
assign_names(predictions)

Interactions with Pterostilbene
      tail_id     score tail_label      node_type
7414     7414  0.444300  FOOD00394           food
2965     2965  0.443989    DB01057           drug
4887     4887  0.409787    DB15270           drug
4954     4954  0.375224    DB16746           drug
4146     4146  0.350639    DB10966           drug
4115     4115  0.317418    DB10370           drug
6981     6981  0.317366  FDB012521  food_compound
3892     3892  0.306063    DB09116           drug
4299     4299  0.302085    DB11575           drug
4838     4838  0.290141    DB14712           drug

FOOD00394 Lambsquarters
DB01057 Echothiophate
DB15270 Efgartigimod alfa
DB16746 Elivaldogene autotemcel
DB10966 Sorghum bicolor subsp. drummondii pollen
DB10370 Festuca pratensis pollen
FDB012521 Campesterol
DB09116 Calcium carbimide
DB11575 Grazoprevir
DB14712 Elapegademase


In [46]:
# # check if the predicted drug/food is in different interaction with the same drug in the training data

# idx = drug_id
# snd_idx = 'DB11166'

# train_triplets = pd.read_csv(triplets_dir + 'train_with_biokg.tsv', sep='\t', index_col=[0])
# valid_triplets = pd.read_csv(triplets_dir + 'valid_with_biokg.tsv', sep='\t', index_col=[0])
# test_triplets = pd.read_csv(triplets_dir + 'test_with_biokg.tsv', sep='\t', index_col=[0])

# filtered_triplets = train_triplets.loc[train_triplets.index == idx]
# in_train = filtered_triplets.loc[filtered_triplets['tail'] == snd_idx].any().sum()

# filtered_triplets = valid_triplets.loc[valid_triplets.index == idx]
# in_valid = filtered_triplets.loc[filtered_triplets['tail'] == snd_idx].any().sum()

# filtered_triplets = test_triplets.loc[test_triplets.index == idx]
# in_test = filtered_triplets.loc[filtered_triplets['tail'] == snd_idx].any().sum()

# print(f'Relation in triplets:')
# print(f'- train:', 'yes' if in_train else 'no')
# print(f'- valid:', 'yes' if in_valid else 'no')
# print(f'- test:', 'yes' if in_test else 'no')
      

### Metrics

In [20]:
# hits@k (how many predicted triplets in the first k positions are in test data)

hits10 = []
hits20 = []
hits30 = []
hits100 = []

for drug in common_drugs_ids:
    prediction_file = prediction_dir + specification + '/complex_' + drug + '_negative_' + specification + '.csv'
    preds = get_predictions(prediction_file)
    
    if preds is None:
        continue
        
    hits10.append(preds.in_testing[:10].sum())
    hits20.append(preds.in_testing[:20].sum())
    hits30.append(preds.in_testing[:30].sum())
    hits100.append(preds.in_testing[:100].sum())
    

print('Avg. hits@10:', np.mean(hits10))
print('Avg. hits@20:', np.mean(hits20))
print('Avg. hits@30:', np.mean(hits30))
print('Avg. hits@100:', np.mean(hits100))

Avg. hits@10: nan
Avg. hits@20: nan
Avg. hits@30: nan
Avg. hits@100: nan


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [21]:
# MRR (https://en.wikipedia.org/wiki/Mean_reciprocal_rank)
# if the predicted triplet isn't in the first 100 test data -> index = 100

mrr = []

for drug in common_drugs_ids:
    prediction_file = prediction_dir + specification + '/complex_' + drug + '_negative_' + specification + '.csv'
    preds = get_predictions(prediction_file)
    
    if preds is None:
        continue
    
    try:
        idx = 1 / (list(preds.in_testing).index(True) + 1)
    except:
        idx = 1 / 100
    mrr.append(idx)
    
print('MRR:', np.mean(mrr))

MRR: nan
