In [4]:
import pandas as pd
from kge.model import KgeModel
from kge.util.io import load_checkpoint
from dotenv import dotenv_values

In [37]:
env = dotenv_values()
libkge_path = env['LIBKGE_PATH']
selfloops_data_path = libkge_path + '/data/selfloops'

# Create index dict for relation names
rel_name_to_id = {}
with open(f'{libkge_path}/data/selfloops/relation_ids.del', 'r') as f:
    for line in f.readlines():
        if line.endswith('\n'):
            line = line[:-1]
        rel_id, rel_name = line.split('\t')
        rel_name_to_id[rel_name] = int(rel_id)

# Create index dict for entity names
ent_name_to_id = {}
with open(f'{libkge_path}/data/selfloops/entity_ids.del', 'r') as f:
    for line in f.readlines():
        if line.endswith('\n'):
            line = line[:-1]
        ent_id, ent_name = line.split('\t')
        ent_name_to_id[ent_name] = int(ent_id)

# Load embeds
checkpoint = load_checkpoint('../3_minimising_training/experiments/20240603-094003-RDK_n10_psi_0.01/checkpoint_00002/checkpoint_00002.pt')
embeds = KgeModel.create_from(checkpoint).state_dict()
rel_embeds = embeds['_relation_embedder._embeddings.weight']
ent_embeds = embeds['_entity_embedder._embeddings.weight']

Loading configuration of dataset selfloops from /Users/fu19841/Documents/thesis/kge/data/selfloops ...
Found psi value: 0.01
Created sparse neighbour adjacency of shape: torch.Size([19734, 19734])
Found psi value: 0.01
Created sparse neighbour adjacency of shape: torch.Size([19734, 19734])




In [28]:
abdominal_rel = 'C0000731'
abdominal_id = rel_name_to_id[abdominal_rel]
abdominal_vec = rel_embeds[abdominal_id]

print(abdominal_vec.shape)
print(abdominal_vec.mean())
print(abdominal_vec.std())
print(abdominal_vec)

torch.Size([256])
tensor(0.0374)
tensor(0.5251)
tensor([-4.6859e-01,  3.0182e-01, -1.3695e-01, -4.4108e-01, -3.5706e-01,
        -8.2026e-01, -1.3660e-01,  6.2356e-01,  3.5136e-01, -3.8047e-01,
        -1.2287e-02, -1.1036e-01, -3.0345e-01,  4.9830e-02,  1.3788e+00,
         1.2056e-01,  4.9325e-01,  7.5324e-01, -4.0527e-01, -4.2192e-02,
        -3.5734e-01, -2.4516e-01,  6.5433e-02,  4.0608e-01, -4.4806e-01,
         1.3571e-01,  4.9731e-01,  8.7099e-01, -3.6035e-01, -4.2999e-01,
         5.9221e-01, -7.6736e-01, -1.9323e-01, -5.3344e-01, -3.9821e-01,
        -6.7146e-01,  1.0868e-01,  1.7251e-01,  2.1471e-01,  2.8824e-01,
         8.7110e-01, -1.0096e-01,  8.3933e-03,  8.6110e-01, -4.7824e-01,
         2.5608e-01,  1.4841e-01, -5.7521e-02,  8.4381e-01, -2.2492e-01,
         9.0598e-01, -7.8605e-01, -1.1085e+00,  5.1028e-01,  7.6411e-01,
         6.5879e-01, -2.5337e-01,  3.2712e-01,  7.8576e-01,  1.0580e+00,
         5.1987e-01, -5.1474e-01, -5.3540e-01,  1.0487e-01, -8.3258e-02,
   

In [55]:
# Load polypharmacy holdout
holdout = pd.read_csv('../../data/selfloops/holdout.tsv', sep='\t', header=None)
holdout.columns = ['h', 'r', 't']
holdout.query(f'r == "{abdominal_rel}"', inplace=True)
holdout_nodes = set(holdout.h.unique()).union(set(holdout.t.unique()))
print(len(holdout_nodes))
holdout

433


Unnamed: 0,h,r,t
0,CID000003937,C0000731,CID006398970
1,CID000004829,C0000731,CID000056959
2,CID000005291,C0000731,CID000060831
3,CID000001971,C0000731,CID000003883
4,CID000004900,C0000731,CID000060835
...,...,...,...
1232,CID000003386,C0000731,CID000005402
1233,CID000004679,C0000731,CID000060871
1234,CID000002554,C0000731,CID000004927
1235,CID000005029,C0000731,CID000060852


In [54]:
# Load false holdout edges
fake_holdout_path = env['THESIS_PATH'] + '/Chapter3/analysis/assessment/false_edges'
false_holdout = pd.read_csv(f'{fake_holdout_path}/{abdominal_rel}.tsv', header=None, sep='\t')
false_holdout.columns = ['h', 'r', 't']
false_holdout_nodes = set(false_holdout.h.unique()).union(set(false_holdout.t.unique()))
print(len(false_holdout_nodes))
false_holdout

632


Unnamed: 0,h,r,t
0,CID000077993,C0000731,CID000004594
1,CID000002522,C0000731,CID000005878
2,CID000163742,C0000731,CID000125889
3,CID000004411,C0000731,CID000003825
4,CID000002512,C0000731,CID000005372
...,...,...,...
1232,CID000056339,C0000731,CID000001983
1233,CID001349907,C0000731,CID000005650
1234,CID000060787,C0000731,CID000002610
1235,CID000002656,C0000731,CID000034312


# 1. Check for overlapping edges

In [56]:
merged_holdout = pd.concat([holdout, false_holdout])
start_len = len(merged_holdout)
merged_holdout.drop_duplicates(inplace=True)
print(f'After dropping duplicates, lost {start_len - len(merged_holdout)} edges')

After dropping duplicates, lost 0 edges


# 2. Check scores being produced

In [70]:
from kge.model.simple import SimplEScorer
from sklearn.metrics import roc_auc_score, average_precision_score

simple = SimplEScorer(checkpoint['config'], 'selfloops')

def score_row(row):
    h_vec = ent_embeds[ent_name_to_id[row.h]]
    t_vec = ent_embeds[ent_name_to_id[row.t]]
    return simple.score_emb(
        h_vec.reshape(1, 256), 
        abdominal_vec.reshape(1, 256), 
        t_vec.reshape(1, 256), 
        combine='spo'
    ).item()

pd.options.mode.chained_assignment = None 


In [82]:
holdout['is_real'] = 1
false_holdout['is_real'] = 0
merged_holdout = pd.concat([holdout, false_holdout])# PROBLEM IS HERE, NEED TO RESET INDEX OR GET DOUBLE INDICES
merged_holdout['score'] = None
for i, row in merged_holdout.iterrows():
    merged_holdout['score'].loc[i] = score_row(row)

merged_holdout.sort_values('score', ascending=False, inplace=True)
merged_holdout.to_csv('C731_psi_1e-2_epoch2.csv', index=False)
print(merged_holdout)

labels = merged_holdout.is_real.values
preds = merged_holdout['score'].values
prc = average_precision_score(labels, preds)

print('AUPRC = ', prc)

                 h         r             t  is_real      score
478   CID000002578  C0000731  CID000004594        1    1.69134
478   CID000004679  C0000731  CID000054688        0    1.69134
1030  CID000003365  C0000731  CID000003929        1   1.563685
1030  CID000003403  C0000731  CID000083786        0   1.563685
216   CID000003658  C0000731  CID005487301        0   1.467639
...            ...       ...           ...      ...        ...
314   CID005473385  C0000731  CID000002673        0 -35.707001
403   CID000003494  C0000731  CID000005482        0 -36.472023
403   CID000003899  C0000731  CID000004829        1 -36.472023
540   CID000002675  C0000731  CID000002232        0 -41.089203
540   CID000002088  C0000731  CID000003148        1 -41.089203

[2474 rows x 5 columns]
AUPRC =  0.49999999999999994


# Now repeat the exact same code, but reset the index after merging:

In [83]:
holdout['is_real'] = 1
false_holdout['is_real'] = 0
merged_holdout = pd.concat([holdout, false_holdout]).reset_index(drop=True)# PROBLEM IS HERE, NEED TO RESET INDEX OR GET DOUBLE INDICES
merged_holdout['score'] = None
for i, row in merged_holdout.iterrows():
    merged_holdout['score'].loc[i] = score_row(row)

merged_holdout.sort_values('score', ascending=False, inplace=True)
merged_holdout.to_csv('C731_psi_1e-2_epoch2_fixed.csv', index=False)
print(merged_holdout)

labels = merged_holdout.is_real.values
preds = merged_holdout['score'].values
prc = average_precision_score(labels, preds)

print('AUPRC = ', prc)

                 h         r             t  is_real      score
306   CID000003419  C0000731  CID000005090        1   2.725994
612   CID000003365  C0000731  CID000005379        1   2.385819
85    CID000003957  C0000731  CID000004635        1   2.365933
392   CID000002666  C0000731  CID000005203        1   2.356106
919   CID000002083  C0000731  CID000005523        1   2.352815
...            ...       ...           ...      ...        ...
1668  CID000005482  C0000731  CID000004893        0 -32.934185
2332  CID000051263  C0000731  CID003086258        0 -33.108044
1551  CID005473385  C0000731  CID000002673        0 -35.707001
1640  CID000003494  C0000731  CID000005482        0 -36.472023
1777  CID000002675  C0000731  CID000002232        0 -41.089203

[2474 rows x 5 columns]
AUPRC =  0.9580013310271165
