## Evaluating my KG on OGB dataset

In [1]:
import pandas as pd
import json
import numpy as np

import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch_geometric.utils import negative_sampling
import torch_geometric.transforms as T
from torch_geometric.nn import GCNConv, SAGEConv

from kg_model import KG_model

from ogb.linkproppred import Evaluator, PygLinkPropPredDataset

from pykeen.evaluation import RankBasedEvaluator
from pykeen.triples import CoreTriplesFactory
from pykeen.pipeline import pipeline, pipeline_from_config 

In [2]:
dataset_dir = '../data/dataset-ogb/'

### Load and prepare data

In [None]:
dataset = PygLinkPropPredDataset(name='ogbl-ddi', root=dataset_dir, transform=T.ToSparseTensor())
data = dataset[0]
data.adj_t

In [None]:
split_edge = dataset.get_edge_split()
train_edge, valid_edge, test_edge = split_edge["train"], split_edge["valid"], split_edge["test"]
train_edge

In [None]:
print(valid_edge['edge_neg'].shape)
print(valid_edge['edge'].shape)

In [None]:
def convert_to_triples_factory(data, num_entities, num_relations):
#     tf_data = TriplesFactory.from_labeled_triples(
#         data[["head", "relation", "tail"]].values,
#         create_inverse_triples=False,
#         entity_to_id=None,
#         relation_to_id=None,
#         compact_id=False 
#     )
    
    tf_data = CoreTriplesFactory(
        data,
        num_entities = num_entities,
        num_relations = num_relations
    )

    print(tf_data)

    return tf_data

In [None]:
# add relation type - interacts with

train = train_edge['edge']
train = torch.tensor([[x[0], 0, x[1]] for x in train])
train_df = pd.DataFrame(train, columns=['head', 'relation', 'tail'])

valid = valid_edge['edge']
valid = torch.tensor([[x[0], 0, x[1]] for x in valid])
valid_df = pd.DataFrame(valid, columns=['head', 'relation', 'tail'])

valid_neg = valid_edge['edge_neg']
valid_neg = torch.tensor([[x[0], 0, x[1]] for x in valid_neg])

test = test_edge['edge']
test = torch.tensor([[x[0], 0, x[1]] for x in test])
test_df = pd.DataFrame(test, columns=['head', 'relation', 'tail'])

test_neg = test_edge['edge_neg']
test_neg = torch.tensor([[x[0], 0, x[1]] for x in test_neg])

num_entities = data.num_nodes

train_tf = convert_to_triples_factory(torch.tensor(train_df.values), num_entities, 1)
valid_tf = convert_to_triples_factory(torch.tensor(valid_df.values), num_entities, 1)
test_tf = convert_to_triples_factory(torch.tensor(test_df.values), num_entities, 1)

In [None]:
# save dataset split to txt files

dir_data_my_split = 'dataset/ogbl_ddi-my_split/'

train_df.to_csv(dir_data_my_split + 'train.txt', sep='\t', header=False, index=False)
valid_df.to_csv(dir_data_my_split + 'valid.txt', sep='\t', header=False, index=False)
test_df.to_csv(dir_data_my_split + 'test.txt', sep='\t', header=False, index=False)

### Train my KG model

In [None]:
# model_kg = KG_model('transe', train_tf, valid_tf, test_tf, 'ogb-ddi')
# model_kg.set_params(10, 'Adam', RankBasedEvaluator, 'gpu')
# print('Training...')
# model_kg.train()
# print('Training done')

In [None]:
# pipeline_result = model_kg.trained_model
# pipeline_result.plot_losses()

In [None]:
config = {
        'metadata': dict(
            title='ComplEx'
        ),
        'pipeline': dict(
            training = '../data/dataset-ogb/ogbl_ddi-my_split/train.txt',
            validation = '../data/dataset-ogb/ogbl_ddi-my_split/valid.txt',
            testing = '../data/dataset-ogb/ogbl_ddi-my_split/test.txt',
            model='ComplEx',
            model_kwargs=dict(
                   embedding_dim=1000,
            ),
            optimizer='Adam',
            optimizer_kwargs=dict(lr=0.001),
            loss='marginranking',
            loss_kwargs=dict(margin=2.64),
            training_loop='slcwa',
            training_kwargs=dict(
                num_epochs=20, 
                batch_size=512, 
                checkpoint_name='Complex_checkpoint-ogb-ddi.pt',
                checkpoint_directory='kg_checkpoints',
                checkpoint_frequency=5    
            ),
            device='gpu',
            negative_sampler='basic',
            negative_sampler_kwargs=dict(num_negs_per_pos=94),
            evaluator='rankbased',
            evaluator_kwargs=dict(filtered=True),
            evaluation_kwargs=dict(batch_size=64),
            stopper='early',
            stopper_kwargs=dict(
                patience=10,
                relative_delta=0.002
            )
        )
    }

In [None]:
pipeline_result = pipeline_from_config(config)

In [None]:
pipeline_result.plot_losses()

### Compute scores for given triplets

In [None]:
# compute scores for positive and negative triplets 
batch_size = 512

n = train.size(0) // batch_size
pos_train_preds = []
for i in range(n+1):
    start_idx = i*batch_size
    end_idx = min((i+1)*batch_size, train.size(0))
    edge = train[start_idx:end_idx]
    pos_train_preds += [pipeline_result.model.score_hrt(edge).squeeze().cpu().detach()]
pos_train_pred = torch.cat(pos_train_preds, dim=0)

n = valid.size(0) // batch_size
pos_valid_preds = []
for i in range(n+1):
    start_idx = i*batch_size
    end_idx = min((i+1)*batch_size, valid.size(0))
    edge = valid[start_idx:end_idx]
    pos_valid_preds += [pipeline_result.model.score_hrt(edge).squeeze().cpu().detach()]
pos_valid_pred = torch.cat(pos_valid_preds, dim=0)

n = valid_neg.size(0) // batch_size
neg_valid_preds = []
for i in range(n+1):
    start_idx = i*batch_size
    end_idx = min((i+1)*batch_size, valid_neg.size(0))
    edge = valid_neg[start_idx:end_idx]
    neg_valid_preds += [pipeline_result.model.score_hrt(edge).squeeze().cpu().detach()]
neg_valid_pred = torch.cat(neg_valid_preds, dim=0)

n = test.size(0) // batch_size
pos_test_preds = []
for i in range(n+1):
    start_idx = i*batch_size
    end_idx = min((i+1)*batch_size, test.size(0))
    edge = test[start_idx:end_idx]
    pos_test_preds += [pipeline_result.model.score_hrt(edge).squeeze().cpu().detach()]
pos_test_pred = torch.cat(pos_test_preds, dim=0)

n = test_neg.size(0) // batch_size
neg_test_preds = []
for i in range(n+1):
    start_idx = i*batch_size
    end_idx = min((i+1)*batch_size, test_neg.size(0))
    edge = test_neg[start_idx:end_idx]
    neg_test_preds += [pipeline_result.model.score_hrt(edge).squeeze().cpu().detach()]
neg_test_pred = torch.cat(neg_test_preds, dim=0)

In [None]:
print(f'Score ranges train: {torch.min(pos_train_pred)} - {torch.max(pos_train_pred)}')
print(f'Score ranges valid: {torch.min(pos_valid_pred)} - {torch.max(pos_valid_pred)}')
print(f'Score ranges valid neg: {torch.min(neg_valid_pred)} - {torch.max(neg_valid_pred)}')
print(f'Score ranges test: {torch.min(pos_test_pred)} - {torch.max(pos_test_pred)}')
print(f'Score ranges test neg: {torch.min(neg_test_pred)} - {torch.max(neg_test_pred)}')

### Evaluate my results

In [None]:
# Evaluate the coputed scores - hits@K

evaluator = Evaluator(name = 'ogbl-ddi')

results = {}
for K in [10, 20, 30]:
    evaluator.K = K
    train_hits = evaluator.eval({
        'y_pred_pos': pos_train_pred,
        'y_pred_neg': neg_valid_pred,
    })[f'hits@{K}']
    valid_hits = evaluator.eval({
        'y_pred_pos': pos_valid_pred,
        'y_pred_neg': neg_valid_pred,
    })[f'hits@{K}']
    test_hits = evaluator.eval({
        'y_pred_pos': pos_test_pred,
        'y_pred_neg': neg_test_pred,
    })[f'hits@{K}']
    
    results[f'Hits@{K}'] = (train_hits, valid_hits, test_hits)
    
    
for hits, result in results.items():
    print(hits)
#     print(result)
    train_hits, valid_hits, test_hits = result
    print(f'Train: {100 * train_hits:.2f}%')
    print(f'Valid: {100 * valid_hits:.2f}%')
    print(f'Test: {100 * test_hits:.2f}%')


In [None]:
print(model_kg.trained_model.get_metric('hits@1'))
print(model_kg.trained_model.get_metric('hits@5'))
print(model_kg.trained_model.get_metric('hits@10'))

In [None]:
print(model_kg.trained_model.get_metric('mrr'))

### BioKG dataset

In [None]:
dataset = PygLinkPropPredDataset(name='ogbl-biokg', root=dataset_dir, transform=T.ToSparseTensor())
data = dataset[0]
data

In [None]:
split_edge = dataset.get_edge_split()
train_triples, valid_triples, test_triples = split_edge["train"], split_edge["valid"], split_edge["test"]

In [None]:
int(max(train_triples['relation']))+1

train_triples (valid/test):
* head type (e.g. disease)
* head - tensor
* relation - tensor
* tail type (e.g. protein)
* tail - tensor


In [None]:
# relation_name_id = data['edge_index_dict'].keys()
head = train_triples['head']
relation = train_triples['relation']
tail = train_triples['tail']
train_df = pd.DataFrame({'head': head, 'relation': relation, 'tail': tail})
print(train_df.head())

head = valid_triples['head']
relation = valid_triples['relation']
tail = valid_triples['tail']
valid_df = pd.DataFrame({'head': head, 'relation': relation, 'tail': tail})


head = test_triples['head']
relation = test_triples['relation']
tail = test_triples['tail']
test_df = pd.DataFrame({'head': head, 'relation': relation, 'tail': tail})


In [None]:
num_entities = sum(dataset[0]['num_nodes_dict'].values())
num_relations = int(max(train_triples['relation']))+1

In [None]:
train_tf = convert_to_triples_factory(torch.tensor(train_df.values), num_entities, num_relations)
valid_tf = convert_to_triples_factory(torch.tensor(valid_df.values), num_entities, num_relations)
test_tf = convert_to_triples_factory(torch.tensor(test_df.values), num_entities, num_relations)

In [None]:
dir_data_my_split = dataset_dir + 'ogbl_biokg-my_split/'

train_df.to_csv(dir_data_my_split + 'train.txt', sep='\t', header=False, index=False)
valid_df.to_csv(dir_data_my_split + 'valid.txt', sep='\t', header=False, index=False)
test_df.to_csv(dir_data_my_split + 'test.txt', sep='\t', header=False, index=False)

In [5]:
model_name = 'ComplEx'
specification = 'ogb-biokg'

pipeline = pipeline(
            training = '../data/dataset-ogb/ogbl_biokg-my_split/train.txt',
            validation = '../data/dataset-ogb/ogbl_biokg-my_split/valid.txt',
            testing = '../data/dataset-ogb/ogbl_biokg-my_split/test.txt',
            model = model_name,
            model_kwargs = dict(
                embedding_dim = 1000
            ),
            loss = 'MarginRankingLoss',
            optimizer = 'Adam',
            optimizer_kwargs = dict(
                lr = 0.001
            ),
            evaluator = 'rankbased',
            device = 'gpu',
            training_kwargs = dict(
                batch_size = 100,
                num_epochs = 5,
                checkpoint_name = model_name + '-' + specification + '_checkpoint.pt',
                checkpoint_directory = 'kg_checkpoints'
            ),
        )  

INFO:pykeen.pipeline.api:=> no training loop checkpoint file found at 'kg_checkpoints/ComplEx-ogb-biokg_checkpoint.pt'. Creating a new file.
INFO:pykeen.pipeline.api:Using device: gpu


OutOfMemoryError: CUDA out of memory. Tried to allocate 344.00 MiB (GPU 0; 7.79 GiB total capacity; 360.64 MiB already allocated; 261.25 MiB free; 386.00 MiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
pipeline.plot_losses()

In [None]:
pipeline.metric_results.to_df()

In [None]:
pipeline.get_metric('mrr')

In [None]:
from collections import defaultdict

def test_step(trained_model, test_triplets, num_entities, num_relations):   
    evaluator = Evaluator(name = 'ogbl-biokg')
    
    batch_size = 512
    
    test_logs = defaultdict(list)
    
    n = test_triplets['head'].size(0) // batch_size
    test_dataset = []
    for i in range(n+1):
        start_idx = i*batch_size
        end_idx = min((i+1)*batch_size, test_triplets['head'].size(0))
        positive_triples = torch.stack([test_triples['head'][start_idx:end_idx], 
                                        test_triples['relation'][start_idx:end_idx], 
                                        test_triples['tail'][start_idx:end_idx]], 1)
        
        num_neg = test_triples['head_neg'][start_idx:end_idx].size(0)
        neg_nodes = test_triples['head_neg'].shape[1]
        
#         print(num_neg, neg_nodes)
        negative_triples = torch.stack([test_triples['head_neg'][start_idx:end_idx], 
                                        torch.randint(0, num_relations, (num_neg, neg_nodes)), 
                                        test_triples['tail_neg'][start_idx:end_idx]], 1)
#         negative_triples = torch.stack([torch.randint(0, num_entities, (num_neg,)), 
#                                         torch.randint(0, num_relations, (num_neg,)), 
#                                         torch.randint(0,  num_entities, (num_neg,))], 1)
        test_dataset.append((positive_triples, negative_triples))

#     step = 0
    
    with torch.no_grad():
#     for test_dataset in test_dataset_list:
        for positive_sample, negative_sample in test_dataset:
    #             if args.cuda:
            positive_sample = positive_sample.cuda()
            negative_sample = negative_sample.cuda()

    #             score = model((positive_sample, negative_sample), mode)
            score = trained_model.model.score_hrt(positive_sample)
            score_neg = trained_model.model.score_hrt(negative_sample)


            batch_results = evaluator.eval({'y_pred_pos': score[:, 0], 
                                        'y_pred_neg': score_neg.squeeze(-1)})
            for metric in batch_results:
                test_logs[metric].append(batch_results[metric])


#             print('Evaluating the model... (%d)' % (step))

#             step += 1

    metrics = {}
    for metric in test_logs:
        metrics[metric] = torch.cat(test_logs[metric]).mean().item()
        
    return metrics    

In [None]:
metrics = test_step(pipeline, valid_triples, num_entities, num_relations)

In [None]:
metrics