In [1]:
import sys 
sys.path.append("..")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import torch, torch_geometric
from models import training_utils, prediction_utils, exp_utils, mlp_baseline
from exploration import explor_utils

In [2]:
data_folder = "/biodata/nyanovsky/datasets/dti/processed/v2/"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

node_df = pd.read_csv(data_folder+"dti_tensor_df.csv",index_col=0)
#load data
datasets, node_map = training_utils.load_data(data_folder,load_inverted_map=False,load_test=True)

train_set, val_set, test_set = datasets

gene_feature_dict = training_utils.load_feature_dict(data_folder+"prot_features_64.txt", data_folder+"prot_features_ids.txt", 
                                                    node_df, "gene")

In [3]:
import yaml
configs_folder = "/biodata/nyanovsky/datasets/dti/best_models/"
with open(configs_folder+"sage_config.yaml","r") as file:
    sage_config = yaml.safe_load(file)
train_params = sage_config["train"]
gral_params = sage_config["gral"]

In [4]:
gral_params

{'L2_norm': False,
 'batch_norm': False,
 'dropout': 0.0,
 'hidden_channels': 32,
 'layer_connectivity': 'False',
 'macro_aggregation': 'sum',
 'msg_passing_layers': 4,
 'normalize_output': False,
 'post_process_layers': 0,
 'pre_process_layers': 0}

In [5]:
train, val, test = exp_utils.init_features(train_set,val_set,test_set, train_params, gene_feature_dict)

  data_object[nodetype].x[tensor_idxs] = nodetype_embs


In [23]:
mlp = mlp_baseline.MLP_model(params={"hidden_channels":32, "batch_norm":False, "dropout":0.0, "post_process_layers":4},
                             metadata=train_set.metadata(), sup_type=[("gene","chg","chem")])

In [24]:
versiones = ["v2","v_test/seed_0","v_test/seed_1", "v_test/seed_2", "v_test/seed_3", "v_test/seed_4",
             "v_test/seed_5", "v_test/seed_6", "v_test/seed_7", "v_test/seed_8"]

In [19]:
full_set = torch.load(data_folder+"dti_full_dataset.pt")
negative_sampler = training_utils.NegativeSampler(full_set,("gene","chg","chem"),full_set["gene"]["degree_chg"],full_set["chem"]["degree_chg"])

In [26]:
from sklearn.metrics import roc_auc_score, average_precision_score, accuracy_score, precision_score, recall_score

In [38]:
perf = {"auc":[], "acc":[], "ap":[],"precision":[], "recall":[]}
for version in versiones:
    data_folder = f"/biodata/nyanovsky/datasets/dti/processed/{version}/"
    dataset, node_map = training_utils.load_data(data_folder,load_inverted_map=False,load_test=True)
    node_df = pd.read_csv(data_folder+"dti_tensor_df.csv",index_col=0)
    full_set = torch.load(data_folder+"dti_full_dataset.pt")
    negative_sampler = training_utils.NegativeSampler(full_set,("gene","chg","chem"),full_set["gene"]["degree_chg"],full_set["chem"]["degree_chg"])
    train, val, test = exp_utils.init_features(dataset[0], dataset[1], dataset[2], train_params, gene_feature_dict)
    model = exp_utils.train_model(mlp, train_params, train, val, negative_sampler)[0]
    model = model.to("cpu")
    encodings = model.encoder(test.x_dict)

    model.eval()
    with torch.no_grad():
        predictor = prediction_utils.Predictor(node_df,encodings)
        preds = predictor.predict_supervision_edges(test,("gene","chg","chem"))
        y_true = preds.label.values
        y_score = preds.score.values
        y_pred_labels = preds.score.values.round()

        auc = roc_auc_score(y_true,y_score)
        acc = accuracy_score(y_true,y_pred_labels) 
        ap = average_precision_score(y_true,y_score) 
        precision = precision_score(y_true,y_pred_labels) 
        recall = recall_score(y_true,y_pred_labels) 

        perf["auc"].append(auc)
        perf["acc"].append(acc)
        perf["ap"].append(ap)
        perf["precision"].append(precision)
        perf["recall"].append(recall)
    print(version + " done\n")

v2 done

v_test/seed_0 done

v_test/seed_1 done

v_test/seed_2 done

v_test/seed_3 done

v_test/seed_4 done

v_test/seed_5 done

v_test/seed_6 done

v_test/seed_7 done

v_test/seed_8 done



In [39]:
for key in perf.keys():
    print(f"mean {key} = {np.mean(perf[key])} $\pm$ {np.std(perf[key])}")

mean auc = 0.5756983634258861 $\pm$ 0.009152883052088586
mean acc = 0.5542589215941651 $\pm$ 0.007088625182493206
mean ap = 0.5749191926584549 $\pm$ 0.007135015422395094
mean precision = 0.561170433011522 $\pm$ 0.009441659393076358
mean recall = 0.5031518624641833 $\pm$ 0.05523539298195471
