### Baseline models

Two models, with no graph structure
1. Simple logistic regression. Following Otto's (?) suggestion. How does very simple model perform?

ans: better than graph transformer. lol

2. MLP classifier. Classifier model, no graph structure / network info

In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, roc_auc_score, average_precision_score, f1_score, precision_score, recall_score

import torch.nn as nn
import torch
import pandas as pd
import numpy as np

import torch_geometric
from torch_geometric.data import Data
import h5py
from torch_geometric.utils import add_self_loops

import sys
sys.path.append('../')



***
**SMG data**

In [2]:
def load_h5_graph(PATH, LABEL_PATH, ppi, seed=42):
    f = h5py.File(f'{PATH}/{ppi}_multiomics.h5', 'r')
    # Build edge indices from the network matrix
    network = f['network'][:]
    src, dst = np.nonzero(network)
    edge_index = torch.tensor(np.vstack((src, dst)), dtype=torch.long)

    # Load node features and assign a node "name" attribute if desired
    features = f['features'][:]
    x = torch.from_numpy(features)
    num_nodes = x.size(0)
    node_name = f['gene_names'][...,-1].astype(str)

    # Retrieve gene names and create a mapping: gene name -> node index
    gene_name = f['gene_names'][...,-1].astype(str)
    gene_map = {g: i for i, g in enumerate(gene_name)}  # gene name -> node index

    # Originally, the code combined several label arrays but then reads a health.tsv.
    # Here we read the health.tsv file and extract the symbols.
    # Ensure that PATH is defined in your environment.
    label_df = pd.read_csv(LABEL_PATH, sep='\t').astype(str) # TODO fix this for druggable gene prediction
    label_symbols = label_df['symbol'].tolist()

    # Determine positive nodes: indices that appear in both the health.tsv and gene_name list
    mask = [gene_map[g] for g in sorted(list(set(label_symbols) & set(gene_name)))]

    # Randomly select negative samples from those nodes not in the positive mask.
    np.random.seed(seed)
    all_indices = set(range(len(gene_name)))
    negative_candidates = sorted(list(all_indices - set(mask)))
    neg_sample_size = min(len(mask), len(gene_name) - len(mask))
    neg_mask = np.random.choice(negative_candidates, size=neg_sample_size, replace=False).tolist()

    print("Negative mask indices:", neg_mask)

    # Create a label vector (1 for positive, 0 for negative)
    y = torch.zeros(len(gene_name), dtype=torch.float)
    y[mask] = 1
    y = y.unsqueeze(1)  # shape: [num_nodes, 1]

    # Combine positive and negative indices for the split
    final_mask = mask + neg_mask
    final_labels = y[final_mask].squeeze(1).numpy()  # converting to numpy for stratification

    # Split indices into train, test, and validation sets using stratification
    train_idx, test_idx, _, _ = train_test_split(final_mask, final_labels, test_size=0.2,
                                                    shuffle=True, stratify=final_labels, random_state=seed)
    train_idx, val_idx, _, _ = train_test_split(train_idx, y[train_idx].numpy().squeeze(1),
                                                test_size=0.2, shuffle=True,
                                                stratify=y[train_idx].numpy().squeeze(1), random_state=seed)

    # Create boolean masks for all nodes
    train_mask = torch.zeros(len(gene_name), dtype=torch.bool)
    test_mask = torch.zeros(len(gene_name), dtype=torch.bool)
    val_mask = torch.zeros(len(gene_name), dtype=torch.bool)
    train_mask[train_idx] = True
    test_mask[test_idx] = True
    val_mask[val_idx] = True

    # Add self-loops to the edge_index
    edge_index, _ = add_self_loops(edge_index, num_nodes=num_nodes)

    # Build the PyTorch Geometric data object
    data = Data(x=x, edge_index=edge_index, y=y)
    data.train_mask = train_mask.unsqueeze(1)  # unsqueeze if you want to mimic the original shape
    data.test_mask = test_mask.unsqueeze(1)
    data.val_mask = val_mask.unsqueeze(1)
    data.name = node_name  # optional: storing node names

    return data

In [3]:
PATH = '../data/real/smg_data/'
LABEL_PATH = '../data/real/smg_data/health.tsv'
ppis = ['CPDB', 'IRefIndex', 'IRefIndex_2015', 'PCNet', 'STRINGdb']#'Multinet',

data = {}
seeds = [0,1,2]

for ppi in ppis:
    for seed in seeds:
        data[ppi, seed] = load_h5_graph(PATH, LABEL_PATH, ppi, seed=seed)

Negative mask indices: [8999, 13560, 10056, 5238, 4400, 10126, 7257, 9287, 1328, 10029, 3798, 1343, 4006, 10315, 12558, 6469, 8637, 6127, 3586, 785, 5161, 5253, 2224, 7950, 7645, 12498, 12624, 9395, 13098, 9368, 7476, 3001, 6913, 2409, 2090, 3433, 4462, 5038, 4739, 5096, 10199, 4624, 7788, 7438, 542, 10380, 4442, 3545, 4161, 3051, 10863, 11284, 11717, 3395, 12867, 7054, 8812, 12216, 6740, 1955, 5830, 7993, 6318, 11760, 6879, 13045, 7607, 2732, 501, 11409, 8966, 12040, 13196, 10590, 4523, 12691, 3070, 2058, 2204, 5668, 12618, 4245, 4588, 537, 387, 8507, 4056, 12163, 4326, 11298]
Negative mask indices: [5243, 2450, 5909, 8874, 5635, 1343, 9229, 8495, 8700, 2673, 6224, 11148, 7952, 2444, 12904, 6947, 2322, 10057, 38, 9950, 1331, 4230, 881, 8834, 13602, 11429, 10796, 12282, 2504, 6762, 9169, 3298, 12296, 8425, 11250, 5144, 3068, 7362, 10595, 10015, 4886, 8878, 7636, 5285, 1828, 6509, 12316, 11240, 11162, 6636, 3643, 3846, 13532, 3633, 4301, 11960, 11194, 4740, 2250, 3970, 4268, 7221, 96, 6

In [4]:
data

{('CPDB',
  0): Data(x=[13627, 64], edge_index=[2, 518005], y=[13627, 1], train_mask=[13627, 1], test_mask=[13627, 1], val_mask=[13627, 1], name=[13627]),
 ('CPDB',
  1): Data(x=[13627, 64], edge_index=[2, 518005], y=[13627, 1], train_mask=[13627, 1], test_mask=[13627, 1], val_mask=[13627, 1], name=[13627]),
 ('CPDB',
  2): Data(x=[13627, 64], edge_index=[2, 518005], y=[13627, 1], train_mask=[13627, 1], test_mask=[13627, 1], val_mask=[13627, 1], name=[13627]),
 ('IRefIndex',
  0): Data(x=[17013, 64], edge_index=[2, 760150], y=[17013, 1], train_mask=[17013, 1], test_mask=[17013, 1], val_mask=[17013, 1], name=[17013]),
 ('IRefIndex',
  1): Data(x=[17013, 64], edge_index=[2, 760150], y=[17013, 1], train_mask=[17013, 1], test_mask=[17013, 1], val_mask=[17013, 1], name=[17013]),
 ('IRefIndex',
  2): Data(x=[17013, 64], edge_index=[2, 760150], y=[17013, 1], train_mask=[17013, 1], test_mask=[17013, 1], val_mask=[17013, 1], name=[17013]),
 ('IRefIndex_2015',
  0): Data(x=[12129, 64], edge_inde

***
**Logistic regression**

In [8]:
def log_reg(X_train, y_train, X_test, y_test):
    clf = LogisticRegression(max_iter=1000, random_state=0)
    clf.fit(X_train, y_train)

    acc = accuracy_score(y_test, clf.predict(X_test)) * 100
        # Get predictions and predicted probabilities for the positive class
    y_pred = clf.predict(X_test)
    y_prob = clf.predict_proba(X_test)[:, 1]
    
    # Compute metrics
    acc = accuracy_score(y_test, y_pred) * 100
    auc = roc_auc_score(y_test, y_prob)
    aupr = average_precision_score(y_test, y_prob)
    f1 = f1_score(y_test, y_pred)
    print(f"F1 score: {f1:.4f}")
    
    print(f"Logistic regression model accuracy: {acc:.2f}%")
    print(f"AUC: {auc:.4f}")
    print(f"AUPR: {aupr:.4f}")
    
    return acc, auc, aupr, f1

In [9]:
log_reg_results = {}


for ppi in ['CPDB']:
    for seed in [0,1,2]:
        mask_train = data[ppi,seed].train_mask.squeeze()  
        mask_test = data[ppi,seed].test_mask.squeeze()

        X_train = data[ppi,seed].x[mask_train].numpy()  
        y_train = data[ppi,seed].y[mask_train].numpy()
        X_test = data[ppi,seed].x[mask_test].numpy()  
        y_test = data[ppi,seed].y[mask_test].numpy()

        print(f"Training on {ppi} data")
        
        log_reg_results[ppi,seed] = log_reg(X_train, y_train, X_test, y_test)

Training on CPDB data
F1 score: 0.7097
Logistic regression model accuracy: 75.00%
AUC: 0.7284
AUPR: 0.8129
Training on CPDB data
F1 score: 0.4828
Logistic regression model accuracy: 58.33%
AUC: 0.7130
AUPR: 0.7160
Training on CPDB data
F1 score: 0.5405
Logistic regression model accuracy: 52.78%
AUC: 0.6049
AUPR: 0.7182


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [10]:
log_reg_results

{('CPDB', 0): (75.0,
  np.float64(0.7283950617283951),
  np.float64(0.8128956046413354),
  np.float64(0.7096774193548387)),
 ('CPDB', 1): (58.333333333333336,
  np.float64(0.712962962962963),
  np.float64(0.7160144957997348),
  np.float64(0.4827586206896552)),
 ('CPDB', 2): (52.77777777777778,
  np.float64(0.6049382716049383),
  np.float64(0.7181861391971391),
  np.float64(0.5405405405405406))}

In [11]:

# Extract metric values
accuracies = [val[0] for val in log_reg_results.values()]
aucs = [val[1] for val in log_reg_results.values()]
auprs = [val[2] for val in log_reg_results.values()]
f1s = [val[3] for val in log_reg_results.values()]
# Calculate means
mean_acc = np.mean(accuracies)
mean_auc = np.mean(aucs)
mean_aupr = np.mean(auprs)
mean_f1 = np.mean(f1s)
# Calculate standard deviations
std_acc = np.std(accuracies)
std_auc = np.std(aucs)
std_aupr = np.std(auprs)
std_f1 = np.std(f1s)
print(f"Mean Accuracy: {mean_acc:.2f}%")
print(f"Standard Deviation Accuracy: {std_acc:.2f}")
print(f"Mean AUC: {mean_auc:.4f}")
print(f"Standard Deviation AUC: {std_auc:.4f}")
print(f"Mean AUPR: {mean_aupr:.4f}")
print(f"Standard Deviation AUPR: {std_aupr:.4f}")
print(f"Mean F1: {mean_f1:.4f}")
print(f"Standard Deviation F1: {std_f1:.4f}")

Mean Accuracy: 62.04%
Standard Deviation Accuracy: 9.44
Mean AUC: 0.6821
Standard Deviation AUC: 0.0549
Mean AUPR: 0.7490
Standard Deviation AUPR: 0.0452
Mean F1: 0.5777
Standard Deviation F1: 0.0963


In [105]:
def log_reg_cv(X_train, y_train, X_test=None, y_test=None, cv=10):
    clf = LogisticRegression(max_iter=1000, random_state=42)
    scores = cross_val_score(clf, X_train, y_train, cv=cv, scoring='accuracy')
    print(f"Logistic Regression CV Mean Accuracy: {scores.mean()*100:.2f}% (±{scores.std()*100:.2f}%)")
    print(f"Logistic Regression CV Median Accuracy: {np.median(scores)*100:.2f}% (±{scores.std()*100:.2f}%)")
    return scores


In [119]:
log_reg_cv_results = {}
for ppi in ppis:
    mask_train = data[ppi,42].train_mask.squeeze()  
    mask_test = data[ppi,42].test_mask.squeeze()

    X_train = data[ppi,42].x[mask_train].numpy()  
    y_train = data[ppi,42].y[mask_train].numpy().ravel()
    # X_test = data[ppi].x[mask_test].numpy()  
    # y_test = data[ppi].y[mask_test].numpy()
    print(f"Evaluating {ppi} data with CV")
    log_reg_cv_results[ppi,42] = log_reg_cv(X_train, y_train)

Evaluating CPDB data with CV
Logistic Regression CV Mean Accuracy: 70.30% (±12.89%)
Logistic Regression CV Median Accuracy: 72.73% (±12.89%)
Evaluating IRefIndex data with CV
Logistic Regression CV Mean Accuracy: 75.91% (±10.95%)
Logistic Regression CV Median Accuracy: 74.24% (±10.95%)
Evaluating IRefIndex_2015 data with CV
Logistic Regression CV Mean Accuracy: 69.00% (±12.64%)
Logistic Regression CV Median Accuracy: 66.82% (±12.64%)
Evaluating PCNet data with CV
Logistic Regression CV Mean Accuracy: 72.05% (±9.20%)
Logistic Regression CV Median Accuracy: 75.00% (±9.20%)
Evaluating STRINGdb data with CV
Logistic Regression CV Mean Accuracy: 71.89% (±10.47%)
Logistic Regression CV Median Accuracy: 72.73% (±10.47%)


In [109]:
def log_reg_grid_search(X, y):
    param_grid = {'C': [0.01, 0.1, 1, 10, 100]}
    clf = LogisticRegression(max_iter=1000, random_state=42)
    grid_search = GridSearchCV(clf, param_grid, cv=10, scoring='accuracy')
    grid_search.fit(X, y)
    print("Best params:", grid_search.best_params_)
    print(f"Best CV Accuracy: {grid_search.best_score_*100:.2f}%")
    return grid_search

In [120]:
log_reg_grid_results = {}
for ppi in ppis:
    mask_train = data[ppi,42].train_mask.squeeze()  
    mask_test = data[ppi,42].test_mask.squeeze()

    X_train = data[ppi,42].x[mask_train].numpy()  
    y_train = data[ppi,42].y[mask_train].numpy().ravel()
    print(f"Grid search on {ppi} data")
    log_reg_grid_results[ppi,42] = log_reg_grid_search(X_train, y_train)

Grid search on CPDB data
Best params: {'C': 0.1}
Best CV Accuracy: 71.14%
Grid search on IRefIndex data
Best params: {'C': 10}
Best CV Accuracy: 78.48%
Grid search on IRefIndex_2015 data
Best params: {'C': 1}
Best CV Accuracy: 69.00%
Grid search on PCNet data
Best params: {'C': 1}
Best CV Accuracy: 72.05%
Grid search on STRINGdb data
Best params: {'C': 10}
Best CV Accuracy: 76.44%


***
**MLP**

In [12]:
class MLP(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes, dropout = 0.2):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.ReLU(),
            nn.Dropout(p=dropout),
            nn.Linear(hidden_size, 32),
            nn.ReLU(),
            nn.Dropout(p=dropout),
            nn.Linear(32, num_classes)
        )
    
    def forward(self, x):
        return self.layers(x)

In [26]:
def train_mlp(X_train, X_test, y_train, y_test, lr=0.001, decay = 5e-4, hidden_size=64, n_epochs = 100):
    #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, stratify=y)
    model = MLP(X_train.shape[1], hidden_size, 2)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=decay)

    for epoch in range(n_epochs):
        optimizer.zero_grad()
        outputs = model(X_train)
        loss = criterion(outputs, y_train)
        loss.backward()
        optimizer.step()

        predictions = outputs.argmax(dim=1)
        acc = accuracy_score(y_train, predictions)
        auroc = roc_auc_score(y_train, predictions)
        aupr = average_precision_score(y_train, predictions)
        f1 = f1_score(y_train, predictions)
        print(f"Epoch {epoch} - Loss: {loss.item()} - Accuracy: {acc:.3f}")

    with torch.no_grad():
        outputs = model(X_test)
        _, predicted = torch.max(outputs, 1)
        test_acc = accuracy_score(y_test, predicted)
        test_auroc = roc_auc_score(y_test, predicted)
        test_aupr = average_precision_score(y_test, predicted)
        test_f1 = f1_score(y_test, predicted)
        print(f"MLP model test accuracy: {acc:.3f}")

    return model, test_acc, test_auroc, test_aupr, test_f1, predicted

In [27]:
mlp_results = {}
for ppi in ppis:
    mask_train = data[ppi,0].train_mask.squeeze()  
    mask_test = data[ppi,0].test_mask.squeeze()

    X_train = torch.tensor(data[ppi,0].x[mask_train].numpy(), dtype=torch.float32)
    y_train = torch.tensor([y.item() for yz in data[ppi,0].y[mask_train] for y in yz], dtype=torch.long)
    X_test = torch.tensor(data[ppi,0].x[mask_test].numpy(), dtype=torch.float32)  
    y_test = torch.tensor([y.item() for yz in data[ppi,0].y[mask_test] for y in yz], dtype=torch.long)

    # data[ppi].x = torch.tensor(data[ppi].x, dtype=torch.float32)
    # labels = torch.tensor([y.item() for yz in data[ppi].y for y in yz], dtype=torch.long)

    #labels = torch.tensor(labels, dtype=torch.long)
    model, test_acc, test_auroc, test_aupr, test_f1, test_pred = train_mlp(X_train, X_test, y_train, y_test)
    mlp_results[ppi] = (model, test_acc, test_f1, test_pred)

Epoch 0 - Loss: 0.6946852803230286 - Accuracy: 0.496
Epoch 1 - Loss: 0.6918052434921265 - Accuracy: 0.522
Epoch 2 - Loss: 0.69182950258255 - Accuracy: 0.513
Epoch 3 - Loss: 0.6921141743659973 - Accuracy: 0.487
Epoch 4 - Loss: 0.6898582577705383 - Accuracy: 0.530
Epoch 5 - Loss: 0.6902645826339722 - Accuracy: 0.539
Epoch 6 - Loss: 0.6883981227874756 - Accuracy: 0.557
Epoch 7 - Loss: 0.689025342464447 - Accuracy: 0.609
Epoch 8 - Loss: 0.6899604201316833 - Accuracy: 0.548
Epoch 9 - Loss: 0.6855391263961792 - Accuracy: 0.635
Epoch 10 - Loss: 0.6856611967086792 - Accuracy: 0.626
Epoch 11 - Loss: 0.6859285831451416 - Accuracy: 0.635
Epoch 12 - Loss: 0.6857849955558777 - Accuracy: 0.565
Epoch 13 - Loss: 0.6809494495391846 - Accuracy: 0.635
Epoch 14 - Loss: 0.6795617341995239 - Accuracy: 0.600
Epoch 15 - Loss: 0.6832281947135925 - Accuracy: 0.557
Epoch 16 - Loss: 0.6820057034492493 - Accuracy: 0.600
Epoch 17 - Loss: 0.6807754039764404 - Accuracy: 0.652
Epoch 18 - Loss: 0.6772734522819519 - Acc

In [28]:
mlp_results

{'CPDB': (MLP(
    (layers): Sequential(
      (0): Linear(in_features=64, out_features=64, bias=True)
      (1): ReLU()
      (2): Dropout(p=0.2, inplace=False)
      (3): Linear(in_features=64, out_features=32, bias=True)
      (4): ReLU()
      (5): Dropout(p=0.2, inplace=False)
      (6): Linear(in_features=32, out_features=2, bias=True)
    )
  ),
  0.75,
  np.float64(0.7272727272727273),
  tensor([0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1,
          0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0])),
 'IRefIndex': (MLP(
    (layers): Sequential(
      (0): Linear(in_features=64, out_features=64, bias=True)
      (1): ReLU()
      (2): Dropout(p=0.2, inplace=False)
      (3): Linear(in_features=64, out_features=32, bias=True)
      (4): ReLU()
      (5): Dropout(p=0.2, inplace=False)
      (6): Linear(in_features=32, out_features=2, bias=True)
    )
  ),
  0.7027027027027027,
  np.float64(0.6666666666666666),
  tensor([0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0

In [30]:
data

{('CPDB',
  0): Data(x=[13627, 64], edge_index=[2, 518005], y=[13627, 1], train_mask=[13627, 1], test_mask=[13627, 1], val_mask=[13627, 1], name=[13627]),
 ('CPDB',
  1): Data(x=[13627, 64], edge_index=[2, 518005], y=[13627, 1], train_mask=[13627, 1], test_mask=[13627, 1], val_mask=[13627, 1], name=[13627]),
 ('CPDB',
  2): Data(x=[13627, 64], edge_index=[2, 518005], y=[13627, 1], train_mask=[13627, 1], test_mask=[13627, 1], val_mask=[13627, 1], name=[13627]),
 ('IRefIndex',
  0): Data(x=[17013, 64], edge_index=[2, 760150], y=[17013, 1], train_mask=[17013, 1], test_mask=[17013, 1], val_mask=[17013, 1], name=[17013]),
 ('IRefIndex',
  1): Data(x=[17013, 64], edge_index=[2, 760150], y=[17013, 1], train_mask=[17013, 1], test_mask=[17013, 1], val_mask=[17013, 1], name=[17013]),
 ('IRefIndex',
  2): Data(x=[17013, 64], edge_index=[2, 760150], y=[17013, 1], train_mask=[17013, 1], test_mask=[17013, 1], val_mask=[17013, 1], name=[17013]),
 ('IRefIndex_2015',
  0): Data(x=[12129, 64], edge_inde

In [29]:
data['CPDB', 0].train_mask
X_train = torch.tensor(data['CPDB', 0].x[mask_train].numpy(), dtype=torch.float32)
X_train

IndexError: The shape of the mask [13179] at index 0 does not match the shape of the indexed tensor [13627, 64] at index 0

In [31]:
mlp_results_multi_seed = {}
seeds = [0,1,2]
for ppi in ppis:
    for seed in seeds:
        mask_train = data[ppi, seed].train_mask.squeeze()  
        mask_test = data[ppi, seed].test_mask.squeeze()

        X_train = torch.tensor(data[ppi, seed].x[mask_train].numpy(), dtype=torch.float32)
        y_train = torch.tensor([y.item() for yz in data[ppi, seed].y[mask_train] for y in yz], dtype=torch.long)
        X_test = torch.tensor(data[ppi, seed].x[mask_test].numpy(), dtype=torch.float32)  
        y_test = torch.tensor([y.item() for yz in data[ppi, seed].y[mask_test] for y in yz], dtype=torch.long)

        print(f"Running MLP experiments on {ppi, seed} data")
        models, test_acc, test_auroc, test_aupr,test_f1, _ = train_mlp(X_train, X_test, y_train, y_test)
        mlp_results_multi_seed[ppi, seed] = (test_acc, test_auroc, test_aupr,test_f1)

Running MLP experiments on ('CPDB', 0) data
Epoch 0 - Loss: 0.6976980566978455 - Accuracy: 0.504
Epoch 1 - Loss: 0.6935251355171204 - Accuracy: 0.504
Epoch 2 - Loss: 0.6956667900085449 - Accuracy: 0.504
Epoch 3 - Loss: 0.695032000541687 - Accuracy: 0.504
Epoch 4 - Loss: 0.6931697726249695 - Accuracy: 0.504
Epoch 5 - Loss: 0.6926687359809875 - Accuracy: 0.504
Epoch 6 - Loss: 0.6914736032485962 - Accuracy: 0.504
Epoch 7 - Loss: 0.6930235624313354 - Accuracy: 0.504
Epoch 8 - Loss: 0.6906492710113525 - Accuracy: 0.513
Epoch 9 - Loss: 0.6907798647880554 - Accuracy: 0.539
Epoch 10 - Loss: 0.6886951327323914 - Accuracy: 0.530
Epoch 11 - Loss: 0.6923489570617676 - Accuracy: 0.513
Epoch 12 - Loss: 0.6893332004547119 - Accuracy: 0.539
Epoch 13 - Loss: 0.6877790093421936 - Accuracy: 0.609
Epoch 14 - Loss: 0.6873703002929688 - Accuracy: 0.600
Epoch 15 - Loss: 0.6878857016563416 - Accuracy: 0.591
Epoch 16 - Loss: 0.6869164109230042 - Accuracy: 0.574
Epoch 17 - Loss: 0.6856655478477478 - Accuracy: 0

In [32]:
mlp_results_multi_seed

{('CPDB', 0): (0.7777777777777778,
  np.float64(0.7777777777777777),
  np.float64(0.7380952380952381),
  np.float64(0.75)),
 ('CPDB', 1): (0.7222222222222222,
  np.float64(0.7222222222222223),
  np.float64(0.6746031746031746),
  np.float64(0.6875)),
 ('CPDB', 2): (0.6666666666666666,
  np.float64(0.6666666666666667),
  np.float64(0.6111111111111112),
  np.float64(0.6666666666666666)),
 ('IRefIndex', 0): (0.7837837837837838,
  np.float64(0.7792397660818714),
  np.float64(0.7493743743743744),
  np.float64(0.7333333333333333)),
 ('IRefIndex', 1): (0.7297297297297297,
  np.float64(0.7339181286549707),
  np.float64(0.7060947587263376),
  np.float64(0.6875)),
 ('IRefIndex', 2): (0.7837837837837838,
  np.float64(0.7821637426900584),
  np.float64(0.7219406906906907),
  np.float64(0.7647058823529411)),
 ('IRefIndex_2015', 0): (0.8235294117647058,
  np.float64(0.8235294117647058),
  np.float64(0.7554179566563467),
  np.float64(0.8333333333333334)),
 ('IRefIndex_2015', 1): (0.7647058823529411,
  

In [38]:
mlp_results_multi_seed_df = pd.DataFrame(columns=[
    'PPI', 'Mean Accuracy', 'Std Accuracy', 
    'Mean AUC', 'Std AUC', 'Mean AUPR', 'Std AUPR', 'Mean F1', 'Std F1'
    ])

for i, ppi in enumerate(ppis):
    acc_ppi = []
    auc_ppi = []
    aupr_ppi =[]
    f1_ppi = []
    for seed in seeds:
        #print(f'Accuracy for {ppi,seed}: {mlp_results_multi_seed[ppi, seed][0]}')
        acc_ppi.append(mlp_results_multi_seed[ppi, seed][0])
        auc_ppi.append(mlp_results_multi_seed[ppi, seed][1])
        aupr_ppi.append(mlp_results_multi_seed[ppi, seed][2])
        f1_ppi.append(mlp_results_multi_seed[ppi, seed][3])
    mean_acc = np.mean(acc_ppi)
    std_acc = np.std(acc_ppi)
    mean_auc = np.mean(auc_ppi)
    std_auc = np.std(auc_ppi)
    mean_aupr = np.mean(aupr_ppi)
    std_aupr = np.std(aupr_ppi)
    mean_f1 = np.mean(f1_ppi)
    std_f1 = np.std(f1_ppi)

    print(f"Mean accuracy for {ppi}: {mean_acc:.3f} ±{std_acc:.3f}")
    print(f"Mean AUC for {ppi}: {mean_auc:.3f} ±{std_auc:.3f}")
    print(f"Mean AUPR for {ppi}: {mean_aupr:.3f} ±{std_aupr:.3f}")
    print(f"Mean F1 for {ppi}: {mean_f1:.3f} ±{std_f1:.3f}")

    mlp_results_multi_seed_df.loc[i] = [
        ppi, mean_acc, std_acc, mean_auc, std_auc, mean_aupr, std_aupr, mean_f1, std_f1
    ]

mlp_results_multi_seed_df.to_csv('../results/mlp_results_upd.tsv', sep='\t',index=False)


Mean accuracy for CPDB: 0.722 ±0.045
Mean AUC for CPDB: 0.722 ±0.045
Mean AUPR for CPDB: 0.675 ±0.052
Mean F1 for CPDB: 0.701 ±0.035
Mean accuracy for IRefIndex: 0.766 ±0.025
Mean AUC for IRefIndex: 0.765 ±0.022
Mean AUPR for IRefIndex: 0.726 ±0.018
Mean F1 for IRefIndex: 0.729 ±0.032
Mean accuracy for IRefIndex_2015: 0.735 ±0.087
Mean AUC for IRefIndex_2015: 0.735 ±0.087
Mean AUPR for IRefIndex_2015: 0.685 ±0.071
Mean F1 for IRefIndex_2015: 0.688 ±0.151
Mean accuracy for PCNet: 0.702 ±0.087
Mean AUC for PCNet: 0.702 ±0.087
Mean AUPR for PCNet: 0.651 ±0.077
Mean F1 for PCNet: 0.680 ±0.104
Mean accuracy for STRINGdb: 0.733 ±0.059
Mean AUC for STRINGdb: 0.734 ±0.059
Mean AUPR for STRINGdb: 0.673 ±0.041
Mean F1 for STRINGdb: 0.723 ±0.067


In [39]:
mlp_results_multi_seed_df

Unnamed: 0,PPI,Mean Accuracy,Std Accuracy,Mean AUC,Std AUC,Mean AUPR,Std AUPR,Mean F1,Std F1
0,CPDB,0.722222,0.045361,0.722222,0.045361,0.674603,0.051841,0.701389,0.03541
1,IRefIndex,0.765766,0.025481,0.765107,0.022086,0.725803,0.017879,0.728513,0.031703
2,IRefIndex_2015,0.735294,0.086586,0.735294,0.086586,0.685139,0.070801,0.687778,0.150809
3,PCNet,0.701754,0.086838,0.701754,0.086838,0.65117,0.077121,0.679667,0.10427
4,STRINGdb,0.733333,0.058709,0.73366,0.058884,0.672959,0.040614,0.722516,0.067465


In [111]:
def run_mlp_experiment(X_train, X_test, y_train, y_test, seeds=[42, 7, 21]):
    accuracies = []
    models = []
    for seed in seeds:
        torch.manual_seed(seed)
        model, acc, _ = train_mlp(X_train, X_test, y_train, y_test)
        accuracies.append(acc)
        models.append(model)
        print(f"Seed {seed} - Test Accuracy: {acc:.2f}%")
    avg_acc = np.mean(accuracies)
    print(f"Average Test Accuracy over seeds: {avg_acc:.2f}%")
    return models, accuracies

In [112]:
mlp_results_multi_seed = {}
for ppi in ppis:
    mask_train = data[ppi].train_mask.squeeze()  
    mask_test = data[ppi].test_mask.squeeze()

    X_train = torch.tensor(data[ppi].x[mask_train].numpy(), dtype=torch.float32)
    y_train = torch.tensor([y.item() for yz in data[ppi].y[mask_train] for y in yz], dtype=torch.long)
    X_test = torch.tensor(data[ppi].x[mask_test].numpy(), dtype=torch.float32)  
    y_test = torch.tensor([y.item() for yz in data[ppi].y[mask_test] for y in yz], dtype=torch.long)

    print(f"Running MLP experiments on {ppi} data")
    models, accuracies = run_mlp_experiment(X_train, X_test, y_train, y_test)
    mlp_results_multi_seed[ppi] = (models, accuracies)

Running MLP experiments on CPDB data
Epoch 0 - Loss: 0.6943416595458984 - Accuracy: 49.57%
Epoch 1 - Loss: 0.6936794519424438 - Accuracy: 49.57%
Epoch 2 - Loss: 0.6930168271064758 - Accuracy: 49.57%
Epoch 3 - Loss: 0.6923506259918213 - Accuracy: 49.57%
Epoch 4 - Loss: 0.6916755437850952 - Accuracy: 50.43%
Epoch 5 - Loss: 0.6909906268119812 - Accuracy: 50.43%
Epoch 6 - Loss: 0.6902964115142822 - Accuracy: 51.30%
Epoch 7 - Loss: 0.6895898580551147 - Accuracy: 53.04%
Epoch 8 - Loss: 0.6888704895973206 - Accuracy: 63.48%
Epoch 9 - Loss: 0.6881344318389893 - Accuracy: 68.70%
Epoch 10 - Loss: 0.6873695254325867 - Accuracy: 74.78%
Epoch 11 - Loss: 0.6865664720535278 - Accuracy: 78.26%
Epoch 12 - Loss: 0.6857249140739441 - Accuracy: 75.65%
Epoch 13 - Loss: 0.6848435401916504 - Accuracy: 76.52%
Epoch 14 - Loss: 0.683896541595459 - Accuracy: 75.65%
Epoch 15 - Loss: 0.6828877329826355 - Accuracy: 78.26%
Epoch 16 - Loss: 0.6818037629127502 - Accuracy: 79.13%
Epoch 17 - Loss: 0.6806365847587585 - A