### Baseline models

Two models, with no graph structure
1. Simple logistic regression. Following Otto's (?) suggestion. How does very simple model perform?

ans: better than graph transformer. lol

2. MLP classifier. Classifier model, no graph structure / network info

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, roc_auc_score, average_precision_score

import torch.nn as nn
import torch
import pandas as pd
import numpy as np

import torch_geometric
from torch_geometric.data import Data
import h5py
from torch_geometric.utils import add_self_loops

import sys
sys.path.append('../')
from src.models.modig import MODIG


***
**SMG data**

In [2]:
def load_h5_graph(PATH, LABEL_PATH, ppi, seed=42):
    f = h5py.File(f'{PATH}/{ppi}_multiomics.h5', 'r')
    # Build edge indices from the network matrix
    network = f['network'][:]
    src, dst = np.nonzero(network)
    edge_index = torch.tensor(np.vstack((src, dst)), dtype=torch.long)

    # Load node features and assign a node "name" attribute if desired
    features = f['features'][:]
    x = torch.from_numpy(features)
    num_nodes = x.size(0)
    node_name = f['gene_names'][...,-1].astype(str)

    # Retrieve gene names and create a mapping: gene name -> node index
    gene_name = f['gene_names'][...,-1].astype(str)
    gene_map = {g: i for i, g in enumerate(gene_name)}  # gene name -> node index

    # Originally, the code combined several label arrays but then reads a health.tsv.
    # Here we read the health.tsv file and extract the symbols.
    # Ensure that PATH is defined in your environment.
    label_df = pd.read_csv(LABEL_PATH, sep='\t').astype(str) # TODO fix this for druggable gene prediction
    label_symbols = label_df['symbol'].tolist()

    # Determine positive nodes: indices that appear in both the health.tsv and gene_name list
    mask = [gene_map[g] for g in sorted(list(set(label_symbols) & set(gene_name)))]

    # Randomly select negative samples from those nodes not in the positive mask.
    np.random.seed(seed)
    all_indices = set(range(len(gene_name)))
    negative_candidates = sorted(list(all_indices - set(mask)))
    neg_sample_size = min(len(mask), len(gene_name) - len(mask))
    neg_mask = np.random.choice(negative_candidates, size=neg_sample_size, replace=False).tolist()

    print("Negative mask indices:", neg_mask)

    # Create a label vector (1 for positive, 0 for negative)
    y = torch.zeros(len(gene_name), dtype=torch.float)
    y[mask] = 1
    y = y.unsqueeze(1)  # shape: [num_nodes, 1]

    # Combine positive and negative indices for the split
    final_mask = mask + neg_mask
    final_labels = y[final_mask].squeeze(1).numpy()  # converting to numpy for stratification

    # Split indices into train, test, and validation sets using stratification
    train_idx, test_idx, _, _ = train_test_split(final_mask, final_labels, test_size=0.2,
                                                    shuffle=True, stratify=final_labels, random_state=seed)
    train_idx, val_idx, _, _ = train_test_split(train_idx, y[train_idx].numpy().squeeze(1),
                                                test_size=0.2, shuffle=True,
                                                stratify=y[train_idx].numpy().squeeze(1), random_state=seed)

    # Create boolean masks for all nodes
    train_mask = torch.zeros(len(gene_name), dtype=torch.bool)
    test_mask = torch.zeros(len(gene_name), dtype=torch.bool)
    val_mask = torch.zeros(len(gene_name), dtype=torch.bool)
    train_mask[train_idx] = True
    test_mask[test_idx] = True
    val_mask[val_idx] = True

    # Add self-loops to the edge_index
    edge_index, _ = add_self_loops(edge_index, num_nodes=num_nodes)

    # Build the PyTorch Geometric data object
    data = Data(x=x, edge_index=edge_index, y=y)
    data.train_mask = train_mask.unsqueeze(1)  # unsqueeze if you want to mimic the original shape
    data.test_mask = test_mask.unsqueeze(1)
    data.val_mask = val_mask.unsqueeze(1)
    data.name = node_name  # optional: storing node names

    return data

In [4]:
PATH = '../data/real/smg_data/'
LABEL_PATH = '../data/real/smg_data/health.tsv'
ppis = ['CPDB', 'IRefIndex', 'IRefIndex_2015', 'PCNet', 'STRINGdb']#'Multinet',

data = {}
seeds = [0,1,2]

for ppi in ppis:
    for seed in seeds:
        data[ppi, seed] = load_h5_graph(PATH, LABEL_PATH, ppi, seed=seed)

Negative mask indices: [8999, 13560, 10056, 5238, 4400, 10126, 7257, 9287, 1328, 10029, 3798, 1343, 4006, 10315, 12558, 6469, 8637, 6127, 3586, 785, 5161, 5253, 2224, 7950, 7645, 12498, 12624, 9395, 13098, 9368, 7476, 3001, 6913, 2409, 2090, 3433, 4462, 5038, 4739, 5096, 10199, 4624, 7788, 7438, 542, 10380, 4442, 3545, 4161, 3051, 10863, 11284, 11717, 3395, 12867, 7054, 8812, 12216, 6740, 1955, 5830, 7993, 6318, 11760, 6879, 13045, 7607, 2732, 501, 11409, 8966, 12040, 13196, 10590, 4523, 12691, 3070, 2058, 2204, 5668, 12618, 4245, 4588, 537, 387, 8507, 4056, 12163, 4326, 11298]
Negative mask indices: [5243, 2450, 5909, 8874, 5635, 1343, 9229, 8495, 8700, 2673, 6224, 11148, 7952, 2444, 12904, 6947, 2322, 10057, 38, 9950, 1331, 4230, 881, 8834, 13602, 11429, 10796, 12282, 2504, 6762, 9169, 3298, 12296, 8425, 11250, 5144, 3068, 7362, 10595, 10015, 4886, 8878, 7636, 5285, 1828, 6509, 12316, 11240, 11162, 6636, 3643, 3846, 13532, 3633, 4301, 11960, 11194, 4740, 2250, 3970, 4268, 7221, 96, 6

In [5]:
data

{('CPDB',
  0): Data(x=[13627, 64], edge_index=[2, 518005], y=[13627, 1], train_mask=[13627, 1], test_mask=[13627, 1], val_mask=[13627, 1], name=[13627]),
 ('CPDB',
  1): Data(x=[13627, 64], edge_index=[2, 518005], y=[13627, 1], train_mask=[13627, 1], test_mask=[13627, 1], val_mask=[13627, 1], name=[13627]),
 ('CPDB',
  2): Data(x=[13627, 64], edge_index=[2, 518005], y=[13627, 1], train_mask=[13627, 1], test_mask=[13627, 1], val_mask=[13627, 1], name=[13627]),
 ('IRefIndex',
  0): Data(x=[17013, 64], edge_index=[2, 760150], y=[17013, 1], train_mask=[17013, 1], test_mask=[17013, 1], val_mask=[17013, 1], name=[17013]),
 ('IRefIndex',
  1): Data(x=[17013, 64], edge_index=[2, 760150], y=[17013, 1], train_mask=[17013, 1], test_mask=[17013, 1], val_mask=[17013, 1], name=[17013]),
 ('IRefIndex',
  2): Data(x=[17013, 64], edge_index=[2, 760150], y=[17013, 1], train_mask=[17013, 1], test_mask=[17013, 1], val_mask=[17013, 1], name=[17013]),
 ('IRefIndex_2015',
  0): Data(x=[12129, 64], edge_inde

***
**Logistic regression**

In [6]:
def log_reg(X_train, y_train, X_test, y_test):
    clf = LogisticRegression(max_iter=1000, random_state=0)
    clf.fit(X_train, y_train)

    acc = accuracy_score(y_test, clf.predict(X_test)) * 100
        # Get predictions and predicted probabilities for the positive class
    y_pred = clf.predict(X_test)
    y_prob = clf.predict_proba(X_test)[:, 1]
    
    # Compute metrics
    acc = accuracy_score(y_test, y_pred) * 100
    auc = roc_auc_score(y_test, y_prob)
    aupr = average_precision_score(y_test, y_prob)
    
    print(f"Logistic regression model accuracy: {acc:.2f}%")
    print(f"AUC: {auc:.4f}")
    print(f"AUPR: {aupr:.4f}")
    
    return acc, auc, aupr

In [11]:
log_reg_results = {}


for ppi in ['CPDB']:
    for seed in [0,1,2]:
        mask_train = data[ppi,seed].train_mask.squeeze()  
        mask_test = data[ppi,seed].test_mask.squeeze()

        X_train = data[ppi,seed].x[mask_train].numpy()  
        y_train = data[ppi,seed].y[mask_train].numpy()
        X_test = data[ppi,seed].x[mask_test].numpy()  
        y_test = data[ppi,seed].y[mask_test].numpy()

        print(f"Training on {ppi} data")
        
        log_reg_results[ppi,seed] = log_reg(X_train, y_train, X_test, y_test)

Training on CPDB data
Logistic regression model accuracy: 75.00%
AUC: 0.7284
AUPR: 0.8129
Training on CPDB data
Logistic regression model accuracy: 58.33%
AUC: 0.7130
AUPR: 0.7160
Training on CPDB data
Logistic regression model accuracy: 52.78%
AUC: 0.6049
AUPR: 0.7182


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [12]:
log_reg_results

{('CPDB', 0): (75.0,
  np.float64(0.7283950617283951),
  np.float64(0.8128956046413354)),
 ('CPDB', 1): (58.333333333333336,
  np.float64(0.712962962962963),
  np.float64(0.7160144957997348)),
 ('CPDB', 2): (52.77777777777778,
  np.float64(0.6049382716049383),
  np.float64(0.7181861391971391))}

In [13]:

# Extract metric values
accuracies = [val[0] for val in log_reg_results.values()]
aucs = [val[1] for val in log_reg_results.values()]
auprs = [val[2] for val in log_reg_results.values()]

# Calculate means
mean_acc = np.mean(accuracies)
mean_auc = np.mean(aucs)
mean_aupr = np.mean(auprs)

# Calculate standard deviations
std_acc = np.std(accuracies)
std_auc = np.std(aucs)
std_aupr = np.std(auprs)

print(f"Mean Accuracy: {mean_acc:.2f}%")
print(f"Standard Deviation Accuracy: {std_acc:.2f}")
print(f"Mean AUC: {mean_auc:.4f}")
print(f"Standard Deviation AUC: {std_auc:.4f}")
print(f"Mean AUPR: {mean_aupr:.4f}")
print(f"Standard Deviation AUPR: {std_aupr:.4f}")

Mean Accuracy: 62.04%
Standard Deviation Accuracy: 9.44
Mean AUC: 0.6821
Standard Deviation AUC: 0.0549
Mean AUPR: 0.7490
Standard Deviation AUPR: 0.0452


In [105]:
def log_reg_cv(X_train, y_train, X_test=None, y_test=None, cv=10):
    clf = LogisticRegression(max_iter=1000, random_state=42)
    scores = cross_val_score(clf, X_train, y_train, cv=cv, scoring='accuracy')
    print(f"Logistic Regression CV Mean Accuracy: {scores.mean()*100:.2f}% (±{scores.std()*100:.2f}%)")
    print(f"Logistic Regression CV Median Accuracy: {np.median(scores)*100:.2f}% (±{scores.std()*100:.2f}%)")
    return scores


In [119]:
log_reg_cv_results = {}
for ppi in ppis:
    mask_train = data[ppi,42].train_mask.squeeze()  
    mask_test = data[ppi,42].test_mask.squeeze()

    X_train = data[ppi,42].x[mask_train].numpy()  
    y_train = data[ppi,42].y[mask_train].numpy().ravel()
    # X_test = data[ppi].x[mask_test].numpy()  
    # y_test = data[ppi].y[mask_test].numpy()
    print(f"Evaluating {ppi} data with CV")
    log_reg_cv_results[ppi,42] = log_reg_cv(X_train, y_train)

Evaluating CPDB data with CV
Logistic Regression CV Mean Accuracy: 70.30% (±12.89%)
Logistic Regression CV Median Accuracy: 72.73% (±12.89%)
Evaluating IRefIndex data with CV
Logistic Regression CV Mean Accuracy: 75.91% (±10.95%)
Logistic Regression CV Median Accuracy: 74.24% (±10.95%)
Evaluating IRefIndex_2015 data with CV
Logistic Regression CV Mean Accuracy: 69.00% (±12.64%)
Logistic Regression CV Median Accuracy: 66.82% (±12.64%)
Evaluating PCNet data with CV
Logistic Regression CV Mean Accuracy: 72.05% (±9.20%)
Logistic Regression CV Median Accuracy: 75.00% (±9.20%)
Evaluating STRINGdb data with CV
Logistic Regression CV Mean Accuracy: 71.89% (±10.47%)
Logistic Regression CV Median Accuracy: 72.73% (±10.47%)


In [109]:
def log_reg_grid_search(X, y):
    param_grid = {'C': [0.01, 0.1, 1, 10, 100]}
    clf = LogisticRegression(max_iter=1000, random_state=42)
    grid_search = GridSearchCV(clf, param_grid, cv=10, scoring='accuracy')
    grid_search.fit(X, y)
    print("Best params:", grid_search.best_params_)
    print(f"Best CV Accuracy: {grid_search.best_score_*100:.2f}%")
    return grid_search

In [120]:
log_reg_grid_results = {}
for ppi in ppis:
    mask_train = data[ppi,42].train_mask.squeeze()  
    mask_test = data[ppi,42].test_mask.squeeze()

    X_train = data[ppi,42].x[mask_train].numpy()  
    y_train = data[ppi,42].y[mask_train].numpy().ravel()
    print(f"Grid search on {ppi} data")
    log_reg_grid_results[ppi,42] = log_reg_grid_search(X_train, y_train)

Grid search on CPDB data
Best params: {'C': 0.1}
Best CV Accuracy: 71.14%
Grid search on IRefIndex data
Best params: {'C': 10}
Best CV Accuracy: 78.48%
Grid search on IRefIndex_2015 data
Best params: {'C': 1}
Best CV Accuracy: 69.00%
Grid search on PCNet data
Best params: {'C': 1}
Best CV Accuracy: 72.05%
Grid search on STRINGdb data
Best params: {'C': 10}
Best CV Accuracy: 76.44%


***
**MLP**

In [121]:
class MLP(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes, dropout = 0.2):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.ReLU(),
            nn.Dropout(p=dropout),
            nn.Linear(hidden_size, 32),
            nn.ReLU(),
            nn.Dropout(p=dropout),
            nn.Linear(32, num_classes)
        )
    
    def forward(self, x):
        return self.layers(x)

In [149]:
def train_mlp(X_train, X_test, y_train, y_test, lr=0.001, decay = 5e-4, hidden_size=64, n_epochs = 100):
    #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, stratify=y)
    model = MLP(X_train.shape[1], hidden_size, 2)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=decay)

    for epoch in range(n_epochs):
        optimizer.zero_grad()
        outputs = model(X_train)
        loss = criterion(outputs, y_train)
        loss.backward()
        optimizer.step()

        predictions = outputs.argmax(dim=1)
        acc = accuracy_score(y_train, predictions) * 100
        auroc = roc_auc_score(y_train, predictions) * 100
        aupr = average_precision_score(y_train, predictions) * 100
        print(f"Epoch {epoch} - Loss: {loss.item()} - Accuracy: {acc:.2f}%")

    with torch.no_grad():
        outputs = model(X_test)
        _, predicted = torch.max(outputs, 1)
        test_acc = accuracy_score(y_test, predicted) * 100
        test_auroc = roc_auc_score(y_train, predictions) * 100
        test_aupr = average_precision_score(y_train, predictions) * 100
        print(f"MLP model test accuracy: {acc:.2f}%")

    return model, test_acc, test_auroc, test_aupr, predicted

In [150]:
mlp_results = {}
for ppi in ppis:
    mask_train = data[ppi,42].train_mask.squeeze()  
    mask_test = data[ppi,42].test_mask.squeeze()

    X_train = torch.tensor(data[ppi,42].x[mask_train].numpy(), dtype=torch.float32)
    y_train = torch.tensor([y.item() for yz in data[ppi,42].y[mask_train] for y in yz], dtype=torch.long)
    X_test = torch.tensor(data[ppi,42].x[mask_test].numpy(), dtype=torch.float32)  
    y_test = torch.tensor([y.item() for yz in data[ppi,42].y[mask_test] for y in yz], dtype=torch.long)

    # data[ppi].x = torch.tensor(data[ppi].x, dtype=torch.float32)
    # labels = torch.tensor([y.item() for yz in data[ppi].y for y in yz], dtype=torch.long)

    #labels = torch.tensor(labels, dtype=torch.long)
    model, test_acc, test_auroc, test_aupr, test_pred = train_mlp(X_train, X_test, y_train, y_test)
    mlp_results[ppi] = (model, test_acc, test_pred)

Epoch 0 - Loss: 0.6971542835235596 - Accuracy: 49.57%
Epoch 1 - Loss: 0.6971838474273682 - Accuracy: 49.57%
Epoch 2 - Loss: 0.696101725101471 - Accuracy: 49.57%
Epoch 3 - Loss: 0.6940796375274658 - Accuracy: 49.57%
Epoch 4 - Loss: 0.6953411102294922 - Accuracy: 49.57%
Epoch 5 - Loss: 0.6924697756767273 - Accuracy: 49.57%
Epoch 6 - Loss: 0.6901835799217224 - Accuracy: 49.57%
Epoch 7 - Loss: 0.6895936131477356 - Accuracy: 50.43%
Epoch 8 - Loss: 0.6907529234886169 - Accuracy: 51.30%
Epoch 9 - Loss: 0.6906548142433167 - Accuracy: 52.17%
Epoch 10 - Loss: 0.6899601817131042 - Accuracy: 51.30%
Epoch 11 - Loss: 0.6847442388534546 - Accuracy: 59.13%
Epoch 12 - Loss: 0.6853486895561218 - Accuracy: 63.48%
Epoch 13 - Loss: 0.683454155921936 - Accuracy: 60.00%
Epoch 14 - Loss: 0.6805643439292908 - Accuracy: 68.70%
Epoch 15 - Loss: 0.6792022585868835 - Accuracy: 73.91%
Epoch 16 - Loss: 0.680633544921875 - Accuracy: 69.57%
Epoch 17 - Loss: 0.6793858408927917 - Accuracy: 72.17%
Epoch 18 - Loss: 0.6766

In [152]:
mlp_results

{'CPDB': (MLP(
    (layers): Sequential(
      (0): Linear(in_features=64, out_features=64, bias=True)
      (1): ReLU()
      (2): Dropout(p=0.2, inplace=False)
      (3): Linear(in_features=64, out_features=32, bias=True)
      (4): ReLU()
      (5): Dropout(p=0.2, inplace=False)
      (6): Linear(in_features=32, out_features=2, bias=True)
    )
  ),
  63.888888888888886,
  tensor([1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1,
          0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1])),
 'IRefIndex': (MLP(
    (layers): Sequential(
      (0): Linear(in_features=64, out_features=64, bias=True)
      (1): ReLU()
      (2): Dropout(p=0.2, inplace=False)
      (3): Linear(in_features=64, out_features=32, bias=True)
      (4): ReLU()
      (5): Dropout(p=0.2, inplace=False)
      (6): Linear(in_features=32, out_features=2, bias=True)
    )
  ),
  78.37837837837837,
  tensor([0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1,
          0, 0, 1, 1, 0, 1, 1

In [132]:
data['CPDB', 42].train_mask
X_train = torch.tensor(data['CPDB', 42].x[mask_train].numpy(), dtype=torch.float32)
X_train

tensor([[0.0128, 0.0943, 0.0281,  ..., 0.8540, 0.0000, 0.0000],
        [0.0128, 0.0000, 0.0000,  ..., 0.5044, 0.1762, 0.2714],
        [0.0252, 0.4505, 0.3567,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0000, 0.0159, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0158, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.3040, 0.0000]])

In [153]:
mlp_results_multi_seed = {}
for ppi in ppis:
    for seed in seeds:
        mask_train = data[ppi, seed].train_mask.squeeze()  
        mask_test = data[ppi, seed].test_mask.squeeze()

        X_train = torch.tensor(data[ppi, seed].x[mask_train].numpy(), dtype=torch.float32)
        y_train = torch.tensor([y.item() for yz in data[ppi, seed].y[mask_train] for y in yz], dtype=torch.long)
        X_test = torch.tensor(data[ppi, seed].x[mask_test].numpy(), dtype=torch.float32)  
        y_test = torch.tensor([y.item() for yz in data[ppi, seed].y[mask_test] for y in yz], dtype=torch.long)

        print(f"Running MLP experiments on {ppi, seed} data")
        models, test_acc, test_auroc, test_aupr, _ = train_mlp(X_train, X_test, y_train, y_test)
        mlp_results_multi_seed[ppi, seed] = (test_acc, test_auroc, test_aupr)

Running MLP experiments on ('CPDB', 42) data
Epoch 0 - Loss: 0.6937367916107178 - Accuracy: 49.57%
Epoch 1 - Loss: 0.6926459670066833 - Accuracy: 48.70%
Epoch 2 - Loss: 0.6898705959320068 - Accuracy: 53.91%
Epoch 3 - Loss: 0.6913906335830688 - Accuracy: 50.43%
Epoch 4 - Loss: 0.6842337846755981 - Accuracy: 59.13%
Epoch 5 - Loss: 0.6906545758247375 - Accuracy: 48.70%
Epoch 6 - Loss: 0.6872045993804932 - Accuracy: 59.13%
Epoch 7 - Loss: 0.6847443580627441 - Accuracy: 58.26%
Epoch 8 - Loss: 0.6830286383628845 - Accuracy: 63.48%
Epoch 9 - Loss: 0.6802035570144653 - Accuracy: 64.35%
Epoch 10 - Loss: 0.681562066078186 - Accuracy: 63.48%
Epoch 11 - Loss: 0.680666983127594 - Accuracy: 63.48%
Epoch 12 - Loss: 0.6811996698379517 - Accuracy: 62.61%
Epoch 13 - Loss: 0.6788191795349121 - Accuracy: 66.96%
Epoch 14 - Loss: 0.6760696172714233 - Accuracy: 72.17%
Epoch 15 - Loss: 0.6770618557929993 - Accuracy: 67.83%
Epoch 16 - Loss: 0.6755555272102356 - Accuracy: 67.83%
Epoch 17 - Loss: 0.6743602156639

In [154]:
mlp_results_multi_seed

{('CPDB', 42): (66.66666666666666,
  np.float64(90.45674531155476),
  np.float64(87.62300667847893)),
 ('CPDB', 7): (75.0,
  np.float64(84.31639443436177),
  np.float64(79.60824949987767)),
 ('CPDB', 21): (72.22222222222221,
  np.float64(92.13551119177255),
  np.float64(90.43243560406032)),
 ('IRefIndex', 42): (81.08108108108108,
  np.float64(94.00935125657512),
  np.float64(91.1516731855715)),
 ('IRefIndex', 7): (75.67567567567568,
  np.float64(92.2998246639392),
  np.float64(89.24023143623869)),
 ('IRefIndex', 21): (75.67567567567568,
  np.float64(85.44710695499707),
  np.float64(80.77271387616214)),
 ('IRefIndex_2015', 42): (76.47058823529412,
  np.float64(88.83647798742139),
  np.float64(86.53621495327103)),
 ('IRefIndex_2015', 7): (70.58823529411765,
  np.float64(88.74912648497553),
  np.float64(85.45122157485812)),
 ('IRefIndex_2015', 21): (67.64705882352942,
  np.float64(89.67505241090147),
  np.float64(87.07569211779229)),
 ('PCNet', 42): (68.42105263157895,
  np.float64(90.095

In [166]:
mlp_results_multi_seed_df = pd.DataFrame(columns=[
    'PPI', 'Mean Accuracy', 'Std Accuracy', 
    'Mean AUC', 'Std AUC', 'Mean AUPR', 'Std AUPR'
    ])

for i, ppi in enumerate(ppis):
    acc_ppi = []
    auc_ppi = []
    aupr_ppi =[]
    for seed in seeds:
        #print(f'Accuracy for {ppi,seed}: {mlp_results_multi_seed[ppi, seed][0]}')
        acc_ppi.append(mlp_results_multi_seed[ppi, seed][0])
        auc_ppi.append(mlp_results_multi_seed[ppi, seed][1])
        aupr_ppi.append(mlp_results_multi_seed[ppi, seed][2])
    mean_acc = np.mean(acc_ppi)
    std_acc = np.std(acc_ppi)
    mean_auc = np.mean(auc_ppi)
    std_auc = np.std(auc_ppi)
    mean_aupr = np.mean(aupr_ppi)
    std_aupr = np.std(aupr_ppi)
    print(f"Mean accuracy for {ppi}: {mean_acc:.2f}% (±{std_acc:.2f}%)")
    print(f"Mean AUC for {ppi}: {mean_auc:.2f}% (±{std_auc:.2f}%)")
    print(f"Mean AUPR for {ppi}: {mean_aupr:.2f}% (±{std_aupr:.2f}%)")

    mlp_results_multi_seed_df.loc[i] = [
        ppi, mean_acc, std_acc, mean_auc, std_auc, mean_aupr, std_aupr
    ]

mlp_results_multi_seed_df.to_csv('../results/mlp_results.tsv', sep='\t',index=False)


Mean accuracy for CPDB: 71.30% (±3.46%)
Mean AUC for CPDB: 88.97% (±3.36%)
Mean AUPR for CPDB: 85.89% (±4.59%)
Mean accuracy for IRefIndex: 77.48% (±2.55%)
Mean AUC for IRefIndex: 90.59% (±3.70%)
Mean AUPR for IRefIndex: 87.05% (±4.51%)
Mean accuracy for IRefIndex_2015: 71.57% (±3.67%)
Mean AUC for IRefIndex_2015: 89.09% (±0.42%)
Mean AUPR for IRefIndex_2015: 86.35% (±0.68%)
Mean accuracy for PCNet: 66.67% (±1.24%)
Mean AUC for PCNet: 89.24% (±1.78%)
Mean AUPR for PCNet: 85.85% (±2.69%)
Mean accuracy for STRINGdb: 73.33% (±5.39%)
Mean AUC for STRINGdb: 89.79% (±2.38%)
Mean AUPR for STRINGdb: 86.98% (±3.46%)


In [165]:
mlp_results_multi_seed_df

Unnamed: 0,PPI,Mean Accuracy,Std Accuracy,Mean AUC,Std AUC,Mean AUPR,Std AUPR
0,CPDB,71.296296,3.464498,88.96955,3.360899,85.887897,4.586117
1,IRefIndex,77.477477,2.548133,90.585428,3.699763,87.054873,4.510177
2,IRefIndex_2015,71.568627,3.668292,89.086886,0.417423,86.354376,0.675537
3,PCNet,66.666667,1.240538,89.24408,1.783448,85.845997,2.68859
4,STRINGdb,73.333333,5.38748,89.788961,2.377346,86.981166,3.456111


In [111]:
def run_mlp_experiment(X_train, X_test, y_train, y_test, seeds=[42, 7, 21]):
    accuracies = []
    models = []
    for seed in seeds:
        torch.manual_seed(seed)
        model, acc, _ = train_mlp(X_train, X_test, y_train, y_test)
        accuracies.append(acc)
        models.append(model)
        print(f"Seed {seed} - Test Accuracy: {acc:.2f}%")
    avg_acc = np.mean(accuracies)
    print(f"Average Test Accuracy over seeds: {avg_acc:.2f}%")
    return models, accuracies

In [112]:
mlp_results_multi_seed = {}
for ppi in ppis:
    mask_train = data[ppi].train_mask.squeeze()  
    mask_test = data[ppi].test_mask.squeeze()

    X_train = torch.tensor(data[ppi].x[mask_train].numpy(), dtype=torch.float32)
    y_train = torch.tensor([y.item() for yz in data[ppi].y[mask_train] for y in yz], dtype=torch.long)
    X_test = torch.tensor(data[ppi].x[mask_test].numpy(), dtype=torch.float32)  
    y_test = torch.tensor([y.item() for yz in data[ppi].y[mask_test] for y in yz], dtype=torch.long)

    print(f"Running MLP experiments on {ppi} data")
    models, accuracies = run_mlp_experiment(X_train, X_test, y_train, y_test)
    mlp_results_multi_seed[ppi] = (models, accuracies)

Running MLP experiments on CPDB data
Epoch 0 - Loss: 0.6943416595458984 - Accuracy: 49.57%
Epoch 1 - Loss: 0.6936794519424438 - Accuracy: 49.57%
Epoch 2 - Loss: 0.6930168271064758 - Accuracy: 49.57%
Epoch 3 - Loss: 0.6923506259918213 - Accuracy: 49.57%
Epoch 4 - Loss: 0.6916755437850952 - Accuracy: 50.43%
Epoch 5 - Loss: 0.6909906268119812 - Accuracy: 50.43%
Epoch 6 - Loss: 0.6902964115142822 - Accuracy: 51.30%
Epoch 7 - Loss: 0.6895898580551147 - Accuracy: 53.04%
Epoch 8 - Loss: 0.6888704895973206 - Accuracy: 63.48%
Epoch 9 - Loss: 0.6881344318389893 - Accuracy: 68.70%
Epoch 10 - Loss: 0.6873695254325867 - Accuracy: 74.78%
Epoch 11 - Loss: 0.6865664720535278 - Accuracy: 78.26%
Epoch 12 - Loss: 0.6857249140739441 - Accuracy: 75.65%
Epoch 13 - Loss: 0.6848435401916504 - Accuracy: 76.52%
Epoch 14 - Loss: 0.683896541595459 - Accuracy: 75.65%
Epoch 15 - Loss: 0.6828877329826355 - Accuracy: 78.26%
Epoch 16 - Loss: 0.6818037629127502 - Accuracy: 79.13%
Epoch 17 - Loss: 0.6806365847587585 - A