In [1]:
!pip install torch_geometric
!pip install torchmetrics
!pip install rdkit
!pip install pytorch_lightning

Collecting torch_geometric
  Downloading torch_geometric-2.5.0-py3-none-any.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torch_geometric
Successfully installed torch_geometric-2.5.0
Collecting torchmetrics
  Downloading torchmetrics-1.3.1-py3-none-any.whl (840 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m840.4/840.4 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
Collecting lightning-utilities>=0.8.0 (from torchmetrics)
  Downloading lightning_utilities-0.10.1-py3-none-any.whl (24 kB)
Installing collected packages: lightning-utilities, torchmetrics
Successfully installed lightning-utilities-0.10.1 torchmetrics-1.3.1
Collecting rdkit
  Downloading rdkit-2023.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.4/34.4 MB[0m [31m28.1 MB/s[0m eta [36m0:00:00[0m
In

In [2]:
import pandas as pd
import pytorch_lightning as pl
import torch
import torchmetrics
import torch.nn as nn

from torch_geometric.nn import GAT, VGAE
from torch_geometric.utils import from_smiles, negative_sampling, train_test_split_edges
from torch_geometric.transforms import RandomLinkSplit

from random import sample, shuffle
import numpy as np
import pickle

## Load the Data (Incomplete)
* requires conversion to proper format
* requires train-val-test split

In [3]:
from google.colab import drive
drive.mount('/content/drive')

## Load pubchem10M Dataset
PC10M_path = "/content/drive/Shareddrives/cs_89_network_science/data/pubchem10M/"
df_PC10M_train = pd.read_parquet(PC10M_path + "pubchem_10m_train.parquet")
df_PC10M_val = pd.read_parquet(PC10M_path + "pubchem_10m_val.parquet")

# Load clintox and tox21 smiles representations
df_clintox = pd.read_csv("/content/drive/Shareddrives/cs_89_network_science/data/clintox.csv")
df_tox21 = pd.read_csv("/content/drive/Shareddrives/cs_89_network_science/data/tox21.csv")

Mounted at /content/drive


In [4]:
# Convert SMILES to a graph data object with PyTorch Geometric
for i in range(10):
  print(from_smiles(df_clintox.get("smiles")[i]))

Data(x=[24, 9], edge_index=[2, 46], edge_attr=[46, 3], smiles='*C(=O)[C@H](CCCCNC(=O)OCCOC)NC(=O)OCCOC')
Data(x=[12, 9], edge_index=[2, 24], edge_attr=[24, 3], smiles='[C@@H]1([C@@H]([C@@H]([C@H]([C@@H]([C@@H]1Cl)Cl)Cl)Cl)Cl)Cl')
Data(x=[14, 9], edge_index=[2, 26], edge_attr=[26, 3], smiles='[C@H]([C@@H]([C@@H](C(=O)[O-])O)O)([C@H](C(=O)[O-])O)O')
Data(x=[22, 9], edge_index=[2, 46], edge_attr=[46, 3], smiles='[H]/[NH+]=C(/C1=CC(=O)/C(=C\C=c2ccc(=C([NH3+])N)cc2)/C=C1)\N')
Data(x=[27, 9], edge_index=[2, 56], edge_attr=[56, 3], smiles='[H]/[NH+]=C(\N)/c1ccc(cc1)OCCCCCOc2ccc(cc2)/C(=[NH+]/[H])/N')
Data(x=[4, 9], edge_index=[2, 6], edge_attr=[6, 3], smiles='[N+](=O)([O-])[O-]')
Data(x=[2, 9], edge_index=[2, 2], edge_attr=[2, 3], smiles='[N]=O')
Data(x=[0, 9], edge_index=[2, 0], edge_attr=[0, 3], smiles='[NH4][Pt]([NH4])(Cl)Cl')
Data(x=[5, 9], edge_index=[2, 8], edge_attr=[8, 3], smiles='[O-][99Tc](=O)(=O)=O')
Data(x=[5, 9], edge_index=[2, 8], edge_attr=[8, 3], smiles='[O-]P(=O)([O-])F')


## Encoder

In [5]:
class GATEncoder(nn.Module):
  def __init__(self, in_channels, num_layers, hidden_dim=64):
    super().__init__()
    self.in_channels=in_channels
    self.embed_dim = hidden_dim
    self.num_layers = num_layers
    self.hidden_dim = hidden_dim

    self.layers = nn.ModuleList()
    self.gat = GAT(in_channels, hidden_channels=hidden_dim, num_layers=num_layers)

    self.mu = nn.Linear(hidden_dim, hidden_dim)
    self.logvar = nn.Linear(hidden_dim, hidden_dim)

  def forward(self, x_feats, edge_index, edge_attr):
    x_feats = self.gat(x=x_feats, edge_index=edge_index, edge_attr=edge_attr)
    #x_feats = self.mlp(x_feats)

    mu = self.mu(x_feats)
    logvar = self.logvar(x_feats)

    return mu, logvar

In [6]:
import pickle

## Define the VGAE
Current value for hidden channels is arbitrary. Layers are restricted to 2 due to general decrease in performance with subsequent layers.

In [8]:
from_smiles_xchannels = 9
input_features = from_smiles_xchannels

model = VGAE(encoder=GATEncoder(in_channels=input_features, hidden_dim=64, num_layers=2))

with open("/content/sample_data/16dim_pretrain_save_SMART", "rb") as f:
  model = pickle.load(f)['model']

#device = torch.device("cuda")  # necessary to utilize GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Set to run on", device, end=".")
model = model = model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

epochs = 0
g_loss = -1

Set to run on cpu.

# Training


In [9]:
def train(cstart, cend, training_data, bigiter):
  model.train()
  optimizer.zero_grad()
  iter = bigiter[cstart:cend]
  print("[----------------------------------------------------------------------------------------------------]",
        end="\r[")
  pinterval = pmark = (len(iter))/101
  for idx in range(len(iter)):
    i = iter[idx]
    if idx > pmark:
      print("-", end='')
      pmark += pinterval
    data = from_smiles(training_data.get("smiles")[i])
    data = data.to(device)
    data.x = data.x.type(torch.float)
    # Likely preferable to aggregate in some manner. Leaving for now just to create structure of train/test functions
    # such that they run
    mu, logstd = model.forward(data.x, data.edge_index, edge_attr=data.edge_attr)
    model.__setattr__('__mu__', mu)
    model.__setattr__('__logstd__', logstd)
    loss = model.kl_loss()
    loss.backward()
  print("]", end ="\r")
  optimizer.step()

  global g_loss, epochs
  g_loss = loss
  epochs += 1

  return float(loss)

def test(cstart, cend, testing_data):
  model.eval()
  with torch.no_grad():
    iter = list(range(len(testing_data)))
    test_cases = iter[cstart:cend]
    divisor = len(iter)
    avgs = [0, 0]
    print("Test Eval:", end=" ")
    for i in test_cases:
      data = from_smiles(testing_data.get("smiles")[i])
      if data.edge_index.size()[1] > 0:
        data = data.to(device)
        data.x = data.x.type(torch.float)
        data.neg_edge_index = negative_sampling(data.edge_index)
        z = model.encode(data.x, data.edge_index, edge_attr=data.edge_attr)
        if z.size()[0] > 2:
          auc, ap = model.test(z, data.edge_index, data.neg_edge_index)
          if i < 3:
            print(round(auc, 2), round(ap, 2), end=", ")
          avgs[0] += auc
          avgs[1] += ap
        else:
          divisor -= 1
      else:
        divisor -= 1
    avgs = [x/divisor for x in avgs]
    print("AUC=" + str(round(avgs[0], 5)), "AP=" + str(round(avgs[1], 5)), end=".")
  print("")


In [None]:
seg_size = 10000

for i in range(100):
  iter = list(range(len(df_PC10M_train)))
  shuffle(iter)
  j = 0
  while j < len(df_PC10M_train):
    print("Round", str(i+1) + "-" + str(int(j/seg_size+1)), "gives us:", train(j, j+seg_size, df_PC10M_train, iter))
    test(0, 1000, df_PC10M_val)
    j += seg_size
    torch.save({'epoch': epochs,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': g_loss,},
               "/content/drive/Shareddrives/cs_89_network_science/saves/Bmodel")
  print()

Round 1-1 gives us: 4.047802925109863
Test Eval: 0.55 0.66, 0.73 0.68, 0.59 0.67, AUC=0.64085 AP=0.66658.
Round 1-2 gives us: 3.172560453414917
Test Eval: 0.64 0.66, 0.87 0.84, 0.65 0.65, AUC=0.68432 AP=0.69096.
Round 1-3 gives us: 2.8102564811706543
Test Eval: 0.82 0.82, 0.78 0.71, 0.78 0.77, AUC=0.73036 AP=0.72093.
Round 1-4 gives us: 2.5504932403564453
Test Eval: 0.85 0.85, 0.88 0.83, 0.79 0.73, AUC=0.74404 AP=0.73097.
Round 1-5 gives us: 1.8263213634490967
Test Eval: 0.8 0.79, 0.77 0.7, 0.76 0.7, AUC=0.74468 AP=0.72927.
Round 1-6 gives us: 1.9318976402282715
Test Eval: 0.74 0.73, 0.7 0.66, 0.79 0.73, AUC=0.74109 AP=0.72838.
Round 1-7 gives us: 1.198912262916565
Test Eval: 0.74 0.73, 0.7 0.7, 0.74 0.72, AUC=0.7072 AP=0.70536.
Round 1-8 gives us: 0.9805225729942322
Test Eval: 0.62 0.67, 0.74 0.76, 0.72 0.72, AUC=0.68477 AP=0.69651.
Round 1-9 gives us: 0.9241567850112915
Test Eval: 0.58 0.62, 0.77 0.77, 0.75 0.74, AUC=0.69889 AP=0.70931.
Round 1-10 gives us: 0.7996111512184143
Test Ev

KeyboardInterrupt: 

## Fine-tuning

In [None]:
# Combine smiles representations from each
df_smiles = pd.concat([df[['smiles']] for df in [df_clintox, df_tox21]])
df_smiles = df_smiles.reset_index(drop=True)

# Split data for 70% training, 15% validation, 15% testing.
idx = list(range(len(df_smiles)))
shuffle(idx)
train_idx = idx[:int(0.7*len(df_smiles))]
val_idx = idx[int(0.7*len(df_smiles)):int(0.85*len(df_smiles))]
test_idx = idx[int(0.85*len(df_smiles)):]
df_smiles['train'] = df_smiles['test'] = df_smiles['val'] = 0
df_smiles.loc[train_idx, 'train'] = 1
df_smiles.loc[test_idx, 'test'] = 1
df_smiles.loc[val_idx, 'val'] = 1

#train_smiles = []
#for sr in df_smiles[df_smiles['train']==1].get("smiles"):
#  if sr not in train_smiles:
#    train_smiles.append(sr)
#
#test_smiles = []
#for sr in df_smiles[df_smiles['test']==1].get("smiles"):
#  if sr not in test_smiles:
#    test_smiles.append(sr)
#
#train_smiles = [from_smiles(sr) for sr in train_smiles]
#test_smiles = [from_smiles(sr) for sr in test_smiles]

df_smiles_train = df_smiles[df_smiles['train']==1]
df_smiles_train = df_smiles_train.reset_index(drop=True)
df_smiles_test = df_smiles[df_smiles['test']==1]
df_smiles_test = df_smiles_test.reset_index(drop=True)

In [None]:
for i in range(100):
  iter = list(range(len(df_smiles_train)))
  shuffle(iter)
  print("Round", str(i+1), "gives us:", train(0, len(df_smiles_train), df_smiles_train, iter))
  test(0, len(df_smiles_test), df_smiles_test)

Round 1 gives us: 0.052082013338804245
Test Eval: 0.75 0.7, 0.56 0.73, 0.44 0.6, AUC=0.71717 AP=0.72635.
Round 2 gives us: 0.11347097903490067
Test Eval: 0.71 0.68, 0.56 0.73, 0.5 0.67, AUC=0.72439 AP=0.73092.
Round 3 gives us: 0.047754157334566116
Test Eval: 0.81 0.79, 0.56 0.73, 0.56 0.69, AUC=0.73646 AP=0.73841.
Round 4 gives us: 0.024495922029018402
Test Eval: 0.65 0.63, 0.56 0.73, 0.44 0.6, AUC=0.73649 AP=0.73856.
Round 5 gives us: 0.0405842661857605
Test Eval: 0.66 0.72, 0.56 0.73, 0.38 0.58, AUC=0.71036 AP=0.72143.
Round 6 gives us: 0.04774768278002739
Test Eval: 0.58 0.67, 0.56 0.73, 0.44 0.6, AUC=0.68252 AP=0.70688.
Round 7 gives us: 0.054346248507499695
Test Eval: 0.58 0.69, 0.56 0.73, 0.38 0.58, AUC=0.67049 AP=0.70068.
Round 8 gives us: 0.032825060188770294
Test Eval: 0.62 0.69, 0.56 0.73, 0.5 0.75, AUC=0.68253 AP=0.70598.
Round 9 gives us: 0.03731738030910492
Test Eval: 0.74 0.74, 0.56 0.73, 0.38 0.58, AUC=0.71733 AP=0.72626.
Round 10 gives us: 0.03422272950410843
Test Eval

KeyboardInterrupt: 

In [None]:
torch.save({'epoch': epochs,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': g_loss,},
               "/content/drive/Shareddrives/cs_89_network_science/saves/Bmodel_finetuned")

## Classification

In [None]:
def manual_sum_pooling(node_embeddings, batch):
    num_graphs = batch.max().item() + 1
    graph_embedding = torch.zeros((num_graphs, node_embeddings.size(1)), device=node_embeddings.device)
    for graph_id in range(num_graphs):
        graph_embedding[graph_id] = node_embeddings[batch == graph_id].sum(dim=0)
    return graph_embedding

# Define classifer model
class VGAEClassifier(nn.Module):
  def __init__(self, base_model, num_classes, lr=1e-5):
    super().__init__()
    self.base_model = base_model
    self.num_classes = num_classes
    self.fc = nn.Linear(base_model.encoder.hidden_dim, num_classes)
    self.lr = lr

    # Use BCEWithLogitsLoss for multi-label binary classification
    self.criterion = nn.BCEWithLogitsLoss()

    # Initialize metrics specifically for multi-label with the correct number of labels
    self.accuracy = torchmetrics.Accuracy(average='macro', task="multilabel", num_classes=num_classes, num_labels=num_classes)
    # For F1 and AUC, specifying num_classes for multilabel task
    self.f1_score = torchmetrics.F1Score(num_classes=num_classes, average='macro', task="multilabel", num_labels=num_classes)
    self.auc_roc = torchmetrics.AUROC(num_classes=num_classes, average='macro', task="multilabel", num_labels=num_classes)

  def forward(self, x, edge_index, edge_attr, batch=None):
    node_embeddings = self.base_model.encode(x, edge_index, edge_attr)

    if batch is not None:
        graph_embedding = manual_sum_pooling(node_embeddings, batch)
    else:
        graph_embedding = torch.sum(node_embeddings, dim=0)
    out = self.fc(graph_embedding)
    return out

  def configure_optimizers(self):
    return torch.optim.Adam(self.parameters(), lr=self.lr)

In [None]:
# Load the VGAE model
#model_vgae = VGAE(encoder=GATEncoder(in_channels=9, hidden_dim=18, num_layers=2)) # 18 for Amodel; 64 for Bmodel q
model_vgae = VGAE(encoder=GATEncoder(in_channels=9, hidden_dim=64, num_layers=2))

checkpoint = torch.load("/content/drive/Shareddrives/cs_89_network_science/saves/Bmodel_finetuned")
model_vgae.load_state_dict(checkpoint['model_state_dict'])

epochs = checkpoint['epoch']
g_loss = checkpoint['loss']

# Initialize the classifiers
model_clintox = VGAEClassifier(model_vgae, 2)
optimizer_clintox = model_clintox.configure_optimizers()

model_tox21 = VGAEClassifier(model_vgae, 12)
optimizer_tox21 = model_tox21.configure_optimizers()

# Set to gpu if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Set to run on", device, end=".")
model_clintox = model_clintox.to(device)
model_tox21 = model_tox21.to(device)

Set to run on cpu.

In [None]:
# Split data for 70% training, 15% validation, 15% testing.
for df in [df_clintox, df_tox21]:
  n = len(df)
  idx = list(range(n))
  shuffle(idx)
  train_idx = idx[:int(0.7*n)]
  val_idx = idx[int(0.7*n):int(0.85*n)]
  test_idx = idx[int(0.85*n):]
  df['train'] = df['test'] = df['val'] = 0
  df.loc[train_idx, 'train'] = 1
  df.loc[test_idx, 'test'] = 1
  df.loc[val_idx, 'val'] = 1

In [None]:
def train(training_data, critcol, iter):
  model.train()
  optimizer.zero_grad()
  print("[----------------------------------------------------------------------------------------------------]",
        end="\r[")
  pinterval = pmark = (len(iter))/101
  for idx in range(len(iter)):
    i = iter[idx]
    if idx > pmark:
      print("-", end='')
      pmark += pinterval

    data = from_smiles(training_data.get("smiles").iloc[i])
    data = data.to(device)
    data.x = data.x.type(torch.float)

    y = torch.Tensor(training_data[critcol].iloc[i])
    y_hat = model.forward(data.x, data.edge_index, edge_attr=data.edge_attr)
    y_pred = torch.sigmoid(y_hat)

    loss = model.criterion(y_pred, y)
    loss.backward()
  print("]", end ="\r")
  optimizer.step()

  global g_loss, epochs
  g_loss = loss
  epochs += 1

  return float(loss)

def test(testing_data, critcol):
  model.eval()
  test_cases = list(range(len(testing_data)))
  reals = []
  preds = []
  print("Test Eval:", end=" ")
  with torch.no_grad():
    for i in test_cases:
      data = from_smiles(testing_data.get("smiles").iloc[i])
      data = data.to(device)
      data.x = data.x.type(torch.float)

      y = torch.Tensor(testing_data[critcol].iloc[i])
      y_hat = model.forward(data.x, data.edge_index, edge_attr=data.edge_attr)
      y_pred = torch.sigmoid(y_hat)

      reals.append(y)
      preds.append(y_pred)
  reals = torch.stack(reals)
  preds = torch.stack(preds)
  auroc = model.auc_roc(preds, reals.int())
  acc = model.accuracy(preds, reals.int())
  f1 = model.f1_score(preds, reals.int())

  print("accuracy=" + str(round(float(acc), 5)) + ",",
        "f1=" + str(round(float(f1), 5)) + ",",
        "auc_roc=" + str(round(float(auroc), 5)),
        end=".")
  print("")

def val(val_data, critcol):
    loss = 0
    model.eval()
    with torch.no_grad():
        for i in range(len(val_data)):
            data = from_smiles(val_data.get("smiles").iloc[i])
            data = data.to(device)
            data.x = data.x.type(torch.float)

            y = torch.Tensor(val_data[critcol].iloc[i])
            y_hat = model.forward(data.x, data.edge_index, edge_attr=data.edge_attr)
            y_pred = torch.sigmoid(y_hat)

            loss += model.criterion(y_pred, y)
    return loss / len(val_data)

In [None]:
# clintox training
trainset = df_clintox[df_clintox['train']==1]
valset = df_clintox[df_clintox['val']==1]
criterion = ['FDA_APPROVED', 'CT_TOX']
model = model_clintox
optimizer = optimizer_clintox

for i in range(100):
  iter = list(range(len(trainset)))
  shuffle(iter)
  print("Round", str(i+1), "gives us:", train(trainset, criterion, iter))
  test(valset, criterion)

Round 1 gives us: 0.5248650312423706
Test Eval: accuracy=0.96205, f1=0.48438, auc_roc=N/A.
Round 2 gives us: 0.7635267376899719
Test Eval: accuracy=0.96205, f1=0.48438, auc_roc=N/A.
Round 3 gives us: 0.6409727931022644
Test Eval: accuracy=0.96205, f1=0.48438, auc_roc=N/A.
Round 4 gives us: 0.6820686459541321
Test Eval: accuracy=0.96205, f1=0.48438, auc_roc=N/A.
Round 5 gives us: 0.6288365721702576
Test Eval: accuracy=0.96205, f1=0.48438, auc_roc=N/A.
Round 6 gives us: 0.5609068274497986
Test Eval: accuracy=0.96205, f1=0.48438, auc_roc=N/A.
Round 7 gives us: 0.5652252435684204
Test Eval: accuracy=0.96205, f1=0.48438, auc_roc=N/A.
Round 8 gives us: 0.6941674947738647
Test Eval: accuracy=0.96205, f1=0.48438, auc_roc=N/A.
Round 9 gives us: 0.6439749002456665
Test Eval: accuracy=0.96205, f1=0.48438, auc_roc=N/A.
Round 10 gives us: 0.7356199622154236
Test Eval: accuracy=0.96205, f1=0.48438, auc_roc=N/A.
Round 11 gives us: 0.6601439714431763
Test Eval: accuracy=0.96205, f1=0.48438, auc_roc=N/

KeyboardInterrupt: 

In [None]:
torch.save({'epoch': epochs,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': g_loss,},
               "/content/drive/Shareddrives/cs_89_network_science/saves/Bclintox")

In [None]:
import pickle

In [None]:
k = 5

In [None]:
# Load the VGAE model
#model_vgae = VGAE(encoder=GATEncoder(in_channels=9, hidden_dim=18, num_layers=2)) # 18 for Amodel; 64 for Bmodel q
model_vgae_clintox = VGAE(encoder=GATEncoder(in_channels=9, hidden_dim=64, num_layers=2))
model_vgae_tox21 = VGAE(encoder=GATEncoder(in_channels=9, hidden_dim=64, num_layers=2))

checkpoint = torch.load("/content/drive/Shareddrives/cs_89_network_science/saves/Bmodel") # eschew fine-tuned for now
model_vgae_clintox.load_state_dict(checkpoint['model_state_dict'])
model_vgae_tox21.load_state_dict(checkpoint['model_state_dict'])

with open("/content/drive/Shareddrives/cs_89_network_science/saves/128dim_pretrain_save", "rb") as f:
  lm = pickle.load(f)
model_vgae_clintox = lm['model']

#epochs = checkpoint['epoch']
#g_loss = checkpoint['loss']

# Initialize the classifiers
model_clintox = VGAEClassifier(model_vgae_clintox, 2)
optimizer_clintox = model_clintox.configure_optimizers()

model_tox21 = VGAEClassifier(model_vgae_tox21, 12)
optimizer_tox21 = model_tox21.configure_optimizers()


# Set to gpu if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Set to run on", device, end=".")
model_clintox = model_clintox.to(device)
model_tox21 = model_tox21.to(device)

Set to run on cpu.

In [None]:
# clintox training
df_clintox_k = pd.read_csv("/content/drive/Shareddrives/cs_89_network_science/data/clintox_fivefoldsplit#" + str(k) + ".csv")
trainset = df_clintox_k[df_clintox_k['train']==1].reset_index(drop=True)
valset = df_clintox_k[df_clintox_k['val']==1].reset_index(drop=True)
testset = df_clintox_k[df_clintox_k['test']==1].reset_index(drop=True)
criterion = ['FDA_APPROVED', 'CT_TOX']
model = model_clintox
optimizer = optimizer_clintox

val_loss = 99999999
for i in range(100):
  iter = list(range(len(trainset)))
  shuffle(iter)
  print("Round", str(i+1), "gives us:", train(trainset, criterion, iter), end=". ")
  new_val_loss = val(valset, criterion)
  if new_val_loss < val_loss:
    print("New Checkpoint:", float(new_val_loss), end="; ")
    val_loss = new_val_loss
    torch.save({'epoch': i+1,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'val_loss': val_loss,},
               "/content/drive/Shareddrives/cs_89_network_science/saves/large_clintox")
  test(testset, criterion)

Round 1 gives us: 0.8302482962608337. New Checkpoint: 0.7025930881500244; Test Eval: accuracy=0.86577, f1=0.45421, auc_roc=0.44483.
Round 2 gives us: 0.7130303978919983. New Checkpoint: 0.6879845857620239; Test Eval: accuracy=0.89933, f1=0.4735, auc_roc=0.4006.
Round 3 gives us: 0.6296921372413635. New Checkpoint: 0.6746369004249573; Test Eval: accuracy=0.9094, f1=0.47902, auc_roc=0.39805.
Round 4 gives us: 0.6328834891319275. New Checkpoint: 0.6625080108642578; Test Eval: accuracy=0.91275, f1=0.48084, auc_roc=0.40406.
Round 5 gives us: 0.5214331150054932. New Checkpoint: 0.6515225172042847; Test Eval: accuracy=0.91275, f1=0.48084, auc_roc=0.40875.
Round 6 gives us: 0.8097246289253235. New Checkpoint: 0.6415560245513916; Test Eval: accuracy=0.91275, f1=0.48084, auc_roc=0.41129.
Round 7 gives us: 0.6440235376358032. New Checkpoint: 0.6324648857116699; Test Eval: accuracy=0.91275, f1=0.48084, auc_roc=0.41401.
Round 8 gives us: 0.5093182325363159. New Checkpoint: 0.6241958737373352; Test 

## Results
128-dim model with full epoch of pubchem.
### Clintox Results
Recorded at 100 epochs. "large_clintox"

k=1; test loss=0.5417; test acc=0.9396; test f1=0.4698; test auroc=0.3245;

k=2; test loss=0.5498; test acc=0.94631; test f1=0.48797; test auroc=0.20531;

k=3; test loss=0.5433; test acc=0.90268; test f1=0.47535; test auroc=0.3946;

k=4; test loss=0.5274; test acc=0.92617; test f1=0.48264; test auroc=0.33376;

k=5; test loss=; test acc=; test f1=; test auroc=;


## Results
Note these are of the 64-dim model, with very limited pretraining (exact epoch number tbc).
### Clintox Results
Recorded at 100 epochs. "C_clintox"

k=1; test loss=0.6249; test acc=0.93289; test f1=0.46309; test auroc=;
(acc & f1 held at epochs 56-68, 69-88, 89-92, 93-100; loss continued decreasing)

(Out of curiosity, recorder at 200 epochs):
test loss=0.5748; test acc=0.93624; test f1=0.46644; test auroc=;
(acc & f1 held at 101-126, 127-200)

k=3; test loss=; test acc=; test f1=; test auroc=;

k=4; test loss=; test acc=; test f1=; test auroc=;

k=5; test loss=; test acc=; test f1=; test auroc=;

In [None]:
# Flatten the results and create a DataFrame
results = {'clintox_1': [0.5417, 0.9396, 0.4698, 0.3245, 100],
           'clintox_2': [0.5498, 0.94631, 0.48797, 0.20531, 100],
           'clintox_3': [0.5433, 0.90268, 0.47535, 0.3946, 100],
           'clintox_4': [0.5274, 0.92617, 0.48264, 0.33376, 100],
           'clintox_5': [0.5311, 0.91275, 0.48084, 0.42356, 100]
           }
df_results = pd.DataFrame.from_dict(results, orient='index', columns=['test_loss', 'test_acc', 'test_f1', 'test_auc_roc', 'epochs'])

# Display the resulting DataFrame
print(df_results)
df_results.to_csv('/content/drive/Shareddrives/cs_89_network_science/results/VGAE_GAT.csv')

           test_loss  test_acc  test_f1  test_auc_roc  epochs
clintox_1     0.5417   0.93960  0.46980       0.32450     100
clintox_2     0.5498   0.94631  0.48797       0.20531     100
clintox_3     0.5433   0.90268  0.47535       0.39460     100
clintox_4     0.5274   0.92617  0.48264       0.33376     100
clintox_5     0.5311   0.91275  0.48084       0.42356     100


In [None]:
checkpoint = torch.load("/content/drive/Shareddrives/cs_89_network_science/saves/C_clintox") # eschew fine-tuned for now
model = VGAEClassifier(VGAE(encoder=GATEncoder(in_channels=9, hidden_dim=64, num_layers=2)), 2)
model.load_state_dict(checkpoint['model_state_dict'])

<All keys matched successfully>

In [None]:
test(testset, criterion)

Test Eval: accuracy=0.93624, f1=0.48264, auc_roc=0.33333,.


In [None]:
# tox21 training
df_tox21_nt0 = df_tox21.fillna(0)
trainset = df_tox21_nt0[df_tox21_nt0['train']==1]
valset = df_tox21_nt0[df_tox21_nt0['val']==1]
criterion = ['NR-AR', 'NR-AR-LBD', 'NR-AhR', 'NR-Aromatase', 'NR-ER', 'NR-ER-LBD','NR-PPAR-gamma', 'SR-ARE', 'SR-ATAD5',
             'SR-HSE', 'SR-MMP', 'SR-p53']
model = model_tox21
optimizer = optimizer_tox21

for i in range(50):
  iter = list(range(len(trainset)))
  shuffle(iter)
  print("Round", str(i+1), "gives us:", train(trainset, criterion, iter))
  test(valset, criterion)

Round 1 gives us: 0.8848485946655273
Test Eval: accuracy=0.5201, f1=0.03134, auc_roc=N/A.
Round 2 gives us: 0.8945091366767883
Test Eval: accuracy=0.5235, f1=0.0312, auc_roc=N/A.
Round 3 gives us: 0.9534947276115417
Test Eval: accuracy=0.52641, f1=0.03099, auc_roc=N/A.
Round 4 gives us: 0.9648405909538269
Test Eval: accuracy=0.52905, f1=0.03064, auc_roc=N/A.
Round 5 gives us: 0.964924156665802
Test Eval: accuracy=0.53265, f1=0.03037, auc_roc=N/A.
Round 6 gives us: 1.0000156164169312
Test Eval: accuracy=0.53557, f1=0.03023, auc_roc=N/A.
Round 7 gives us: 0.9938910007476807
Test Eval: accuracy=0.53848, f1=0.02995, auc_roc=N/A.
Round 8 gives us: 0.9891119599342346
Test Eval: accuracy=0.54042, f1=0.02981, auc_roc=N/A.
Round 9 gives us: 0.8456937670707703
Test Eval: accuracy=0.54319, f1=0.0296, auc_roc=N/A.
Round 10 gives us: 0.958885133266449
Test Eval: accuracy=0.54652, f1=0.02953, auc_roc=N/A.
Round 11 gives us: 0.7586655616760254
Test Eval: accuracy=0.54867, f1=0.02926, auc_roc=N/A.
Rou

KeyboardInterrupt: 

In [None]:
torch.save({'epoch': epochs,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': g_loss,},
               "/content/drive/Shareddrives/cs_89_network_science/saves/tox21_NA0")

## Results
128-dim model with full epoch of pubchem.
### Tox21 Results
Recorded at 100 epochs. "large_clintox"
72. New Checkpoint: 0.7655649781227112; Test Eval: accuracy=0.94184, f1=0.0, auc_roc=0.36136,.
k=1; test loss=0.5417; test acc=0.9396; test f1=0.4698; test auroc=0.3245;

In [None]:
k = 5

# Load the VGAE model
#model_vgae = VGAE(encoder=GATEncoder(in_channels=9, hidden_dim=18, num_layers=2)) # 18 for Amodel; 64 for Bmodel q
model_vgae_tox21 = VGAE(encoder=GATEncoder(in_channels=9, hidden_dim=64, num_layers=2))

checkpoint = torch.load("/content/drive/Shareddrives/cs_89_network_science/saves/Bmodel") # eschew fine-tuned for now
model_vgae_tox21.load_state_dict(checkpoint['model_state_dict'])

with open("/content/drive/Shareddrives/cs_89_network_science/saves/128dim_pretrain_save", "rb") as f:
  lm = pickle.load(f)
model_vgae_tox21 = lm['model']

# Initialize the classifiers
model_tox21 = VGAEClassifier(model_vgae_tox21, 12)
optimizer_tox21 = model_tox21.configure_optimizers()

# Set to gpu if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Set to run on", device, end=".")
model_tox21 = model_tox21.to(device)

# clintox training
df_k = pd.read_csv("/content/drive/Shareddrives/cs_89_network_science/data/tox21_fivefoldsplit#" + str(k) + ".csv").fillna(0)
trainset = df_k[df_k['train']==1].reset_index(drop=True)
valset = df_k[df_k['val']==1].reset_index(drop=True)
testset = df_k[df_k['test']==1].reset_index(drop=True)
criterion = ['NR-AR', 'NR-AR-LBD', 'NR-AhR', 'NR-Aromatase', 'NR-ER', 'NR-ER-LBD','NR-PPAR-gamma', 'SR-ARE', 'SR-ATAD5', 'SR-HSE', 'SR-MMP', 'SR-p53']
model = model_tox21
optimizer = optimizer_tox21

val_loss = 99999999
for i in range(50):
  iter = list(range(len(trainset)))
  shuffle(iter)
  print("Round", str(i+1), "gives us:", train(trainset, criterion, iter), end=". ")
  new_val_loss = val(valset, criterion)
  if new_val_loss < val_loss:
    print("New Checkpoint:", float(new_val_loss), end="; ")
    val_loss = new_val_loss
    torch.save({'epoch': i+1,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'val_loss': val_loss,},
               "/content/drive/Shareddrives/cs_89_network_science/saves/large_clintox")
  test(testset, criterion)

Round 1 gives us: 0.998526394367218. New Checkpoint: 0.9389772415161133; Test Eval: accuracy=0.71907, f1=0.08972, auc_roc=0.46331.
Round 2 gives us: 1.0467541217803955. New Checkpoint: 0.9342154264450073; Test Eval: accuracy=0.81102, f1=0.06578, auc_roc=0.44314.
Round 3 gives us: 0.8920257687568665. New Checkpoint: 0.9294883012771606; Test Eval: accuracy=0.87028, f1=0.05389, auc_roc=0.42867.
Round 4 gives us: 0.932929277420044. New Checkpoint: 0.9248011708259583; Test Eval: accuracy=0.9012, f1=0.04495, auc_roc=0.42109.
Round 5 gives us: 0.8399574160575867. New Checkpoint: 0.9201619625091553; Test Eval: accuracy=0.91573, f1=0.0361, auc_roc=0.4163.
Round 6 gives us: 0.8218849301338196. New Checkpoint: 0.9155727028846741; Test Eval: accuracy=0.9231, f1=0.03673, auc_roc=0.41275.
Round 7 gives us: 0.994641125202179. New Checkpoint: 0.9110381603240967; Test Eval: accuracy=0.92902, f1=0.03552, auc_roc=0.40994.
Round 8 gives us: 0.8918437957763672. New Checkpoint: 0.906566321849823; Test Eval:

In [None]:
test(trainset, criterion)

Test Eval: accuracy=0.93837, f1=0.0, auc_roc=0.37312.


In [None]:
# Flatten the results and create a DataFrame
results = {'clintox_1': [0.5417, 0.9396, 0.4698, 0.3245, 100],
           'clintox_2': [0.5498, 0.94631, 0.48797, 0.20531, 100],
           'clintox_3': [0.5433, 0.90268, 0.47535, 0.3946, 100],
           'clintox_4': [0.5274, 0.92617, 0.48264, 0.33376, 100],
           'clintox_5': [0.5311, 0.91275, 0.48084, 0.42356, 100],
           'tox21_1': [0.7757, 0.94184, 0.0, 0.36223, 50],
           'tox21_2': [0.7668, 0.93654, 0.0, 0.36198, 50],
           'tox21_3': [0.7668, 0.93071, 0.0, 0.34779, 50],
           'tox21_4': [0.7747, 0.93914, 0.0, 0.38622, 50],
           'tox21_5': [0.7811, 0.94085, 0.0, 0.3702]
           }
df_results = pd.DataFrame.from_dict(results, orient='index', columns=['test_loss', 'test_acc', 'test_f1', 'test_auc_roc', 'epochs'])

# Display the resulting DataFrame
print(df_results)
df_results.to_csv('/content/drive/Shareddrives/cs_89_network_science/results/VGAE_GAT.csv')

           test_loss  test_acc  test_f1  test_auc_roc  epochs
clintox_1     0.5417   0.93960  0.46980       0.32450   100.0
clintox_2     0.5498   0.94631  0.48797       0.20531   100.0
clintox_3     0.5433   0.90268  0.47535       0.39460   100.0
clintox_4     0.5274   0.92617  0.48264       0.33376   100.0
clintox_5     0.5311   0.91275  0.48084       0.42356   100.0
tox21_1       0.7757   0.94184  0.00000       0.36223    50.0
tox21_2       0.7668   0.93654  0.00000       0.36198    50.0
tox21_3       0.7668   0.93071  0.00000       0.34779    50.0
tox21_4       0.7747   0.93914  0.00000       0.38622    50.0
tox21_5       0.7811   0.94085  0.00000       0.37020     NaN


In [None]:
with open("temp2", "wb") as f:
  pickle.dump(model, f)

In [None]:
(0.36223 + 0.36198 + 0.34779 + 0.38622 + 00.37020)/5

0.365684