<a href="https://colab.research.google.com/github/lnsayer/personal_repo/blob/main/drug%20discovery%20with%20BACE%20dataset/bace_dataset_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [52]:
# Time how long it takes to install packages

from timeit import default_timer as timer
from IPython.display import Javascript

display(Javascript('''google.colab.output.setIframeHeight(0, true, {maxHeight: 300})'''))

start_time = timer()

torch_geometric_start_time = timer()
!pip install torch_geometric
torch_geometric_end_time = timer()

deep_chem_start_time = timer()
!pip install deepchem
deep_chem_end_time = timer()

end_time = timer()

print(f"Time for cell to run: {end_time-start_time:.4f}")
print(f"torch_geometric time: {torch_geometric_end_time-torch_geometric_start_time:.4f}")
print(f"deep_chem time: {deep_chem_end_time-deep_chem_start_time:.4f}")

<IPython.core.display.Javascript object>

Time for cell to run: 15.2031
torch_geometric time: 8.8664
deep_chem time: 6.3365


In [53]:
# Import necessary modules
import requests
from pathlib import Path
import os.path as osp
import shutil

import torch
import torch_geometric
from torch_geometric.data import InMemoryDataset, Dataset, Data
from torch_geometric.loader import DataLoader
from torch_geometric.nn import GCNConv, global_mean_pool, GraphConv, GATConv, MLP, GINConv, global_max_pool, SAGPooling, TopKPooling, GINEConv
from torch.nn import Linear, ReLU, Dropout, Softmax
import torch.nn as nn
import torch.nn.functional as F

import deepchem as dc
from deepchem.feat.graph_data import GraphData

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, roc_auc_score
from pandas import DataFrame

import random

from tqdm.auto import tqdm
from timeit import default_timer as timer

from IPython.display import Javascript
import pickle

from typing import Callable, Optional, Any


import warnings

In [151]:
print(os.getcwd())

/content/drive/MyDrive/Colab Notebooks


In [165]:
# Convert bace_dataset_utils file to python file
!jupyter nbconvert --to python "/content/drive/MyDrive/Colab Notebooks/bace_dataset_utils.ipynb" --output bace_dataset_utils

[NbConvertApp] Converting notebook /content/drive/MyDrive/Colab Notebooks/bace_dataset_utils.ipynb to python
[NbConvertApp] Writing 33137 bytes to /content/drive/MyDrive/Colab Notebooks/bace_dataset_utils.py


In [153]:
# Move python file to current directory
# shutil.move('/content/drive/MyDrive/Colab Notebooks/bace_dataset_utils.py', './bace_dataset_utils.py')

In [166]:
from bace_dataset_utils import (MoleculeDataset, GCNClassifier, GraphConvClassifier, GATClassifier, GINConvClassifier, GINEConvClassifier,
                                train_step, test_step, moving_average, train, run_model_repeats, gcn_callable, gat_callable, graphconv_callable,
                                ginconv_callable, adam_optimizer_callable, new_metric_func, average_model_metrics, loss_acc_auc_plots)

In [167]:
# If want to download bace csv straight into colab
data_path = Path("data/")
bace_path = data_path / "raw"

if bace_path.is_dir():
  print(f"{bace_path} is already a directory")
else:
  print(f"{bace_path} is not a directory, creating one")
  bace_path.mkdir(parents=True, exist_ok=True)

  with open(bace_path / "bace.csv", "wb") as f:
    request = requests.get("https://raw.githubusercontent.com/lnsayer/personal_repo/main/drug%20discovery%20with%20BACE%20dataset/data/bace.csv")
    print("Downloading data")
    f.write(request.content)

data/raw is already a directory


In [168]:
original_dataset = MoleculeDataset(root = "data/", csv_file = "bace.csv").shuffle()
dataset = original_dataset

train_indices = int(0.8*len(dataset))

train_dataset = dataset[:train_indices]
test_dataset = dataset[train_indices:]

In [169]:
num_workers = 4
# Create a dataloader from the dataset
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=num_workers)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=True, num_workers=num_workers)



# GCN training runs

In [178]:
# GCN Classifier Training Run
warnings.filterwarnings("ignore", category=RuntimeWarning)

display(Javascript('''google.colab.output.setIframeHeight(0, true, {maxHeight: 300})'''))

# Number of features of the graphs
num_features = next(iter(train_dataloader)).x.shape[1]
num_hidden_channels = 128
num_out_channels = 2
# with global mean pooling, best is around 100 epochs
nb_epochs = 300
pool_method = global_mean_pool
window_size = 10
patience = 30
criterion = torch.nn.CrossEntropyLoss()


<IPython.core.display.Javascript object>



In [179]:
gcn_callable

In [180]:
run_model_repeats

In [181]:
def run_model_repeats(model: torch.nn.Module,
                      optimizer_: Callable[[], torch.optim.Optimizer],
                      criterion_: torch.nn.Module,
                      models_directory: Path=None,
                      num_hidden_channels: int = 128,
                      pool_method: Any = global_mean_pool,
                      nb_epochs: int = 300,
                      nb_repeats: int = 1,
                      window_size: int = 10,
                      patience: int = 50):
  """
  Runs training runs for 'nb_repeats' and optionally saves them if nb_repeats.
  Optionally save the model and its results if models_directory provided.
  """

  for i in range(nb_repeats):
    if models_directory:
      model_save_name = f"{i}_{num_hidden_channels}_{nb_epochs}_{pool_method.__name__}.pth"
      model_save_path  = models_directory / model_save_name
    else:
      model_save_path = None
    optimizer = optimizer_(model.parameters())


    results = train(model,
        train_dataloader,
        test_dataloader,
        optimizer,
        criterion,
        epochs = nb_epochs,
        model_save_path = model_save_path,
        window_size=window_size,
        patience=patience)
    if models_directory:
      with open(models_directory/f"{i}_{num_hidden_channels}_{nb_epochs}_{pool_method.__name__}_results.pkl", 'wb') as f:
        print("Saved results of this model")
        pickle.dump(results, f)
    else:
      print("Did not save results of this model")

In [184]:
num_features = next(iter(train_dataloader)).x.shape[1]
pool_method = global_mean_pool

print(pool_method)

run_model_repeats(model = GCNClassifier(num_features, num_hidden_channels, num_out_channels, pool_method),
                  optimizer_ = adam_optimizer_callable,
                  criterion_ = criterion,
                  num_hidden_channels = 128,
                  pool_method = global_mean_pool,
                  nb_epochs = 2,
                  nb_repeats = 2,
                  window_size = 10,
                  patience = 50)

<function global_mean_pool at 0x7db6c6f571c0>


  0%|          | 0/2 [00:00<?, ?it/s]

NameError: name 'pool_method' is not defined

In [174]:
gcn_callable()

NameError: name 'num_features' is not defined