<a href="https://colab.research.google.com/github/lnsayer/personal_repo/blob/main/drug%20discovery%20with%20BACE%20dataset/bace_dataset_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Time how long it takes to install packages

from timeit import default_timer as timer
from IPython.display import Javascript

display(Javascript('''google.colab.output.setIframeHeight(0, true, {maxHeight: 300})'''))

start_time = timer()

torch_geometric_start_time = timer()
!pip install torch_geometric
torch_geometric_end_time = timer()

deep_chem_start_time = timer()
!pip install deepchem
deep_chem_end_time = timer()

end_time = timer()

print(f"Time for cell to run: {end_time-start_time:.4f}")
print(f"torch_geometric time: {torch_geometric_end_time-torch_geometric_start_time:.4f}")
print(f"deep_chem time: {deep_chem_end_time-deep_chem_start_time:.4f}")

<IPython.core.display.Javascript object>

Collecting torch_geometric
  Downloading torch_geometric-2.5.3-py3-none-any.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torch_geometric
Successfully installed torch_geometric-2.5.3
Collecting deepchem
  Downloading deepchem-2.8.0-py3-none-any.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
Collecting rdkit (from deepchem)
  Downloading rdkit-2024.3.3-cp310-cp310-manylinux_2_28_x86_64.whl (33.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.1/33.1 MB[0m [31m37.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: rdkit, deepchem
Successfully installed deepchem-2.8.0 rdkit-2024.3.3
Time for cell to run: 38.2477
torch_geometric time: 24.8651
deep_chem time: 13.3824


In [4]:
# Import necessary modules
import requests
from pathlib import Path
import os.path as osp
import shutil
import os

import torch
import torch_geometric
from torch_geometric.data import InMemoryDataset, Dataset, Data
from torch_geometric.loader import DataLoader
from torch_geometric.nn import GCNConv, global_mean_pool, GraphConv, GATConv, MLP, GINConv, global_max_pool, SAGPooling, TopKPooling, GINEConv
from torch.nn import Linear, ReLU, Dropout, Softmax
import torch.nn as nn
import torch.nn.functional as F

import deepchem as dc
from deepchem.feat.graph_data import GraphData

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, roc_auc_score
from pandas import DataFrame

import random

from tqdm.auto import tqdm
from timeit import default_timer as timer

from IPython.display import Javascript
import pickle

from typing import Callable, Optional, Any


import warnings

In [5]:
print(os.getcwd())

/content


In [6]:
# Convert bace_dataset_utils file to python file
!jupyter nbconvert --to python "/content/drive/MyDrive/Colab Notebooks/bace_dataset_utils.ipynb" --output bace_dataset_utils

[NbConvertApp] Converting notebook /content/drive/MyDrive/Colab Notebooks/bace_dataset_utils.ipynb to python
[NbConvertApp] Writing 33137 bytes to /content/drive/MyDrive/Colab Notebooks/bace_dataset_utils.py


In [9]:
# Move python file to current directory
shutil.move('/content/drive/MyDrive/Colab Notebooks/bace_dataset_utils.py', './bace_dataset_utils.py')

'./bace_dataset_utils.py'

In [10]:
from bace_dataset_utils import (MoleculeDataset, GCNClassifier, GraphConvClassifier, GATClassifier, GINConvClassifier, GINEConvClassifier,
                                train_step, test_step, moving_average, train, run_model_repeats, gcn_callable, gat_callable, graphconv_callable,
                                ginconv_callable, adam_optimizer_callable, new_metric_func, average_model_metrics, loss_acc_auc_plots)

<IPython.core.display.Javascript object>

Time for cell to run: 15.2847
torch_geometric time: 8.7481
deep_chem time: 6.5366
hello world


In [11]:
# If want to download bace csv straight into colab
data_path = Path("data/")
bace_path = data_path / "raw"

if bace_path.is_dir():
  print(f"{bace_path} is already a directory")
else:
  print(f"{bace_path} is not a directory, creating one")
  bace_path.mkdir(parents=True, exist_ok=True)

  with open(bace_path / "bace.csv", "wb") as f:
    request = requests.get("https://raw.githubusercontent.com/lnsayer/personal_repo/main/drug%20discovery%20with%20BACE%20dataset/data/bace.csv")
    print("Downloading data")
    f.write(request.content)

data/raw is not a directory, creating one
Downloading data


In [12]:
original_dataset = MoleculeDataset(root = "data/", csv_file = "bace.csv").shuffle()
dataset = original_dataset

train_indices = int(0.8*len(dataset))

train_dataset = dataset[:train_indices]
test_dataset = dataset[train_indices:]

Processing...
Done!


In [13]:
num_workers = 4
# Create a dataloader from the dataset
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=num_workers)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=True, num_workers=num_workers)



# GCN training runs

In [14]:
# GCN Classifier Training Run
warnings.filterwarnings("ignore", category=RuntimeWarning)

display(Javascript('''google.colab.output.setIframeHeight(0, true, {maxHeight: 300})'''))

# Number of features of the graphs
num_features = next(iter(train_dataloader)).x.shape[1]
num_hidden_channels = 128
num_out_channels = 2
# with global mean pooling, best is around 100 epochs
nb_epochs = 300
pool_method = global_mean_pool
window_size = 10
patience = 30
criterion = torch.nn.CrossEntropyLoss()


<IPython.core.display.Javascript object>



In [15]:
gcn_callable

In [16]:
run_model_repeats

In [19]:
num_features = next(iter(train_dataloader)).x.shape[1]
pool_method = global_mean_pool

print(pool_method)

run_model_repeats(model = GCNClassifier(num_features, num_hidden_channels, num_out_channels, global_mean_pool),
                  optimizer_ = adam_optimizer_callable,
                  criterion_ = criterion,
                  num_hidden_channels = 128,
                  pool_method = global_mean_pool,
                  nb_epochs = 2,
                  nb_repeats = 2,
                  window_size = 10,
                  patience = 50)

<function global_mean_pool at 0x7e2866258e50>


  0%|          | 0/2 [00:00<?, ?it/s]

NameError: name 'pool_method' is not defined

In [None]:
gcn_callable()

NameError: name 'num_features' is not defined