<a href="https://colab.research.google.com/github/lnsayer/personal_repo/blob/main/drug%20discovery%20with%20BACE%20dataset/bace_dataset_going_modular.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os

os.makedirs("going_modular", exist_ok=True)

In [3]:
print(os.getcwd())

/content


In [24]:
%%writefile going_modular/get_data.py

import requests
from pathlib import Path
import pandas as pd

# If want to download bace csv straight into colab
data_path = Path("going_modular/data/")
bace_path = data_path / "raw"

if bace_path.is_dir():
  print(f"{bace_path} is already a directory")
else:
  print(f"{bace_path} is not a directory, creating one")
  bace_path.mkdir(parents=True, exist_ok=True)

  with open(bace_path / "bace.csv", "wb") as f:
    request = requests.get("https://raw.githubusercontent.com/lnsayer/personal_repo/main/drug%20discovery%20with%20BACE%20dataset/data/bace.csv")
    print("Downloading data")
    f.write(request.content)

# Resave the csv files without unnecessary columns
bace_df = pd.read_csv(bace_path/ "bace.csv")
bace_df = bace_df[["mol", "CID", "Class", "Model", "pIC50"]]
bace_df.to_csv(bace_path/"bace.csv")

bace_df

Overwriting going_modular/get_data.py


In [25]:
!python going_modular/get_data.py

going_modular/data/raw is already a directory


In [23]:
import pandas as pd
bace_df = pd.read_csv("going_modular/data/raw/bace.csv")
bace_df

Unnamed: 0.1,Unnamed: 0,mol,CID,Class,Model,pIC50
0,0,O1CC[C@@H](NC(=O)[C@@H](Cc2cc3cc(ccc3nc2N)-c2c...,BACE_1,1,Train,9.154901
1,1,Fc1cc(cc(F)c1)C[C@H](NC(=O)[C@@H](N1CC[C@](NC(...,BACE_2,1,Train,8.853872
2,2,S1(=O)(=O)N(c2cc(cc3c2n(cc3CC)CC1)C(=O)N[C@H](...,BACE_3,1,Train,8.698970
3,3,S1(=O)(=O)C[C@@H](Cc2cc(O[C@H](COCC)C(F)(F)F)c...,BACE_4,1,Train,8.698970
4,4,S1(=O)(=O)N(c2cc(cc3c2n(cc3CC)CC1)C(=O)N[C@H](...,BACE_5,1,Train,8.698970
...,...,...,...,...,...,...
1508,1508,Clc1cc2nc(n(c2cc1)C(CC(=O)NCC1CCOCC1)CC)N,BACE_1543,0,Test,3.000000
1509,1509,Clc1cc2nc(n(c2cc1)C(CC(=O)NCc1ncccc1)CC)N,BACE_1544,0,Test,3.000000
1510,1510,Brc1cc(ccc1)C1CC1C=1N=C(N)N(C)C(=O)C=1,BACE_1545,0,Test,2.953115
1511,1511,O=C1N(C)C(=NC(=C1)C1CC1c1cc(ccc1)-c1ccccc1)N,BACE_1546,0,Test,2.733298


In [60]:
%%writefile going_modular/data_setup.py

from timeit import default_timer as timer
import subprocess

start_time = timer()
try:
  import torch_geometric
  import deepchem as dc
  print("entered try")
except:
  subprocess.call(['pip', 'install', 'torch_geometric'])
  subprocess.call(['pip', 'install', 'deepchem'])
  print("entered except")
end_time = timer()
print(f"Time to install packages: {end_time-start_time:.4f}")

import torch
import torch_geometric
from torch_geometric.data import InMemoryDataset, Dataset, Data
from torch_geometric.loader import DataLoader

import deepchem as dc
from deepchem.feat.graph_data import GraphData
import os
import pandas as pd
import os.path as osp


# Custom torch geometric Dataset class to store the samples and their corresponding labels

class MoleculeDataset(Dataset):
  def __init__(self, root, csv_file, transform=None, pre_transform=None, pre_filter=None):
    """
    Constructor method of the class

    :root = Path where the dataset should be stored. This folder is split
    into raw_dir (downloaded dataset) and processed_dir(processed data).
    :csv_file = Desired name of the CSV file to be saved.
    : transform, pre_transform, pre_filter = optional transforms
    """
    self.csv_file = csv_file
    super().__init__(root, transform, pre_transform, pre_filter)

  @property
  def raw_file_names(self):
    """
    If this file exists in raw_dir, the download is not triggered/
    (the download function is not implemented here)
    """
    return self.csv_file

  @property
  def processed_file_names(self):
    """
    If these files are found in raw_dir, processing is skipped
    """
    self.data = pd.read_csv(self.raw_paths[0]).reset_index()

    return [f'data_{i}.pt' for i in list(self.data.index)]

  def download(self):
    """
    No need to download the csv file as it is already downloaded
    """
    pass

  def process(self):
    """
    Converts molecules with SMILES formats into PyTorch graphs. Uses Deepchem's MolGraphConvFeaturizer to create a graph
    and then convert that to a torch graph with to_pyg_graph. Saves these in the processed directory.
    """
    self.data = pd.read_csv(self.raw_paths[0]).reset_index()
    featurizer=dc.feat.MolGraphConvFeaturizer(use_edges=True)

    for idx, row in self.data.iterrows():
      # Featurize molecule and convert to torch graph
      smiles = row['mol']
      label = row['Class']
      pic50 = row['pIC50']

      out = featurizer.featurize(smiles)
      pyg_out = GraphData.to_pyg_graph(out[0])
      pyg_out.Class = torch.tensor([label])
      pyg_out.smiles = smiles
      pyg_out.pic50 = pic50

      # data = Data(x=pyg_out.x, edge_index=pyg_out.edge_index, edge_attr=pyg_out.edge_attr,
      #            y=torch.tensor([label]), dtype = torch.float)

      torch.save(pyg_out, osp.join(self.processed_dir, f'data_{idx}.pt'))

  def len(self):
    """
    Returns number of samples in the dataset
    """
    return len(self.processed_file_names)

  def get(self, idx):
    """
    Loads a single graph
    """
    data = torch.load(osp.join(self.processed_dir, f'data_{idx}.pt'))
    return data

NUM_WORKERS = os.cpu_count()

def create_dataloaders(root_directory: str,
                       batch_size: int,
                       num_workers: int=NUM_WORKERS,
                       train_fraction: float=0.8):
  dataset = MoleculeDataset(root = root_directory, csv_file = "bace.csv").shuffle()

  train_indices = int(train_fraction*len(dataset))

  train_dataset = dataset[:train_indices]
  test_dataset = dataset[train_indices:]

  train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers)
  test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers)

  return dataset, train_dataloader, test_dataloader


Overwriting going_modular/data_setup.py


In [61]:
%%writefile going_modular/train.py

import data_setup
from timeit import default_timer as timer

start_time = timer()
dataset, train_dataloader, test_dataloader = data_setup.create_dataloaders(root_directory = "going_modular/data/",
                                                                batch_size = 32)
end_time = timer()

print(f"Created dataset, train_dataloader, test_dataloader, took {end_time - start_time:.4f}s")

Overwriting going_modular/train.py


In [62]:
from IPython.display import Javascript


display(Javascript('''google.colab.output.setIframeHeight(0, true, {maxHeight: 300})'''))

!python going_modular/train.py

<IPython.core.display.Javascript object>

No normalization for SPS. Feature removed!
No normalization for AvgIpc. Feature removed!
2024-07-17 09:53:39.013505: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-17 09:53:39.013587: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-17 09:53:39.016301: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-17 09:53:39.030263: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow w