Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Sep_21_10:33:58_PDT_2022
Cuda compilation tools, release 11.8, V11.8.89
Build cuda_11.8.r11.8/compiler.31833905_0


Install Libraries

In [None]:
!pip install  dgl -f https://data.dgl.ai/wheels/cu118/repo.html
!pip install  dglgo -f https://data.dgl.ai/wheels-test/repo.html
!pip install dgllife
!pip install torch
!pip install deepchem

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in links: https://data.dgl.ai/wheels/cu118/repo.html
Collecting dgl
  Downloading https://data.dgl.ai/wheels/cu118/dgl-1.1.0%2Bcu118-cp310-cp310-manylinux1_x86_64.whl (86.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.4/86.4 MB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dgl
Successfully installed dgl-1.1.0+cu118
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in links: https://data.dgl.ai/wheels-test/repo.html
Collecting dglgo
  Downloading dglgo-0.0.2-py3-none-any.whl (63 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.5/63.5 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
Collecting isort>=5.10.1 (from dglgo)
  Downloading isort-5.12.0-py3-none-any.whl (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31

Import Statements

In [None]:
import os
import io
import pickle
import shutil
import warnings
import statistics
import numpy as np
import pandas as pd
from PIL import Image
import seaborn as sns
from random import seed
from random import shuffle
import matplotlib.pyplot as plt
from itertools import combinations

import deepchem as dc
from deepchem.feat.mol_graphs import ConvMol, WeaveMol
from deepchem.models.layers import GraphConv, GraphPool, GraphGather

from keras import backend as K
from keras.optimizers import Adam
from keras.models import Sequential
from keras.layers import Dense, Reshape, LSTM, Dropout, BatchNormalization, Conv2D, Flatten

from scipy.stats import pearsonr
from scipy.stats import wasserstein_distance

from sklearn.metrics import r2_score
from sklearn.decomposition import PCA

import tensorflow as tf
import tensorflow.keras.layers as layers

from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem.Draw import SimilarityMaps

#Seed
seed(13)



Path to load/save models and predictions

In [None]:
path = "/content/drive/MyDrive/Miscellaneous Code/SpectraPredictions/"

Load Dataset into 5 seperate folds

In [None]:
file_to_read = open("/content/drive/MyDrive/Colab Notebooks/NIST Dataset.pickle", "rb")

d = pickle.load(file_to_read)
smiles = d["smiles"]
sequences = d["sequences"]
that_index = int(np.where(smiles == "C")[0]) #This single carbon node (methane) breaks the MolGraphConv featurizer below, so this compound is manualy removed
smiles = np.concatenate((smiles[:that_index], smiles[that_index+1:]))
sequences = np.concatenate((sequences[:that_index], sequences[that_index+1:]))

#Zip each data sequence
dataset = list(zip(smiles, sequences))
shuffle(dataset)

#Extract compounds that occur more than once so that repeats aren't distributed across folds
single_occurence_molecules = [x for x in dataset if list(d["smiles"]).count(x[0]) <= 1]
multiple_occurence_molecules = [x for x in dataset if x[0] not in [h[0] for h in single_occurence_molecules]]
multis = multiple_occurence_molecules

#Create folds
folds = {}
fold_size = len(single_occurence_molecules) // 5
for i in range(1, 6):
    folds[i] = single_occurence_molecules[((i - 1) * fold_size):(i * fold_size)]

#Add whatever wasn't added from single occurences to the end of multiple occurences
multiple_occurence_molecules += single_occurence_molecules[(5 * fold_size):]
mult_fold_size = len(multiple_occurence_molecules) // 5

#Add all these molecules across folds such that all repeat occurences always occur within the same fold
current_fold = 0
while(len(multiple_occurence_molecules) > 0):
    current_fold %= 5
    current_fold += 1
    current_molecule = multiple_occurence_molecules[0]
    while current_molecule[0] in [h[0] for h in multiple_occurence_molecules]:
        folds[current_fold].append(multiple_occurence_molecules.pop([h[0] for h in multiple_occurence_molecules].index(current_molecule[0])))

#Print the length of each fold
for i in range(1, 6):
    print(len(folds[i]))

1501
1503
1500
1505
1496


Create test and train sets

In [None]:
#Helper Functions
def normalize(s):
    """Normalize the input series from 0->1 and return it"""
    maxval = max(s)
    scale = 1 / maxval
    if(maxval == 0):
      scale = 0
    return([j * scale for j in s])

def floor_out(x):
    """Add a floor threshold of 0.01 to reduce noise in spectra"""
    return([j if j > 0.01 else 0 for j in x])

def normal_many(x):
    """Normalize and floor in series"""
    return(np.array([floor_out(normalize(j)) for j in x]))

#Create fold sets
dataset_splits = {1: {}, 2: {}, 3: {}, 4: {}, 5: {}}
for i in range(1, 6):
  #For each i-th split, the testing set will be the i-th fold
  test = folds[i]
  train = []
  for x in range(1, 6):
    if x != i:
      train += folds[x]

  dataset_splits[i]["test_smiles"] = [j[0] for j in test]
  dataset_splits[i]["test_y"] = normal_many([j[1] for j in test])[:,432:]
  dataset_splits[i]["train_smiles"] = [j[0] for j in train]
  dataset_splits[i]["train_y"] = normal_many([j[1] for j in train])[:,432:]

  scale = 1 / maxval


Define Loss Functions

In [None]:
def euc_dist_keras(y_true, y_pred):
    """Euclidean distance loss function"""
    return K.sqrt(K.sum(K.square(y_true - y_pred), axis=-1, keepdims=True))

def pearson_first(y_true, y_pred):
    """Return pearson correlation for two single tensors"""
    return(pearsonr(y_true, y_pred)[0])

def wrapped_pearson_correlation(y_true, y_pred):
    y = tf.py_function(func = pearson_first, inp = [y_true, y_pred], Tout = tf.float32)
    return(y)

Run SMILES through DC Featurizer

In [None]:
featurizer = dc.feat.CircularFingerprint(radius = 2, size = 1024, chiral = False, features = False)
for i in range(1, 6):
  dataset_splits[i]["test_x"] = featurizer.featurize(dataset_splits[i]["test_smiles"])
  dataset_splits[i]["train_x"] = featurizer.featurize(dataset_splits[i]["train_smiles"])

Add Graph Featurization

In [None]:
graph_featurizer = dc.feat.ConvMolFeaturizer()
for i in range(1, 6):
  dataset_splits[i]["test_x_graph"] = graph_featurizer.featurize(dataset_splits[i]["test_smiles"])
  dataset_splits[i]["train_x_graph"] = graph_featurizer.featurize(dataset_splits[i]["train_smiles"])


Graph Data Generator

In [None]:
def data_generator(dataset, epochs = 1):
    for ind, (X_b, y_b, w_b, ids_b) in enumerate(dataset.iterbatches(batch_size, epochs, deterministic = True, pad_batches = True)):
        multiConvMol = ConvMol.agglomerate_mols(X_b)
        inputs = [multiConvMol.get_atom_features(), multiConvMol.deg_slice, multiConvMol.membership]
        for i in range(1, len(multiConvMol.get_deg_adjacency_lists())):
            inputs.append(multiConvMol.get_deg_adjacency_lists()[i])
            inputs = np.array(inputs)
            labels = [y_b]
            weights = [w_b]

    yield (inputs, labels, weights)

def clean(arr):
    #Helper Function for DC featurizers
    arr = list(map(float, arr))
    return [item for item in arr if not np.isnan(item)]

Other Models Featurizers and Training

In [None]:
#GraphConv featurization
graph_featurizer = dc.feat.ConvMolFeaturizer()
for i in range(1, 6):
  dataset_splits[i]["test_x_graph"] = graph_featurizer.featurize(dataset_splits[i]["test_smiles"])
  dataset_splits[i]["train_x_graph"] = graph_featurizer.featurize(dataset_splits[i]["train_smiles"])

In [None]:
#Weave featurization
mpnn = dc.feat.WeaveFeaturizer()
for i in range(1, 6):
  dataset_splits[i]["test_x_mpnn"] = mpnn.featurize(dataset_splits[i]["test_smiles"])
  dataset_splits[i]["train_x_mpnn"] = mpnn.featurize(dataset_splits[i]["train_smiles"])

In [None]:
#MolGraphConv featurization
gat = dc.feat.MolGraphConvFeaturizer(use_edges=True)
for i in range(1, 6):
  dataset_splits[i]["test_x_mgc"] = list(gat.featurize(dataset_splits[i]["test_smiles"]))
  dataset_splits[i]["train_x_mgc"] = list(gat.featurize(dataset_splits[i]["train_smiles"]))

GCN

In [None]:
#GraphConvModel training loop
import pickle
for i in range(1, 6):
    dtrain = dc.data.NumpyDataset(X = dataset_splits[i]["train_x_graph"], y = dataset_splits[i]["train_y"])
    dtest = dc.data.NumpyDataset(X = dataset_splits[i]["test_x_graph"], y = dataset_splits[i]["test_y"])
    gcnmodel = dc.models.GraphConvModel(1800, mode='regression', dropout = 0.1, batch_normalize = True, dense_layer_size=2048, batch_size = 64, learning_rate = 0.001, activation_fns = [tf.nn.sigmoid, tf.nn.sigmoid, tf.nn.sigmoid, tf.nn.sigmoid, tf.nn.sigmoid, tf.nn.sigmoid, tf.nn.sigmoid])
    gcnmodel.fit(dtrain, nb_epoch = 100)

    #Collect evaluation metrics
    g1predictions = gcnmodel.predict(dtest)
    graph_r2s = []
    total_r2, count = 0, 0
    for x in range(len(dataset_splits[i]["test_y"])):
        current_r2 = wrapped_pearson_correlation(normalize(g1predictions[x]), dataset_splits[i]["test_y"][x])
        total_r2 += 0 if np.isnan(current_r2) else current_r2
        graph_r2s.append(current_r2)
        count += 1
    current_fold_loss = round(float(total_r2 / count), 5)
    print("R2 Loss for fold", i, ":", current_fold_loss)
    gr2 = clean(list(map(float, graph_r2s)))
    print("I:", i, "Mean", statistics.mean(gr2), "Median", statistics.median(gr2), "STDev", statistics.stdev(gr2))#expirement
    fold_predictions_path = path + "GC_" + str(i) + "_preds.pickle"
    with open(fold_predictions_path, 'wb') as handle:
        pickle.dump(g1predictions, handle)

R2 Loss for fold 1 : 0.78268
I: 1 Mean 0.7826799059858786 Median 0.8166015148162842 STDev 0.16707830807219093
R2 Loss for fold 2 : 0.75424
I: 2 Mean 0.7542366366548413 Median 0.7876438498497009 STDev 0.17751610789087374
R2 Loss for fold 3 : 0.81345
I: 3 Mean 0.8134498006825646 Median 0.8551957309246063 STDev 0.15041162379188527




R2 Loss for fold 4 : 0.78765
I: 4 Mean 0.7881760836351028 Median 0.8426811099052429 STDev 0.18534759265484863
R2 Loss for fold 5 : 0.77158
I: 5 Mean 0.7715848307598402 Median 0.8202866613864899 STDev 0.17708218476898757


MPNN

In [None]:
#MPNN model training loop
from deepchem.models.torch_models import MPNNModel
import dgl

for i in range(1, 6):
    dtrain = dc.data.NumpyDataset(X = dataset_splits[i]["train_x_mgc"], y = dataset_splits[i]["train_y"])
    dtest = dc.data.NumpyDataset(X = dataset_splits[i]["test_x_mgc"], y = dataset_splits[i]["test_y"])
    model = MPNNModel(1800, mode='regression', dropout = 0.1, batch_normalize = True, dense_layer_size=2048, batch_size = 64, n_pair_feat = 14, n_atom_feat = 75)
    model.fit(dtrain, nb_epoch = 100)
    #Collect evaluation metrics
    graph_r2s = []
    g2predictions = model.predict(dtest)
    total_r2, count = 0, 0
    for x in range(len(dataset_splits[i]["test_y"])):
        current_r2 = wrapped_pearson_correlation(normalize(g2predictions[x]), dataset_splits[i]["test_y"][x])
        total_r2 += 0 if np.isnan(current_r2) else current_r2
        graph_r2s.append(current_r2)
        count += 1
    current_fold_loss = round(float(total_r2 / count), 5)
    print("R2 Loss for fold", i, ":", current_fold_loss)
    gr2 = clean(list(map(float, graph_r2s)))
    print("I:", i, "Mean", statistics.mean(gr2), "Median", statistics.median(gr2), "STDev", statistics.stdev(gr2))
    fold_predictions_path = path + "MPNN_" + str(i) + "_preds.pickle"
    with open(fold_predictions_path, 'wb') as handle:
        pickle.dump(g2predictions, handle)

DGL backend not selected or invalid.  Assuming PyTorch for now.


Setting the default backend to "pytorch". You can change it in the ~/.dgl/config.json file or export the DGLBACKEND environment variable.  Valid options are: pytorch, mxnet, tensorflow (all lowercase)


  assert input.numel() == input.storage().size(), (


R2 Loss for fold 1 : 0.86846
I: 1 Mean 0.8684604604648479 Median 0.9117663502693176 STDev 0.14041430869531368
R2 Loss for fold 2 : 0.86286
I: 2 Mean 0.8628573239156982 Median 0.9146058559417725 STDev 0.15230558812402234
R2 Loss for fold 3 : 0.87391
I: 3 Mean 0.8739131643250585 Median 0.9148865342140198 STDev 0.13244216886571603
R2 Loss for fold 4 : 0.8592
I: 4 Mean 0.8597730922071818 Median 0.9120941460132599 STDev 0.1576911603480587
R2 Loss for fold 5 : 0.87027
I: 5 Mean 0.8702684227367216 Median 0.9176366031169891 STDev 0.1448158321177


AttentiveFP

In [None]:
#AttentiveFP model training loop


import deepchem as dc
from deepchem.models import AttentiveFPModel
for i in range(1, 6):
    dtrain = dc.data.NumpyDataset(X = dataset_splits[i]["train_x_mgc"], y = dataset_splits[i]["train_y"])
    dtest = dc.data.NumpyDataset(X = dataset_splits[i]["test_x_mgc"], y = dataset_splits[i]["test_y"])
    fpmodel = AttentiveFPModel(n_tasks = 1800, mode='regression', dropout = 0.1, batch_normalize = True, dense_layer_size=2048, batch_size = 64, learning_rate = 0.001, activation_fns = "p")

    fpmodel.fit(dtrain, nb_epoch = 100)
    fppredictions = fpmodel.predict(dtest)

    graph_r2s = []
    total_r2, count = 0, 0
    for x in range(len(dataset_splits[i]["test_y"])):
        current_r2 = wrapped_pearson_correlation(normalize(fppredictions[x]), dataset_splits[i]["test_y"][x])
        total_r2 += 0 if np.isnan(current_r2) else current_r2
        graph_r2s.append(current_r2)
        count += 1
    current_fold_loss = round(float(total_r2 / count), 5)
    print("R2 Loss for fold", i, ":", current_fold_loss)
    gr2 = clean(list(map(float, graph_r2s)))
    print("I:", i, "Mean", statistics.mean(gr2), "Median", statistics.median(gr2), "STDev", statistics.stdev(gr2))#expirement
    fold_predictions_path = path + "AFP_" + str(i) + "_preds.pickle"
    with open(fold_predictions_path, 'wb') as handle:
        pickle.dump(fppredictions, handle)

R2 Loss for fold 1 : 0.88973
I: 1 Mean 0.8897326930117688 Median 0.9348607063293457 STDev 0.13656071402891767
R2 Loss for fold 2 : 0.88605
I: 2 Mean 0.8860499247659496 Median 0.9350899457931519 STDev 0.13984195648038159
R2 Loss for fold 3 : 0.88952
I: 3 Mean 0.8895247695371509 Median 0.9374888837337494 STDev 0.13027806755725838




R2 Loss for fold 4 : 0.88083
I: 4 Mean 0.881416023201111 Median 0.9341515004634857 STDev 0.1495300033128919
R2 Loss for fold 5 : 0.88883
I: 5 Mean 0.8888323200757012 Median 0.9384139478206635 STDev 0.13985106077743867


GAT Model

In [None]:
#GAT model training loop
for i in range(1, 6):
    dtrain = dc.data.NumpyDataset(X = dataset_splits[i]["train_x_mgc"], y = dataset_splits[i]["train_y"])
    dtest = dc.data.NumpyDataset(X = dataset_splits[i]["test_x_mgc"], y = dataset_splits[i]["test_y"])
    model = dc.models.GATModel(1800, mode='regression', dropout = 0.1, graph_attention_layers = [64, 64], batch_normalize = True, dense_layer_size=2048, batch_size = 64, learning_rate = 0.001)
    model.fit(dtrain, nb_epoch = 100)
    gatpredictions = model.predict(dtest)

    graph_r2s = []
    total_r2, count = 0, 0
    for x in range(len(dataset_splits[i]["test_y"])):
        current_r2 = wrapped_pearson_correlation(normalize(gatpredictions[x]), dataset_splits[i]["test_y"][x])
        total_r2 += 0 if np.isnan(current_r2) else current_r2
        graph_r2s.append(current_r2)
        count += 1
    current_fold_loss = round(float(total_r2 / count), 5)
    print("R2 Loss for fold", i, ":", current_fold_loss)
    gr2 = clean(list(map(float, graph_r2s)))
    print("I:", i, "Mean", statistics.mean(gr2), "Median", statistics.median(gr2), "STDev", statistics.stdev(gr2))#expirement

    fold_predictions_path = path + "GAT_" + str(i) + "_preds.pickle"
    with open(fold_predictions_path, 'wb') as handle:
        pickle.dump(gatpredictions, handle)

R2 Loss for fold 1 : 0.81116
I: 1 Mean 0.811163037599652 Median 0.8485574722290039 STDev 0.1533507744152288
R2 Loss for fold 2 : 0.80345
I: 2 Mean 0.8034500384062568 Median 0.8431214094161987 STDev 0.15703145370518626
R2 Loss for fold 3 : 0.81251
I: 3 Mean 0.812512859771649 Median 0.8478283584117889 STDev 0.14583377658594088
R2 Loss for fold 4 : 0.80498
I: 4 Mean 0.8055115403759818 Median 0.8454619348049164 STDev 0.16049066387768288
R2 Loss for fold 5 : 0.80723
I: 5 Mean 0.8072259483670935 Median 0.8390827775001526 STDev 0.14946917200372078


MorganFP/DNN Model

In [None]:
#MorganFP model using a dense layer as output
for i in range(1, 6):
    fpmodel = Sequential()
    fpmodel.add(Dense(4096, input_dim = 1024))
    fpmodel.add(BatchNormalization())
    fpmodel.add(Dropout(0.1))
    fpmodel.add(Dense(2048, activation = "relu"))
    fpmodel.add(BatchNormalization())
    fpmodel.add(Dropout(0.1))
    fpmodel.add(Dense(1024, activation = "relu"))

    fpmodel.add(Dense(1800, activation = "sigmoid"))

    fpmodel.compile(loss = euc_dist_keras, optimizer = "Adam")
    fpmodel.fit(dataset_splits[i]["train_x"], dataset_splits[i]["train_y"], batch_size = 64, epochs = 100, verbose = 0)
    #Collect evaluation metrics
    morganpredictions = fpmodel.predict(dataset_splits[i]["test_x"])
    total_r2, count = 0, 0
    totalp = 0
    fp_r2s = []
    for x in range(len(morganpredictions)):
        current_r2 = wrapped_pearson_correlation(normalize(morganpredictions[x]), dataset_splits[i]["test_y"][x])
        total_r2 += 0 if np.isnan(current_r2) else current_r2
        fp_r2s.append(current_r2)
        count += 1

    current_fold_loss = round(float(total_r2 / count), 5)
    print("R2 Loss for fold", i, ":", current_fold_loss)
    cleanfpr2s = clean(list(map(float, fp_r2s)))
    print("I", i, statistics.mean(cleanfpr2s), statistics.median(cleanfpr2s), statistics.stdev(cleanfpr2s))
    fold_predictions_path = path + "MFP_" + str(i) + "_preds.pickle"
    with open(fold_predictions_path, 'wb') as handle:
        pickle.dump(morganpredictions, handle)

R2 Loss for fold 1 : 0.86806
I 1 0.8680567330681408 0.9198525547981262 0.15146768710067954
R2 Loss for fold 2 : 0.86172
I 2 0.8617206382405952 0.9236180186271667 0.15817543832473255
R2 Loss for fold 3 : 0.87118
I 3 0.8711828431952745 0.9273976683616638 0.1468683184148113
R2 Loss for fold 4 : 0.85997
I 4 0.8605389280338831 0.9221876263618469 0.16141339017634002
R2 Loss for fold 5 : 0.86896
I 5 0.8689623830795169 0.9259579479694366 0.15243854976736723
