In [1]:
import warnings
warnings.filterwarnings("ignore")

import time
import copy
import pickle

import numpy as np
import pandas as pd
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

from sklearn.metrics import mean_squared_error, r2_score, make_scorer

import deepchem as dc

In [2]:
def rmse(estimator, X_eval, y_eval):
    y_hat = estimator.predict(X_eval)
    return np.sqrt(mean_squared_error(y_eval.flatten(), y_hat.flatten()))


def r2(estimator, X_eval, y_eval):
    y_hat = estimator.predict(X_eval)
    return r2_score(y_eval.flatten(), y_hat.flatten())


def peason_r(estimator, X_eval, y_eval):
    y_hat = estimator.predict(X_eval)
    return np.corrcoef(y_eval.flatten(), y_hat.flatten())[0, 1]


def peason_r_metric(y_true, y_pred):
    return np.corrcoef(y_true.flatten(), y_pred.flatten())[0, 1]

peason_r_score = make_scorer(peason_r_metric)

# Create train/test datasets

In [3]:
train_file = "data/process/pdY_train.csv"
test_file = "data/process/pdY_test.csv"
chembl_27_file = "data/process/pdY_chembl_27.csv"
vietherbs_file = "data/process/pdY_vietherbs.csv"

tasks = ["dG"]
ntasks = len(tasks)
featurizer_func = dc.feat.ConvMolFeaturizer()
loader = dc.data.CSVLoader(tasks=tasks, feature_field='smiles', featurizer=featurizer_func)

train_dataset = loader.create_dataset(train_file)
test_dataset = loader.create_dataset(test_file)

chembl_27_dataset = loader.create_dataset(chembl_27_file)
vietherbs_dataset = loader.create_dataset(vietherbs_file)

Failed to featurize datapoint 377, None. Appending empty array
Exception message: Python argument types in
    rdkit.Chem.rdmolfiles.CanonicalRankAtoms(NoneType)
did not match C++ signature:
    CanonicalRankAtoms(RDKit::ROMol mol, bool breakTies=True, bool includeChirality=True, bool includeIsotopes=True)
Failed to featurize datapoint 378, None. Appending empty array
Exception message: Python argument types in
    rdkit.Chem.rdmolfiles.CanonicalRankAtoms(NoneType)
did not match C++ signature:
    CanonicalRankAtoms(RDKit::ROMol mol, bool breakTies=True, bool includeChirality=True, bool includeIsotopes=True)
Failed to featurize datapoint 379, None. Appending empty array
Exception message: Python argument types in
    rdkit.Chem.rdmolfiles.CanonicalRankAtoms(NoneType)
did not match C++ signature:
    CanonicalRankAtoms(RDKit::ROMol mol, bool breakTies=True, bool includeChirality=True, bool includeIsotopes=True)
Failed to featurize datapoint 380, None. Appending empty array
Exception mes

Failed to featurize datapoint 1936, None. Appending empty array
Exception message: Python argument types in
    rdkit.Chem.rdmolfiles.CanonicalRankAtoms(NoneType)
did not match C++ signature:
    CanonicalRankAtoms(RDKit::ROMol mol, bool breakTies=True, bool includeChirality=True, bool includeIsotopes=True)
Failed to featurize datapoint 1978, None. Appending empty array
Exception message: Python argument types in
    rdkit.Chem.rdmolfiles.CanonicalRankAtoms(NoneType)
did not match C++ signature:
    CanonicalRankAtoms(RDKit::ROMol mol, bool breakTies=True, bool includeChirality=True, bool includeIsotopes=True)
Failed to featurize datapoint 2120, None. Appending empty array
Exception message: Python argument types in
    rdkit.Chem.rdmolfiles.CanonicalRankAtoms(NoneType)
did not match C++ signature:
    CanonicalRankAtoms(RDKit::ROMol mol, bool breakTies=True, bool includeChirality=True, bool includeIsotopes=True)
Failed to featurize datapoint 2302, None. Appending empty array
Exception

# Default

In [9]:
model = dc.models.GraphConvModel(ntasks, mode='regression', model_dir="models/graph_conv_01")
model.restore()

print("Train RMSE:", rmse(model, train_dataset, train_dataset.y))
print("Train Pearson's R:", peason_r(model, train_dataset, train_dataset.y))

print("Test RMSE:", rmse(model, test_dataset, test_dataset.y))
print("Test Pearson's R:", peason_r(model, test_dataset, test_dataset.y))

Train RMSE: 2.0656511167574054
Train Pearson's R: 0.7214282041072938
Test RMSE: 2.178783089278693
Test Pearson's R: 0.609781554656271


## `graph_conv_layers = [64, 64, 64]`, `dense_layer_size=128`, `dropout=0`

In [10]:
batch_size = 64
batches_per_epoch = 600/batch_size
learning_rate = dc.models.optimizers.ExponentialDecay(0.0001, 0.9, batches_per_epoch)

model = dc.models.GraphConvModel(ntasks, 
                                 graph_conv_layers=[64, 64, 64], 
                                 dense_layer_size=128,
                                 dropout=0.,
                                 batch_normalize=True,
                                 mode='regression', 
                                 batch_size=batch_size, 
                                 learning_rate=learning_rate)

metric = dc.metrics.Metric(dc.metrics.rms_score)
callback = dc.models.ValidationCallback(test_dataset, 100, metric)
model.fit(train_dataset, nb_epoch=100, callbacks=callback)

print("Train RMSE:", rmse(model, train_dataset, train_dataset.y))
print("Train Pearson's R:", peason_r(model, train_dataset, train_dataset.y))

print("Test RMSE:", rmse(model, test_dataset, test_dataset.y))
print("Test Pearson's R:", peason_r(model, test_dataset, test_dataset.y))





Step 100 validation: rms_score=5.77003
Step 200 validation: rms_score=4.20323
Step 300 validation: rms_score=2.74132
Step 400 validation: rms_score=1.85557
Step 500 validation: rms_score=1.44241
Step 600 validation: rms_score=1.28334
Step 700 validation: rms_score=1.25131
Train RMSE: 0.6844424440680644
Train Pearson's R: 0.9030769602347032
Test RMSE: 1.2513115244439372
Test Pearson's R: 0.6806038223231273


##  `graph_conv_layers = [128, 128, 128,  128, 128]`, `dense_layer_size=256`, `dropout=0.0`

In [12]:
model = dc.models.GraphConvModel(ntasks, 
                                 graph_conv_layers=[128, 128, 128, 128, 128], 
                                 dense_layer_size=256,
                                 dropout=0.0,
                                 batch_normalize=True,
                                 mode='regression', 
                                 batch_size=64)

metric = dc.metrics.Metric(dc.metrics.rms_score)
callback = dc.models.ValidationCallback(test_dataset, 100, metric)
model.fit(train_dataset, nb_epoch=100, callbacks=callback)

print("Train RMSE:", rmse(model, train_dataset, train_dataset.y))
print("Train Pearson's R:", peason_r(model, train_dataset, train_dataset.y))

print("Test RMSE:", rmse(model, test_dataset, test_dataset.y))
print("Test Pearson's R:", peason_r(model, test_dataset, test_dataset.y))

Step 100 validation: rms_score=1.55199
Step 200 validation: rms_score=2.09454
Step 300 validation: rms_score=1.64835
Step 400 validation: rms_score=1.17031
Step 500 validation: rms_score=1.08215
Step 600 validation: rms_score=1.07372
Step 700 validation: rms_score=1.09179
Train RMSE: 0.5086564617883529
Train Pearson's R: 0.9479088499923816
Test RMSE: 1.0917886065207112
Test Pearson's R: 0.7480186549121637


In [15]:
y_test_pred = model.predict(test_dataset)
test_pred_df = pd.DataFrame({"smiles": test_dataset.ids, "dG": test_dataset.y.flatten(), "pred": y_test_pred.flatten()})
test_pred_df.to_csv("results/graph_conv/test_pred.csv", index=False)

## graph_conv_layers = [64, 128,], dense_layer_size=128, dropout=0.0

In [16]:
model = dc.models.GraphConvModel(ntasks, 
                                 graph_conv_layers=[512, 512, 512], 
                                 dense_layer_size=512,
                                 dropout=0.,
                                 batch_normalize=True,
                                 mode='regression', 
                                 batch_size=64)

metric = dc.metrics.Metric(dc.metrics.rms_score)
callback = dc.models.ValidationCallback(test_dataset, 100, metric)
model.fit(train_dataset, nb_epoch=50, callbacks=callback)

print("Train RMSE:", rmse(model, train_dataset, train_dataset.y))
print("Train Pearson's R:", peason_r(model, train_dataset, train_dataset.y))

print("Test RMSE:", rmse(model, test_dataset, test_dataset.y))
print("Test Pearson's R:", peason_r(model, test_dataset, test_dataset.y))

Step 100 validation: rms_score=4.82356
Step 200 validation: rms_score=5.44167
Step 300 validation: rms_score=3.64936
Train RMSE: 2.274671702167959
Train Pearson's R: 0.661570009933509
Test RMSE: 2.444204925293292
Test Pearson's R: 0.4468855228492178
