In [None]:
import warnings
warnings.filterwarnings("ignore")

import time
import copy
import pickle

import numpy as np
import pandas as pd
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

from sklearn.metrics import mean_squared_error, r2_score, make_scorer

import deepchem as dc

In [None]:
def rmse(estimator, X_eval, y_eval):
    y_hat = estimator.predict(X_eval)
    return np.sqrt(mean_squared_error(y_eval.flatten(), y_hat.flatten()))


def r2(estimator, X_eval, y_eval):
    y_hat = estimator.predict(X_eval)
    return r2_score(y_eval.flatten(), y_hat.flatten())


def peason_r(estimator, X_eval, y_eval):
    y_hat = estimator.predict(X_eval)
    return np.corrcoef(y_eval.flatten(), y_hat.flatten())[0, 1]


def peason_r_metric(y_true, y_pred):
    return np.corrcoef(y_true.flatten(), y_pred.flatten())[0, 1]

peason_r_score = make_scorer(peason_r_metric)

# Create train/test datasets

In [None]:
train_file = "data/process/pdY_train.csv"
test_file = "data/process/pdY_test.csv"
chembl_27_file = "data/process/pdY_chembl_27.csv"
vietherbs_file = "data/process/pdY_vietherbs.csv"

tasks = ["dG"]
ntasks = len(tasks)
featurizer_func = dc.feat.ConvMolFeaturizer()
loader = dc.data.CSVLoader(tasks=tasks, feature_field='smiles', featurizer=featurizer_func)

train_dataset = loader.create_dataset(train_file)
test_dataset = loader.create_dataset(test_file)

chembl_27_dataset = loader.create_dataset(chembl_27_file)
vietherbs_dataset = loader.create_dataset(vietherbs_file)

# Default

In [None]:
model = dc.models.GraphConvModel(ntasks, mode='regression', model_dir="models/graph_conv_01")
model.fit(train_dataset, nb_epoch=50)

print("Train RMSE:", rmse(model, train_dataset, train_dataset.y))
print("Train Pearson's R:", peason_r(model, train_dataset, train_dataset.y))

print("Test RMSE:", rmse(model, test_dataset, test_dataset.y))
print("Test Pearson's R:", peason_r(model, test_dataset, test_dataset.y))

#pickle.dump(model, open("models/graph_conv_01.pkl", "wb"))

In [None]:
model = dc.models.GraphConvModel(ntasks, mode='regression', model_dir="models/graph_conv_01")
model.restore()

print("Train RMSE:", rmse(model, train_dataset, train_dataset.y))
print("Train Pearson's R:", peason_r(model, train_dataset, train_dataset.y))

print("Test RMSE:", rmse(model, test_dataset, test_dataset.y))
print("Test Pearson's R:", peason_r(model, test_dataset, test_dataset.y))

## `graph_conv_layers = [64, 64, 64]`, `dense_layer_size=128`, `dropout=0`

In [None]:
batch_size = 64
batches_per_epoch = 600/batch_size
learning_rate = dc.models.optimizers.ExponentialDecay(0.0001, 0.9, batches_per_epoch)

model = dc.models.GraphConvModel(ntasks, 
                                 graph_conv_layers=[64, 64, 64], 
                                 dense_layer_size=128,
                                 dropout=0.,
                                 batch_normalize=True,
                                 mode='regression', 
                                 batch_size=batch_size, 
                                 learning_rate=learning_rate)

metric = dc.metrics.Metric(dc.metrics.rms_score)
callback = dc.models.ValidationCallback(test_dataset, 100, metric)
model.fit(train_dataset, nb_epoch=100, callbacks=callback)

print("Train RMSE:", rmse(model, train_dataset, train_dataset.y))
print("Train Pearson's R:", peason_r(model, train_dataset, train_dataset.y))

print("Test RMSE:", rmse(model, test_dataset, test_dataset.y))
print("Test Pearson's R:", peason_r(model, test_dataset, test_dataset.y))

In [None]:
model = dc.models.GraphConvModel(ntasks, mode='regression')
model.restore()

print("Train RMSE:", rmse(model, train_dataset, train_dataset.y))
print("Train Pearson's R:", peason_r(model, train_dataset, train_dataset.y))

print("Test RMSE:", rmse(model, test_dataset, test_dataset.y))
print("Test Pearson's R:", peason_r(model, test_dataset, test_dataset.y))

##  `graph_conv_layers = [128, 128, 128,  128, 128]`, `dense_layer_size=256`, `dropout=0.0`

In [None]:
model = dc.models.GraphConvModel(ntasks, 
                                 graph_conv_layers=[128, 128, 128, 128, 128], 
                                 dense_layer_size=256,
                                 dropout=0.0,
                                 batch_normalize=True,
                                 mode='regression', 
                                 batch_size=64)

metric = dc.metrics.Metric(dc.metrics.rms_score)
callback = dc.models.ValidationCallback(test_dataset, 100, metric)
model.fit(train_dataset, nb_epoch=100, callbacks=callback)

print("Train RMSE:", rmse(model, train_dataset, train_dataset.y))
print("Train Pearson's R:", peason_r(model, train_dataset, train_dataset.y))

print("Test RMSE:", rmse(model, test_dataset, test_dataset.y))
print("Test Pearson's R:", peason_r(model, test_dataset, test_dataset.y))

In [None]:
y_test_pred = model.predict(test_dataset)
test_pred_df = pd.DataFrame({"smiles": test_dataset.ids, "dG": test_dataset.y.flatten(), "pred": y_test_pred.flatten()})
test_pred_df.to_csv("results/graph_conv/test_pred.csv", index=False)

## graph_conv_layers = [64, 128,], dense_layer_size=128, dropout=0.0

In [None]:
model = dc.models.GraphConvModel(ntasks, 
                                 graph_conv_layers=[512, 512, 512], 
                                 dense_layer_size=512,
                                 dropout=0.,
                                 batch_normalize=True,
                                 mode='regression', 
                                 batch_size=64)

metric = dc.metrics.Metric(dc.metrics.rms_score)
callback = dc.models.ValidationCallback(test_dataset, 100, metric)
model.fit(train_dataset, nb_epoch=50, callbacks=callback)

print("Train RMSE:", rmse(model, train_dataset, train_dataset.y))
print("Train Pearson's R:", peason_r(model, train_dataset, train_dataset.y))

print("Test RMSE:", rmse(model, test_dataset, test_dataset.y))
print("Test Pearson's R:", peason_r(model, test_dataset, test_dataset.y))