In [None]:
import warnings
warnings.filterwarnings("ignore")

import time
import copy
import pickle

import numpy as np
import pandas as pd
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

from sklearn.metrics import mean_squared_error, r2_score, make_scorer

import deepchem as dc

In [None]:
def rmse(estimator, X_eval, y_eval):
    y_hat = estimator.predict(X_eval)
    return np.sqrt(mean_squared_error(y_eval.flatten(), y_hat.flatten()))


def r2(estimator, X_eval, y_eval):
    y_hat = estimator.predict(X_eval)
    return r2_score(y_eval.flatten(), y_hat.flatten())


def peason_r(estimator, X_eval, y_eval):
    y_hat = estimator.predict(X_eval)
    return np.corrcoef(y_eval.flatten(), y_hat.flatten())[0, 1]


def peason_r_metric(y_true, y_pred):
    return np.corrcoef(y_true.flatten(), y_pred.flatten())[0, 1]

peason_r_score = make_scorer(peason_r_metric)

# Create train/test datasets

In [None]:
assert False

pdXY = pd.read_csv("data/process/pdXY_labeled_rdkit_descriptors_104ft_imputed_std.csv")

PDY_COLS = ["new_id", "smiles", "dG", "code", "train_test", "smiles_len"]

ii = pdXY["train_test"] == "test"
pdXY[ii][PDY_COLS].to_csv("data/process/pdY_test.csv", index=False)

jj = pdXY["train_test"].isin(["train", "val"])
pdXY[jj][PDY_COLS].to_csv("data/process/pdY_train.csv", index=False)


In [None]:
train_file = "data/process/pdY_train.csv"
test_file = "data/process/pdY_test.csv"

tasks = ["dG"]
ntasks = len(tasks)
featurizer_func = dc.feat.ConvMolFeaturizer()
loader = dc.data.CSVLoader(tasks=tasks, feature_field='smiles', featurizer=featurizer_func)

train_dataset = loader.create_dataset(train_file)
test_dataset = loader.create_dataset(test_file)

In [None]:
train_dataset.ids

# Default

In [None]:
model = dc.models.GraphConvModel(ntasks, mode='regression', model_dir="models/graph/gconv_default")
model.fit(train_dataset, nb_epoch=50)

print("Train RMSE:", rmse(model, train_dataset, train_dataset.y))
print("Train Pearson's R:", peason_r(model, train_dataset, train_dataset.y))

print("Test RMSE:", rmse(model, test_dataset, test_dataset.y))
print("Test Pearson's R:", peason_r(model, test_dataset, test_dataset.y))

#pickle.dump(model, open("models/graph/graph_conv_default.pkl", "wb"))

In [None]:
model = dc.models.GraphConvModel(ntasks, mode='regression', model_dir="models/graph/gconv_default")
model.restore()

print("Train RMSE:", rmse(model, train_dataset, train_dataset.y))
print("Train Pearson's R:", peason_r(model, train_dataset, train_dataset.y))

print("Test RMSE:", rmse(model, test_dataset, test_dataset.y))
print("Test Pearson's R:", peason_r(model, test_dataset, test_dataset.y))

## `graph_conv_layers = [64, 64, 64]`, `dense_layer_size=128`, `dropout=0`

In [None]:
model_dir = "models/graph/gconv_01"
batch_size = 64
batches_per_epoch = 600/batch_size
learning_rate = dc.models.optimizers.ExponentialDecay(0.0001, 0.9, batches_per_epoch)


In [None]:
batch_size = 64
batches_per_epoch = 600/batch_size
learning_rate = dc.models.optimizers.ExponentialDecay(0.0001, 0.9, batches_per_epoch)

model = dc.models.GraphConvModel(ntasks, 
                                 graph_conv_layers=[64, 64, 64], 
                                 dense_layer_size=128,
                                 dropout=0.,
                                 batch_normalize=True,
                                 mode='regression', 
                                 batch_size=batch_size, 
                                 learning_rate=learning_rate,
                                 model_dir=model_dir)

metric = dc.metrics.Metric(dc.metrics.rms_score)
callback = dc.models.ValidationCallback(test_dataset, 100, metric)
model.fit(train_dataset, nb_epoch=100, callbacks=callback)

print("Train RMSE:", rmse(model, train_dataset, train_dataset.y))
print("Train Pearson's R:", peason_r(model, train_dataset, train_dataset.y))

print("Test RMSE:", rmse(model, test_dataset, test_dataset.y))
print("Test Pearson's R:", peason_r(model, test_dataset, test_dataset.y))

In [None]:
model = dc.models.GraphConvModel(ntasks, 
                                 graph_conv_layers=[64, 64, 64], 
                                 dense_layer_size=128,
                                 dropout=0.,
                                 batch_normalize=True,
                                 mode='regression', 
                                 batch_size=batch_size, 
                                 learning_rate=learning_rate,
                                 model_dir=model_dir)
model.restore()

print("Train RMSE:", rmse(model, train_dataset, train_dataset.y))
print("Train Pearson's R:", peason_r(model, train_dataset, train_dataset.y))

print("Test RMSE:", rmse(model, test_dataset, test_dataset.y))
print("Test Pearson's R:", peason_r(model, test_dataset, test_dataset.y))

##  `graph_conv_layers = [128, 128, 128,  128, 128]`, `dense_layer_size=256`, `dropout=0.0`

In [None]:
model_dir = "models/graph/gconv_02"

In [None]:
model = dc.models.GraphConvModel(ntasks, 
                                 graph_conv_layers=[128, 128, 128, 128, 128], 
                                 dense_layer_size=256,
                                 dropout=0.0,
                                 batch_normalize=True,
                                 mode='regression', 
                                 batch_size=64,
                                model_dir=model_dir)

metric = dc.metrics.Metric(dc.metrics.rms_score)
callback = dc.models.ValidationCallback(test_dataset, 100, metric)
model.fit(train_dataset, nb_epoch=100, callbacks=callback)

print("Train RMSE:", rmse(model, train_dataset, train_dataset.y))
print("Train Pearson's R:", peason_r(model, train_dataset, train_dataset.y))

print("Test RMSE:", rmse(model, test_dataset, test_dataset.y))
print("Test Pearson's R:", peason_r(model, test_dataset, test_dataset.y))

In [None]:
model = dc.models.GraphConvModel(ntasks, 
                                 graph_conv_layers=[128, 128, 128, 128, 128], 
                                 dense_layer_size=256,
                                 dropout=0.0,
                                 batch_normalize=True,
                                 mode='regression', 
                                 batch_size=64,
                                model_dir=model_dir)

model.restore()

print("Train RMSE:", rmse(model, train_dataset, train_dataset.y))
print("Train Pearson's R:", peason_r(model, train_dataset, train_dataset.y))

print("Test RMSE:", rmse(model, test_dataset, test_dataset.y))
print("Test Pearson's R:", peason_r(model, test_dataset, test_dataset.y))

## graph_conv_layers = [512, 512, 512], dense_layer_size=512, dropout=0.0

In [None]:
model_dir = "models/graph/gconv_03"

In [None]:
model = dc.models.GraphConvModel(ntasks, 
                                 graph_conv_layers=[512, 512, 512], 
                                 dense_layer_size=512,
                                 dropout=0.,
                                 batch_normalize=True,
                                 mode='regression', 
                                 batch_size=64, 
                                model_dir=model_dir)

metric = dc.metrics.Metric(dc.metrics.rms_score)
callback = dc.models.ValidationCallback(test_dataset, 100, metric)
model.fit(train_dataset, nb_epoch=50, callbacks=callback)

print("Train RMSE:", rmse(model, train_dataset, train_dataset.y))
print("Train Pearson's R:", peason_r(model, train_dataset, train_dataset.y))

print("Test RMSE:", rmse(model, test_dataset, test_dataset.y))
print("Test Pearson's R:", peason_r(model, test_dataset, test_dataset.y))

In [None]:
model = dc.models.GraphConvModel(ntasks, 
                                 graph_conv_layers=[512, 512, 512], 
                                 dense_layer_size=512,
                                 dropout=0.,
                                 batch_normalize=True,
                                 mode='regression', 
                                 batch_size=64, 
                                model_dir=model_dir)

model.restore()

print("Train RMSE:", rmse(model, train_dataset, train_dataset.y))
print("Train Pearson's R:", peason_r(model, train_dataset, train_dataset.y))

print("Test RMSE:", rmse(model, test_dataset, test_dataset.y))
print("Test Pearson's R:", peason_r(model, test_dataset, test_dataset.y))

# graph_conv_layers = [256, 256, 256,], dense_layer_size=256, dropout=0.0

In [None]:
model_dir = "models/graph/gconv_04"

In [None]:
model = dc.models.GraphConvModel(ntasks, 
                                 graph_conv_layers=[256, 256, 256], 
                                 dense_layer_size=256,
                                 dropout=0.,
                                 batch_normalize=True,
                                 mode='regression', 
                                 batch_size=64, 
                                model_dir=model_dir)

metric = dc.metrics.Metric(dc.metrics.rms_score)
callback = dc.models.ValidationCallback(test_dataset, 100, metric)
model.fit(train_dataset, nb_epoch=50, callbacks=callback)

print("Train RMSE:", rmse(model, train_dataset, train_dataset.y))
print("Train Pearson's R:", peason_r(model, train_dataset, train_dataset.y))

print("Test RMSE:", rmse(model, test_dataset, test_dataset.y))
print("Test Pearson's R:", peason_r(model, test_dataset, test_dataset.y))

In [None]:
model = dc.models.GraphConvModel(ntasks, 
                                 graph_conv_layers=[256, 256, 256], 
                                 dense_layer_size=256,
                                 dropout=0.,
                                 batch_normalize=True,
                                 mode='regression', 
                                 batch_size=64, 
                                model_dir=model_dir)

model.restore()

print("Train RMSE:", rmse(model, train_dataset, train_dataset.y))
print("Train Pearson's R:", peason_r(model, train_dataset, train_dataset.y))

print("Test RMSE:", rmse(model, test_dataset, test_dataset.y))
print("Test Pearson's R:", peason_r(model, test_dataset, test_dataset.y))

# predict test

In [None]:
model = dc.models.GraphConvModel(ntasks, 
                                 graph_conv_layers=[256, 256, 256], 
                                 dense_layer_size=256,
                                 dropout=0.,
                                 batch_normalize=True,
                                 mode='regression', 
                                 batch_size=64, 
                                model_dir=model_dir)

model.restore()

print("Train RMSE:", rmse(model, train_dataset, train_dataset.y))
print("Train Pearson's R:", peason_r(model, train_dataset, train_dataset.y))

print("Test RMSE:", rmse(model, test_dataset, test_dataset.y))
print("Test Pearson's R:", peason_r(model, test_dataset, test_dataset.y))

In [None]:
df00 = {"dG": test_dataset.y[:, 0], "pred": model.predict(test_dataset)[:, 0]}
df00 = pd.DataFrame(df00)
df00.to_csv("results/graph_conv/test_pred.csv", index=False)
print(df00.shape)
df00.head()

# predict chembl_27

In [None]:
assert False

pdXY = pd.read_csv("data/process/pdXY_chembl_27_rdkit_descriptors_104ft_imputed_std.csv")

PDY_COLS = ["new_id", "smiles", "dG", "code", "train_test", "smiles_len"]

pdXY[PDY_COLS].to_csv("data/process/pdY_chembl_27.csv", index=False)

In [None]:
chembl_27_file = "data/process/pdY_chembl_27.csv"

tasks = ["dG"]
ntasks = len(tasks)
featurizer_func = dc.feat.ConvMolFeaturizer()
loader = dc.data.CSVLoader(tasks=tasks, feature_field='smiles', featurizer=featurizer_func)

chembl_27_dataset = loader.create_dataset(chembl_27_file)

In [None]:
chembl_27_dataset.save_to_disk("data/process/chembl_27_dataset")

In [None]:
model_dir = "models/graph/gconv_04"

model = dc.models.GraphConvModel(ntasks, 
                                 graph_conv_layers=[256, 256, 256], 
                                 dense_layer_size=256,
                                 dropout=0.,
                                 batch_normalize=True,
                                 mode='regression', 
                                 batch_size=64, 
                                model_dir=model_dir)

model.restore()

print("Train RMSE:", rmse(model, train_dataset, train_dataset.y))
print("Train Pearson's R:", peason_r(model, train_dataset, train_dataset.y))

print("Test RMSE:", rmse(model, test_dataset, test_dataset.y))
print("Test Pearson's R:", peason_r(model, test_dataset, test_dataset.y))

In [None]:
pdY_chembl_27 = pd.read_csv(chembl_27_file)
print(pdY_chembl_27.shape)
display_df(pdY_chembl_27.head())